1 |
#include "GArgs.h" |
2 |
#include "GStr.h" |
3 |
#include "GHash.hh" |
4 |
#include "GList.hh" |
5 |
#include <ctype.h> |
6 |
|
7 |
#define USAGE "Usage:\n\ |
8 |
fqtrim [-5 <5adapter>] [-3 <3adapter>] [-a <min_matchlen>] [-p {64|33}] [-q <minq> [-t <trim_max>]]\\\n\ |
9 |
[-n <rename_prefix>] [-o <outsuffix>] [-z <zcmd>] [-r <discarded.lst>]\\\n\ |
10 |
[-l <minlen>] [-C] [-D] [-Q] <input.fq>[,<input_mates.fq>\n\ |
11 |
\n\ |
12 |
Trim low quality bases at the 3' end, optionally trim adapter sequence, filter\n\ |
13 |
for low complexity and collapse duplicate reads\n\ |
14 |
If read pairs should be trimmed and kept together (i.e. without discarding\n\ |
15 |
one read in a pair), the two file names should be given delimited by a comma\n\ |
16 |
or a colon character\n\ |
17 |
\n\ |
18 |
Options:\n\ |
19 |
-n rename all the reads using the <prefix> followed by a read counter;\n\ |
20 |
if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\ |
21 |
the read duplication count\n\ |
22 |
-o write the trimmed/filtered reads to file(s) named <input>.<outsuffix>\n\ |
23 |
which will be created in the current (working) directory\n\ |
24 |
-5 trim the given adapter or primer sequence at the 5' end of each read\n\ |
25 |
(e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\ |
26 |
-3 trim the given adapter sequence at the 3' end of each read\n\ |
27 |
(e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\ |
28 |
-a minimum bases to match to adaptor sequence (default 5)\n\ |
29 |
-q trim bases with quality value lower than <minq> (starting at the 3' end)\n\ |
30 |
-t for -q option, maximum trimming at the 3' end is limited to <trim_max>\n\ |
31 |
-m maximum percentage of Ns allowed in a read after trimming (default 7)\n\ |
32 |
-l minimum \"clean\" length after trimming that a read must have\n\ |
33 |
in order to pass the filter (default: 16)\n\ |
34 |
-r write a simple \"trash report\" file listing the discarded reads with a\n\ |
35 |
one letter code for the reason why they were trashed\n\ |
36 |
-D apply a low-complexity (dust) filter and discard any read that has over\n\ |
37 |
50% of its length detected as low complexity\n\ |
38 |
-C collapse duplicate reads and add a prefix with their count to the read\n\ |
39 |
name (e.g. for microRNAs)\n\ |
40 |
-p input is phred64/phred33 (use -p64 or -p33)\n\ |
41 |
-Q convert quality values to the other Phred qv type\n\ |
42 |
-V verbose processing\n\ |
43 |
" |
44 |
|
45 |
//-z for -o option, the output stream(s) will be first piped into the given\n |
46 |
// <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n |
47 |
|
48 |
|
49 |
// example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG |
50 |
|
51 |
//For pair ends sequencing: |
52 |
//3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT |
53 |
//5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG |
54 |
//FILE* f_out=NULL; //stdout if not provided |
55 |
//FILE* f_out2=NULL; //for paired reads |
56 |
//FILE* f_in=NULL; //input fastq (stdin if not provided) |
57 |
//FILE* f_in2=NULL; //for paired reads |
58 |
FILE* freport=NULL; |
59 |
|
60 |
bool debug=false; |
61 |
bool verbose=false; |
62 |
bool doCollapse=false; |
63 |
bool doDust=false; |
64 |
bool fastaOutput=false; |
65 |
bool trashReport=false; |
66 |
//bool rawFormat=false; |
67 |
int min_read_len=16; |
68 |
double max_perc_N=7.0; |
69 |
int dust_cutoff=16; |
70 |
bool isfasta=false; |
71 |
bool convert_phred=false; |
72 |
GStr outsuffix; // -o |
73 |
GStr adapter3; |
74 |
GStr adapter5; |
75 |
GStr prefix; |
76 |
GStr zcmd; |
77 |
int num_trimmed5=0; |
78 |
int num_trimmed3=0; |
79 |
int num_trimmedN=0; |
80 |
int num_trimmedQ=0; |
81 |
int min_trimmed5=INT_MAX; |
82 |
int min_trimmed3=INT_MAX; |
83 |
|
84 |
int qvtrim_qmin=0; |
85 |
int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases |
86 |
int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet) |
87 |
int qv_cvtadd=0; //could be -31 or +31 |
88 |
|
89 |
int a3len=0; |
90 |
int a5len=0; |
91 |
// adaptor matching metrics -- for extendMatch() function |
92 |
const int a_m_score=2; //match score |
93 |
const int a_mis_score=-3; //mismatch |
94 |
const int a_dropoff_score=7; |
95 |
int a_min_score=10; //an exact match of 5 bases at the proper ends WILL be trimmed |
96 |
const int a_min_chain_score=15; //for gapped alignments |
97 |
|
98 |
class CSegChain; |
99 |
|
100 |
class CSegPair { |
101 |
public: |
102 |
GSeg a; |
103 |
GSeg b; //the adapter segment |
104 |
int score; |
105 |
int flags; |
106 |
CSegChain* chain; |
107 |
CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) { |
108 |
score=mscore; |
109 |
if (score==0) score=a.len()*a_m_score; |
110 |
flags=0; |
111 |
chain=NULL; |
112 |
} |
113 |
int len() { return a.len(); } |
114 |
bool operator==(CSegPair& d){ |
115 |
//return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end); |
116 |
//make equal even segments that are included into one another: |
117 |
return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end); |
118 |
} |
119 |
bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score |
120 |
if (b.start==d.b.start) { |
121 |
if (score==d.score) { |
122 |
//just try to be consistent: |
123 |
if (b.end==d.b.end) { |
124 |
return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start); |
125 |
} |
126 |
return (b.end>d.b.end); |
127 |
} |
128 |
else return (score<d.score); |
129 |
} |
130 |
return (b.start>d.b.start); |
131 |
} |
132 |
bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord |
133 |
/*if (b.start==d.b.start && b.end==d.b.end) { |
134 |
return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start); |
135 |
} |
136 |
return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/ |
137 |
if (b.start==d.b.start) { |
138 |
if (score==d.score) { |
139 |
//just try to be consistent: |
140 |
if (b.end==d.b.end) { |
141 |
return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start); |
142 |
} |
143 |
return (b.end<d.b.end); |
144 |
} |
145 |
else return (score>d.score); |
146 |
} |
147 |
return (b.start<d.b.start); |
148 |
} |
149 |
}; |
150 |
|
151 |
int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score |
152 |
CSegPair& x = *(CSegPair *)sa; |
153 |
CSegPair& y = *(CSegPair *)sb; |
154 |
/* |
155 |
if (x.b.end==y.b.end) { |
156 |
if (x.b.start==y.b.start) { |
157 |
if (x.a.end==y.a.end) { |
158 |
if (x.a.start==y.a.start) return 0; |
159 |
return ((x.a.start>y.a.start) ? -1 : 1); |
160 |
} |
161 |
else { |
162 |
return ((x.a.end>y.a.end) ? -1 : 1); |
163 |
} |
164 |
} |
165 |
else { |
166 |
return ((x.b.start>y.b.start) ? -1 : 1); |
167 |
} |
168 |
} |
169 |
else { |
170 |
return ((x.b.end>y.b.end) ? -1 : 1); |
171 |
} |
172 |
*/ |
173 |
if (x.b.end==y.b.end) { |
174 |
if (x.score==y.score) { |
175 |
if (x.b.start==y.b.start) { |
176 |
if (x.a.end==y.a.end) { |
177 |
if (x.a.start==y.a.start) return 0; |
178 |
return ((x.a.start<y.a.start) ? -1 : 1); |
179 |
} |
180 |
else { |
181 |
return ((x.a.end<y.a.end) ? -1 : 1); |
182 |
} |
183 |
} |
184 |
else { |
185 |
return ((x.b.start<y.b.start) ? -1 : 1); |
186 |
} |
187 |
} else return ((x.score>y.score) ? -1 : 1); |
188 |
} |
189 |
else { |
190 |
return ((x.b.end>y.b.end) ? -1 : 1); |
191 |
} |
192 |
|
193 |
} |
194 |
|
195 |
class CSegChain:public GList<CSegPair> { |
196 |
public: |
197 |
uint astart; |
198 |
uint aend; |
199 |
uint bstart; |
200 |
uint bend; |
201 |
int score; |
202 |
bool endSort; |
203 |
CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique |
204 |
//as SegPairs are inserted, they will be sorted by a.start coordinate |
205 |
score=0; |
206 |
astart=MAX_UINT; |
207 |
aend=0; |
208 |
bstart=MAX_UINT; |
209 |
bend=0; |
210 |
endSort=aln5; |
211 |
if (aln5) { setSorted(cmpSegEnds); } |
212 |
} |
213 |
bool operator==(CSegChain& d) { |
214 |
//return (score==d.score); |
215 |
return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend); |
216 |
} |
217 |
bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate |
218 |
//return (score<d.score); |
219 |
if (bstart==d.bstart && bend==d.bend) { |
220 |
return (astart==d.astart)?(aend>d.aend):(astart>d.astart); |
221 |
} |
222 |
return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart); |
223 |
} |
224 |
bool operator<(CSegChain& d) { |
225 |
//return (score>d.score); |
226 |
if (bstart==d.bstart && bend==d.bend) { |
227 |
return (astart==d.astart)?(aend<d.aend):(astart<d.astart); |
228 |
} |
229 |
return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart); |
230 |
} |
231 |
void addSegPair(CSegPair* segp) { |
232 |
if (AddIfNew(segp)!=segp) return; |
233 |
score+=segp->score; |
234 |
if (astart>segp->a.start) astart=segp->a.start; |
235 |
if (aend<segp->a.end) aend=segp->a.end; |
236 |
if (bstart>segp->b.start) bstart=segp->b.start; |
237 |
if (bend<segp->b.end) bend=segp->b.end; |
238 |
} |
239 |
//for building actual chains: |
240 |
bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain |
241 |
int bgap=0; |
242 |
int agap=0; |
243 |
//if (endSort) { |
244 |
if (bstart>segp->b.start) { |
245 |
bgap = (int)(bstart-segp->b.end); |
246 |
if (abs(bgap)>2) return false; |
247 |
agap = (int)(astart-segp->a.end); |
248 |
if (abs(agap)>2) return false; |
249 |
} |
250 |
else { |
251 |
bgap = (int) (segp->b.start-bend); |
252 |
if (abs(bgap)>2) return false; |
253 |
agap = (int)(segp->a.start-aend); |
254 |
if (abs(agap)>2) return false; |
255 |
} |
256 |
if (agap*bgap<0) return false; |
257 |
addSegPair(segp); |
258 |
score-=abs(agap)+abs(bgap); |
259 |
return true; |
260 |
} |
261 |
}; |
262 |
|
263 |
// element in dhash: |
264 |
class FqDupRec { |
265 |
public: |
266 |
int count; //how many of these reads are the same |
267 |
int len; //length of qv |
268 |
char* firstname; //optional, only if we want to keep the original read names |
269 |
char* qv; |
270 |
FqDupRec(GStr* qstr=NULL, const char* rname=NULL) { |
271 |
len=0; |
272 |
qv=NULL; |
273 |
firstname=NULL; |
274 |
count=0; |
275 |
if (qstr!=NULL) { |
276 |
qv=Gstrdup(qstr->chars()); |
277 |
len=qstr->length(); |
278 |
count++; |
279 |
} |
280 |
if (rname!=NULL) firstname=Gstrdup(rname); |
281 |
} |
282 |
~FqDupRec() { |
283 |
GFREE(qv); |
284 |
GFREE(firstname); |
285 |
} |
286 |
void add(GStr& d) { //collapse another record into this one |
287 |
if (d.length()!=len) |
288 |
GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n"); |
289 |
count++; |
290 |
for (int i=0;i<len;i++) |
291 |
qv[i]=(qv[i]+d[i])/2; |
292 |
} |
293 |
}; |
294 |
|
295 |
void openfw(FILE* &f, GArgs& args, char opt) { |
296 |
GStr s=args.getOpt(opt); |
297 |
if (!s.is_empty()) { |
298 |
if (s=='-') f=stdout; |
299 |
else { |
300 |
f=fopen(s.chars(),"w"); |
301 |
if (f==NULL) GError("Error creating file: %s\n", s.chars()); |
302 |
} |
303 |
} |
304 |
} |
305 |
|
306 |
#define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh) |
307 |
#define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh) |
308 |
|
309 |
GHash<FqDupRec> dhash; //hash to keep track of duplicates |
310 |
|
311 |
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
312 |
GStr& s, GStr& infname, GStr& infname2); |
313 |
// uses outsuffix to generate output file names and open file handles as needed |
314 |
|
315 |
void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter); |
316 |
void trash_report(char trashcode, GStr& rname, FILE* freport); |
317 |
|
318 |
bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv, |
319 |
GStr& rname, GStr& rinfo, GStr& infname); |
320 |
|
321 |
char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3); |
322 |
//returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed |
323 |
|
324 |
bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured |
325 |
bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured |
326 |
int dust(GStr& seq); |
327 |
bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
328 |
bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
329 |
|
330 |
void convertPhred(char* q, int len); |
331 |
void convertPhred(GStr& q); |
332 |
|
333 |
int main(int argc, char * const argv[]) { |
334 |
GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:t:o:z:a:"); |
335 |
int e; |
336 |
if ((e=args.isError())>0) { |
337 |
GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]); |
338 |
exit(224); |
339 |
} |
340 |
debug=(args.getOpt('Y')!=NULL); |
341 |
verbose=(args.getOpt('V')!=NULL); |
342 |
convert_phred=(args.getOpt('Q')!=NULL); |
343 |
doCollapse=(args.getOpt('C')!=NULL); |
344 |
doDust=(args.getOpt('D')!=NULL); |
345 |
/* |
346 |
rawFormat=(args.getOpt('R')!=NULL); |
347 |
if (rawFormat) { |
348 |
GError("Sorry, raw qseq format parsing is not implemented yet!\n"); |
349 |
} |
350 |
*/ |
351 |
prefix=args.getOpt('n'); |
352 |
GStr s=args.getOpt('l'); |
353 |
if (!s.is_empty()) |
354 |
min_read_len=s.asInt(); |
355 |
s=args.getOpt('m'); |
356 |
if (!s.is_empty()) |
357 |
max_perc_N=s.asDouble(); |
358 |
s=args.getOpt('d'); |
359 |
if (!s.is_empty()) { |
360 |
dust_cutoff=s.asInt(); |
361 |
doDust=true; |
362 |
} |
363 |
s=args.getOpt('q'); |
364 |
if (!s.is_empty()) { |
365 |
qvtrim_qmin=s.asInt(); |
366 |
} |
367 |
s=args.getOpt('t'); |
368 |
if (!s.is_empty()) { |
369 |
qvtrim_max=s.asInt(); |
370 |
} |
371 |
s=args.getOpt('p'); |
372 |
if (!s.is_empty()) { |
373 |
int v=s.asInt(); |
374 |
if (v==33) { |
375 |
qv_phredtype=33; |
376 |
qv_cvtadd=31; |
377 |
} |
378 |
else if (v==64) { |
379 |
qv_phredtype=64; |
380 |
qv_cvtadd=-31; |
381 |
} |
382 |
else |
383 |
GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE); |
384 |
} |
385 |
if (args.getOpt('3')!=NULL) { |
386 |
adapter3=args.getOpt('3'); |
387 |
adapter3.upper(); |
388 |
a3len=adapter3.length(); |
389 |
} |
390 |
if (args.getOpt('5')!=NULL) { |
391 |
adapter5=args.getOpt('5'); |
392 |
adapter5.upper(); |
393 |
a5len=adapter5.length(); |
394 |
} |
395 |
s=args.getOpt('a'); |
396 |
if (!s.is_empty()) { |
397 |
int a_minmatch=s.asInt(); |
398 |
a_min_score=a_minmatch<<1; |
399 |
} |
400 |
|
401 |
if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o'); |
402 |
trashReport= (args.getOpt('r')!=NULL); |
403 |
int fcount=args.startNonOpt(); |
404 |
if (fcount==0) { |
405 |
GMessage(USAGE); |
406 |
exit(224); |
407 |
} |
408 |
if (fcount>1 && doCollapse) { |
409 |
GError("%s Sorry, the -C option only works with a single input.\n", USAGE); |
410 |
} |
411 |
//openfw(f_out, args, 'o'); |
412 |
//if (f_out==NULL) f_out=stdout; |
413 |
if (trashReport) |
414 |
openfw(freport, args, 'r'); |
415 |
char* infile=NULL; |
416 |
while ((infile=args.nextNonOpt())!=NULL) { |
417 |
int incounter=0; //counter for input reads |
418 |
int outcounter=0; //counter for output reads |
419 |
int trash_s=0; //too short from the get go |
420 |
int trash_Q=0; |
421 |
int trash_N=0; |
422 |
int trash_D=0; |
423 |
int trash_A3=0; |
424 |
int trash_A5=0; |
425 |
s=infile; |
426 |
GStr infname; |
427 |
GStr infname2; |
428 |
FILE* f_in=NULL; |
429 |
FILE* f_in2=NULL; |
430 |
FILE* f_out=NULL; |
431 |
FILE* f_out2=NULL; |
432 |
bool paired_reads=false; |
433 |
setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2); |
434 |
GLineReader fq(f_in); |
435 |
GLineReader* fq2=NULL; |
436 |
if (f_in2!=NULL) { |
437 |
fq2=new GLineReader(f_in2); |
438 |
paired_reads=true; |
439 |
} |
440 |
GStr rseq, rqv, seqid, seqinfo; |
441 |
GStr rseq2, rqv2, seqid2, seqinfo2; |
442 |
while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) { |
443 |
incounter++; |
444 |
int a5=0, a3=0, b5=0, b3=0; |
445 |
char tcode=0, tcode2=0; |
446 |
tcode=process_read(seqid, rseq, rqv, a5, a3); |
447 |
//if (!doCollapse) { |
448 |
if (fq2!=NULL) { |
449 |
getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2); |
450 |
if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) { |
451 |
GError("Error: no paired match for read %s vs %s (%s,%s)\n", |
452 |
seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars()); |
453 |
} |
454 |
tcode2=process_read(seqid2, rseq2, rqv2, b5, b3); |
455 |
//decide what to do with this pair and print rseq2 if the pair makes it |
456 |
if (tcode>1 && tcode2<=1) { |
457 |
//"untrash" rseq |
458 |
if (a3-a5+1<min_read_len) { |
459 |
a5=1; |
460 |
if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); } |
461 |
} |
462 |
tcode=1; |
463 |
} |
464 |
else if (tcode<=1 && tcode2>1) { |
465 |
//"untrash" rseq2 |
466 |
if (b3-b5+1<min_read_len) { |
467 |
b5=1; |
468 |
if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); } |
469 |
} |
470 |
tcode2=1; |
471 |
} |
472 |
if (tcode<=1) { //trimmed or left intact -- write it! |
473 |
if (tcode>0) { |
474 |
rseq2=rseq2.substr(b5,b3-b5+1); |
475 |
if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1); |
476 |
} |
477 |
int nocounter=0; |
478 |
writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter); |
479 |
} |
480 |
} //paired read |
481 |
// } |
482 |
if (tcode>1) { //trashed |
483 |
if (tcode=='s') trash_s++; |
484 |
else if (tcode=='Q') trash_Q++; |
485 |
else if (tcode=='N') trash_N++; |
486 |
else if (tcode=='D') trash_D++; |
487 |
else if (tcode=='3') trash_A3++; |
488 |
else if (tcode=='5') trash_A5++; |
489 |
if (trashReport) trash_report(tcode, seqid, freport); |
490 |
} |
491 |
else if (!doCollapse) { //write it |
492 |
if (tcode>0) { |
493 |
rseq=rseq.substr(a5,a3-a5+1); |
494 |
if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1); |
495 |
} |
496 |
writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter); |
497 |
} |
498 |
} //for each fastq record |
499 |
delete fq2; |
500 |
FRCLOSE(f_in); |
501 |
FRCLOSE(f_in2); |
502 |
if (doCollapse) { |
503 |
outcounter=0; |
504 |
int maxdup_count=1; |
505 |
char* maxdup_seq=NULL; |
506 |
dhash.startIterate(); |
507 |
FqDupRec* qd=NULL; |
508 |
char* seq=NULL; |
509 |
while ((qd=dhash.NextData(seq))!=NULL) { |
510 |
GStr rseq(seq); |
511 |
//do the dusting here |
512 |
if (doDust) { |
513 |
int dustbases=dust(rseq); |
514 |
if (dustbases>(rseq.length()>>1)) { |
515 |
if (trashReport && qd->firstname!=NULL) { |
516 |
fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count); |
517 |
} |
518 |
trash_D+=qd->count; |
519 |
continue; |
520 |
} |
521 |
} |
522 |
outcounter++; |
523 |
if (qd->count>maxdup_count) { |
524 |
maxdup_count=qd->count; |
525 |
maxdup_seq=seq; |
526 |
} |
527 |
if (isfasta) { |
528 |
if (prefix.is_empty()) { |
529 |
fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count, |
530 |
rseq.chars()); |
531 |
} |
532 |
else { //use custom read name |
533 |
fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter, |
534 |
qd->count, rseq.chars()); |
535 |
} |
536 |
} |
537 |
else { //fastq format |
538 |
if (convert_phred) convertPhred(qd->qv, qd->len); |
539 |
if (prefix.is_empty()) { |
540 |
fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count, |
541 |
rseq.chars(), qd->qv); |
542 |
} |
543 |
else { //use custom read name |
544 |
fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
545 |
qd->count, rseq.chars(), qd->qv); |
546 |
} |
547 |
} |
548 |
}//for each element of dhash |
549 |
if (maxdup_count>1) { |
550 |
GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq); |
551 |
} |
552 |
} //collapse entries |
553 |
if (verbose) { |
554 |
if (paired_reads) { |
555 |
GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars()); |
556 |
GMessage("Number of input pairs :%9d\n", incounter); |
557 |
GMessage(" Output pairs :%9d\n", outcounter); |
558 |
} |
559 |
else { |
560 |
GMessage(">Input file : %s\n", infname.chars()); |
561 |
GMessage("Number of input reads :%9d\n", incounter); |
562 |
GMessage(" Output reads :%9d\n", outcounter); |
563 |
} |
564 |
GMessage("------------------------------------\n"); |
565 |
if (num_trimmed5) |
566 |
GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5); |
567 |
if (num_trimmed3) |
568 |
GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3); |
569 |
GMessage("------------------------------------\n"); |
570 |
if (trash_s>0) |
571 |
GMessage(" Trashed by length:%9d\n", trash_s); |
572 |
if (trash_N>0) |
573 |
GMessage(" Trashed by N%%:%9d\n", trash_N); |
574 |
if (trash_Q>0) |
575 |
GMessage("Trashed by low quality:%9d\n", trash_Q); |
576 |
if (trash_A5>0) |
577 |
GMessage(" Trashed by 5' adapter:%9d\n", trash_A5); |
578 |
if (trash_A3>0) |
579 |
GMessage(" Trashed by 3' adapter:%9d\n", trash_A3); |
580 |
} |
581 |
if (trashReport) { |
582 |
FWCLOSE(freport); |
583 |
} |
584 |
FWCLOSE(f_out); |
585 |
FWCLOSE(f_out2); |
586 |
} //while each input file |
587 |
|
588 |
//getc(stdin); |
589 |
} |
590 |
|
591 |
class NData { |
592 |
public: |
593 |
int NPos[1024]; //there should be no reads longer than 1K ? |
594 |
int NCount; |
595 |
int end5; |
596 |
int end3; |
597 |
int seqlen; |
598 |
double perc_N; //percentage of Ns in end5..end3 range only! |
599 |
const char* seq; |
600 |
bool valid; |
601 |
NData() { |
602 |
NCount=0; |
603 |
end5=0; |
604 |
end3=0; |
605 |
seq=NULL; |
606 |
perc_N=0; |
607 |
valid=true; |
608 |
} |
609 |
void init(GStr& rseq) { |
610 |
NCount=0; |
611 |
perc_N=0; |
612 |
seqlen=rseq.length(); |
613 |
seq=rseq.chars(); |
614 |
end5=1; |
615 |
end3=seqlen; |
616 |
for (int i=0;i<seqlen;i++) |
617 |
if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT") |
618 |
NPos[NCount]=i; |
619 |
NCount++; |
620 |
} |
621 |
perc_N=(NCount*100.0)/seqlen; |
622 |
valid=true; |
623 |
} |
624 |
void N_calc() { //only in the region after trimming |
625 |
int n=0; |
626 |
for (int i=end3-1;i<end5;i++) { |
627 |
if (seq[i]=='N') n++; |
628 |
} |
629 |
perc_N=(n*100.0)/(end5-end3+1); |
630 |
} |
631 |
}; |
632 |
|
633 |
static NData feat; |
634 |
int perc_lenN=12; // incremental distance from ends, in percentage of |
635 |
// sequence length, where N-trimming is done (default:12 %) (autolimited to 20) |
636 |
|
637 |
void N_analyze(int l5, int l3, int p5, int p3) { |
638 |
/* assumes feat was filled properly */ |
639 |
int old_dif, t5,t3,v; |
640 |
if (l3<l5+2 || p5>p3 ) { |
641 |
feat.end5=l5+1; |
642 |
feat.end3=l3+1; |
643 |
return; |
644 |
} |
645 |
|
646 |
t5=feat.NPos[p5]-l5; |
647 |
t3=l3-feat.NPos[p3]; |
648 |
old_dif=p3-p5; |
649 |
v=(int)((((double)(l3-l5))*perc_lenN)/100); |
650 |
if (v>20) v=20; /* enforce N-search limit for very long reads */ |
651 |
if (t5 < v ) { |
652 |
l5=feat.NPos[p5]+1; |
653 |
p5++; |
654 |
} |
655 |
if (t3 < v) { |
656 |
l3=feat.NPos[p3]-1; |
657 |
p3--; |
658 |
} |
659 |
/* restNs=p3-p5; number of Ns in the new CLR */ |
660 |
if (p3-p5==old_dif) { /* no change, return */ |
661 |
/* don't trim if this may shorten the read too much */ |
662 |
//if (l5-l3<min_read_len) return; |
663 |
feat.end5=l5+1; |
664 |
feat.end3=l3+1; |
665 |
return; |
666 |
} |
667 |
else |
668 |
N_analyze(l5,l3, p5,p3); |
669 |
} |
670 |
|
671 |
|
672 |
bool qtrim(GStr& qvs, int &l5, int &l3) { |
673 |
if (qvtrim_qmin==0 || qvs.is_empty()) return false; |
674 |
if (qv_phredtype==0) { |
675 |
//try to guess the Phred type |
676 |
int vmin=256, vmax=0; |
677 |
for (int i=0;i<qvs.length();i++) { |
678 |
if (vmin>qvs[i]) vmin=qvs[i]; |
679 |
if (vmax<qvs[i]) vmax=qvs[i]; |
680 |
} |
681 |
if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; } |
682 |
if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; } |
683 |
if (qv_phredtype==0) { |
684 |
GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n"); |
685 |
} |
686 |
if (verbose) |
687 |
GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64)); |
688 |
} //guessing Phred type |
689 |
for (l3=qvs.length()-1;l3>2;l3--) { |
690 |
if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break; |
691 |
} |
692 |
//just in case, check also the 5' the end (?) |
693 |
for (l5=0;l5<qvs.length()-3;l5++) { |
694 |
if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break; |
695 |
} |
696 |
if (qvtrim_max>0) { |
697 |
if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max; |
698 |
if (l5>qvtrim_max) l5=qvtrim_max; |
699 |
} |
700 |
return (l5>0 || l3<qvs.length()-1); |
701 |
} |
702 |
|
703 |
bool ntrim(GStr& rseq, int &l5, int &l3) { |
704 |
//count Ns in the sequence, trim N-rich ends |
705 |
feat.init(rseq); |
706 |
l5=feat.end5-1; |
707 |
l3=feat.end3-1; |
708 |
N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1); |
709 |
if (l5==feat.end5-1 && l3==feat.end3-1) { |
710 |
if (feat.perc_N>max_perc_N) { |
711 |
feat.valid=false; |
712 |
l3=l5+1; |
713 |
return true; |
714 |
} |
715 |
else { |
716 |
return false; //nothing changed |
717 |
} |
718 |
} |
719 |
l5=feat.end5-1; |
720 |
l3=feat.end3-1; |
721 |
if (l3-l5+1<min_read_len) { |
722 |
feat.valid=false; |
723 |
return true; |
724 |
} |
725 |
feat.N_calc(); |
726 |
|
727 |
if (feat.perc_N>max_perc_N) { |
728 |
feat.valid=false; |
729 |
l3=l5+1; |
730 |
return true; |
731 |
} |
732 |
return true; |
733 |
} |
734 |
|
735 |
//--------------- dust functions ---------------- |
736 |
class DNADuster { |
737 |
public: |
738 |
int dustword; |
739 |
int dustwindow; |
740 |
int dustwindow2; |
741 |
int dustcutoff; |
742 |
int mv, iv, jv; |
743 |
int counts[32*32*32]; |
744 |
int iis[32*32*32]; |
745 |
DNADuster(int cutoff=16, int winsize=32, int wordsize=3) { |
746 |
dustword=wordsize; |
747 |
dustwindow=winsize; |
748 |
dustwindow2 = (winsize>>1); |
749 |
dustcutoff=cutoff; |
750 |
mv=0; |
751 |
iv=0; |
752 |
jv=0; |
753 |
} |
754 |
void setWindowSize(int value) { |
755 |
dustwindow = value; |
756 |
dustwindow2 = (dustwindow >> 1); |
757 |
} |
758 |
void setWordSize(int value) { |
759 |
dustword=value; |
760 |
} |
761 |
void wo1(int len, const char* s, int ivv) { |
762 |
int i, ii, j, v, t, n, n1, sum; |
763 |
int js, nis; |
764 |
n = 32 * 32 * 32; |
765 |
n1 = n - 1; |
766 |
nis = 0; |
767 |
i = 0; |
768 |
ii = 0; |
769 |
sum = 0; |
770 |
v = 0; |
771 |
for (j=0; j < len; j++, s++) { |
772 |
ii <<= 5; |
773 |
if (*s<=32) { |
774 |
i=0; |
775 |
continue; |
776 |
} |
777 |
ii |= *s - 'A'; //assume uppercase! |
778 |
ii &= n1; |
779 |
i++; |
780 |
if (i >= dustword) { |
781 |
for (js=0; js < nis && iis[js] != ii; js++) ; |
782 |
if (js == nis) { |
783 |
iis[nis] = ii; |
784 |
counts[ii] = 0; |
785 |
nis++; |
786 |
} |
787 |
if ((t = counts[ii]) > 0) { |
788 |
sum += t; |
789 |
v = 10 * sum / j; |
790 |
if (mv < v) { |
791 |
mv = v; |
792 |
iv = ivv; |
793 |
jv = j; |
794 |
} |
795 |
} |
796 |
counts[ii]++; |
797 |
} |
798 |
} |
799 |
} |
800 |
|
801 |
int wo(int len, const char* s, int* beg, int* end) { |
802 |
int i, l1; |
803 |
l1 = len - dustword + 1; |
804 |
if (l1 < 0) { |
805 |
*beg = 0; |
806 |
*end = len - 1; |
807 |
return 0; |
808 |
} |
809 |
mv = 0; |
810 |
iv = 0; |
811 |
jv = 0; |
812 |
for (i=0; i < l1; i++) { |
813 |
wo1(len-i, s+i, i); |
814 |
} |
815 |
*beg = iv; |
816 |
*end = iv + jv; |
817 |
return mv; |
818 |
} |
819 |
|
820 |
void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) { |
821 |
int i, j, l, a, b, v; |
822 |
if (cutoff==0) cutoff=dustcutoff; |
823 |
a=0;b=0; |
824 |
//GMessage("Dust cutoff=%d\n", cutoff); |
825 |
for (i=0; i < seqlen; i += dustwindow2) { |
826 |
l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i; |
827 |
v = wo(l, seq+i, &a, &b); |
828 |
if (v > cutoff) { |
829 |
//for (j = a; j <= b && j < dustwindow2; j++) { |
830 |
for (j = a; j <= b; j++) { |
831 |
seqmsk[i+j]='N';//could be made lowercase instead |
832 |
} |
833 |
} |
834 |
} |
835 |
//return first; |
836 |
} |
837 |
}; |
838 |
|
839 |
static DNADuster duster; |
840 |
|
841 |
int dust(GStr& rseq) { |
842 |
char* seq=Gstrdup(rseq.chars()); |
843 |
duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff); |
844 |
//check the number of Ns: |
845 |
int ncount=0; |
846 |
for (int i=0;i<rseq.length();i++) { |
847 |
if (seq[i]=='N') ncount++; |
848 |
} |
849 |
GFREE(seq); |
850 |
return ncount; |
851 |
} |
852 |
|
853 |
|
854 |
// ------------------ adapter matching - simple k-mer seed & extend, no indels for now |
855 |
//when a k-mer match is found, simply try to extend the alignment using a drop-off scheme |
856 |
//check minimum score and |
857 |
//for 3' adapter trimming: |
858 |
// require that the right end of the alignment for either the adaptor OR the read must be |
859 |
// < 3 distance from its right end |
860 |
// for 5' adapter trimming: |
861 |
// require that the left end of the alignment for either the adaptor OR the read must |
862 |
// be at coordinate < 3 from start |
863 |
|
864 |
bool extendMatch(const char* a, int alen, int ai, |
865 |
const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) { |
866 |
//so the alignment starts at ai in a, bi in b, with a perfect match of length mlen |
867 |
#ifdef DEBUG |
868 |
GStr dbg(b); |
869 |
#endif |
870 |
//if (debug) { |
871 |
// GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai); |
872 |
// } |
873 |
int a_l=ai; //alignment coordinates on a |
874 |
int a_r=ai+mlen-1; |
875 |
int b_l=bi; //alignment coordinates on b |
876 |
int b_r=bi+mlen-1; |
877 |
int ai_maxscore=ai; |
878 |
int bi_maxscore=bi; |
879 |
int score=mlen*a_m_score; |
880 |
int maxscore=score; |
881 |
int mism5score=a_mis_score; |
882 |
if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end |
883 |
//try to extend to the left first, if possible |
884 |
while (ai>0 && bi>0) { |
885 |
ai--; |
886 |
bi--; |
887 |
score+= (a[ai]==b[bi])? a_m_score : mism5score; |
888 |
if (score>maxscore) { |
889 |
ai_maxscore=ai; |
890 |
bi_maxscore=bi; |
891 |
maxscore=score; |
892 |
} |
893 |
else if (maxscore-score>a_dropoff_score) break; |
894 |
} |
895 |
a_l=ai_maxscore; |
896 |
b_l=bi_maxscore; |
897 |
//if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore); |
898 |
//now extend to the right |
899 |
ai_maxscore=a_r; |
900 |
bi_maxscore=b_r; |
901 |
ai=a_r; |
902 |
bi=b_r; |
903 |
score=maxscore; |
904 |
//sometimes there are extra AAAAs at the end of the read, ignore those |
905 |
if (strcmp(&a[alen-4],"AAAA")==0) { |
906 |
alen-=3; |
907 |
while (a[alen-1]=='A' && alen>ai) alen--; |
908 |
} |
909 |
while (ai<alen-1 && bi<blen-1) { |
910 |
ai++; |
911 |
bi++; |
912 |
//score+= (a[ai]==b[bi])? a_m_score : a_mis_score; |
913 |
if (a[ai]==b[bi]) { //match |
914 |
score+=a_m_score; |
915 |
if (ai>=alen-2) { |
916 |
score+=a_m_score-(alen-ai-1); |
917 |
} |
918 |
} |
919 |
else { //mismatch |
920 |
score+=a_mis_score; |
921 |
} |
922 |
if (score>maxscore) { |
923 |
ai_maxscore=ai; |
924 |
bi_maxscore=bi; |
925 |
maxscore=score; |
926 |
} |
927 |
else if (maxscore-score>a_dropoff_score) break; |
928 |
} |
929 |
a_r=ai_maxscore; |
930 |
b_r=bi_maxscore; |
931 |
int a_ovh3=alen-a_r-1; |
932 |
int b_ovh3=blen-b_r-1; |
933 |
int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3; |
934 |
int mmovh5=(a_l<b_l)? a_l : b_l; |
935 |
//if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore); |
936 |
#ifdef DEBUG |
937 |
if (debug) GMessage(" extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars()); |
938 |
#endif |
939 |
if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) { |
940 |
if (a_l<a_ovh3) { |
941 |
//adapter closer to the left end (typical for 5' adapter) |
942 |
l5=a_r+1; |
943 |
l3=alen-1; |
944 |
} |
945 |
else { |
946 |
//adapter matching at the right end (typical for 3' adapter) |
947 |
l5=0; |
948 |
l3=a_l-1; |
949 |
} |
950 |
return true; |
951 |
} |
952 |
else { //keep this segment pair for later (gapped alignment) |
953 |
segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore)); |
954 |
//this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend) |
955 |
} |
956 |
//do not trim: |
957 |
l5=0; |
958 |
l3=alen-1; |
959 |
return false; |
960 |
} |
961 |
|
962 |
/* |
963 |
int getWordValue(const char* s, int wlen) { |
964 |
int r=0; |
965 |
while (wlen--) { r+=(((int)s[wlen])<<wlen) } |
966 |
return r; |
967 |
} |
968 |
*/ |
969 |
int get3mer_value(const char* s) { |
970 |
return (s[0]<<16)+(s[1]<<8)+s[2]; |
971 |
} |
972 |
|
973 |
int w3_match(int qv, const char* str, int slen, int start_index=0) { |
974 |
if (start_index>=slen || start_index<0) return -1; |
975 |
for (int i=start_index;i<slen-3;i++) { |
976 |
int rv=get3mer_value(str+i); |
977 |
if (rv==qv) return i; |
978 |
} |
979 |
return -1; |
980 |
} |
981 |
|
982 |
int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) { |
983 |
if (end_index>=slen) return -1; |
984 |
if (end_index<0) end_index=slen-1; |
985 |
for (int i=end_index-2;i>=0;i--) { |
986 |
int rv=get3mer_value(str+i); |
987 |
if (rv==qv) return i; |
988 |
} |
989 |
return -1; |
990 |
} |
991 |
|
992 |
int fast4match(int32 qv, const char* str, int slen, int start_index=0) { |
993 |
if (start_index>=slen || start_index<0) return -1; |
994 |
for (int i=start_index;i<slen-4;i++) { |
995 |
int32* rv=(int32*)(str+i); |
996 |
if (*rv==qv) return i; |
997 |
} |
998 |
return -1; |
999 |
} |
1000 |
|
1001 |
int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) { |
1002 |
if (end_index>=slen) return -1; |
1003 |
if (end_index<0) end_index=slen-1; |
1004 |
for (int i=end_index-3;i>=0;i--) { |
1005 |
int32* rv=(int32*)(str+i); |
1006 |
if (*rv==qv) return i; |
1007 |
} |
1008 |
return -1; |
1009 |
} |
1010 |
|
1011 |
#ifdef DEBUG |
1012 |
void dbgPrintChain(CSegChain& chain, const char* aseq) { |
1013 |
GStr s(aseq); |
1014 |
for (int i=0;i<chain.Count();i++) { |
1015 |
CSegPair& seg=*chain[i]; |
1016 |
GMessage(" dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(), |
1017 |
s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end); |
1018 |
} |
1019 |
} |
1020 |
#endif |
1021 |
|
1022 |
bool trim_adapter3(GStr& seq, int&l5, int &l3) { |
1023 |
int rlen=seq.length(); |
1024 |
l5=0; |
1025 |
l3=rlen-1; |
1026 |
//first try a full match, we might get lucky |
1027 |
int fi=-1; |
1028 |
if ((fi=seq.index(adapter3))>=0) { |
1029 |
if (fi<rlen-fi-a3len) {//match is closer to the right end |
1030 |
l5=fi+a3len; |
1031 |
l3=rlen-1; |
1032 |
} |
1033 |
else { |
1034 |
l5=0; |
1035 |
l3=fi-1; |
1036 |
} |
1037 |
return true; |
1038 |
} |
1039 |
#ifdef DEBUG |
1040 |
if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars()); |
1041 |
#endif |
1042 |
|
1043 |
//also, for fast detection of other adapter-only reads that start past |
1044 |
// the beginning of the adapter sequence, try to see if the first a3len-4 |
1045 |
// bases of the read are a substring of the adapter |
1046 |
if (rlen>a3len-3) { |
1047 |
GStr rstart=seq.substr(1,a3len-4); |
1048 |
if ((fi=adapter3.index(rstart))>=0) { |
1049 |
l3=rlen-1; |
1050 |
l5=a3len-4; |
1051 |
while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++; |
1052 |
return true; |
1053 |
} |
1054 |
} |
1055 |
CSegChain a3segs; //no chains here, just an ordered collection of segment pairs |
1056 |
//check the easy cases - 11 bases exact match at the end |
1057 |
int fdlen=11; |
1058 |
if (a3len<16) { |
1059 |
fdlen=a3len>>1; |
1060 |
} |
1061 |
if (fdlen>4) { |
1062 |
//check if we're lucky enough to have the last 11 bases of the read a part of the adapter |
1063 |
GStr rstart=seq.substr(-fdlen-3,fdlen); |
1064 |
if ((fi=adapter3.index(rstart))>=0) { |
1065 |
#ifdef DEBUG |
1066 |
if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars()); |
1067 |
#endif |
1068 |
if (extendMatch(seq.chars(), rlen, rlen-fdlen-3, |
1069 |
adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs)) |
1070 |
return true; |
1071 |
} |
1072 |
//another easy case: first 11 characters of the adaptor found as a substring of the read |
1073 |
GStr bstr=adapter3.substr(0, fdlen); |
1074 |
if ((fi=seq.rindex(bstr))>=0) { |
1075 |
#ifdef DEBUG |
1076 |
if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars()); |
1077 |
#endif |
1078 |
if (extendMatch(seq.chars(), rlen, fi, |
1079 |
adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs)) |
1080 |
return true; |
1081 |
} |
1082 |
} //tried to match 11 bases first |
1083 |
|
1084 |
//no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor |
1085 |
//-- only extend if the match is in the 3' (ending) region of the read |
1086 |
int wordSize=3; |
1087 |
int hlen=12; |
1088 |
if (hlen>a3len-wordSize) hlen=a3len-wordSize; |
1089 |
int imin=rlen>>1; //last half of the read, left boundary for the wmer match |
1090 |
if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); } |
1091 |
imin=rlen-imin; |
1092 |
for (int iw=0;iw<hlen;iw++) { |
1093 |
//int32* qv=(int32*)(adapter3.chars()+iw); |
1094 |
int qv=get3mer_value(adapter3.chars()+iw); |
1095 |
fi=-1; |
1096 |
//while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1097 |
while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1098 |
//GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin); |
1099 |
|
1100 |
#ifdef DEBUG |
1101 |
if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars()); |
1102 |
#endif |
1103 |
if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(), |
1104 |
a3len, iw, wordSize, l5,l3, a3segs)) return true; |
1105 |
fi--; |
1106 |
if (fi<imin) break; |
1107 |
} |
1108 |
} //for each wmer in the first hlen bases of the adaptor |
1109 |
/* |
1110 |
//couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there |
1111 |
//but only do this if we already have segment pairs collected in the last 12 bases of the adapter |
1112 |
if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false; |
1113 |
int hlen2=a3len-wordSize; |
1114 |
//if (hlen2>a3len-4) hlen2=a3len-4; |
1115 |
if (hlen2>hlen) { |
1116 |
#ifdef DEBUG |
1117 |
if (debug && a3segs.Count()>0) { |
1118 |
GMessage(" >>>>>2nd. hash: %s\n",seq.chars()); |
1119 |
} |
1120 |
#endif |
1121 |
for (int iw=hlen;iw<hlen2;iw++) { |
1122 |
//int* qv=(int32 *)(adapter3.chars()+iw); |
1123 |
int qv=get3mer_value(adapter3.chars()+iw); |
1124 |
fi=-1; |
1125 |
//while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1126 |
while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1127 |
extendMatch(seq.chars(), rlen, fi, adapter3.chars(), |
1128 |
a3len, iw, wordSize, l5,l3, a3segs); |
1129 |
fi--; |
1130 |
if (fi<imin) break; |
1131 |
} |
1132 |
} //for each wmer between hlen2 and hlen bases of the adaptor |
1133 |
} |
1134 |
//lastly, analyze collected a3segs for a possible gapped alignment: |
1135 |
GList<CSegChain> segchains(false,true,false); |
1136 |
#ifdef DEBUG |
1137 |
if (debug && a3segs.Count()>0) { |
1138 |
GMessage(">>>>>>>>> Read: %s\n",seq.chars()); |
1139 |
} |
1140 |
#endif |
1141 |
for (int i=0;i<a3segs.Count();i++) { |
1142 |
if (a3segs[i]->chain==NULL) { |
1143 |
if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain |
1144 |
CSegChain* newchain=new CSegChain(); |
1145 |
newchain->setFreeItem(false); |
1146 |
newchain->addSegPair(a3segs[i]); |
1147 |
a3segs[i]->chain=newchain; |
1148 |
segchains.Add(newchain); //just to free them when done |
1149 |
} |
1150 |
for (int j=i+1;j<a3segs.Count();j++) { |
1151 |
CSegChain* chain=a3segs[i]->chain; |
1152 |
if (chain->extendChain(a3segs[j])) { |
1153 |
a3segs[j]->chain=chain; |
1154 |
#ifdef DEBUG |
1155 |
if (debug) dbgPrintChain(*chain, adapter3.chars()); |
1156 |
#endif |
1157 |
//save time by checking here if the extended chain is already acceptable for trimming |
1158 |
if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) { |
1159 |
l5=0; |
1160 |
l3=chain->astart-2; |
1161 |
#ifdef DEBUG |
1162 |
if (debug && a3segs.Count()>0) { |
1163 |
GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars()); |
1164 |
} |
1165 |
#endif |
1166 |
return true; |
1167 |
} |
1168 |
} //chain can be extended |
1169 |
} |
1170 |
} //collect segment alignments into chains |
1171 |
*/ |
1172 |
return false; //no adapter parts found |
1173 |
} |
1174 |
|
1175 |
bool trim_adapter5(GStr& seq, int&l5, int &l3) { |
1176 |
//if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars()); |
1177 |
int rlen=seq.length(); |
1178 |
l5=0; |
1179 |
l3=rlen-1; |
1180 |
//try to see if adapter is fully included in the read |
1181 |
int fi=-1; |
1182 |
if ((fi=seq.index(adapter5))>=0) { |
1183 |
if (fi<rlen-fi-a5len) {//match is closer to the right end |
1184 |
l5=fi+a5len; |
1185 |
l3=rlen-1; |
1186 |
} |
1187 |
else { |
1188 |
l5=0; |
1189 |
l3=fi-1; |
1190 |
} |
1191 |
return true; |
1192 |
} |
1193 |
#ifdef DEBUG |
1194 |
if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars()); |
1195 |
#endif |
1196 |
|
1197 |
CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded |
1198 |
|
1199 |
//try the easy way out first - look for an exact match of 11 bases |
1200 |
int fdlen=11; |
1201 |
if (a5len<16) { |
1202 |
fdlen=a5len>>1; |
1203 |
} |
1204 |
if (fdlen>4) { |
1205 |
GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus |
1206 |
if ((fi=adapter5.index(rstart))>=0) { |
1207 |
#ifdef DEBUG |
1208 |
if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars()); |
1209 |
#endif |
1210 |
if (extendMatch(seq.chars(), rlen, 1, |
1211 |
adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true)) |
1212 |
return true; |
1213 |
} |
1214 |
//another easy case: last 11 characters of the adaptor found as a substring of the read |
1215 |
GStr bstr=adapter5.substr(-fdlen); |
1216 |
if ((fi=seq.index(bstr))>=0) { |
1217 |
#ifdef DEBUG |
1218 |
if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars()); |
1219 |
#endif |
1220 |
if (extendMatch(seq.chars(), rlen, fi, |
1221 |
adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true)) |
1222 |
return true; |
1223 |
} |
1224 |
} //tried to matching at most 11 bases first |
1225 |
|
1226 |
//-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor |
1227 |
//-- only extend a wmer if it matches in the 5' (beginning) region of the read |
1228 |
int wordSize=3; |
1229 |
int hlen=12; |
1230 |
if (hlen>a5len-wordSize) hlen=a5len-wordSize; |
1231 |
int imax=rlen>>1; //first half of the read, right boundary for the wmer match |
1232 |
if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); } |
1233 |
for (int iw=0;iw<=hlen;iw++) { |
1234 |
int apstart=a5len-iw-wordSize; |
1235 |
fi=0; |
1236 |
//int* qv=(int32 *)(adapter5.chars()+apstart); |
1237 |
int qv=get3mer_value(adapter5.chars()+apstart); |
1238 |
//while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1239 |
while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1240 |
#ifdef DEBUG |
1241 |
if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars()); |
1242 |
#endif |
1243 |
if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
1244 |
a5len, apstart, wordSize, l5,l3, a5segs, true)) return true; |
1245 |
fi++; |
1246 |
if (fi>imax) break; |
1247 |
} |
1248 |
} //for each wmer in the last hlen bases of the adaptor |
1249 |
/* |
1250 |
|
1251 |
//couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there |
1252 |
//but only do this if we already have segment pairs collected in the last 12 bases of the adapter |
1253 |
if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false; |
1254 |
int hlen2=a5len-wordSize; |
1255 |
//if (hlen2>a5len-wordSize) hlen2=a5len-wordSize; |
1256 |
#ifdef DEBUG |
1257 |
if (debug && a5segs.Count()>0) { |
1258 |
GMessage(" >>>>>2nd. hash: %s\n",seq.chars()); |
1259 |
} |
1260 |
#endif |
1261 |
if (hlen2>hlen) { |
1262 |
for (int iw=hlen+1;iw<=hlen2;iw++) { |
1263 |
int apstart=a5len-iw-wordSize; |
1264 |
fi=0; |
1265 |
//int* qv=(int32 *)(adapter5.chars()+apstart); |
1266 |
int qv=get3mer_value(adapter5.chars()+apstart); |
1267 |
//while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1268 |
while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1269 |
extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
1270 |
a5len, apstart, wordSize, l5,l3, a5segs, true); |
1271 |
fi++; |
1272 |
if (fi>imax) break; |
1273 |
} |
1274 |
} //for each wmer between hlen2 and hlen bases of the adaptor |
1275 |
} |
1276 |
if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false; |
1277 |
// lastly, analyze collected a5segs for a possible gapped alignment: |
1278 |
GList<CSegChain> segchains(false,true,false); |
1279 |
#ifdef DEBUG |
1280 |
if (debug && a5segs.Count()>0) { |
1281 |
GMessage(">>>>>>>>> Read: %s\n",seq.chars()); |
1282 |
} |
1283 |
#endif |
1284 |
for (int i=0;i<a5segs.Count();i++) { |
1285 |
if (a5segs[i]->chain==NULL) { |
1286 |
if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain |
1287 |
CSegChain* newchain=new CSegChain(true); |
1288 |
newchain->setFreeItem(false); |
1289 |
newchain->addSegPair(a5segs[i]); |
1290 |
a5segs[i]->chain=newchain; |
1291 |
segchains.Add(newchain); //just to free them when done |
1292 |
} |
1293 |
for (int j=i+1;j<a5segs.Count();j++) { |
1294 |
CSegChain* chain=a5segs[i]->chain; |
1295 |
if (chain->extendChain(a5segs[j])) { |
1296 |
a5segs[j]->chain=chain; |
1297 |
#ifdef DEBUG |
1298 |
if (debug) dbgPrintChain(*chain, adapter5.chars()); |
1299 |
#endif |
1300 |
//save time by checking here if the extended chain is already acceptable for trimming |
1301 |
if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) { |
1302 |
l5=chain->aend; |
1303 |
l3=rlen-1; |
1304 |
return true; |
1305 |
} |
1306 |
} //chain can be extended |
1307 |
} |
1308 |
} //collect segment alignments into chains |
1309 |
*/ |
1310 |
return false; //no adapter parts found |
1311 |
} |
1312 |
|
1313 |
//convert qvs to/from phred64 from/to phread33 |
1314 |
void convertPhred(GStr& q) { |
1315 |
for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd; |
1316 |
} |
1317 |
|
1318 |
void convertPhred(char* q, int len) { |
1319 |
for (int i=0;i<len;i++) q[i]+=qv_cvtadd; |
1320 |
} |
1321 |
|
1322 |
bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv, |
1323 |
GStr& rname, GStr& rinfo, GStr& infname) { |
1324 |
rseq=""; |
1325 |
rqv=""; |
1326 |
rname=""; |
1327 |
rinfo=""; |
1328 |
if (fq.eof()) return false; |
1329 |
char* l=fq.getLine(); |
1330 |
while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines |
1331 |
if (l==NULL) return false; |
1332 |
/* if (rawFormat) { |
1333 |
//TODO: implement raw qseq parsing here |
1334 |
//if (raw type=N) then continue; //skip invalid/bad records |
1335 |
} //raw qseq format |
1336 |
else { // FASTQ or FASTA */ |
1337 |
isfasta=(l[0]=='>'); |
1338 |
if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n", |
1339 |
infname.chars(), l); |
1340 |
GStr s(l); |
1341 |
rname=&(l[1]); |
1342 |
for (int i=0;i<rname.length();i++) |
1343 |
if (rname[i]<=' ') { |
1344 |
if (i<rname.length()-2) rinfo=rname.substr(i+1); |
1345 |
rname.cut(i); |
1346 |
break; |
1347 |
} |
1348 |
//now get the sequence |
1349 |
if ((l=fq.getLine())==NULL) |
1350 |
GError("Error: unexpected EOF after header for read %s (%s)\n", |
1351 |
rname.chars(), infname.chars()); |
1352 |
rseq=l; //this must be the DNA line |
1353 |
while ((l=fq.getLine())!=NULL) { |
1354 |
//seq can span multiple lines |
1355 |
if (l[0]=='>' || l[0]=='+') { |
1356 |
fq.pushBack(); |
1357 |
break; // |
1358 |
} |
1359 |
rseq+=l; |
1360 |
} //check for multi-line seq |
1361 |
if (!isfasta) { //reading fastq quality values, which can also be multi-line |
1362 |
if ((l=fq.getLine())==NULL) |
1363 |
GError("Error: unexpected EOF after sequence for %s\n", rname.chars()); |
1364 |
if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n"); |
1365 |
if ((l=fq.getLine())==NULL) |
1366 |
GError("Error: unexpected EOF after qv header for %s\n", rname.chars()); |
1367 |
rqv=l; |
1368 |
//if (rqv.length()!=rseq.length()) |
1369 |
// GError("Error: qv len != seq len for %s\n", rname.chars()); |
1370 |
while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) { |
1371 |
rqv+=l; //append to qv string |
1372 |
} |
1373 |
}// fastq |
1374 |
// } //<-- FASTA or FASTQ |
1375 |
rseq.upper(); //TODO: what if we care about masking? |
1376 |
return true; |
1377 |
} |
1378 |
|
1379 |
char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) { |
1380 |
//returns 0 if the read was untouched, 1 if it was just trimmed |
1381 |
// and a trash code if it was trashed |
1382 |
l5=0; |
1383 |
l3=rseq.length()-1; |
1384 |
if (l3-l5+1<min_read_len) { |
1385 |
return 's'; |
1386 |
} |
1387 |
GStr wseq(rseq.chars()); |
1388 |
GStr wqv(rqv.chars()); |
1389 |
int w5=l5; |
1390 |
int w3=l3; |
1391 |
//first do the q-based trimming |
1392 |
if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming |
1393 |
if (w3-w5+1<min_read_len) { |
1394 |
return 'Q'; //invalid read |
1395 |
} |
1396 |
//-- keep only the w5..w3 range |
1397 |
l5=w5; |
1398 |
l3=w3; |
1399 |
wseq=wseq.substr(w5, w3-w5+1); |
1400 |
if (!wqv.is_empty()) |
1401 |
wqv=wqv.substr(w5, w3-w5+1); |
1402 |
} //qv trimming |
1403 |
// N-trimming on the remaining read seq |
1404 |
if (ntrim(wseq, w5, w3)) { |
1405 |
//GMessage("before: %s\n",wseq.chars()); |
1406 |
//GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1); |
1407 |
l5+=w5; |
1408 |
l3-=(wseq.length()-1-w3); |
1409 |
if (w3-w5+1<min_read_len) { |
1410 |
return 'N'; //to be trashed |
1411 |
} |
1412 |
//-- keep only the w5..w3 range |
1413 |
wseq=wseq.substr(w5, w3-w5+1); |
1414 |
if (!wqv.is_empty()) |
1415 |
wqv=wqv.substr(w5, w3-w5+1); |
1416 |
w5=0; |
1417 |
w3=wseq.length()-1; |
1418 |
} |
1419 |
if (a3len>0) { |
1420 |
if (trim_adapter3(wseq, w5, w3)) { |
1421 |
int trimlen=wseq.length()-(w3-w5+1); |
1422 |
num_trimmed3++; |
1423 |
if (trimlen<min_trimmed3) |
1424 |
min_trimmed3=trimlen; |
1425 |
l5+=w5; |
1426 |
l3-=(wseq.length()-1-w3); |
1427 |
if (w3-w5+1<min_read_len) { |
1428 |
return '3'; |
1429 |
} |
1430 |
//-- keep only the w5..w3 range |
1431 |
wseq=wseq.substr(w5, w3-w5+1); |
1432 |
if (!wqv.is_empty()) |
1433 |
wqv=wqv.substr(w5, w3-w5+1); |
1434 |
}//some adapter was trimmed |
1435 |
} //adapter trimming |
1436 |
if (a5len>0) { |
1437 |
if (trim_adapter5(wseq, w5, w3)) { |
1438 |
int trimlen=wseq.length()-(w3-w5+1); |
1439 |
num_trimmed5++; |
1440 |
if (trimlen<min_trimmed5) |
1441 |
min_trimmed5=trimlen; |
1442 |
l5+=w5; |
1443 |
l3-=(wseq.length()-1-w3); |
1444 |
if (w3-w5+1<min_read_len) { |
1445 |
return '5'; |
1446 |
} |
1447 |
//-- keep only the w5..w3 range |
1448 |
wseq=wseq.substr(w5, w3-w5+1); |
1449 |
if (!wqv.is_empty()) |
1450 |
wqv=wqv.substr(w5, w3-w5+1); |
1451 |
}//some adapter was trimmed |
1452 |
} //adapter trimming |
1453 |
if (doCollapse) { |
1454 |
//keep read for later |
1455 |
FqDupRec* dr=dhash.Find(wseq.chars()); |
1456 |
if (dr==NULL) { //new entry |
1457 |
//if (prefix.is_empty()) |
1458 |
dhash.Add(wseq.chars(), |
1459 |
new FqDupRec(&wqv, rname.chars())); |
1460 |
//else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length())); |
1461 |
} |
1462 |
else |
1463 |
dr->add(wqv); |
1464 |
} //collapsing duplicates |
1465 |
else { //not collapsing duplicates |
1466 |
//apply the dust filter now |
1467 |
if (doDust) { |
1468 |
int dustbases=dust(wseq); |
1469 |
if (dustbases>(wseq.length()>>1)) { |
1470 |
return 'D'; |
1471 |
} |
1472 |
} |
1473 |
} //not collapsing duplicates |
1474 |
return (l5>0 || l3<rseq.length()-1) ? 1 : 0; |
1475 |
} |
1476 |
|
1477 |
void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) { |
1478 |
//GMessage("printing Header..%c%s\n",recmarker, rname.chars()); |
1479 |
if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars()); |
1480 |
else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars()); |
1481 |
} |
1482 |
|
1483 |
void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) { |
1484 |
outcounter++; |
1485 |
bool asFasta=(rqv.is_empty() || fastaOutput); |
1486 |
if (asFasta) { |
1487 |
if (prefix.is_empty()) { |
1488 |
printHeader(f_out, '>',rname,rinfo); |
1489 |
fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now |
1490 |
} |
1491 |
else { |
1492 |
fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter, |
1493 |
rseq.chars()); |
1494 |
} |
1495 |
} |
1496 |
else { //fastq |
1497 |
if (convert_phred) convertPhred(rqv); |
1498 |
if (prefix.is_empty()) { |
1499 |
printHeader(f_out, '@', rname, rinfo); |
1500 |
fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars()); |
1501 |
} |
1502 |
else |
1503 |
fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
1504 |
rseq.chars(),rqv.chars() ); |
1505 |
} |
1506 |
} |
1507 |
|
1508 |
void trash_report(char trashcode, GStr& rname, FILE* freport) { |
1509 |
if (freport==NULL || trashcode<=' ') return; |
1510 |
if (trashcode=='3' || trashcode=='5') { |
1511 |
fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode); |
1512 |
} |
1513 |
else { |
1514 |
fprintf(freport, "%s\t%c\n",rname.chars(),trashcode); |
1515 |
} |
1516 |
//tcounter++; |
1517 |
} |
1518 |
|
1519 |
GStr getFext(GStr& s, int* xpos=NULL) { |
1520 |
//s must be a filename without a path |
1521 |
GStr r(""); |
1522 |
if (xpos!=NULL) *xpos=0; |
1523 |
if (s.is_empty() || s=="-") return r; |
1524 |
int p=s.rindex('.'); |
1525 |
int d=s.rindex('/'); |
1526 |
if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r; |
1527 |
r=s.substr(p+1); |
1528 |
if (xpos!=NULL) *xpos=p+1; |
1529 |
r.lower(); |
1530 |
return r; |
1531 |
} |
1532 |
|
1533 |
void baseFileName(GStr& fname) { |
1534 |
//remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) . |
1535 |
int xpos=0; |
1536 |
GStr fext=getFext(fname, &xpos); |
1537 |
if (xpos<=1) return; |
1538 |
bool extdel=false; |
1539 |
GStr f2; |
1540 |
if (fext=="z" || fext=="zip") { |
1541 |
extdel=true; |
1542 |
} |
1543 |
else if (fext.length()>=2) { |
1544 |
f2=fext.substr(0,2); |
1545 |
extdel=(f2=="gz" || f2=="bz"); |
1546 |
} |
1547 |
if (extdel) { |
1548 |
fname.cut(xpos-1); |
1549 |
fext=getFext(fname, &xpos); |
1550 |
if (xpos<=1) return; |
1551 |
} |
1552 |
extdel=false; |
1553 |
if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") { |
1554 |
extdel=true; |
1555 |
} |
1556 |
else if (fext.length()>=2) { |
1557 |
extdel=(fext.substr(0,2)=="fa"); |
1558 |
} |
1559 |
if (extdel) fname.cut(xpos-1); |
1560 |
GStr fncp(fname); |
1561 |
fncp.lower(); |
1562 |
fncp.chomp("seq"); |
1563 |
fncp.chomp("sequence"); |
1564 |
fncp.trimR("_."); |
1565 |
if (fncp.length()<fname.length()) fname.cut(fncp.length()); |
1566 |
} |
1567 |
|
1568 |
FILE* prepOutFile(GStr& infname, GStr& pocmd) { |
1569 |
FILE* f_out=NULL; |
1570 |
GStr fname(getFileName(infname.chars())); |
1571 |
//eliminate known extensions |
1572 |
baseFileName(fname); |
1573 |
if (outsuffix.is_empty() || outsuffix=="-") { return stdout; } |
1574 |
else if (pocmd.is_empty()) { |
1575 |
GStr oname(fname); |
1576 |
oname.append('.'); |
1577 |
oname.append(outsuffix); |
1578 |
f_out=fopen(oname.chars(),"w"); |
1579 |
if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars()); |
1580 |
} |
1581 |
else { |
1582 |
GStr oname(">"); |
1583 |
oname.append(fname); |
1584 |
oname.append('.'); |
1585 |
oname.append(outsuffix); |
1586 |
pocmd.append(oname); |
1587 |
f_out=popen(pocmd.chars(), "w"); |
1588 |
if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars()); |
1589 |
} |
1590 |
return f_out; |
1591 |
} |
1592 |
|
1593 |
void guess_unzip(GStr& fname, GStr& picmd) { |
1594 |
GStr fext=getFext(fname); |
1595 |
if (fext=="gz" || fext=="gzip" || fext=="z") { |
1596 |
picmd="gzip -cd "; |
1597 |
} |
1598 |
else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") { |
1599 |
picmd="bzip2 -cd "; |
1600 |
} |
1601 |
} |
1602 |
|
1603 |
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
1604 |
GStr& s, GStr& infname, GStr& infname2) { |
1605 |
// uses outsuffix to generate output file names and open file handles as needed |
1606 |
infname=""; |
1607 |
infname2=""; |
1608 |
f_in=NULL; |
1609 |
f_in2=NULL; |
1610 |
f_out=NULL; |
1611 |
f_out2=NULL; |
1612 |
//analyze outsuffix intent |
1613 |
GStr pocmd; |
1614 |
GStr ox=getFext(outsuffix); |
1615 |
if (ox.length()>2) ox=ox.substr(0,2); |
1616 |
if (ox=="gz") pocmd="gzip -9 -c "; |
1617 |
else if (ox=="bz") pocmd="bzip2 -9 -c "; |
1618 |
if (s=="-") { |
1619 |
f_in=stdin; |
1620 |
infname="stdin"; |
1621 |
f_out=prepOutFile(infname, pocmd); |
1622 |
return; |
1623 |
} // streaming from stdin |
1624 |
s.startTokenize(",:"); |
1625 |
s.nextToken(infname); |
1626 |
bool paired=s.nextToken(infname2); |
1627 |
if (fileExists(infname.chars())==0) |
1628 |
GError("Error: cannot find file %s!\n",infname.chars()); |
1629 |
GStr fname(getFileName(infname.chars())); |
1630 |
GStr picmd; |
1631 |
guess_unzip(fname, picmd); |
1632 |
if (picmd.is_empty()) { |
1633 |
f_in=fopen(infname.chars(), "r"); |
1634 |
if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars()); |
1635 |
} |
1636 |
else { |
1637 |
picmd.append(infname); |
1638 |
f_in=popen(picmd.chars(), "r"); |
1639 |
if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars()); |
1640 |
} |
1641 |
f_out=prepOutFile(infname, pocmd); |
1642 |
if (!paired) return; |
1643 |
if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n"); |
1644 |
// ---- paired reads:------------- |
1645 |
if (fileExists(infname2.chars())==0) |
1646 |
GError("Error: cannot find file %s!\n",infname2.chars()); |
1647 |
picmd=""; |
1648 |
GStr fname2(getFileName(infname2.chars())); |
1649 |
guess_unzip(fname2, picmd); |
1650 |
if (picmd.is_empty()) { |
1651 |
f_in2=fopen(infname2.chars(), "r"); |
1652 |
if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars()); |
1653 |
} |
1654 |
else { |
1655 |
picmd.append(infname2); |
1656 |
f_in2=popen(picmd.chars(), "r"); |
1657 |
if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars()); |
1658 |
} |
1659 |
f_out2=prepOutFile(infname2, pocmd); |
1660 |
} |