ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 187
Committed: Fri Feb 17 05:42:21 2012 UTC (7 years, 6 months ago) by gpertea
File size: 48278 byte(s)
Log Message:
fqtrim

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7 #include <inttypes.h>
8
9 #define USAGE "Usage:\n\
10 fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
11 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
12 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
13 <input.fq>[,<input_mates.fq>\n\
14 \n\
15 Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
16 for low complexity and collapse duplicate reads.\n\
17 If read pairs should be trimmed and kept together (i.e. without discarding\n\
18 one read in a pair), the two file names should be given delimited by a comma\n\
19 or a colon character\n\
20 \n\
21 Options:\n\
22 -n rename all the reads using the <prefix> followed by a read counter;\n\
23 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
24 the read duplication count\n\
25 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
26 file(s) named <input>.<outsuffix> which will be created in the \n\
27 current (working) directory; (writes to stdout if -o- is given);\n\
28 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
29 -f file with adaptor sequences to trim, each line having this format:\n\
30 <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
31 -5 trim the given adaptor or primer sequence at the 5' end of each read\n\
32 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
33 -3 trim the given adaptor sequence at the 3' end of each read\n\
34 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
35 -A disable polyA/T trimming (enabled by default)\n\
36 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
37 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
40 -l minimum \"clean\" length after trimming that a read must have\n\
41 in order to pass the filter (default: 16)\n\
42 -r write a simple \"trash report\" file listing the discarded reads with a\n\
43 one letter code for the reason why they were trashed\n\
44 -D apply a low-complexity (dust) filter and discard any read that has over\n\
45 50%% of its length detected as low complexity\n\
46 -C collapse duplicate reads and add a prefix with their count to the read\n\
47 name (e.g. for microRNAs)\n\
48 -p input is phred64/phred33 (use -p64 or -p33)\n\
49 -Q convert quality values to the other Phred qv type\n\
50 -V be verbose when done (print summary counts)\n\
51 "
52
53 //-z for -o option, the output stream(s) will be first piped into the given\n
54 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
55
56
57 // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
58
59 //For paired reads sequencing:
60 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
61 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
62 //FILE* f_out=NULL; //stdout if not provided
63 //FILE* f_out2=NULL; //for paired reads
64 //FILE* f_in=NULL; //input fastq (stdin if not provided)
65 //FILE* f_in2=NULL; //for paired reads
66
67 FILE* freport=NULL;
68
69 bool debug=false;
70 bool verbose=false;
71 bool doCollapse=false;
72 bool doDust=false;
73 bool doPolyTrim=true;
74 bool fastaOutput=false;
75 bool trashReport=false;
76 bool revCompl=false; //also reverse complement adaptor sequences
77 int min_read_len=16;
78 double max_perc_N=7.0;
79 int dust_cutoff=16;
80 bool isfasta=false;
81 bool convert_phred=false;
82 GStr outsuffix; // -o
83 GStr prefix;
84 GStr zcmd;
85 char isACGT[256];
86
87 uint num_trimN=0; //reads trimmed by N%
88 uint num_trimQ=0; //reads trimmed by qv threshold
89 uint num_trimV=0; //reads trimmed by adapter match
90 uint num_trimA=0; //reads trimmed by polyA
91 uint num_trimT=0; //reads trimmed by polyT
92 uint num_trim5=0; //number of reads trimmed at 5' end
93 uint num_trim3=0; //number of reads trimmed at 3' end
94
95 uint64 b_totalIn=0; //total number of input bases
96 uint64 b_totalN=0; //total number of undetermined bases found in input bases
97
98 uint64 b_trimN=0; //total number of bases trimmed due to N-trim
99 uint64 b_trimQ=0; //number of bases trimmed due to qv threshold
100 uint64 b_trimV=0; //total number of bases trimmed due to adaptor matches
101 uint64 b_trimA=0; //number of bases trimmed due to poly-A tails
102 uint64 b_trimT=0; //number of bases trimmed due to poly-T tails
103 uint64 b_trim5=0; //total bases trimmed on the 5' side
104 uint64 b_trim3=0; //total bases trimmed on the 3' side
105 //int min_trimmed5=INT_MAX;
106 //int min_trimmed3=INT_MAX;
107
108 int qvtrim_qmin=0;
109 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
110 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
111 int qv_cvtadd=0; //could be -31 or +31
112
113 // adaptor matching metrics -- for X-drop ungapped extension
114 //const int match_reward=2;
115 //const int mismatch_penalty=3;
116 const int match_reward=2;
117 const int mismatch_penalty=4;
118 const int Xdrop=10;
119
120 const int poly_m_score=2; //match score
121 const int poly_mis_score=-3; //mismatch
122 const int poly_dropoff_score=7;
123 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
124
125 const char *polyA_seed="AAAA";
126 const char *polyT_seed="TTTT";
127
128 struct CASeqData {
129 //positional data for every possible hexamer in an adaptor
130 GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
131 GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
132 GStr seq; //actual adaptor sequence data
133 GStr seqr; //reverse complement sequence
134 int amlen; //fraction of adaptor length matching that's
135 //enough to consider the alignment
136 GAlnTrimType trim_type;
137 bool use_reverse;
138 CASeqData(bool rev=false):seq(),seqr(),
139 amlen(0), use_reverse(rev) {
140 trim_type=galn_None; //should be updated later!
141 for (int i=0;i<4096;i++) {
142 pz[i]=NULL;
143 pzr[i]=NULL;
144 }
145 }
146
147 void update(const char* s) {
148 seq=s;
149 table6mers(seq.chars(), seq.length(), pz);
150 amlen=iround(double(seq.length())*0.8);
151 if (amlen<12)
152 amlen=12;
153 if (!use_reverse) return;
154 //reverse complement
155 seqr=s;
156 int slen=seq.length();
157 for (int i=0;i<slen;i++)
158 seqr[i]=ntComplement(seq[slen-i-1]);
159 table6mers(seqr.chars(), seqr.length(), pzr);
160 }
161
162 ~CASeqData() {
163 for (int i=0;i<4096;i++) {
164 delete pz[i];
165 delete pzr[i];
166 }
167 }
168 };
169
170 GVec<CASeqData> adaptors5;
171 GVec<CASeqData> adaptors3;
172
173 CGreedyAlignData* gxmem_l=NULL;
174 CGreedyAlignData* gxmem_r=NULL;
175
176 // element in dhash:
177 class FqDupRec {
178 public:
179 int count; //how many of these reads are the same
180 int len; //length of qv
181 char* firstname; //optional, only if we want to keep the original read names
182 char* qv;
183 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
184 len=0;
185 qv=NULL;
186 firstname=NULL;
187 count=0;
188 if (qstr!=NULL) {
189 qv=Gstrdup(qstr->chars());
190 len=qstr->length();
191 count++;
192 }
193 if (rname!=NULL) firstname=Gstrdup(rname);
194 }
195 ~FqDupRec() {
196 GFREE(qv);
197 GFREE(firstname);
198 }
199 void add(GStr& d) { //collapse another record into this one
200 if (d.length()!=len)
201 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
202 count++;
203 for (int i=0;i<len;i++)
204 qv[i]=(qv[i]+d[i])/2;
205 }
206 };
207
208 void openfw(FILE* &f, GArgs& args, char opt) {
209 GStr s=args.getOpt(opt);
210 if (!s.is_empty()) {
211 if (s=='-') f=stdout;
212 else {
213 f=fopen(s.chars(),"w");
214 if (f==NULL) GError("Error creating file: %s\n", s.chars());
215 }
216 }
217 }
218
219 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
220 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
221
222 GHash<FqDupRec> dhash; //hash to keep track of duplicates
223
224 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
225 int loadAdaptors(const char* fname);
226
227 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
228 GStr& s, GStr& infname, GStr& infname2);
229 // uses outsuffix to generate output file names and open file handles as needed
230
231 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
232 void trash_report(char trashcode, GStr& rname, FILE* freport);
233
234 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
235 GStr& rname, GStr& rinfo, GStr& infname);
236
237 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
238 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
239
240 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
241 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
242 int dust(GStr& seq);
243 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
244 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
245 bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
246 bool trim_adaptor3(GStr& seq, int &l5, int &l3);
247
248 void convertPhred(char* q, int len);
249 void convertPhred(GStr& q);
250
251 int main(int argc, char * const argv[]) {
252 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
253 int e;
254 if ((e=args.isError())>0) {
255 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
256 exit(224);
257 }
258 debug=(args.getOpt('Y')!=NULL);
259 verbose=(args.getOpt('V')!=NULL);
260 convert_phred=(args.getOpt('Q')!=NULL);
261 doCollapse=(args.getOpt('C')!=NULL);
262 doDust=(args.getOpt('D')!=NULL);
263 revCompl=(args.getOpt('R')!=NULL);
264 if (args.getOpt('A')) doPolyTrim=false;
265 /*
266 rawFormat=(args.getOpt('R')!=NULL);
267 if (rawFormat) {
268 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
269 }
270 */
271 prefix=args.getOpt('n');
272 GStr s=args.getOpt('l');
273 if (!s.is_empty())
274 min_read_len=s.asInt();
275 s=args.getOpt('m');
276 if (!s.is_empty())
277 max_perc_N=s.asDouble();
278 s=args.getOpt('d');
279 if (!s.is_empty()) {
280 dust_cutoff=s.asInt();
281 doDust=true;
282 }
283 s=args.getOpt('q');
284 if (!s.is_empty()) {
285 qvtrim_qmin=s.asInt();
286 }
287 s=args.getOpt('t');
288 if (!s.is_empty()) {
289 qvtrim_max=s.asInt();
290 }
291 s=args.getOpt('p');
292 if (!s.is_empty()) {
293 int v=s.asInt();
294 if (v==33) {
295 qv_phredtype=33;
296 qv_cvtadd=31;
297 }
298 else if (v==64) {
299 qv_phredtype=64;
300 qv_cvtadd=-31;
301 }
302 else
303 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
304 }
305 memset((void*)isACGT, 0, 256);
306 isACGT['A']=isACGT['a']=isACGT['C']=isACGT['c']=1;
307 isACGT['G']=isACGT['g']=isACGT['T']=isACGT['t']=1;
308 s=args.getOpt('f');
309 if (!s.is_empty()) {
310 loadAdaptors(s.chars());
311 }
312 bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
313 s=args.getOpt('5');
314 if (!s.is_empty()) {
315 if (fileAdaptors)
316 GError("Error: options -5 and -f cannot be used together!\n");
317 s.upper();
318 addAdaptor(adaptors5, s, galn_TrimLeft);
319 }
320 s=args.getOpt('3');
321 if (!s.is_empty()) {
322 if (fileAdaptors)
323 GError("Error: options -3 and -f cannot be used together!\n");
324 s.upper();
325 addAdaptor(adaptors3, s, galn_TrimRight);
326 }
327 s=args.getOpt('y');
328 if (!s.is_empty()) {
329 int minmatch=s.asInt();
330 poly_minScore=minmatch*poly_m_score;
331 }
332
333 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
334 else outsuffix="-";
335 trashReport= (args.getOpt('r')!=NULL);
336 int fcount=args.startNonOpt();
337 if (fcount==0) {
338 GMessage(USAGE);
339 exit(224);
340 }
341 if (fcount>1 && doCollapse) {
342 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
343 }
344 //openfw(f_out, args, 'o');
345 //if (f_out==NULL) f_out=stdout;
346 if (trashReport)
347 openfw(freport, args, 'r');
348 char* infile=NULL;
349
350 if (adaptors5.Count()>0)
351 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
352 gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
353 if (adaptors3.Count()>0)
354 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
355
356 while ((infile=args.nextNonOpt())!=NULL) {
357 //for each input file
358 int incounter=0; //counter for input reads
359 int outcounter=0; //counter for output reads
360 int trash_s=0; //too short from the get go
361 int trash_Q=0;
362 int trash_N=0;
363 int trash_D=0;
364 int trash_poly=0;
365 int trash_V=0;
366 num_trimN=0;
367 num_trimQ=0;
368 num_trimV=0;
369 num_trimA=0;
370 num_trimT=0;
371 num_trim5=0;
372 num_trim3=0;
373
374 b_totalIn=0;
375 b_totalN=0;
376 b_trimN=0;
377 b_trimQ=0;
378 b_trimV=0;
379 b_trimA=0;
380 b_trimT=0;
381 b_trim5=0;
382 b_trim3=0;
383
384 s=infile;
385 GStr infname;
386 GStr infname2;
387 FILE* f_in=NULL;
388 FILE* f_in2=NULL;
389 FILE* f_out=NULL;
390 FILE* f_out2=NULL;
391 bool paired_reads=false;
392 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
393 GLineReader fq(f_in);
394 GLineReader* fq2=NULL;
395 if (f_in2!=NULL) {
396 fq2=new GLineReader(f_in2);
397 paired_reads=true;
398 }
399 GStr rseq, rqv, seqid, seqinfo;
400 GStr rseq2, rqv2, seqid2, seqinfo2;
401 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
402 incounter++;
403 int a5=0, a3=0, b5=0, b3=0;
404 char tcode=0, tcode2=0;
405 tcode=process_read(seqid, rseq, rqv, a5, a3);
406 if (a5>0) {
407 b_trim5+=a5;
408 num_trim5++;
409 }
410 if (a3<rseq.length()-1) {
411 b_trim3+=rseq.length()-a3-1;
412 num_trim3++;
413 }
414 if (fq2!=NULL) {
415 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
416 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
417 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
418 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
419 }
420 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
421 if (b5>0) {
422 b_trim5+=b5;
423 num_trim5++;
424 }
425 if (b3<rseq2.length()-1) {
426 b_trim3+=rseq2.length()-b3-1;
427 num_trim3++;
428 }
429 //decide what to do with this pair and print rseq2 if the pair makes it
430 if (tcode>1 && tcode2<=1) {
431 //"untrash" rseq
432 if (a3-a5+1<min_read_len) {
433 a5=1;
434 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
435 }
436 tcode=1;
437 }
438 else if (tcode<=1 && tcode2>1) {
439 //"untrash" rseq2
440 if (b3-b5+1<min_read_len) {
441 b5=1;
442 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
443 }
444 tcode2=1;
445 }
446 if (tcode<=1) { //trimmed or left intact -- write it!
447 if (tcode>0) {
448 rseq2=rseq2.substr(b5,b3-b5+1);
449 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
450 }
451 int nocounter=0;
452 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
453 }
454 } //pair read
455 if (tcode>1) { //trashed
456 #ifdef GDEBUG
457 GMessage(" !!!!TRASH code = %c\n",tcode);
458 #endif
459 if (tcode=='s') trash_s++;
460 else if (tcode=='A' || tcode=='T') trash_poly++;
461 else if (tcode=='Q') trash_Q++;
462 else if (tcode=='N') trash_N++;
463 else if (tcode=='D') trash_D++;
464 else if (tcode=='V') trash_V++;
465 //else if (tcode=='5') trash_A5++;
466 if (trashReport) trash_report(tcode, seqid, freport);
467 }
468 else if (!doCollapse) { //write it
469 if (tcode>0) {
470 rseq=rseq.substr(a5,a3-a5+1);
471 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
472 }
473 #ifdef GDEBUG
474 GMessage(" After trimming:\n");
475 GMessage("%s\n",rseq.chars());
476 #endif
477 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
478 }
479 } //for each fastq record
480 delete fq2;
481 FRCLOSE(f_in);
482 FRCLOSE(f_in2);
483 if (doCollapse) {
484 outcounter=0;
485 int maxdup_count=1;
486 char* maxdup_seq=NULL;
487 dhash.startIterate();
488 FqDupRec* qd=NULL;
489 char* seq=NULL;
490 while ((qd=dhash.NextData(seq))!=NULL) {
491 GStr rseq(seq);
492 //do the dusting here
493 if (doDust) {
494 int dustbases=dust(rseq);
495 if (dustbases>(rseq.length()>>1)) {
496 if (trashReport && qd->firstname!=NULL) {
497 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
498 }
499 trash_D+=qd->count;
500 continue;
501 }
502 }
503 outcounter++;
504 if (qd->count>maxdup_count) {
505 maxdup_count=qd->count;
506 maxdup_seq=seq;
507 }
508 if (isfasta) {
509 if (prefix.is_empty()) {
510 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
511 rseq.chars());
512 }
513 else { //use custom read name
514 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
515 qd->count, rseq.chars());
516 }
517 }
518 else { //fastq format
519 if (convert_phred) convertPhred(qd->qv, qd->len);
520 if (prefix.is_empty()) {
521 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
522 rseq.chars(), qd->qv);
523 }
524 else { //use custom read name
525 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
526 qd->count, rseq.chars(), qd->qv);
527 }
528 }
529 }//for each element of dhash
530 if (maxdup_count>1) {
531 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
532 }
533 } //collapse entries
534 if (verbose) {
535 if (paired_reads) {
536 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
537 GMessage("Number of input pairs :%9u\n", incounter);
538 GMessage(" Output pairs :%9u\t(%u trashed)\n", outcounter, incounter-outcounter);
539 }
540 else {
541 GMessage(">Input file : %s\n", infname.chars());
542 GMessage("Number of input reads :%9d\n", incounter);
543 GMessage(" Output reads :%9d (%u trashed)\n", outcounter, incounter-outcounter);
544 }
545 GMessage("--------------- Read trimming: ------------------\n");
546 if (num_trim5)
547 GMessage(" 5' trimmed :%9u\n", num_trim5);
548 if (num_trim3)
549 GMessage(" 3' trimmed :%9u\n", num_trim3);
550 if (num_trimQ)
551 GMessage(" q.v. trimmed :%9u\n", num_trimQ);
552 if (num_trimN)
553 GMessage(" N trimmed :%9u\n", num_trimN);
554 if (num_trimT)
555 GMessage(" poly-T trimmed :%9u\n", num_trimT);
556 if (num_trimA)
557 GMessage(" poly-A trimmed :%9u\n", num_trimA);
558 if (num_trimV)
559 GMessage(" Adapter trimmed :%9u\n", num_trimV);
560 GMessage("------------------------------------\n");
561 if (trash_s>0)
562 GMessage("Trashed by initial len:%9d\n", trash_s);
563 if (trash_N>0)
564 GMessage(" Trashed by N%%:%9d\n", trash_N);
565 if (trash_Q>0)
566 GMessage("Trashed by low quality:%9d\n", trash_Q);
567 if (trash_poly>0)
568 GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
569 if (trash_V>0)
570 GMessage(" Trashed by adaptor:%9d\n", trash_V);
571 GMessage("\n--------------- Base counts --------------------\n");
572 GMessage(" Input bases :%12llu\n", b_totalIn);
573 double percN=100.0* ((double)b_totalN/(double)b_totalIn);
574 GMessage(" N bases :%12llu (%4.2f%%)\n", b_totalN, percN);
575 GMessage(" trimmed from 5':%12llu\n", b_trim5);
576 GMessage(" trimmed from 3':%12llu\n", b_trim3);
577 GMessage("\n");
578 if (b_trimQ)
579 GMessage(" q.v. trimmed :%12llu\n", b_trimQ);
580 if (b_trimN)
581 GMessage(" N trimmed :%12llu\n", b_trimN);
582 if (b_trimT)
583 GMessage(" poly-T trimmed :%12llu\n", b_trimT);
584 if (b_trimA)
585 GMessage(" poly-A trimmed :%12llu\n", b_trimA);
586 if (b_trimV)
587 GMessage(" Adapter trimmed :%12llu\n", b_trimV);
588
589 }
590 if (trashReport) {
591 FWCLOSE(freport);
592 }
593 FWCLOSE(f_out);
594 FWCLOSE(f_out2);
595 } //while each input file
596 delete gxmem_l;
597 delete gxmem_r;
598 //getc(stdin);
599 }
600
601 class NData {
602 public:
603 int NPos[1024]; //there should be no reads longer than 1K ?
604 int NCount;
605 int end5;
606 int end3;
607 int seqlen;
608 double perc_N; //percentage of Ns in end5..end3 range only!
609 const char* seq;
610 bool valid;
611 NData() {
612 seqlen=0;
613 NCount=0;
614 end5=0;
615 end3=0;
616 seq=NULL;
617 perc_N=0;
618 valid=true;
619 }
620 void init(GStr& rseq) {
621 NCount=0;
622 perc_N=0;
623 seqlen=rseq.length();
624 seq=rseq.chars();
625 end5=1;
626 end3=seqlen;
627 for (int i=0;i<seqlen;i++)
628 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
629 NPos[NCount]=i;
630 NCount++;
631 }
632 perc_N=(NCount*100.0)/seqlen;
633 valid=true;
634 }
635 void N_calc() { //only in the region after trimming
636 int n=0;
637 for (int i=end3-1;i<end5;i++) {
638 if (seq[i]=='N') n++;
639 }
640 perc_N=(n*100.0)/(end5-end3+1);
641 }
642 };
643
644 static NData feat;
645 int perc_lenN=12; // incremental distance from ends, in percentage of
646 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
647
648 void N_analyze(int l5, int l3, int p5, int p3) {
649 /* assumes feat was filled properly */
650 int old_dif, t5,t3,v;
651 if (l3<l5+2 || p5>p3 ) {
652 feat.end5=l5+1;
653 feat.end3=l3+1;
654 return;
655 }
656
657 t5=feat.NPos[p5]-l5;
658 t3=l3-feat.NPos[p3];
659 old_dif=p3-p5;
660 v=(int)((((double)(l3-l5))*perc_lenN)/100);
661 if (v>20) v=20; /* enforce N-search limit for very long reads */
662 if (t5 < v ) {
663 l5=feat.NPos[p5]+1;
664 p5++;
665 }
666 if (t3 < v) {
667 l3=feat.NPos[p3]-1;
668 p3--;
669 }
670 /* restNs=p3-p5; number of Ns in the new CLR */
671 if (p3-p5==old_dif) { /* no change, return */
672 /* don't trim if this may shorten the read too much */
673 //if (l5-l3<min_read_len) return;
674 feat.end5=l5+1;
675 feat.end3=l3+1;
676 return;
677 }
678 else
679 N_analyze(l5,l3, p5,p3);
680 }
681
682
683 bool qtrim(GStr& qvs, int &l5, int &l3) {
684 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
685 if (qv_phredtype==0) {
686 //try to guess the Phred type
687 int vmin=256, vmax=0;
688 for (int i=0;i<qvs.length();i++) {
689 if (vmin>qvs[i]) vmin=qvs[i];
690 if (vmax<qvs[i]) vmax=qvs[i];
691 }
692 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
693 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
694 if (qv_phredtype==0) {
695 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
696 }
697 if (verbose)
698 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
699 } //guessing Phred type
700 for (l3=qvs.length()-1;l3>2;l3--) {
701 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
702 }
703 //just in case, check also the 5' the end (?)
704 for (l5=0;l5<qvs.length()-3;l5++) {
705 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
706 }
707 if (qvtrim_max>0) {
708 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
709 if (l5>qvtrim_max) l5=qvtrim_max;
710 }
711 return (l5>0 || l3<qvs.length()-1);
712 }
713
714 bool ntrim(GStr& rseq, int &l5, int &l3) {
715 //count Ns in the sequence, trim N-rich ends
716 feat.init(rseq);
717 l5=feat.end5-1;
718 l3=feat.end3-1;
719 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
720 if (l5==feat.end5-1 && l3==feat.end3-1) {
721 if (feat.perc_N>max_perc_N) {
722 feat.valid=false;
723 l3=l5+1;
724 return true;
725 }
726 else {
727 return false; //nothing changed
728 }
729 }
730 l5=feat.end5-1;
731 l3=feat.end3-1;
732 if (l3-l5+1<min_read_len) {
733 feat.valid=false;
734 return true;
735 }
736 feat.N_calc();
737
738 if (feat.perc_N>max_perc_N) {
739 feat.valid=false;
740 l3=l5+1;
741 return true;
742 }
743 return true;
744 }
745
746 //--------------- dust functions ----------------
747 class DNADuster {
748 public:
749 int dustword;
750 int dustwindow;
751 int dustwindow2;
752 int dustcutoff;
753 int mv, iv, jv;
754 int counts[32*32*32];
755 int iis[32*32*32];
756 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
757 dustword=wordsize;
758 dustwindow=winsize;
759 dustwindow2 = (winsize>>1);
760 dustcutoff=cutoff;
761 mv=0;
762 iv=0;
763 jv=0;
764 }
765 void setWindowSize(int value) {
766 dustwindow = value;
767 dustwindow2 = (dustwindow >> 1);
768 }
769 void setWordSize(int value) {
770 dustword=value;
771 }
772 void wo1(int len, const char* s, int ivv) {
773 int i, ii, j, v, t, n, n1, sum;
774 int js, nis;
775 n = 32 * 32 * 32;
776 n1 = n - 1;
777 nis = 0;
778 i = 0;
779 ii = 0;
780 sum = 0;
781 v = 0;
782 for (j=0; j < len; j++, s++) {
783 ii <<= 5;
784 if (*s<=32) {
785 i=0;
786 continue;
787 }
788 ii |= *s - 'A'; //assume uppercase!
789 ii &= n1;
790 i++;
791 if (i >= dustword) {
792 for (js=0; js < nis && iis[js] != ii; js++) ;
793 if (js == nis) {
794 iis[nis] = ii;
795 counts[ii] = 0;
796 nis++;
797 }
798 if ((t = counts[ii]) > 0) {
799 sum += t;
800 v = 10 * sum / j;
801 if (mv < v) {
802 mv = v;
803 iv = ivv;
804 jv = j;
805 }
806 }
807 counts[ii]++;
808 }
809 }
810 }
811
812 int wo(int len, const char* s, int* beg, int* end) {
813 int i, l1;
814 l1 = len - dustword + 1;
815 if (l1 < 0) {
816 *beg = 0;
817 *end = len - 1;
818 return 0;
819 }
820 mv = 0;
821 iv = 0;
822 jv = 0;
823 for (i=0; i < l1; i++) {
824 wo1(len-i, s+i, i);
825 }
826 *beg = iv;
827 *end = iv + jv;
828 return mv;
829 }
830
831 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
832 int i, j, l, a, b, v;
833 if (cutoff==0) cutoff=dustcutoff;
834 a=0;b=0;
835 //GMessage("Dust cutoff=%d\n", cutoff);
836 for (i=0; i < seqlen; i += dustwindow2) {
837 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
838 v = wo(l, seq+i, &a, &b);
839 if (v > cutoff) {
840 //for (j = a; j <= b && j < dustwindow2; j++) {
841 for (j = a; j <= b; j++) {
842 seqmsk[i+j]='N';//could be made lowercase instead
843 }
844 }
845 }
846 //return first;
847 }
848 };
849
850 static DNADuster duster;
851
852 int dust(GStr& rseq) {
853 char* seq=Gstrdup(rseq.chars());
854 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
855 //check the number of Ns:
856 int ncount=0;
857 for (int i=0;i<rseq.length();i++) {
858 if (seq[i]=='N') ncount++;
859 }
860 GFREE(seq);
861 return ncount;
862 }
863
864 struct SLocScore {
865 int pos;
866 int score;
867 SLocScore(int p=0,int s=0) {
868 pos=p;
869 score=s;
870 }
871 void set(int p, int s) {
872 pos=p;
873 score=s;
874 }
875 void add(int p, int add) {
876 pos=p;
877 score+=add;
878 }
879 };
880
881 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
882 if (!doPolyTrim) return false;
883 int rlen=seq.length();
884 l5=0;
885 l3=rlen-1;
886 int32 seedVal=*(int32*)poly_seed;
887 char polyChar=poly_seed[0];
888 //assumes N trimming was already done
889 //so a poly match should be very close to the end of the read
890 // -- find the initial match (seed)
891 int lmin=GMAX((rlen-16), 0);
892 int li;
893 for (li=rlen-4;li>lmin;li--) {
894 if (seedVal==*(int*)&(seq[li])) {
895 break;
896 }
897 }
898 if (li<=lmin) return false;
899 //seed found, try to extend it both ways
900 //extend right
901 int ri=li+3;
902 SLocScore loc(ri, poly_m_score<<2);
903 SLocScore maxloc(loc);
904 //extend right
905 while (ri<rlen-1) {
906 ri++;
907 if (seq[ri]==polyChar) {
908 loc.add(ri,poly_m_score);
909 }
910 else if (seq[ri]=='N') {
911 loc.add(ri,0);
912 }
913 else { //mismatch
914 loc.add(ri,poly_mis_score);
915 if (maxloc.score-loc.score>poly_dropoff_score) break;
916 }
917 if (maxloc.score<=loc.score) {
918 maxloc=loc;
919 }
920 }
921 ri=maxloc.pos;
922 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
923 //ri = right boundary for the poly match
924 //extend left
925 loc.set(li, maxloc.score);
926 maxloc.pos=li;
927 while (li>0) {
928 li--;
929 if (seq[li]==polyChar) {
930 loc.add(li,poly_m_score);
931 }
932 else if (seq[li]=='N') {
933 loc.add(li,0);
934 }
935 else { //mismatch
936 loc.add(li,poly_mis_score);
937 if (maxloc.score-loc.score>poly_dropoff_score) break;
938 }
939 if (maxloc.score<=loc.score) {
940 maxloc=loc;
941 }
942 }
943 li=maxloc.pos;
944 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
945 (maxloc.score>poly_minScore && ri>=rlen-3) ||
946 (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
947 //trimming this li-ri match at 3' end
948 l3=li-1;
949 if (l3<0) l3=0;
950 return true;
951 }
952 return false;
953 }
954
955 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
956 if (!doPolyTrim) return false;
957 int rlen=seq.length();
958 l5=0;
959 l3=rlen-1;
960 int32 seedVal=*(int32*)poly_seed;
961 char polyChar=poly_seed[0];
962 //assumes N trimming was already done
963 //so a poly match should be very close to the end of the read
964 // -- find the initial match (seed)
965 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
966 int li;
967 for (li=0;li<=lmax;li++) {
968 if (seedVal==*(int*)&(seq[li])) {
969 break;
970 }
971 }
972 if (li>lmax) return false;
973 //seed found, try to extend it both ways
974 //extend left
975 int ri=li+3; //save rightmost base of the seed
976 SLocScore loc(li, poly_m_score<<2);
977 SLocScore maxloc(loc);
978 while (li>0) {
979 li--;
980 if (seq[li]==polyChar) {
981 loc.add(li,poly_m_score);
982 }
983 else if (seq[li]=='N') {
984 loc.add(li,0);
985 }
986 else { //mismatch
987 loc.add(li,poly_mis_score);
988 if (maxloc.score-loc.score>poly_dropoff_score) break;
989 }
990 if (maxloc.score<=loc.score) {
991 maxloc=loc;
992 }
993 }
994 li=maxloc.pos;
995 if (li>5) return false; //no trimming wanted, too far from 5' end
996 //li = right boundary for the poly match
997
998 //extend right
999 loc.set(ri, maxloc.score);
1000 maxloc.pos=ri;
1001 while (ri<rlen-1) {
1002 ri++;
1003 if (seq[ri]==polyChar) {
1004 loc.add(ri,poly_m_score);
1005 }
1006 else if (seq[ri]=='N') {
1007 loc.add(ri,0);
1008 }
1009 else { //mismatch
1010 loc.add(ri,poly_mis_score);
1011 if (maxloc.score-loc.score>poly_dropoff_score) break;
1012 }
1013 if (maxloc.score<=loc.score) {
1014 maxloc=loc;
1015 }
1016 }
1017 ri=maxloc.pos;
1018 if ((maxloc.score==poly_minScore && li==0) ||
1019 (maxloc.score>poly_minScore && li<2)
1020 || (maxloc.score>(poly_minScore*3) && li<8)) {
1021 //adjust l5 to reflect this trimming of 5' end
1022 l5=ri+1;
1023 if (l5>rlen-1) l5=rlen-1;
1024 return true;
1025 }
1026 return false;
1027 }
1028
1029 bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
1030 if (adaptors3.Count()==0) return false;
1031 int rlen=seq.length();
1032 l5=0;
1033 l3=rlen-1;
1034 bool trimmed=false;
1035 GStr wseq(seq);
1036 int wlen=rlen;
1037 GXSeqData seqdata;
1038 int numruns=revCompl ? 2 : 1;
1039 GList<GXAlnInfo> bestalns(true, true, false);
1040 for (int ai=0;ai<adaptors3.Count();ai++) {
1041 for (int r=0;r<numruns;r++) {
1042 if (r) {
1043 seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
1044 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
1045 }
1046 else {
1047 seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
1048 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
1049 }
1050 GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
1051 if (aln) {
1052 if (aln->strong) {
1053 trimmed=true;
1054 bestalns.Add(aln);
1055 break; //will check the rest next time
1056 }
1057 else bestalns.Add(aln);
1058 }
1059 }//forward and reverse adaptors
1060 if (trimmed) break; //will check the rest in the next cycle
1061 }//for each 3' adaptor
1062 if (bestalns.Count()>0) {
1063 GXAlnInfo* aln=bestalns[0];
1064 if (aln->sl-1 > wlen-aln->sr) {
1065 //keep left side
1066 l3-=(wlen-aln->sl+1);
1067 if (l3<0) l3=0;
1068 }
1069 else { //keep right side
1070 l5+=aln->sr;
1071 if (l5>=rlen) l5=rlen-1;
1072 }
1073 //delete aln;
1074 //if (l3-l5+1<min_read_len) return true;
1075 wseq=seq.substr(l5,l3-l5+1);
1076 wlen=wseq.length();
1077 return true; //break the loops here to report a good find
1078 }
1079 return false;
1080 }
1081
1082 bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1083 if (adaptors5.Count()==0) return false;
1084 int rlen=seq.length();
1085 l5=0;
1086 l3=rlen-1;
1087 bool trimmed=false;
1088 GStr wseq(seq);
1089 int wlen=rlen;
1090 GXSeqData seqdata;
1091 int numruns=revCompl ? 2 : 1;
1092 GList<GXAlnInfo> bestalns(true, true, false);
1093 for (int ai=0;ai<adaptors5.Count();ai++) {
1094 for (int r=0;r<numruns;r++) {
1095 if (r) {
1096 seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1097 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1098 }
1099 else {
1100 seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1101 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1102 }
1103 GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1104 if (aln) {
1105 if (aln->strong) {
1106 trimmed=true;
1107 bestalns.Add(aln);
1108 break; //will check the rest next time
1109 }
1110 else bestalns.Add(aln);
1111 }
1112 } //forward and reverse?
1113 if (trimmed) break; //will check the rest in the next cycle
1114 }//for each 5' adaptor
1115 if (bestalns.Count()>0) {
1116 GXAlnInfo* aln=bestalns[0];
1117 if (aln->sl-1 > wlen-aln->sr) {
1118 //keep left side
1119 l3-=(wlen-aln->sl+1);
1120 if (l3<0) l3=0;
1121 }
1122 else { //keep right side
1123 l5+=aln->sr;
1124 if (l5>=rlen) l5=rlen-1;
1125 }
1126 //delete aln;
1127 //if (l3-l5+1<min_read_len) return true;
1128 wseq=seq.substr(l5,l3-l5+1);
1129 wlen=wseq.length();
1130 return true; //break the loops here to report a good find
1131 }
1132 return false;
1133 }
1134
1135 //convert qvs to/from phred64 from/to phread33
1136 void convertPhred(GStr& q) {
1137 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1138 }
1139
1140 void convertPhred(char* q, int len) {
1141 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1142 }
1143
1144 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1145 GStr& rname, GStr& rinfo, GStr& infname) {
1146 rseq="";
1147 rqv="";
1148 rname="";
1149 rinfo="";
1150 if (fq.eof()) return false;
1151 char* l=fq.getLine();
1152 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1153 if (l==NULL) return false;
1154 /* if (rawFormat) {
1155 //TODO: implement raw qseq parsing here
1156 //if (raw type=N) then continue; //skip invalid/bad records
1157 } //raw qseq format
1158 else { // FASTQ or FASTA */
1159 isfasta=(l[0]=='>');
1160 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1161 infname.chars(), l);
1162 GStr s(l);
1163 rname=&(l[1]);
1164 for (int i=0;i<rname.length();i++)
1165 if (rname[i]<=' ') {
1166 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1167 rname.cut(i);
1168 break;
1169 }
1170 //now get the sequence
1171 if ((l=fq.getLine())==NULL)
1172 GError("Error: unexpected EOF after header for read %s (%s)\n",
1173 rname.chars(), infname.chars());
1174 rseq=l; //this must be the DNA line
1175 while ((l=fq.getLine())!=NULL) {
1176 //seq can span multiple lines
1177 if (l[0]=='>' || l[0]=='+') {
1178 fq.pushBack();
1179 break; //
1180 }
1181 rseq+=l;
1182 } //check for multi-line seq
1183 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1184 if ((l=fq.getLine())==NULL)
1185 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1186 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1187 if ((l=fq.getLine())==NULL)
1188 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1189 rqv=l;
1190 //if (rqv.length()!=rseq.length())
1191 // GError("Error: qv len != seq len for %s\n", rname.chars());
1192 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1193 rqv+=l; //append to qv string
1194 }
1195 }// fastq
1196 // } //<-- FASTA or FASTQ
1197 rseq.upper();
1198 return true;
1199 }
1200
1201 #ifdef GDEBUG
1202 void showTrim(GStr& s, int l5, int l3) {
1203 if (l5>0 || l3==0) {
1204 color_bg(c_red);
1205 }
1206 for (int i=0;i<s.length()-1;i++) {
1207 if (i && i==l5) color_resetbg();
1208 fprintf(stderr, "%c", s[i]);
1209 if (i && i==l3) color_bg(c_red);
1210 }
1211 fprintf(stderr, "%c", s[s.length()-1]);
1212 color_reset();
1213 fprintf(stderr, "\n");
1214 }
1215 #endif
1216
1217 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1218 //returns 0 if the read was untouched, 1 if it was just trimmed
1219 // and a trash code if it was trashed
1220 l5=0;
1221 l3=rseq.length()-1;
1222 #ifdef GDEBUG
1223 //rseq.reverse();
1224 GMessage(">%s\n", rname.chars());
1225 GMessage("%s\n",rseq.chars());
1226 #endif
1227 if (l3-l5+1<min_read_len) {
1228 return 's';
1229 }
1230 GStr wseq(rseq.chars());
1231 GStr wqv(rqv.chars());
1232 int w5=l5;
1233 int w3=l3;
1234 //count Ns
1235 b_totalIn+=rseq.length();
1236 for (int i=0;i<rseq.length();i++) {
1237 if (isACGT[(int)rseq[i]]==0) b_totalN++;
1238 }
1239
1240 //first do the q-based trimming
1241 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1242 b_trimQ+=(w5-l5)+(l3-w3);
1243 num_trimQ++;
1244 if (w3-w5+1<min_read_len) {
1245 return 'Q'; //invalid read
1246 }
1247 //-- keep only the w5..w3 range
1248 l5=w5;
1249 l3=w3;
1250 wseq=wseq.substr(w5, w3-w5+1);
1251 if (!wqv.is_empty())
1252 wqv=wqv.substr(w5, w3-w5+1);
1253 } //qv trimming
1254 // N-trimming on the remaining read seq
1255 if (ntrim(wseq, w5, w3)) {
1256 //Note: ntrim resets w5 and w3
1257 //GMessage("before: %s\n",wseq.chars());
1258 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1259 int trim3=(wseq.length()-1-w3);
1260 b_trimN+=w5+trim3;
1261 num_trimN++;
1262 l5+=w5;
1263 l3-=trim3;
1264 if (w3-w5+1<min_read_len) {
1265 return 'N'; //to be trashed
1266 }
1267 //-- keep only the w5..w3 range
1268 wseq=wseq.substr(w5, w3-w5+1);
1269 if (!wqv.is_empty())
1270 wqv=wqv.substr(w5, w3-w5+1);
1271 w5=0;
1272 w3=wseq.length()-1;
1273 }
1274 char trim_code;
1275 //clean the more dirty end first - 3'
1276 bool trimmedA=false;
1277 bool trimmedT=false;
1278 bool trimmedV=false;
1279
1280 int prev_w5=0;
1281 int prev_w3=0;
1282 bool w3upd=true;
1283 bool w5upd=true;
1284 do {
1285 trim_code=0;
1286 if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1287 trim_code='A';
1288 b_trimA+=(w5+(wseq.length()-1-w3));
1289 if (!trimmedA) { num_trimA++; trimmedA=true; }
1290 }
1291 else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1292 trim_code='T';
1293 b_trimT+=(w5+(wseq.length()-1-w3));
1294 if (!trimmedT) { num_trimT++; trimmedT=true; }
1295 }
1296 else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1297 trim_code='A';
1298 b_trimA+=(w5+(wseq.length()-1-w3));
1299 if (!trimmedA) { num_trimA++; trimmedA=true; }
1300 }
1301 else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1302 trim_code='T';
1303 b_trimT+=(w5+(wseq.length()-1-w3));
1304 if (!trimmedT) { num_trimT++; trimmedT=true; }
1305 }
1306 else if (trim_adaptor5(wseq, w5, w3)) {
1307 trim_code='V';
1308 b_trimV+=(w5+(wseq.length()-1-w3));
1309 if (!trimmedV) { num_trimV++; trimmedV=true; }
1310 }
1311 else if (trim_adaptor3(wseq, w5, w3)) {
1312 trim_code='V';
1313 b_trimV+=(w5+(wseq.length()-1-w3));
1314 if (!trimmedV) { num_trimV++; trimmedV=true; }
1315 }
1316 if (trim_code) {
1317 w3upd=(w3!=prev_w3);
1318 w5upd=(w5!=prev_w5);
1319 if (w3upd) prev_w3=w3;
1320 if (w5upd) prev_w5=w5;
1321 #ifdef GDEBUG
1322 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1323 showTrim(wseq, w5, w3);
1324 #endif
1325 //int trimlen=wseq.length()-(w3-w5+1);
1326 l5+=w5;
1327 l3-=(wseq.length()-1-w3);
1328 if (w3-w5+1<min_read_len) {
1329 return trim_code;
1330 }
1331 //-- keep only the w5..w3 range
1332 wseq=wseq.substr(w5, w3-w5+1);
1333 if (!wqv.is_empty())
1334 wqv=wqv.substr(w5, w3-w5+1);
1335 }//trimming at 3' end
1336 } while (trim_code);
1337 if (doCollapse) {
1338 //keep read for later
1339 FqDupRec* dr=dhash.Find(wseq.chars());
1340 if (dr==NULL) { //new entry
1341 //if (prefix.is_empty())
1342 dhash.Add(wseq.chars(),
1343 new FqDupRec(&wqv, rname.chars()));
1344 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1345 }
1346 else
1347 dr->add(wqv);
1348 } //collapsing duplicates
1349 else { //not collapsing duplicates
1350 //apply the dust filter now
1351 if (doDust) {
1352 int dustbases=dust(wseq);
1353 if (dustbases>(wseq.length()>>1)) {
1354 return 'D';
1355 }
1356 }
1357 } //not collapsing duplicates
1358 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1359 }
1360
1361 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1362 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1363 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1364 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1365 }
1366
1367 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1368 outcounter++;
1369 bool asFasta=(rqv.is_empty() || fastaOutput);
1370 if (asFasta) {
1371 if (prefix.is_empty()) {
1372 printHeader(f_out, '>',rname,rinfo);
1373 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1374 }
1375 else {
1376 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1377 rseq.chars());
1378 }
1379 }
1380 else { //fastq
1381 if (convert_phred) convertPhred(rqv);
1382 if (prefix.is_empty()) {
1383 printHeader(f_out, '@', rname, rinfo);
1384 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1385 }
1386 else
1387 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1388 rseq.chars(),rqv.chars() );
1389 }
1390 }
1391
1392 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1393 if (freport==NULL || trashcode<=' ') return;
1394 if (trashcode=='3' || trashcode=='5') {
1395 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1396 }
1397 else {
1398 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1399 }
1400 //tcounter++;
1401 }
1402
1403 GStr getFext(GStr& s, int* xpos=NULL) {
1404 //s must be a filename without a path
1405 GStr r("");
1406 if (xpos!=NULL) *xpos=0;
1407 if (s.is_empty() || s=="-") return r;
1408 int p=s.rindex('.');
1409 int d=s.rindex('/');
1410 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1411 r=s.substr(p+1);
1412 if (xpos!=NULL) *xpos=p+1;
1413 r.lower();
1414 return r;
1415 }
1416
1417 void baseFileName(GStr& fname) {
1418 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1419 int xpos=0;
1420 GStr fext=getFext(fname, &xpos);
1421 if (xpos<=1) return;
1422 bool extdel=false;
1423 GStr f2;
1424 if (fext=="z" || fext=="zip") {
1425 extdel=true;
1426 }
1427 else if (fext.length()>=2) {
1428 f2=fext.substr(0,2);
1429 extdel=(f2=="gz" || f2=="bz");
1430 }
1431 if (extdel) {
1432 fname.cut(xpos-1);
1433 fext=getFext(fname, &xpos);
1434 if (xpos<=1) return;
1435 }
1436 extdel=false;
1437 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1438 extdel=true;
1439 }
1440 else if (fext.length()>=2) {
1441 extdel=(fext.substr(0,2)=="fa");
1442 }
1443 if (extdel) fname.cut(xpos-1);
1444 GStr fncp(fname);
1445 fncp.lower();
1446 fncp.chomp("seq");
1447 fncp.chomp("sequence");
1448 fncp.trimR("_.");
1449 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1450 }
1451
1452 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1453 FILE* f_out=NULL;
1454 GStr fname(getFileName(infname.chars()));
1455 //eliminate known extensions
1456 baseFileName(fname);
1457 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1458 else if (pocmd.is_empty()) {
1459 GStr oname(fname);
1460 oname.append('.');
1461 oname.append(outsuffix);
1462 f_out=fopen(oname.chars(),"w");
1463 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1464 }
1465 else {
1466 GStr oname(">");
1467 oname.append(fname);
1468 oname.append('.');
1469 oname.append(outsuffix);
1470 pocmd.append(oname);
1471 f_out=popen(pocmd.chars(), "w");
1472 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1473 }
1474 return f_out;
1475 }
1476
1477 void guess_unzip(GStr& fname, GStr& picmd) {
1478 GStr fext=getFext(fname);
1479 if (fext=="gz" || fext=="gzip" || fext=="z") {
1480 picmd="gzip -cd ";
1481 }
1482 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1483 picmd="bzip2 -cd ";
1484 }
1485 }
1486
1487 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1488 //TODO: prepare CASeqData here, and collect hexamers as well
1489 if (seq.is_empty() || seq=="-" ||
1490 seq=="N/A" || seq==".") return;
1491
1492 CASeqData adata(revCompl);
1493 int idx=adaptors.Add(adata);
1494 if (idx<0) GError("Error: failed to add adaptor!\n");
1495 adaptors[idx].trim_type=trim_type;
1496 adaptors[idx].update(seq.chars());
1497 }
1498
1499
1500 int loadAdaptors(const char* fname) {
1501 GLineReader lr(fname);
1502 char* l;
1503 while ((l=lr.nextLine())!=NULL) {
1504 if (lr.length()<=3 || l[0]=='#') continue;
1505 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1506 l[0]==';'|| l[0]==':' ) {
1507 int i=1;
1508 while (l[i]!=0 && isspace(l[i])) {
1509 i++;
1510 }
1511 if (l[i]!=0) {
1512 GStr s(&(l[i]));
1513 #ifdef GDEBUG
1514 //s.reverse();
1515 #endif
1516 addAdaptor(adaptors3, s, galn_TrimRight);
1517 continue;
1518 }
1519 }
1520 else {
1521 GStr s(l);
1522 s.startTokenize("\t ;,:");
1523 GStr a5,a3;
1524 if (s.nextToken(a5))
1525 s.nextToken(a3);
1526 else continue; //no tokens on this line
1527 GAlnTrimType ttype5=galn_TrimLeft;
1528 a5.upper();
1529 a3.upper();
1530 if (a3.is_empty() || a3==a5 || a3=="=") {
1531 a3.clear();
1532 ttype5=galn_TrimEither;
1533 }
1534 #ifdef GDEBUG
1535 // a5.reverse();
1536 // a3.reverse();
1537 #endif
1538 addAdaptor(adaptors5, a5, ttype5);
1539 addAdaptor(adaptors3, a3, galn_TrimRight);
1540 }
1541 }
1542 return adaptors5.Count()+adaptors3.Count();
1543 }
1544
1545 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1546 GStr& s, GStr& infname, GStr& infname2) {
1547 // uses outsuffix to generate output file names and open file handles as needed
1548 infname="";
1549 infname2="";
1550 f_in=NULL;
1551 f_in2=NULL;
1552 f_out=NULL;
1553 f_out2=NULL;
1554 //analyze outsuffix intent
1555 GStr pocmd;
1556 if (outsuffix=="-") {
1557 f_out=stdout;
1558 }
1559 else {
1560 GStr ox=getFext(outsuffix);
1561 if (ox.length()>2) ox=ox.substr(0,2);
1562 if (ox=="gz") pocmd="gzip -9 -c ";
1563 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1564 }
1565 if (s=="-") {
1566 f_in=stdin;
1567 infname="stdin";
1568 f_out=prepOutFile(infname, pocmd);
1569 return;
1570 } // streaming from stdin
1571 s.startTokenize(",:");
1572 s.nextToken(infname);
1573 bool paired=s.nextToken(infname2);
1574 if (fileExists(infname.chars())==0)
1575 GError("Error: cannot find file %s!\n",infname.chars());
1576 GStr fname(getFileName(infname.chars()));
1577 GStr picmd;
1578 guess_unzip(fname, picmd);
1579 if (picmd.is_empty()) {
1580 f_in=fopen(infname.chars(), "r");
1581 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1582 }
1583 else {
1584 picmd.append(infname);
1585 f_in=popen(picmd.chars(), "r");
1586 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1587 }
1588 if (f_out==stdout) {
1589 if (paired) GError("Error: output suffix required for paired reads\n");
1590 return;
1591 }
1592 f_out=prepOutFile(infname, pocmd);
1593 if (!paired) return;
1594 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1595 // ---- paired reads:-------------
1596 if (fileExists(infname2.chars())==0)
1597 GError("Error: cannot find file %s!\n",infname2.chars());
1598 picmd="";
1599 GStr fname2(getFileName(infname2.chars()));
1600 guess_unzip(fname2, picmd);
1601 if (picmd.is_empty()) {
1602 f_in2=fopen(infname2.chars(), "r");
1603 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1604 }
1605 else {
1606 picmd.append(infname2);
1607 f_in2=popen(picmd.chars(), "r");
1608 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1609 }
1610 f_out2=prepOutFile(infname2, pocmd);
1611 }