ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 108
Committed: Tue Oct 11 19:49:29 2011 UTC (7 years, 10 months ago) by gpertea
File size: 41757 byte(s)
Log Message:
made more stringent on the 5' end, other tests and fixes

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 76 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10     [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 76 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15     for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28     -f file with adapter sequences to trim, each line having this format:\n\
29     <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 gpertea 4 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32     -3 trim the given adapter sequence at the 3' end of each read\n\
33     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 103 -A disable polyA/T trimming (enabled by default)\n\
35 gpertea 94 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39     -l minimum \"clean\" length after trimming that a read must have\n\
40     in order to pass the filter (default: 16)\n\
41 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42     one letter code for the reason why they were trashed\n\
43 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44     50% of its length detected as low complexity\n\
45 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 gpertea 12 name (e.g. for microRNAs)\n\
47     -p input is phred64/phred33 (use -p64 or -p33)\n\
48     -Q convert quality values to the other Phred qv type\n\
49     -V verbose processing\n\
50 gpertea 4 "
51 gpertea 13
52     //-z for -o option, the output stream(s) will be first piped into the given\n
53     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54    
55    
56 gpertea 4 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 gpertea 12
58 gpertea 76 //For paired reads sequencing:
59 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
62     //FILE* f_out2=NULL; //for paired reads
63     //FILE* f_in=NULL; //input fastq (stdin if not provided)
64     //FILE* f_in2=NULL; //for paired reads
65 gpertea 4 FILE* freport=NULL;
66 gpertea 13
67 gpertea 4 bool debug=false;
68 gpertea 12 bool verbose=false;
69 gpertea 4 bool doCollapse=false;
70     bool doDust=false;
71 gpertea 103 bool doPolyTrim=true;
72 gpertea 13 bool fastaOutput=false;
73 gpertea 4 bool trashReport=false;
74 gpertea 12 //bool rawFormat=false;
75 gpertea 4 int min_read_len=16;
76 gpertea 12 double max_perc_N=7.0;
77 gpertea 4 int dust_cutoff=16;
78     bool isfasta=false;
79 gpertea 12 bool convert_phred=false;
80 gpertea 13 GStr outsuffix; // -o
81     GStr prefix;
82     GStr zcmd;
83     int num_trimmed5=0;
84     int num_trimmed3=0;
85     int num_trimmedN=0;
86     int num_trimmedQ=0;
87     int min_trimmed5=INT_MAX;
88     int min_trimmed3=INT_MAX;
89 gpertea 12
90 gpertea 13 int qvtrim_qmin=0;
91     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
92 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93     int qv_cvtadd=0; //could be -31 or +31
94    
95 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
96 gpertea 103 const int match_reward=2;
97     const int mismatch_penalty=3;
98     const int Xdrop=8;
99    
100 gpertea 94 const int poly_m_score=2; //match score
101     const int poly_mis_score=-3; //mismatch
102 gpertea 88 const int poly_dropoff_score=7;
103 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
104 gpertea 4
105 gpertea 94 const char *polyA_seed="AAAA";
106     const char *polyT_seed="TTTT";
107 gpertea 102 GVec<GStr> adapters5;
108     GVec<GStr> adapters3;
109 gpertea 12
110 gpertea 103 CGreedyAlignData* gxmem_l=NULL;
111     CGreedyAlignData* gxmem_r=NULL;
112    
113 gpertea 4 // element in dhash:
114     class FqDupRec {
115     public:
116     int count; //how many of these reads are the same
117     int len; //length of qv
118     char* firstname; //optional, only if we want to keep the original read names
119     char* qv;
120     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
121     len=0;
122     qv=NULL;
123     firstname=NULL;
124     count=0;
125     if (qstr!=NULL) {
126     qv=Gstrdup(qstr->chars());
127     len=qstr->length();
128     count++;
129     }
130     if (rname!=NULL) firstname=Gstrdup(rname);
131     }
132     ~FqDupRec() {
133     GFREE(qv);
134     GFREE(firstname);
135     }
136     void add(GStr& d) { //collapse another record into this one
137     if (d.length()!=len)
138     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
139     count++;
140     for (int i=0;i<len;i++)
141     qv[i]=(qv[i]+d[i])/2;
142     }
143     };
144    
145     void openfw(FILE* &f, GArgs& args, char opt) {
146     GStr s=args.getOpt(opt);
147     if (!s.is_empty()) {
148     if (s=='-') f=stdout;
149     else {
150 gpertea 13 f=fopen(s.chars(),"w");
151 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
152     }
153     }
154     }
155    
156     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
157     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
158    
159     GHash<FqDupRec> dhash; //hash to keep track of duplicates
160    
161 gpertea 94 int loadAdapters(const char* fname);
162    
163 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
164     GStr& s, GStr& infname, GStr& infname2);
165     // uses outsuffix to generate output file names and open file handles as needed
166    
167     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
168     void trash_report(char trashcode, GStr& rname, FILE* freport);
169    
170     bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
171     GStr& rname, GStr& rinfo, GStr& infname);
172    
173     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
174     //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
175    
176 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
177     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
178 gpertea 4 int dust(GStr& seq);
179 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
180     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
181 gpertea 4 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
182 gpertea 94 bool trim_adapter3(GStr& seq, int &l5, int &l3);
183 gpertea 4
184 gpertea 12 void convertPhred(char* q, int len);
185     void convertPhred(GStr& q);
186 gpertea 4
187     int main(int argc, char * const argv[]) {
188 gpertea 103 GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189 gpertea 4 int e;
190     if ((e=args.isError())>0) {
191     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
192     exit(224);
193     }
194 gpertea 9 debug=(args.getOpt('Y')!=NULL);
195 gpertea 13 verbose=(args.getOpt('V')!=NULL);
196 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
197 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
198     doDust=(args.getOpt('D')!=NULL);
199 gpertea 103 if (args.getOpt('A')) doPolyTrim=false;
200 gpertea 12 /*
201 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
202     if (rawFormat) {
203     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
204     }
205 gpertea 12 */
206 gpertea 4 prefix=args.getOpt('n');
207     GStr s=args.getOpt('l');
208     if (!s.is_empty())
209     min_read_len=s.asInt();
210 gpertea 12 s=args.getOpt('m');
211     if (!s.is_empty())
212     max_perc_N=s.asDouble();
213 gpertea 4 s=args.getOpt('d');
214     if (!s.is_empty()) {
215     dust_cutoff=s.asInt();
216     doDust=true;
217     }
218 gpertea 12 s=args.getOpt('q');
219     if (!s.is_empty()) {
220 gpertea 13 qvtrim_qmin=s.asInt();
221 gpertea 12 }
222 gpertea 13 s=args.getOpt('t');
223     if (!s.is_empty()) {
224     qvtrim_max=s.asInt();
225     }
226 gpertea 12 s=args.getOpt('p');
227     if (!s.is_empty()) {
228     int v=s.asInt();
229     if (v==33) {
230     qv_phredtype=33;
231     qv_cvtadd=31;
232     }
233     else if (v==64) {
234     qv_phredtype=64;
235     qv_cvtadd=-31;
236     }
237     else
238     GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
239     }
240 gpertea 94 s=args.getOpt('f');
241     if (!s.is_empty()) {
242     loadAdapters(s.chars());
243     }
244 gpertea 102 bool fileAdapters=adapters5.Count()+adapters3.Count();
245 gpertea 94 s=args.getOpt('5');
246     if (!s.is_empty()) {
247     if (fileAdapters)
248     GError("Error: options -5 and -f cannot be used together!\n");
249     s.upper();
250 gpertea 102 adapters5.Add(s);
251 gpertea 4 }
252 gpertea 94 s=args.getOpt('3');
253     if (!s.is_empty()) {
254     if (fileAdapters)
255     GError("Error: options -3 and -f cannot be used together!\n");
256 gpertea 102 s.upper();
257     adapters3.Add(s);
258 gpertea 4 }
259 gpertea 94 s=args.getOpt('y');
260 gpertea 13 if (!s.is_empty()) {
261 gpertea 94 int minmatch=s.asInt();
262     poly_minScore=minmatch*poly_m_score;
263 gpertea 13 }
264 gpertea 76
265 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
266 gpertea 76 else outsuffix="-";
267 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
268 gpertea 13 int fcount=args.startNonOpt();
269     if (fcount==0) {
270 gpertea 4 GMessage(USAGE);
271     exit(224);
272     }
273 gpertea 13 if (fcount>1 && doCollapse) {
274     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
275     }
276     //openfw(f_out, args, 'o');
277     //if (f_out==NULL) f_out=stdout;
278 gpertea 4 if (trashReport)
279     openfw(freport, args, 'r');
280     char* infile=NULL;
281 gpertea 103
282     if (adapters5.Count()>0)
283     gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
284     if (adapters3.Count()>0)
285     gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
286    
287 gpertea 4 while ((infile=args.nextNonOpt())!=NULL) {
288 gpertea 103 //for each input file
289 gpertea 13 int incounter=0; //counter for input reads
290     int outcounter=0; //counter for output reads
291     int trash_s=0; //too short from the get go
292     int trash_Q=0;
293     int trash_N=0;
294     int trash_D=0;
295 gpertea 103 int trash_poly=0;
296 gpertea 13 int trash_A3=0;
297     int trash_A5=0;
298     s=infile;
299     GStr infname;
300     GStr infname2;
301     FILE* f_in=NULL;
302     FILE* f_in2=NULL;
303     FILE* f_out=NULL;
304     FILE* f_out2=NULL;
305     bool paired_reads=false;
306     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
307     GLineReader fq(f_in);
308     GLineReader* fq2=NULL;
309     if (f_in2!=NULL) {
310     fq2=new GLineReader(f_in2);
311     paired_reads=true;
312     }
313     GStr rseq, rqv, seqid, seqinfo;
314     GStr rseq2, rqv2, seqid2, seqinfo2;
315     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
316     incounter++;
317     int a5=0, a3=0, b5=0, b3=0;
318     char tcode=0, tcode2=0;
319     tcode=process_read(seqid, rseq, rqv, a5, a3);
320 gpertea 103 if (fq2!=NULL) {
321 gpertea 13 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
322     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
323     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
324     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
325     }
326     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
327     //decide what to do with this pair and print rseq2 if the pair makes it
328     if (tcode>1 && tcode2<=1) {
329     //"untrash" rseq
330     if (a3-a5+1<min_read_len) {
331     a5=1;
332     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
333 gpertea 4 }
334 gpertea 13 tcode=1;
335     }
336     else if (tcode<=1 && tcode2>1) {
337     //"untrash" rseq2
338     if (b3-b5+1<min_read_len) {
339     b5=1;
340     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
341     }
342     tcode2=1;
343     }
344     if (tcode<=1) { //trimmed or left intact -- write it!
345     if (tcode>0) {
346     rseq2=rseq2.substr(b5,b3-b5+1);
347     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
348 gpertea 4 }
349 gpertea 13 int nocounter=0;
350     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
351     }
352 gpertea 103 } //pair read
353 gpertea 13 if (tcode>1) { //trashed
354 gpertea 103 #ifdef GDEBUG
355     GMessage(" !!!!TRASH => 'N'\n");
356     #endif
357 gpertea 13 if (tcode=='s') trash_s++;
358 gpertea 103 else if (tcode=='A' || tcode=='T') trash_poly++;
359 gpertea 13 else if (tcode=='Q') trash_Q++;
360     else if (tcode=='N') trash_N++;
361     else if (tcode=='D') trash_D++;
362     else if (tcode=='3') trash_A3++;
363     else if (tcode=='5') trash_A5++;
364     if (trashReport) trash_report(tcode, seqid, freport);
365     }
366     else if (!doCollapse) { //write it
367     if (tcode>0) {
368     rseq=rseq.substr(a5,a3-a5+1);
369     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
370     }
371 gpertea 103 #ifdef GDEBUG
372     GMessage(" After trimming:\n");
373 gpertea 105 GMessage("%s\n",rseq.chars());
374 gpertea 103 #endif
375 gpertea 13 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
376     }
377     } //for each fastq record
378     delete fq2;
379     FRCLOSE(f_in);
380     FRCLOSE(f_in2);
381     if (doCollapse) {
382     outcounter=0;
383     int maxdup_count=1;
384     char* maxdup_seq=NULL;
385     dhash.startIterate();
386     FqDupRec* qd=NULL;
387     char* seq=NULL;
388     while ((qd=dhash.NextData(seq))!=NULL) {
389     GStr rseq(seq);
390     //do the dusting here
391     if (doDust) {
392     int dustbases=dust(rseq);
393     if (dustbases>(rseq.length()>>1)) {
394     if (trashReport && qd->firstname!=NULL) {
395     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
396 gpertea 4 }
397 gpertea 13 trash_D+=qd->count;
398     continue;
399     }
400     }
401     outcounter++;
402     if (qd->count>maxdup_count) {
403     maxdup_count=qd->count;
404     maxdup_seq=seq;
405     }
406     if (isfasta) {
407     if (prefix.is_empty()) {
408     fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
409     rseq.chars());
410 gpertea 4 }
411 gpertea 13 else { //use custom read name
412     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
413     qd->count, rseq.chars());
414 gpertea 4 }
415 gpertea 13 }
416     else { //fastq format
417     if (convert_phred) convertPhred(qd->qv, qd->len);
418     if (prefix.is_empty()) {
419     fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
420     rseq.chars(), qd->qv);
421 gpertea 4 }
422 gpertea 13 else { //use custom read name
423     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
424     qd->count, rseq.chars(), qd->qv);
425     }
426     }
427     }//for each element of dhash
428     if (maxdup_count>1) {
429     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
430 gpertea 4 }
431 gpertea 13 } //collapse entries
432     if (verbose) {
433     if (paired_reads) {
434     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
435     GMessage("Number of input pairs :%9d\n", incounter);
436     GMessage(" Output pairs :%9d\n", outcounter);
437     }
438     else {
439     GMessage(">Input file : %s\n", infname.chars());
440     GMessage("Number of input reads :%9d\n", incounter);
441     GMessage(" Output reads :%9d\n", outcounter);
442     }
443     GMessage("------------------------------------\n");
444     if (num_trimmed5)
445     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
446     if (num_trimmed3)
447     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
448     GMessage("------------------------------------\n");
449     if (trash_s>0)
450     GMessage(" Trashed by length:%9d\n", trash_s);
451     if (trash_N>0)
452     GMessage(" Trashed by N%%:%9d\n", trash_N);
453     if (trash_Q>0)
454     GMessage("Trashed by low quality:%9d\n", trash_Q);
455 gpertea 103 if (trash_poly>0)
456     GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
457 gpertea 13 if (trash_A5>0)
458     GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
459     if (trash_A3>0)
460     GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
461     }
462     if (trashReport) {
463     FWCLOSE(freport);
464 gpertea 4 }
465 gpertea 13 FWCLOSE(f_out);
466     FWCLOSE(f_out2);
467     } //while each input file
468 gpertea 103 delete gxmem_l;
469     delete gxmem_r;
470 gpertea 12 //getc(stdin);
471 gpertea 4 }
472    
473     class NData {
474     public:
475     int NPos[1024]; //there should be no reads longer than 1K ?
476     int NCount;
477     int end5;
478     int end3;
479     int seqlen;
480     double perc_N; //percentage of Ns in end5..end3 range only!
481     const char* seq;
482     bool valid;
483     NData() {
484     NCount=0;
485     end5=0;
486     end3=0;
487     seq=NULL;
488     perc_N=0;
489     valid=true;
490     }
491     void init(GStr& rseq) {
492     NCount=0;
493     perc_N=0;
494     seqlen=rseq.length();
495     seq=rseq.chars();
496     end5=1;
497     end3=seqlen;
498     for (int i=0;i<seqlen;i++)
499 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
500 gpertea 4 NPos[NCount]=i;
501     NCount++;
502     }
503     perc_N=(NCount*100.0)/seqlen;
504     valid=true;
505     }
506     void N_calc() { //only in the region after trimming
507     int n=0;
508     for (int i=end3-1;i<end5;i++) {
509     if (seq[i]=='N') n++;
510     }
511     perc_N=(n*100.0)/(end5-end3+1);
512     }
513     };
514    
515     static NData feat;
516     int perc_lenN=12; // incremental distance from ends, in percentage of
517     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
518    
519     void N_analyze(int l5, int l3, int p5, int p3) {
520     /* assumes feat was filled properly */
521     int old_dif, t5,t3,v;
522 gpertea 12 if (l3<l5+2 || p5>p3 ) {
523 gpertea 4 feat.end5=l5+1;
524     feat.end3=l3+1;
525     return;
526     }
527    
528     t5=feat.NPos[p5]-l5;
529     t3=l3-feat.NPos[p3];
530     old_dif=p3-p5;
531     v=(int)((((double)(l3-l5))*perc_lenN)/100);
532     if (v>20) v=20; /* enforce N-search limit for very long reads */
533     if (t5 < v ) {
534     l5=feat.NPos[p5]+1;
535     p5++;
536     }
537     if (t3 < v) {
538     l3=feat.NPos[p3]-1;
539     p3--;
540     }
541     /* restNs=p3-p5; number of Ns in the new CLR */
542     if (p3-p5==old_dif) { /* no change, return */
543     /* don't trim if this may shorten the read too much */
544     //if (l5-l3<min_read_len) return;
545     feat.end5=l5+1;
546     feat.end3=l3+1;
547     return;
548     }
549     else
550     N_analyze(l5,l3, p5,p3);
551     }
552    
553    
554 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
555 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
556 gpertea 12 if (qv_phredtype==0) {
557     //try to guess the Phred type
558     int vmin=256, vmax=0;
559     for (int i=0;i<qvs.length();i++) {
560     if (vmin>qvs[i]) vmin=qvs[i];
561     if (vmax<qvs[i]) vmax=qvs[i];
562     }
563     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
564     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
565     if (qv_phredtype==0) {
566     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
567     }
568 gpertea 13 if (verbose)
569     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
570     } //guessing Phred type
571 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
572 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
573 gpertea 12 }
574     //just in case, check also the 5' the end (?)
575     for (l5=0;l5<qvs.length()-3;l5++) {
576 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
577 gpertea 12 }
578 gpertea 13 if (qvtrim_max>0) {
579     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
580     if (l5>qvtrim_max) l5=qvtrim_max;
581     }
582 gpertea 12 return (l5>0 || l3<qvs.length()-1);
583     }
584    
585     bool ntrim(GStr& rseq, int &l5, int &l3) {
586     //count Ns in the sequence, trim N-rich ends
587 gpertea 4 feat.init(rseq);
588     l5=feat.end5-1;
589     l3=feat.end3-1;
590     N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
591 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
592     if (feat.perc_N>max_perc_N) {
593     feat.valid=false;
594     l3=l5+1;
595     return true;
596     }
597     else {
598     return false; //nothing changed
599     }
600     }
601 gpertea 4 l5=feat.end5-1;
602     l3=feat.end3-1;
603     if (l3-l5+1<min_read_len) {
604     feat.valid=false;
605     return true;
606     }
607     feat.N_calc();
608 gpertea 12
609     if (feat.perc_N>max_perc_N) {
610 gpertea 4 feat.valid=false;
611     l3=l5+1;
612     return true;
613     }
614     return true;
615     }
616    
617     //--------------- dust functions ----------------
618     class DNADuster {
619     public:
620     int dustword;
621     int dustwindow;
622     int dustwindow2;
623     int dustcutoff;
624     int mv, iv, jv;
625     int counts[32*32*32];
626     int iis[32*32*32];
627     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
628     dustword=wordsize;
629     dustwindow=winsize;
630     dustwindow2 = (winsize>>1);
631     dustcutoff=cutoff;
632     mv=0;
633     iv=0;
634     jv=0;
635     }
636     void setWindowSize(int value) {
637     dustwindow = value;
638     dustwindow2 = (dustwindow >> 1);
639     }
640     void setWordSize(int value) {
641     dustword=value;
642     }
643     void wo1(int len, const char* s, int ivv) {
644     int i, ii, j, v, t, n, n1, sum;
645     int js, nis;
646     n = 32 * 32 * 32;
647     n1 = n - 1;
648     nis = 0;
649     i = 0;
650     ii = 0;
651     sum = 0;
652     v = 0;
653     for (j=0; j < len; j++, s++) {
654     ii <<= 5;
655     if (*s<=32) {
656     i=0;
657     continue;
658     }
659     ii |= *s - 'A'; //assume uppercase!
660     ii &= n1;
661     i++;
662     if (i >= dustword) {
663     for (js=0; js < nis && iis[js] != ii; js++) ;
664     if (js == nis) {
665     iis[nis] = ii;
666     counts[ii] = 0;
667     nis++;
668     }
669     if ((t = counts[ii]) > 0) {
670     sum += t;
671     v = 10 * sum / j;
672     if (mv < v) {
673     mv = v;
674     iv = ivv;
675     jv = j;
676     }
677     }
678     counts[ii]++;
679     }
680     }
681     }
682    
683     int wo(int len, const char* s, int* beg, int* end) {
684     int i, l1;
685     l1 = len - dustword + 1;
686     if (l1 < 0) {
687     *beg = 0;
688     *end = len - 1;
689     return 0;
690     }
691     mv = 0;
692     iv = 0;
693     jv = 0;
694     for (i=0; i < l1; i++) {
695     wo1(len-i, s+i, i);
696     }
697     *beg = iv;
698     *end = iv + jv;
699     return mv;
700     }
701    
702     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
703     int i, j, l, a, b, v;
704     if (cutoff==0) cutoff=dustcutoff;
705     a=0;b=0;
706     //GMessage("Dust cutoff=%d\n", cutoff);
707     for (i=0; i < seqlen; i += dustwindow2) {
708     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
709     v = wo(l, seq+i, &a, &b);
710     if (v > cutoff) {
711     //for (j = a; j <= b && j < dustwindow2; j++) {
712     for (j = a; j <= b; j++) {
713     seqmsk[i+j]='N';//could be made lowercase instead
714     }
715     }
716     }
717     //return first;
718     }
719     };
720    
721     static DNADuster duster;
722    
723     int dust(GStr& rseq) {
724     char* seq=Gstrdup(rseq.chars());
725     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
726     //check the number of Ns:
727     int ncount=0;
728     for (int i=0;i<rseq.length();i++) {
729     if (seq[i]=='N') ncount++;
730     }
731     GFREE(seq);
732     return ncount;
733     }
734    
735 gpertea 12 int get3mer_value(const char* s) {
736     return (s[0]<<16)+(s[1]<<8)+s[2];
737     }
738    
739     int w3_match(int qv, const char* str, int slen, int start_index=0) {
740     if (start_index>=slen || start_index<0) return -1;
741     for (int i=start_index;i<slen-3;i++) {
742     int rv=get3mer_value(str+i);
743     if (rv==qv) return i;
744     }
745     return -1;
746     }
747    
748     int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
749     if (end_index>=slen) return -1;
750     if (end_index<0) end_index=slen-1;
751     for (int i=end_index-2;i>=0;i--) {
752     int rv=get3mer_value(str+i);
753     if (rv==qv) return i;
754     }
755     return -1;
756     }
757    
758 gpertea 94 struct SLocScore {
759     int pos;
760     int score;
761     SLocScore(int p=0,int s=0) {
762     pos=p;
763     score=s;
764     }
765     void set(int p, int s) {
766     pos=p;
767     score=s;
768     }
769     void add(int p, int add) {
770     pos=p;
771     score+=add;
772     }
773     };
774 gpertea 12
775 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
776 gpertea 103 if (!doPolyTrim) return false;
777 gpertea 94 int rlen=seq.length();
778     l5=0;
779     l3=rlen-1;
780     int32 seedVal=*(int32*)poly_seed;
781     char polyChar=poly_seed[0];
782     //assumes N trimming was already done
783     //so a poly match should be very close to the end of the read
784     // -- find the initial match (seed)
785 gpertea 108 int lmin=GMAX((rlen-16), 0);
786 gpertea 94 int li;
787     for (li=rlen-4;li>lmin;li--) {
788     if (seedVal==*(int*)&(seq[li])) {
789     break;
790     }
791 gpertea 12 }
792 gpertea 94 if (li<=lmin) return false;
793     //seed found, try to extend it both ways
794     //extend right
795     int ri=li+3;
796     SLocScore loc(ri, poly_m_score<<2);
797     SLocScore maxloc(loc);
798     //extend right
799     while (ri<rlen-2) {
800     ri++;
801     if (seq[ri]==polyChar) {
802     loc.add(ri,poly_m_score);
803     }
804     else if (seq[ri]=='N') {
805     loc.add(ri,0);
806     }
807     else { //mismatch
808     loc.add(ri,poly_mis_score);
809     if (maxloc.score-loc.score>poly_dropoff_score) break;
810     }
811     if (maxloc.score<=loc.score) {
812     maxloc=loc;
813     }
814 gpertea 12 }
815 gpertea 94 ri=maxloc.pos;
816 gpertea 103 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
817 gpertea 94 //ri = right boundary for the poly match
818     //extend left
819     loc.set(li, maxloc.score);
820     maxloc.pos=li;
821     while (li>0) {
822     li--;
823     if (seq[li]==polyChar) {
824     loc.add(li,poly_m_score);
825     }
826     else if (seq[li]=='N') {
827     loc.add(li,0);
828     }
829     else { //mismatch
830     loc.add(li,poly_mis_score);
831     if (maxloc.score-loc.score>poly_dropoff_score) break;
832     }
833     if (maxloc.score<=loc.score) {
834     maxloc=loc;
835     }
836     }
837 gpertea 105 li=maxloc.pos;
838 gpertea 108 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
839     (maxloc.score>poly_minScore && ri>=rlen-3) ||
840     (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
841 gpertea 105 //trimming this li-ri match at 3' end
842     l3=li-1;
843     if (l3<0) l3=0;
844 gpertea 94 return true;
845     }
846     return false;
847 gpertea 12 }
848    
849 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
850 gpertea 103 if (!doPolyTrim) return false;
851 gpertea 88 int rlen=seq.length();
852     l5=0;
853     l3=rlen-1;
854 gpertea 94 int32 seedVal=*(int32*)poly_seed;
855     char polyChar=poly_seed[0];
856 gpertea 88 //assumes N trimming was already done
857     //so a poly match should be very close to the end of the read
858     // -- find the initial match (seed)
859 gpertea 108 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
860 gpertea 94 int li;
861     for (li=0;li<=lmax;li++) {
862     if (seedVal==*(int*)&(seq[li])) {
863 gpertea 88 break;
864     }
865     }
866 gpertea 94 if (li>lmax) return false;
867     //seed found, try to extend it both ways
868     //extend left
869     int ri=li+3; //save rightmost base of the seed
870     SLocScore loc(li, poly_m_score<<2);
871     SLocScore maxloc(loc);
872     while (li>0) {
873     li--;
874     if (seq[li]==polyChar) {
875     loc.add(li,poly_m_score);
876     }
877     else if (seq[li]=='N') {
878     loc.add(li,0);
879     }
880     else { //mismatch
881     loc.add(li,poly_mis_score);
882     if (maxloc.score-loc.score>poly_dropoff_score) break;
883     }
884     if (maxloc.score<=loc.score) {
885     maxloc=loc;
886     }
887 gpertea 88 }
888 gpertea 94 li=maxloc.pos;
889 gpertea 103 if (li>5) return false; //no trimming wanted, too far from 5' end
890 gpertea 94 //li = right boundary for the poly match
891    
892     //extend right
893     loc.set(ri, maxloc.score);
894     maxloc.pos=ri;
895     while (ri<rlen-2) {
896     ri++;
897     if (seq[ri]==polyChar) {
898     loc.add(ri,poly_m_score);
899     }
900     else if (seq[ri]=='N') {
901     loc.add(ri,0);
902     }
903     else { //mismatch
904     loc.add(ri,poly_mis_score);
905     if (maxloc.score-loc.score>poly_dropoff_score) break;
906     }
907     if (maxloc.score<=loc.score) {
908     maxloc=loc;
909     }
910     }
911 gpertea 105 ri=maxloc.pos;
912 gpertea 103 if ((maxloc.score==poly_minScore && li==0) ||
913     (maxloc.score>poly_minScore && li<2)
914 gpertea 108 || (maxloc.score>(poly_minScore*3) && li<8)) {
915 gpertea 105 //adjust l5 to reflect this trimming of 5' end
916     l5=ri+1;
917     if (l5>rlen-1) l5=rlen-1;
918 gpertea 94 return true;
919     }
920     return false;
921 gpertea 88 }
922    
923 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
924 gpertea 103 if (adapters3.Count()==0) return false;
925 gpertea 4 int rlen=seq.length();
926     l5=0;
927     l3=rlen-1;
928 gpertea 103 bool trimmed=false;
929     GStr wseq(seq.chars());
930     int wlen=rlen;
931     for (int ai=0;ai<adapters3.Count();ai++) {
932     if (adapters3[ai].is_empty()) continue;
933     int alen=adapters3[ai].length();
934     GStr& aseq=adapters3[ai];
935     GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
936     if (r_bestaln) {
937     trimmed=true;
938     //keep unmatched region on the left, if any
939     l3-=(wlen-r_bestaln->sl+1);
940     delete r_bestaln;
941     if (l3<0) l3=0;
942     if (l3-l5+1<min_read_len) return true;
943     wseq=seq.substr(l5,l3-l5+1);
944     wlen=wseq.length();
945     }
946     }//for each 5' adapter
947     return trimmed;
948 gpertea 4 }
949    
950     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
951 gpertea 103 if (adapters5.Count()==0) return false;
952 gpertea 4 int rlen=seq.length();
953     l5=0;
954     l3=rlen-1;
955 gpertea 102 bool trimmed=false;
956 gpertea 103 GStr wseq(seq.chars());
957     int wlen=rlen;
958 gpertea 102 for (int ai=0;ai<adapters5.Count();ai++) {
959     if (adapters5[ai].is_empty()) continue;
960 gpertea 103 int alen=adapters5[ai].length();
961     GStr& aseq=adapters5[ai];
962     GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_l, 84);
963     if (l_bestaln) {
964     trimmed=true;
965     l5+=l_bestaln->sr;
966     delete l_bestaln;
967     if (l5>=rlen) l5=rlen-1;
968     if (l3-l5+1<min_read_len) return true;
969     wseq=seq.substr(l5,l3-l5+1);
970     wlen=wseq.length();
971     }
972 gpertea 102 }//for each 5' adapter
973     return trimmed;
974 gpertea 98 }
975 gpertea 4
976 gpertea 12 //convert qvs to/from phred64 from/to phread33
977     void convertPhred(GStr& q) {
978     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
979 gpertea 4 }
980    
981 gpertea 12 void convertPhred(char* q, int len) {
982     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
983 gpertea 4 }
984    
985 gpertea 13 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
986     GStr& rname, GStr& rinfo, GStr& infname) {
987     rseq="";
988     rqv="";
989     rname="";
990     rinfo="";
991     if (fq.eof()) return false;
992     char* l=fq.getLine();
993     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
994     if (l==NULL) return false;
995     /* if (rawFormat) {
996     //TODO: implement raw qseq parsing here
997     //if (raw type=N) then continue; //skip invalid/bad records
998     } //raw qseq format
999     else { // FASTQ or FASTA */
1000     isfasta=(l[0]=='>');
1001     if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1002     infname.chars(), l);
1003     GStr s(l);
1004     rname=&(l[1]);
1005     for (int i=0;i<rname.length();i++)
1006     if (rname[i]<=' ') {
1007     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1008     rname.cut(i);
1009     break;
1010     }
1011     //now get the sequence
1012     if ((l=fq.getLine())==NULL)
1013     GError("Error: unexpected EOF after header for read %s (%s)\n",
1014     rname.chars(), infname.chars());
1015     rseq=l; //this must be the DNA line
1016     while ((l=fq.getLine())!=NULL) {
1017     //seq can span multiple lines
1018     if (l[0]=='>' || l[0]=='+') {
1019     fq.pushBack();
1020     break; //
1021     }
1022     rseq+=l;
1023     } //check for multi-line seq
1024     if (!isfasta) { //reading fastq quality values, which can also be multi-line
1025     if ((l=fq.getLine())==NULL)
1026     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1027     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1028     if ((l=fq.getLine())==NULL)
1029     GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1030     rqv=l;
1031     //if (rqv.length()!=rseq.length())
1032     // GError("Error: qv len != seq len for %s\n", rname.chars());
1033     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1034     rqv+=l; //append to qv string
1035     }
1036     }// fastq
1037     // } //<-- FASTA or FASTQ
1038 gpertea 94 rseq.upper();
1039 gpertea 13 return true;
1040     }
1041    
1042 gpertea 105 #ifdef GDEBUG
1043     void showTrim(GStr& s, int l5, int l3) {
1044     if (l5>0) {
1045     color_bg(c_red);
1046     }
1047     for (int i=0;i<s.length()-1;i++) {
1048     if (i && i==l5) color_resetbg();
1049     fprintf(stderr, "%c", s[i]);
1050     if (i==l3) color_bg(c_red);
1051     }
1052     fprintf(stderr, "%c", s[s.length()-1]);
1053     color_reset();
1054     fprintf(stderr, "\n");
1055     }
1056     #endif
1057    
1058 gpertea 13 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1059     //returns 0 if the read was untouched, 1 if it was just trimmed
1060     // and a trash code if it was trashed
1061     l5=0;
1062     l3=rseq.length()-1;
1063 gpertea 103 #ifdef GDEBUG
1064 gpertea 108 //rseq.reverse();
1065 gpertea 103 GMessage(">%s\n", rname.chars());
1066 gpertea 105 GMessage("%s\n",rseq.chars());
1067 gpertea 103 #endif
1068 gpertea 13 if (l3-l5+1<min_read_len) {
1069     return 's';
1070     }
1071     GStr wseq(rseq.chars());
1072     GStr wqv(rqv.chars());
1073     int w5=l5;
1074     int w3=l3;
1075     //first do the q-based trimming
1076     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1077     if (w3-w5+1<min_read_len) {
1078     return 'Q'; //invalid read
1079     }
1080     //-- keep only the w5..w3 range
1081     l5=w5;
1082     l3=w3;
1083     wseq=wseq.substr(w5, w3-w5+1);
1084     if (!wqv.is_empty())
1085     wqv=wqv.substr(w5, w3-w5+1);
1086     } //qv trimming
1087     // N-trimming on the remaining read seq
1088     if (ntrim(wseq, w5, w3)) {
1089     //GMessage("before: %s\n",wseq.chars());
1090     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1091     l5+=w5;
1092     l3-=(wseq.length()-1-w3);
1093     if (w3-w5+1<min_read_len) {
1094     return 'N'; //to be trashed
1095     }
1096     //-- keep only the w5..w3 range
1097     wseq=wseq.substr(w5, w3-w5+1);
1098     if (!wqv.is_empty())
1099     wqv=wqv.substr(w5, w3-w5+1);
1100     w5=0;
1101     w3=wseq.length()-1;
1102     }
1103 gpertea 103 char trim_code;
1104     do {
1105     trim_code=0;
1106     if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1107     trim_code='A';
1108     }
1109     else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1110     trim_code='T';
1111     }
1112     else if (trim_adapter5(wseq, w5, w3)) {
1113     trim_code='5';
1114     }
1115     if (trim_code) {
1116 gpertea 105 #ifdef GDEBUG
1117     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1118     showTrim(wseq, w5, w3);
1119     #endif
1120 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1121 gpertea 103 num_trimmed5++;
1122     if (trimlen<min_trimmed5)
1123     min_trimmed5=trimlen;
1124 gpertea 13 l5+=w5;
1125     l3-=(wseq.length()-1-w3);
1126     if (w3-w5+1<min_read_len) {
1127 gpertea 103 return trim_code;
1128 gpertea 13 }
1129     //-- keep only the w5..w3 range
1130     wseq=wseq.substr(w5, w3-w5+1);
1131     if (!wqv.is_empty())
1132     wqv=wqv.substr(w5, w3-w5+1);
1133 gpertea 103 }// trimmed at 5' end
1134     } while (trim_code);
1135    
1136     do {
1137     trim_code=0;
1138     if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1139     trim_code='A';
1140     }
1141     else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1142     trim_code='T';
1143     }
1144     else if (trim_adapter3(wseq, w5, w3)) {
1145     trim_code='3';
1146     }
1147     if (trim_code) {
1148 gpertea 105 #ifdef GDEBUG
1149     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1150     showTrim(wseq, w5, w3);
1151     #endif
1152 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1153 gpertea 103 num_trimmed3++;
1154     if (trimlen<min_trimmed3)
1155     min_trimmed3=trimlen;
1156 gpertea 13 l5+=w5;
1157     l3-=(wseq.length()-1-w3);
1158     if (w3-w5+1<min_read_len) {
1159 gpertea 103 return trim_code;
1160 gpertea 13 }
1161     //-- keep only the w5..w3 range
1162     wseq=wseq.substr(w5, w3-w5+1);
1163     if (!wqv.is_empty())
1164     wqv=wqv.substr(w5, w3-w5+1);
1165 gpertea 103 }//trimming at 3' end
1166     } while (trim_code);
1167    
1168    
1169 gpertea 13 if (doCollapse) {
1170     //keep read for later
1171     FqDupRec* dr=dhash.Find(wseq.chars());
1172     if (dr==NULL) { //new entry
1173     //if (prefix.is_empty())
1174     dhash.Add(wseq.chars(),
1175     new FqDupRec(&wqv, rname.chars()));
1176     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1177     }
1178     else
1179     dr->add(wqv);
1180     } //collapsing duplicates
1181     else { //not collapsing duplicates
1182     //apply the dust filter now
1183     if (doDust) {
1184     int dustbases=dust(wseq);
1185     if (dustbases>(wseq.length()>>1)) {
1186     return 'D';
1187     }
1188     }
1189     } //not collapsing duplicates
1190     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1191     }
1192    
1193     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1194     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1195     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1196     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1197     }
1198    
1199     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1200     outcounter++;
1201     bool asFasta=(rqv.is_empty() || fastaOutput);
1202     if (asFasta) {
1203     if (prefix.is_empty()) {
1204     printHeader(f_out, '>',rname,rinfo);
1205     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1206     }
1207     else {
1208     fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1209     rseq.chars());
1210     }
1211     }
1212     else { //fastq
1213     if (convert_phred) convertPhred(rqv);
1214     if (prefix.is_empty()) {
1215     printHeader(f_out, '@', rname, rinfo);
1216     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1217     }
1218     else
1219     fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1220     rseq.chars(),rqv.chars() );
1221     }
1222     }
1223    
1224     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1225     if (freport==NULL || trashcode<=' ') return;
1226     if (trashcode=='3' || trashcode=='5') {
1227 gpertea 103 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1228 gpertea 13 }
1229     else {
1230     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1231     }
1232     //tcounter++;
1233     }
1234    
1235     GStr getFext(GStr& s, int* xpos=NULL) {
1236     //s must be a filename without a path
1237     GStr r("");
1238     if (xpos!=NULL) *xpos=0;
1239     if (s.is_empty() || s=="-") return r;
1240     int p=s.rindex('.');
1241     int d=s.rindex('/');
1242     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1243     r=s.substr(p+1);
1244     if (xpos!=NULL) *xpos=p+1;
1245     r.lower();
1246     return r;
1247     }
1248    
1249     void baseFileName(GStr& fname) {
1250     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1251     int xpos=0;
1252     GStr fext=getFext(fname, &xpos);
1253     if (xpos<=1) return;
1254     bool extdel=false;
1255     GStr f2;
1256     if (fext=="z" || fext=="zip") {
1257     extdel=true;
1258     }
1259     else if (fext.length()>=2) {
1260     f2=fext.substr(0,2);
1261     extdel=(f2=="gz" || f2=="bz");
1262     }
1263     if (extdel) {
1264     fname.cut(xpos-1);
1265     fext=getFext(fname, &xpos);
1266     if (xpos<=1) return;
1267     }
1268     extdel=false;
1269     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1270     extdel=true;
1271     }
1272     else if (fext.length()>=2) {
1273     extdel=(fext.substr(0,2)=="fa");
1274     }
1275     if (extdel) fname.cut(xpos-1);
1276     GStr fncp(fname);
1277     fncp.lower();
1278     fncp.chomp("seq");
1279     fncp.chomp("sequence");
1280     fncp.trimR("_.");
1281     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1282     }
1283    
1284     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1285     FILE* f_out=NULL;
1286     GStr fname(getFileName(infname.chars()));
1287     //eliminate known extensions
1288     baseFileName(fname);
1289     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1290     else if (pocmd.is_empty()) {
1291     GStr oname(fname);
1292     oname.append('.');
1293     oname.append(outsuffix);
1294     f_out=fopen(oname.chars(),"w");
1295     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1296     }
1297     else {
1298     GStr oname(">");
1299     oname.append(fname);
1300     oname.append('.');
1301     oname.append(outsuffix);
1302     pocmd.append(oname);
1303     f_out=popen(pocmd.chars(), "w");
1304     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1305     }
1306     return f_out;
1307     }
1308    
1309     void guess_unzip(GStr& fname, GStr& picmd) {
1310     GStr fext=getFext(fname);
1311     if (fext=="gz" || fext=="gzip" || fext=="z") {
1312     picmd="gzip -cd ";
1313     }
1314     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1315     picmd="bzip2 -cd ";
1316     }
1317     }
1318    
1319 gpertea 94
1320     int loadAdapters(const char* fname) {
1321     GLineReader lr(fname);
1322     char* l;
1323     while ((l=lr.nextLine())!=NULL) {
1324     if (lr.length()<=3 || l[0]=='#') continue;
1325 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1326     l[0]==';'|| l[0]==':' ) {
1327 gpertea 94 int i=1;
1328     while (l[i]!=0 && isspace(l[i])) {
1329     i++;
1330     }
1331     if (l[i]!=0) {
1332 gpertea 102 GStr s(&(l[i]));
1333 gpertea 108 #ifdef GDEBUG
1334     //s.reverse();
1335     #endif
1336 gpertea 102 adapters3.Add(s);
1337 gpertea 94 continue;
1338     }
1339     }
1340     else {
1341 gpertea 98 GStr s(l);
1342     s.startTokenize("\t ;,:");
1343     GStr a5,a3;
1344     if (s.nextToken(a5))
1345     s.nextToken(a3);
1346 gpertea 102 a5.upper();
1347     a3.upper();
1348 gpertea 108 #ifdef GDEBUG
1349     // a5.reverse();
1350     // a3.reverse();
1351     #endif
1352 gpertea 102 adapters5.Add(a5);
1353     adapters3.Add(a3);
1354 gpertea 94 }
1355     }
1356 gpertea 102 return adapters5.Count()+adapters3.Count();
1357 gpertea 94 }
1358    
1359 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1360     GStr& s, GStr& infname, GStr& infname2) {
1361     // uses outsuffix to generate output file names and open file handles as needed
1362     infname="";
1363     infname2="";
1364     f_in=NULL;
1365     f_in2=NULL;
1366     f_out=NULL;
1367     f_out2=NULL;
1368     //analyze outsuffix intent
1369     GStr pocmd;
1370 gpertea 76 if (outsuffix=="-") {
1371     f_out=stdout;
1372     }
1373     else {
1374     GStr ox=getFext(outsuffix);
1375     if (ox.length()>2) ox=ox.substr(0,2);
1376     if (ox=="gz") pocmd="gzip -9 -c ";
1377     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1378     }
1379 gpertea 13 if (s=="-") {
1380     f_in=stdin;
1381     infname="stdin";
1382     f_out=prepOutFile(infname, pocmd);
1383     return;
1384     } // streaming from stdin
1385     s.startTokenize(",:");
1386     s.nextToken(infname);
1387     bool paired=s.nextToken(infname2);
1388     if (fileExists(infname.chars())==0)
1389     GError("Error: cannot find file %s!\n",infname.chars());
1390     GStr fname(getFileName(infname.chars()));
1391     GStr picmd;
1392     guess_unzip(fname, picmd);
1393     if (picmd.is_empty()) {
1394     f_in=fopen(infname.chars(), "r");
1395     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1396     }
1397     else {
1398     picmd.append(infname);
1399     f_in=popen(picmd.chars(), "r");
1400     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1401     }
1402 gpertea 76 if (f_out==stdout) {
1403     if (paired) GError("Error: output suffix required for paired reads\n");
1404     return;
1405     }
1406 gpertea 13 f_out=prepOutFile(infname, pocmd);
1407     if (!paired) return;
1408     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1409     // ---- paired reads:-------------
1410     if (fileExists(infname2.chars())==0)
1411     GError("Error: cannot find file %s!\n",infname2.chars());
1412     picmd="";
1413     GStr fname2(getFileName(infname2.chars()));
1414     guess_unzip(fname2, picmd);
1415     if (picmd.is_empty()) {
1416     f_in2=fopen(infname2.chars(), "r");
1417     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1418     }
1419     else {
1420     picmd.append(infname2);
1421     f_in2=popen(picmd.chars(), "r");
1422     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1423     }
1424     f_out2=prepOutFile(infname2, pocmd);
1425     }