ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 105
Committed: Mon Oct 10 21:44:55 2011 UTC (7 years, 10 months ago) by gpertea
File size: 41600 byte(s)
Log Message:
wip - works?

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 76 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10     [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 76 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15     for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28     -f file with adapter sequences to trim, each line having this format:\n\
29     <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 gpertea 4 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32     -3 trim the given adapter sequence at the 3' end of each read\n\
33     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 103 -A disable polyA/T trimming (enabled by default)\n\
35 gpertea 94 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39     -l minimum \"clean\" length after trimming that a read must have\n\
40     in order to pass the filter (default: 16)\n\
41 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42     one letter code for the reason why they were trashed\n\
43 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44     50% of its length detected as low complexity\n\
45 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 gpertea 12 name (e.g. for microRNAs)\n\
47     -p input is phred64/phred33 (use -p64 or -p33)\n\
48     -Q convert quality values to the other Phred qv type\n\
49     -V verbose processing\n\
50 gpertea 4 "
51 gpertea 13
52     //-z for -o option, the output stream(s) will be first piped into the given\n
53     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54    
55    
56 gpertea 4 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 gpertea 12
58 gpertea 76 //For paired reads sequencing:
59 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
62     //FILE* f_out2=NULL; //for paired reads
63     //FILE* f_in=NULL; //input fastq (stdin if not provided)
64     //FILE* f_in2=NULL; //for paired reads
65 gpertea 4 FILE* freport=NULL;
66 gpertea 13
67 gpertea 4 bool debug=false;
68 gpertea 12 bool verbose=false;
69 gpertea 4 bool doCollapse=false;
70     bool doDust=false;
71 gpertea 103 bool doPolyTrim=true;
72 gpertea 13 bool fastaOutput=false;
73 gpertea 4 bool trashReport=false;
74 gpertea 12 //bool rawFormat=false;
75 gpertea 4 int min_read_len=16;
76 gpertea 12 double max_perc_N=7.0;
77 gpertea 4 int dust_cutoff=16;
78     bool isfasta=false;
79 gpertea 12 bool convert_phred=false;
80 gpertea 13 GStr outsuffix; // -o
81     GStr prefix;
82     GStr zcmd;
83     int num_trimmed5=0;
84     int num_trimmed3=0;
85     int num_trimmedN=0;
86     int num_trimmedQ=0;
87     int min_trimmed5=INT_MAX;
88     int min_trimmed3=INT_MAX;
89 gpertea 12
90 gpertea 13 int qvtrim_qmin=0;
91     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
92 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93     int qv_cvtadd=0; //could be -31 or +31
94    
95 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
96 gpertea 103 const int match_reward=2;
97     const int mismatch_penalty=3;
98     const int Xdrop=8;
99    
100 gpertea 94 const int poly_m_score=2; //match score
101     const int poly_mis_score=-3; //mismatch
102 gpertea 88 const int poly_dropoff_score=7;
103 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
104 gpertea 4
105 gpertea 94 const char *polyA_seed="AAAA";
106     const char *polyT_seed="TTTT";
107 gpertea 102 GVec<GStr> adapters5;
108     GVec<GStr> adapters3;
109 gpertea 12
110 gpertea 103 CGreedyAlignData* gxmem_l=NULL;
111     CGreedyAlignData* gxmem_r=NULL;
112    
113 gpertea 4 // element in dhash:
114     class FqDupRec {
115     public:
116     int count; //how many of these reads are the same
117     int len; //length of qv
118     char* firstname; //optional, only if we want to keep the original read names
119     char* qv;
120     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
121     len=0;
122     qv=NULL;
123     firstname=NULL;
124     count=0;
125     if (qstr!=NULL) {
126     qv=Gstrdup(qstr->chars());
127     len=qstr->length();
128     count++;
129     }
130     if (rname!=NULL) firstname=Gstrdup(rname);
131     }
132     ~FqDupRec() {
133     GFREE(qv);
134     GFREE(firstname);
135     }
136     void add(GStr& d) { //collapse another record into this one
137     if (d.length()!=len)
138     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
139     count++;
140     for (int i=0;i<len;i++)
141     qv[i]=(qv[i]+d[i])/2;
142     }
143     };
144    
145     void openfw(FILE* &f, GArgs& args, char opt) {
146     GStr s=args.getOpt(opt);
147     if (!s.is_empty()) {
148     if (s=='-') f=stdout;
149     else {
150 gpertea 13 f=fopen(s.chars(),"w");
151 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
152     }
153     }
154     }
155    
156     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
157     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
158    
159     GHash<FqDupRec> dhash; //hash to keep track of duplicates
160    
161 gpertea 94 int loadAdapters(const char* fname);
162    
163 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
164     GStr& s, GStr& infname, GStr& infname2);
165     // uses outsuffix to generate output file names and open file handles as needed
166    
167     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
168     void trash_report(char trashcode, GStr& rname, FILE* freport);
169    
170     bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
171     GStr& rname, GStr& rinfo, GStr& infname);
172    
173     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
174     //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
175    
176 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
177     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
178 gpertea 4 int dust(GStr& seq);
179 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
180     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
181 gpertea 4 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
182 gpertea 94 bool trim_adapter3(GStr& seq, int &l5, int &l3);
183 gpertea 4
184 gpertea 12 void convertPhred(char* q, int len);
185     void convertPhred(GStr& q);
186 gpertea 4
187     int main(int argc, char * const argv[]) {
188 gpertea 103 GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189 gpertea 4 int e;
190     if ((e=args.isError())>0) {
191     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
192     exit(224);
193     }
194 gpertea 9 debug=(args.getOpt('Y')!=NULL);
195 gpertea 13 verbose=(args.getOpt('V')!=NULL);
196 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
197 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
198     doDust=(args.getOpt('D')!=NULL);
199 gpertea 103 if (args.getOpt('A')) doPolyTrim=false;
200 gpertea 12 /*
201 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
202     if (rawFormat) {
203     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
204     }
205 gpertea 12 */
206 gpertea 4 prefix=args.getOpt('n');
207     GStr s=args.getOpt('l');
208     if (!s.is_empty())
209     min_read_len=s.asInt();
210 gpertea 12 s=args.getOpt('m');
211     if (!s.is_empty())
212     max_perc_N=s.asDouble();
213 gpertea 4 s=args.getOpt('d');
214     if (!s.is_empty()) {
215     dust_cutoff=s.asInt();
216     doDust=true;
217     }
218 gpertea 12 s=args.getOpt('q');
219     if (!s.is_empty()) {
220 gpertea 13 qvtrim_qmin=s.asInt();
221 gpertea 12 }
222 gpertea 13 s=args.getOpt('t');
223     if (!s.is_empty()) {
224     qvtrim_max=s.asInt();
225     }
226 gpertea 12 s=args.getOpt('p');
227     if (!s.is_empty()) {
228     int v=s.asInt();
229     if (v==33) {
230     qv_phredtype=33;
231     qv_cvtadd=31;
232     }
233     else if (v==64) {
234     qv_phredtype=64;
235     qv_cvtadd=-31;
236     }
237     else
238     GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
239     }
240 gpertea 94 s=args.getOpt('f');
241     if (!s.is_empty()) {
242     loadAdapters(s.chars());
243     }
244 gpertea 102 bool fileAdapters=adapters5.Count()+adapters3.Count();
245 gpertea 94 s=args.getOpt('5');
246     if (!s.is_empty()) {
247     if (fileAdapters)
248     GError("Error: options -5 and -f cannot be used together!\n");
249     s.upper();
250 gpertea 102 adapters5.Add(s);
251 gpertea 4 }
252 gpertea 94 s=args.getOpt('3');
253     if (!s.is_empty()) {
254     if (fileAdapters)
255     GError("Error: options -3 and -f cannot be used together!\n");
256 gpertea 102 s.upper();
257     adapters3.Add(s);
258 gpertea 4 }
259 gpertea 94 s=args.getOpt('y');
260 gpertea 13 if (!s.is_empty()) {
261 gpertea 94 int minmatch=s.asInt();
262     poly_minScore=minmatch*poly_m_score;
263 gpertea 13 }
264 gpertea 76
265 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
266 gpertea 76 else outsuffix="-";
267 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
268 gpertea 13 int fcount=args.startNonOpt();
269     if (fcount==0) {
270 gpertea 4 GMessage(USAGE);
271     exit(224);
272     }
273 gpertea 13 if (fcount>1 && doCollapse) {
274     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
275     }
276     //openfw(f_out, args, 'o');
277     //if (f_out==NULL) f_out=stdout;
278 gpertea 4 if (trashReport)
279     openfw(freport, args, 'r');
280     char* infile=NULL;
281 gpertea 103
282     if (adapters5.Count()>0)
283     gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
284     if (adapters3.Count()>0)
285     gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
286    
287 gpertea 4 while ((infile=args.nextNonOpt())!=NULL) {
288 gpertea 103 //for each input file
289 gpertea 13 int incounter=0; //counter for input reads
290     int outcounter=0; //counter for output reads
291     int trash_s=0; //too short from the get go
292     int trash_Q=0;
293     int trash_N=0;
294     int trash_D=0;
295 gpertea 103 int trash_poly=0;
296 gpertea 13 int trash_A3=0;
297     int trash_A5=0;
298     s=infile;
299     GStr infname;
300     GStr infname2;
301     FILE* f_in=NULL;
302     FILE* f_in2=NULL;
303     FILE* f_out=NULL;
304     FILE* f_out2=NULL;
305     bool paired_reads=false;
306     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
307     GLineReader fq(f_in);
308     GLineReader* fq2=NULL;
309     if (f_in2!=NULL) {
310     fq2=new GLineReader(f_in2);
311     paired_reads=true;
312     }
313     GStr rseq, rqv, seqid, seqinfo;
314     GStr rseq2, rqv2, seqid2, seqinfo2;
315     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
316     incounter++;
317     int a5=0, a3=0, b5=0, b3=0;
318     char tcode=0, tcode2=0;
319     tcode=process_read(seqid, rseq, rqv, a5, a3);
320 gpertea 103 if (fq2!=NULL) {
321 gpertea 13 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
322     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
323     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
324     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
325     }
326     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
327     //decide what to do with this pair and print rseq2 if the pair makes it
328     if (tcode>1 && tcode2<=1) {
329     //"untrash" rseq
330     if (a3-a5+1<min_read_len) {
331     a5=1;
332     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
333 gpertea 4 }
334 gpertea 13 tcode=1;
335     }
336     else if (tcode<=1 && tcode2>1) {
337     //"untrash" rseq2
338     if (b3-b5+1<min_read_len) {
339     b5=1;
340     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
341     }
342     tcode2=1;
343     }
344     if (tcode<=1) { //trimmed or left intact -- write it!
345     if (tcode>0) {
346     rseq2=rseq2.substr(b5,b3-b5+1);
347     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
348 gpertea 4 }
349 gpertea 13 int nocounter=0;
350     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
351     }
352 gpertea 103 } //pair read
353 gpertea 13 if (tcode>1) { //trashed
354 gpertea 103 #ifdef GDEBUG
355     GMessage(" !!!!TRASH => 'N'\n");
356     #endif
357 gpertea 13 if (tcode=='s') trash_s++;
358 gpertea 103 else if (tcode=='A' || tcode=='T') trash_poly++;
359 gpertea 13 else if (tcode=='Q') trash_Q++;
360     else if (tcode=='N') trash_N++;
361     else if (tcode=='D') trash_D++;
362     else if (tcode=='3') trash_A3++;
363     else if (tcode=='5') trash_A5++;
364     if (trashReport) trash_report(tcode, seqid, freport);
365     }
366     else if (!doCollapse) { //write it
367     if (tcode>0) {
368     rseq=rseq.substr(a5,a3-a5+1);
369     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
370     }
371 gpertea 103 #ifdef GDEBUG
372     GMessage(" After trimming:\n");
373 gpertea 105 GMessage("%s\n",rseq.chars());
374 gpertea 103 #endif
375 gpertea 13 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
376     }
377     } //for each fastq record
378     delete fq2;
379     FRCLOSE(f_in);
380     FRCLOSE(f_in2);
381     if (doCollapse) {
382     outcounter=0;
383     int maxdup_count=1;
384     char* maxdup_seq=NULL;
385     dhash.startIterate();
386     FqDupRec* qd=NULL;
387     char* seq=NULL;
388     while ((qd=dhash.NextData(seq))!=NULL) {
389     GStr rseq(seq);
390     //do the dusting here
391     if (doDust) {
392     int dustbases=dust(rseq);
393     if (dustbases>(rseq.length()>>1)) {
394     if (trashReport && qd->firstname!=NULL) {
395     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
396 gpertea 4 }
397 gpertea 13 trash_D+=qd->count;
398     continue;
399     }
400     }
401     outcounter++;
402     if (qd->count>maxdup_count) {
403     maxdup_count=qd->count;
404     maxdup_seq=seq;
405     }
406     if (isfasta) {
407     if (prefix.is_empty()) {
408     fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
409     rseq.chars());
410 gpertea 4 }
411 gpertea 13 else { //use custom read name
412     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
413     qd->count, rseq.chars());
414 gpertea 4 }
415 gpertea 13 }
416     else { //fastq format
417     if (convert_phred) convertPhred(qd->qv, qd->len);
418     if (prefix.is_empty()) {
419     fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
420     rseq.chars(), qd->qv);
421 gpertea 4 }
422 gpertea 13 else { //use custom read name
423     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
424     qd->count, rseq.chars(), qd->qv);
425     }
426     }
427     }//for each element of dhash
428     if (maxdup_count>1) {
429     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
430 gpertea 4 }
431 gpertea 13 } //collapse entries
432     if (verbose) {
433     if (paired_reads) {
434     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
435     GMessage("Number of input pairs :%9d\n", incounter);
436     GMessage(" Output pairs :%9d\n", outcounter);
437     }
438     else {
439     GMessage(">Input file : %s\n", infname.chars());
440     GMessage("Number of input reads :%9d\n", incounter);
441     GMessage(" Output reads :%9d\n", outcounter);
442     }
443     GMessage("------------------------------------\n");
444     if (num_trimmed5)
445     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
446     if (num_trimmed3)
447     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
448     GMessage("------------------------------------\n");
449     if (trash_s>0)
450     GMessage(" Trashed by length:%9d\n", trash_s);
451     if (trash_N>0)
452     GMessage(" Trashed by N%%:%9d\n", trash_N);
453     if (trash_Q>0)
454     GMessage("Trashed by low quality:%9d\n", trash_Q);
455 gpertea 103 if (trash_poly>0)
456     GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
457 gpertea 13 if (trash_A5>0)
458     GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
459     if (trash_A3>0)
460     GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
461     }
462     if (trashReport) {
463     FWCLOSE(freport);
464 gpertea 4 }
465 gpertea 13 FWCLOSE(f_out);
466     FWCLOSE(f_out2);
467     } //while each input file
468 gpertea 103 delete gxmem_l;
469     delete gxmem_r;
470 gpertea 12 //getc(stdin);
471 gpertea 4 }
472    
473     class NData {
474     public:
475     int NPos[1024]; //there should be no reads longer than 1K ?
476     int NCount;
477     int end5;
478     int end3;
479     int seqlen;
480     double perc_N; //percentage of Ns in end5..end3 range only!
481     const char* seq;
482     bool valid;
483     NData() {
484     NCount=0;
485     end5=0;
486     end3=0;
487     seq=NULL;
488     perc_N=0;
489     valid=true;
490     }
491     void init(GStr& rseq) {
492     NCount=0;
493     perc_N=0;
494     seqlen=rseq.length();
495     seq=rseq.chars();
496     end5=1;
497     end3=seqlen;
498     for (int i=0;i<seqlen;i++)
499 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
500 gpertea 4 NPos[NCount]=i;
501     NCount++;
502     }
503     perc_N=(NCount*100.0)/seqlen;
504     valid=true;
505     }
506     void N_calc() { //only in the region after trimming
507     int n=0;
508     for (int i=end3-1;i<end5;i++) {
509     if (seq[i]=='N') n++;
510     }
511     perc_N=(n*100.0)/(end5-end3+1);
512     }
513     };
514    
515     static NData feat;
516     int perc_lenN=12; // incremental distance from ends, in percentage of
517     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
518    
519     void N_analyze(int l5, int l3, int p5, int p3) {
520     /* assumes feat was filled properly */
521     int old_dif, t5,t3,v;
522 gpertea 12 if (l3<l5+2 || p5>p3 ) {
523 gpertea 4 feat.end5=l5+1;
524     feat.end3=l3+1;
525     return;
526     }
527    
528     t5=feat.NPos[p5]-l5;
529     t3=l3-feat.NPos[p3];
530     old_dif=p3-p5;
531     v=(int)((((double)(l3-l5))*perc_lenN)/100);
532     if (v>20) v=20; /* enforce N-search limit for very long reads */
533     if (t5 < v ) {
534     l5=feat.NPos[p5]+1;
535     p5++;
536     }
537     if (t3 < v) {
538     l3=feat.NPos[p3]-1;
539     p3--;
540     }
541     /* restNs=p3-p5; number of Ns in the new CLR */
542     if (p3-p5==old_dif) { /* no change, return */
543     /* don't trim if this may shorten the read too much */
544     //if (l5-l3<min_read_len) return;
545     feat.end5=l5+1;
546     feat.end3=l3+1;
547     return;
548     }
549     else
550     N_analyze(l5,l3, p5,p3);
551     }
552    
553    
554 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
555 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
556 gpertea 12 if (qv_phredtype==0) {
557     //try to guess the Phred type
558     int vmin=256, vmax=0;
559     for (int i=0;i<qvs.length();i++) {
560     if (vmin>qvs[i]) vmin=qvs[i];
561     if (vmax<qvs[i]) vmax=qvs[i];
562     }
563     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
564     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
565     if (qv_phredtype==0) {
566     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
567     }
568 gpertea 13 if (verbose)
569     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
570     } //guessing Phred type
571 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
572 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
573 gpertea 12 }
574     //just in case, check also the 5' the end (?)
575     for (l5=0;l5<qvs.length()-3;l5++) {
576 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
577 gpertea 12 }
578 gpertea 13 if (qvtrim_max>0) {
579     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
580     if (l5>qvtrim_max) l5=qvtrim_max;
581     }
582 gpertea 12 return (l5>0 || l3<qvs.length()-1);
583     }
584    
585     bool ntrim(GStr& rseq, int &l5, int &l3) {
586     //count Ns in the sequence, trim N-rich ends
587 gpertea 4 feat.init(rseq);
588     l5=feat.end5-1;
589     l3=feat.end3-1;
590     N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
591 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
592     if (feat.perc_N>max_perc_N) {
593     feat.valid=false;
594     l3=l5+1;
595     return true;
596     }
597     else {
598     return false; //nothing changed
599     }
600     }
601 gpertea 4 l5=feat.end5-1;
602     l3=feat.end3-1;
603     if (l3-l5+1<min_read_len) {
604     feat.valid=false;
605     return true;
606     }
607     feat.N_calc();
608 gpertea 12
609     if (feat.perc_N>max_perc_N) {
610 gpertea 4 feat.valid=false;
611     l3=l5+1;
612     return true;
613     }
614     return true;
615     }
616    
617     //--------------- dust functions ----------------
618     class DNADuster {
619     public:
620     int dustword;
621     int dustwindow;
622     int dustwindow2;
623     int dustcutoff;
624     int mv, iv, jv;
625     int counts[32*32*32];
626     int iis[32*32*32];
627     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
628     dustword=wordsize;
629     dustwindow=winsize;
630     dustwindow2 = (winsize>>1);
631     dustcutoff=cutoff;
632     mv=0;
633     iv=0;
634     jv=0;
635     }
636     void setWindowSize(int value) {
637     dustwindow = value;
638     dustwindow2 = (dustwindow >> 1);
639     }
640     void setWordSize(int value) {
641     dustword=value;
642     }
643     void wo1(int len, const char* s, int ivv) {
644     int i, ii, j, v, t, n, n1, sum;
645     int js, nis;
646     n = 32 * 32 * 32;
647     n1 = n - 1;
648     nis = 0;
649     i = 0;
650     ii = 0;
651     sum = 0;
652     v = 0;
653     for (j=0; j < len; j++, s++) {
654     ii <<= 5;
655     if (*s<=32) {
656     i=0;
657     continue;
658     }
659     ii |= *s - 'A'; //assume uppercase!
660     ii &= n1;
661     i++;
662     if (i >= dustword) {
663     for (js=0; js < nis && iis[js] != ii; js++) ;
664     if (js == nis) {
665     iis[nis] = ii;
666     counts[ii] = 0;
667     nis++;
668     }
669     if ((t = counts[ii]) > 0) {
670     sum += t;
671     v = 10 * sum / j;
672     if (mv < v) {
673     mv = v;
674     iv = ivv;
675     jv = j;
676     }
677     }
678     counts[ii]++;
679     }
680     }
681     }
682    
683     int wo(int len, const char* s, int* beg, int* end) {
684     int i, l1;
685     l1 = len - dustword + 1;
686     if (l1 < 0) {
687     *beg = 0;
688     *end = len - 1;
689     return 0;
690     }
691     mv = 0;
692     iv = 0;
693     jv = 0;
694     for (i=0; i < l1; i++) {
695     wo1(len-i, s+i, i);
696     }
697     *beg = iv;
698     *end = iv + jv;
699     return mv;
700     }
701    
702     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
703     int i, j, l, a, b, v;
704     if (cutoff==0) cutoff=dustcutoff;
705     a=0;b=0;
706     //GMessage("Dust cutoff=%d\n", cutoff);
707     for (i=0; i < seqlen; i += dustwindow2) {
708     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
709     v = wo(l, seq+i, &a, &b);
710     if (v > cutoff) {
711     //for (j = a; j <= b && j < dustwindow2; j++) {
712     for (j = a; j <= b; j++) {
713     seqmsk[i+j]='N';//could be made lowercase instead
714     }
715     }
716     }
717     //return first;
718     }
719     };
720    
721     static DNADuster duster;
722    
723     int dust(GStr& rseq) {
724     char* seq=Gstrdup(rseq.chars());
725     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
726     //check the number of Ns:
727     int ncount=0;
728     for (int i=0;i<rseq.length();i++) {
729     if (seq[i]=='N') ncount++;
730     }
731     GFREE(seq);
732     return ncount;
733     }
734    
735 gpertea 12 int get3mer_value(const char* s) {
736     return (s[0]<<16)+(s[1]<<8)+s[2];
737     }
738    
739     int w3_match(int qv, const char* str, int slen, int start_index=0) {
740     if (start_index>=slen || start_index<0) return -1;
741     for (int i=start_index;i<slen-3;i++) {
742     int rv=get3mer_value(str+i);
743     if (rv==qv) return i;
744     }
745     return -1;
746     }
747    
748     int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
749     if (end_index>=slen) return -1;
750     if (end_index<0) end_index=slen-1;
751     for (int i=end_index-2;i>=0;i--) {
752     int rv=get3mer_value(str+i);
753     if (rv==qv) return i;
754     }
755     return -1;
756     }
757    
758 gpertea 94 struct SLocScore {
759     int pos;
760     int score;
761     SLocScore(int p=0,int s=0) {
762     pos=p;
763     score=s;
764     }
765     void set(int p, int s) {
766     pos=p;
767     score=s;
768     }
769     void add(int p, int add) {
770     pos=p;
771     score+=add;
772     }
773     };
774 gpertea 12
775 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
776 gpertea 103 if (!doPolyTrim) return false;
777 gpertea 94 int rlen=seq.length();
778     l5=0;
779     l3=rlen-1;
780     int32 seedVal=*(int32*)poly_seed;
781     char polyChar=poly_seed[0];
782     //assumes N trimming was already done
783     //so a poly match should be very close to the end of the read
784     // -- find the initial match (seed)
785     int lmin=GMAX((rlen-12), 0);
786     int li;
787     for (li=rlen-4;li>lmin;li--) {
788     if (seedVal==*(int*)&(seq[li])) {
789     break;
790     }
791 gpertea 12 }
792 gpertea 94 if (li<=lmin) return false;
793     //seed found, try to extend it both ways
794     //extend right
795     int ri=li+3;
796     SLocScore loc(ri, poly_m_score<<2);
797     SLocScore maxloc(loc);
798     //extend right
799     while (ri<rlen-2) {
800     ri++;
801     if (seq[ri]==polyChar) {
802     loc.add(ri,poly_m_score);
803     }
804     else if (seq[ri]=='N') {
805     loc.add(ri,0);
806     }
807     else { //mismatch
808     loc.add(ri,poly_mis_score);
809     if (maxloc.score-loc.score>poly_dropoff_score) break;
810     }
811     if (maxloc.score<=loc.score) {
812     maxloc=loc;
813     }
814 gpertea 12 }
815 gpertea 94 ri=maxloc.pos;
816 gpertea 103 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
817 gpertea 94 //ri = right boundary for the poly match
818     //extend left
819     loc.set(li, maxloc.score);
820     maxloc.pos=li;
821     while (li>0) {
822     li--;
823     if (seq[li]==polyChar) {
824     loc.add(li,poly_m_score);
825     }
826     else if (seq[li]=='N') {
827     loc.add(li,0);
828     }
829     else { //mismatch
830     loc.add(li,poly_mis_score);
831     if (maxloc.score-loc.score>poly_dropoff_score) break;
832     }
833     if (maxloc.score<=loc.score) {
834     maxloc=loc;
835     }
836     }
837 gpertea 105 li=maxloc.pos;
838 gpertea 103 if ((maxloc.score>poly_minScore && ri>=rlen-3) ||
839     (maxloc.score==poly_minScore && ri==rlen-1) ||
840     (maxloc.score>(poly_minScore<<1) && ri>=rlen-6)) {
841 gpertea 105 //trimming this li-ri match at 3' end
842     l3=li-1;
843     if (l3<0) l3=0;
844 gpertea 94 return true;
845     }
846     return false;
847 gpertea 12 }
848    
849 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
850 gpertea 103 if (!doPolyTrim) return false;
851 gpertea 88 int rlen=seq.length();
852     l5=0;
853     l3=rlen-1;
854 gpertea 94 int32 seedVal=*(int32*)poly_seed;
855     char polyChar=poly_seed[0];
856 gpertea 88 //assumes N trimming was already done
857     //so a poly match should be very close to the end of the read
858     // -- find the initial match (seed)
859 gpertea 94 int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
860     int li;
861     for (li=0;li<=lmax;li++) {
862     if (seedVal==*(int*)&(seq[li])) {
863 gpertea 88 break;
864     }
865     }
866 gpertea 94 if (li>lmax) return false;
867     //seed found, try to extend it both ways
868     //extend left
869     int ri=li+3; //save rightmost base of the seed
870     SLocScore loc(li, poly_m_score<<2);
871     SLocScore maxloc(loc);
872     while (li>0) {
873     li--;
874     if (seq[li]==polyChar) {
875     loc.add(li,poly_m_score);
876     }
877     else if (seq[li]=='N') {
878     loc.add(li,0);
879     }
880     else { //mismatch
881     loc.add(li,poly_mis_score);
882     if (maxloc.score-loc.score>poly_dropoff_score) break;
883     }
884     if (maxloc.score<=loc.score) {
885     maxloc=loc;
886     }
887 gpertea 88 }
888 gpertea 94 li=maxloc.pos;
889 gpertea 103 if (li>5) return false; //no trimming wanted, too far from 5' end
890 gpertea 94 //li = right boundary for the poly match
891    
892     //extend right
893     loc.set(ri, maxloc.score);
894     maxloc.pos=ri;
895     while (ri<rlen-2) {
896     ri++;
897     if (seq[ri]==polyChar) {
898     loc.add(ri,poly_m_score);
899     }
900     else if (seq[ri]=='N') {
901     loc.add(ri,0);
902     }
903     else { //mismatch
904     loc.add(ri,poly_mis_score);
905     if (maxloc.score-loc.score>poly_dropoff_score) break;
906     }
907     if (maxloc.score<=loc.score) {
908     maxloc=loc;
909     }
910     }
911 gpertea 105 ri=maxloc.pos;
912 gpertea 103 if ((maxloc.score==poly_minScore && li==0) ||
913     (maxloc.score>poly_minScore && li<2)
914     || (maxloc.score>(poly_minScore<<1) && li<6)) {
915 gpertea 105 //adjust l5 to reflect this trimming of 5' end
916     l5=ri+1;
917     if (l5>rlen-1) l5=rlen-1;
918 gpertea 94 return true;
919     }
920     return false;
921 gpertea 88 }
922    
923 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
924 gpertea 103 if (adapters3.Count()==0) return false;
925 gpertea 4 int rlen=seq.length();
926     l5=0;
927     l3=rlen-1;
928 gpertea 103 bool trimmed=false;
929     GStr wseq(seq.chars());
930     int wlen=rlen;
931     for (int ai=0;ai<adapters3.Count();ai++) {
932     if (adapters3[ai].is_empty()) continue;
933     int alen=adapters3[ai].length();
934     GStr& aseq=adapters3[ai];
935     GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
936     if (r_bestaln) {
937     trimmed=true;
938     //keep unmatched region on the left, if any
939     l3-=(wlen-r_bestaln->sl+1);
940     delete r_bestaln;
941     if (l3<0) l3=0;
942     if (l3-l5+1<min_read_len) return true;
943     wseq=seq.substr(l5,l3-l5+1);
944     wlen=wseq.length();
945     }
946     }//for each 5' adapter
947     return trimmed;
948 gpertea 4 }
949    
950     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
951 gpertea 103 if (adapters5.Count()==0) return false;
952 gpertea 4 int rlen=seq.length();
953     l5=0;
954     l3=rlen-1;
955 gpertea 102 bool trimmed=false;
956 gpertea 103 GStr wseq(seq.chars());
957     int wlen=rlen;
958 gpertea 102 for (int ai=0;ai<adapters5.Count();ai++) {
959     if (adapters5[ai].is_empty()) continue;
960 gpertea 103 int alen=adapters5[ai].length();
961     GStr& aseq=adapters5[ai];
962     GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_l, 84);
963     if (l_bestaln) {
964     trimmed=true;
965     l5+=l_bestaln->sr;
966     delete l_bestaln;
967     if (l5>=rlen) l5=rlen-1;
968     if (l3-l5+1<min_read_len) return true;
969     wseq=seq.substr(l5,l3-l5+1);
970     wlen=wseq.length();
971     }
972 gpertea 102 }//for each 5' adapter
973     return trimmed;
974 gpertea 98 }
975 gpertea 4
976 gpertea 12 //convert qvs to/from phred64 from/to phread33
977     void convertPhred(GStr& q) {
978     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
979 gpertea 4 }
980    
981 gpertea 12 void convertPhred(char* q, int len) {
982     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
983 gpertea 4 }
984    
985 gpertea 13 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
986     GStr& rname, GStr& rinfo, GStr& infname) {
987     rseq="";
988     rqv="";
989     rname="";
990     rinfo="";
991     if (fq.eof()) return false;
992     char* l=fq.getLine();
993     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
994     if (l==NULL) return false;
995     /* if (rawFormat) {
996     //TODO: implement raw qseq parsing here
997     //if (raw type=N) then continue; //skip invalid/bad records
998     } //raw qseq format
999     else { // FASTQ or FASTA */
1000     isfasta=(l[0]=='>');
1001     if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1002     infname.chars(), l);
1003     GStr s(l);
1004     rname=&(l[1]);
1005     for (int i=0;i<rname.length();i++)
1006     if (rname[i]<=' ') {
1007     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1008     rname.cut(i);
1009     break;
1010     }
1011     //now get the sequence
1012     if ((l=fq.getLine())==NULL)
1013     GError("Error: unexpected EOF after header for read %s (%s)\n",
1014     rname.chars(), infname.chars());
1015     rseq=l; //this must be the DNA line
1016     while ((l=fq.getLine())!=NULL) {
1017     //seq can span multiple lines
1018     if (l[0]=='>' || l[0]=='+') {
1019     fq.pushBack();
1020     break; //
1021     }
1022     rseq+=l;
1023     } //check for multi-line seq
1024     if (!isfasta) { //reading fastq quality values, which can also be multi-line
1025     if ((l=fq.getLine())==NULL)
1026     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1027     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1028     if ((l=fq.getLine())==NULL)
1029     GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1030     rqv=l;
1031     //if (rqv.length()!=rseq.length())
1032     // GError("Error: qv len != seq len for %s\n", rname.chars());
1033     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1034     rqv+=l; //append to qv string
1035     }
1036     }// fastq
1037     // } //<-- FASTA or FASTQ
1038 gpertea 94 rseq.upper();
1039 gpertea 13 return true;
1040     }
1041    
1042 gpertea 105 #ifdef GDEBUG
1043     void showTrim(GStr& s, int l5, int l3) {
1044     if (l5>0) {
1045     color_bg(c_red);
1046     }
1047     for (int i=0;i<s.length()-1;i++) {
1048     if (i && i==l5) color_resetbg();
1049     fprintf(stderr, "%c", s[i]);
1050     if (i==l3) color_bg(c_red);
1051     }
1052     fprintf(stderr, "%c", s[s.length()-1]);
1053     color_reset();
1054     fprintf(stderr, "\n");
1055     }
1056     #endif
1057    
1058 gpertea 13 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1059     //returns 0 if the read was untouched, 1 if it was just trimmed
1060     // and a trash code if it was trashed
1061     l5=0;
1062     l3=rseq.length()-1;
1063 gpertea 103 #ifdef GDEBUG
1064     GMessage(">%s\n", rname.chars());
1065 gpertea 105 GMessage("%s\n",rseq.chars());
1066 gpertea 103 #endif
1067 gpertea 13 if (l3-l5+1<min_read_len) {
1068     return 's';
1069     }
1070     GStr wseq(rseq.chars());
1071     GStr wqv(rqv.chars());
1072     int w5=l5;
1073     int w3=l3;
1074     //first do the q-based trimming
1075     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1076     if (w3-w5+1<min_read_len) {
1077     return 'Q'; //invalid read
1078     }
1079     //-- keep only the w5..w3 range
1080     l5=w5;
1081     l3=w3;
1082     wseq=wseq.substr(w5, w3-w5+1);
1083     if (!wqv.is_empty())
1084     wqv=wqv.substr(w5, w3-w5+1);
1085     } //qv trimming
1086     // N-trimming on the remaining read seq
1087     if (ntrim(wseq, w5, w3)) {
1088     //GMessage("before: %s\n",wseq.chars());
1089     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1090     l5+=w5;
1091     l3-=(wseq.length()-1-w3);
1092     if (w3-w5+1<min_read_len) {
1093     return 'N'; //to be trashed
1094     }
1095     //-- keep only the w5..w3 range
1096     wseq=wseq.substr(w5, w3-w5+1);
1097     if (!wqv.is_empty())
1098     wqv=wqv.substr(w5, w3-w5+1);
1099     w5=0;
1100     w3=wseq.length()-1;
1101     }
1102 gpertea 103 char trim_code;
1103     do {
1104     trim_code=0;
1105     if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1106     trim_code='A';
1107     }
1108     else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1109     trim_code='T';
1110     }
1111     else if (trim_adapter5(wseq, w5, w3)) {
1112     trim_code='5';
1113     }
1114     if (trim_code) {
1115 gpertea 105 #ifdef GDEBUG
1116     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1117     showTrim(wseq, w5, w3);
1118     #endif
1119 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1120 gpertea 103 num_trimmed5++;
1121     if (trimlen<min_trimmed5)
1122     min_trimmed5=trimlen;
1123 gpertea 13 l5+=w5;
1124     l3-=(wseq.length()-1-w3);
1125     if (w3-w5+1<min_read_len) {
1126 gpertea 103 return trim_code;
1127 gpertea 13 }
1128     //-- keep only the w5..w3 range
1129     wseq=wseq.substr(w5, w3-w5+1);
1130     if (!wqv.is_empty())
1131     wqv=wqv.substr(w5, w3-w5+1);
1132 gpertea 103 }// trimmed at 5' end
1133     } while (trim_code);
1134    
1135     do {
1136     trim_code=0;
1137     if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1138     trim_code='A';
1139     }
1140     else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1141     trim_code='T';
1142     }
1143     else if (trim_adapter3(wseq, w5, w3)) {
1144     trim_code='3';
1145     }
1146     if (trim_code) {
1147 gpertea 105 #ifdef GDEBUG
1148     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1149     showTrim(wseq, w5, w3);
1150     #endif
1151 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1152 gpertea 103 num_trimmed3++;
1153     if (trimlen<min_trimmed3)
1154     min_trimmed3=trimlen;
1155 gpertea 13 l5+=w5;
1156     l3-=(wseq.length()-1-w3);
1157     if (w3-w5+1<min_read_len) {
1158 gpertea 103 return trim_code;
1159 gpertea 13 }
1160     //-- keep only the w5..w3 range
1161     wseq=wseq.substr(w5, w3-w5+1);
1162     if (!wqv.is_empty())
1163     wqv=wqv.substr(w5, w3-w5+1);
1164 gpertea 103 }//trimming at 3' end
1165     } while (trim_code);
1166    
1167    
1168 gpertea 13 if (doCollapse) {
1169     //keep read for later
1170     FqDupRec* dr=dhash.Find(wseq.chars());
1171     if (dr==NULL) { //new entry
1172     //if (prefix.is_empty())
1173     dhash.Add(wseq.chars(),
1174     new FqDupRec(&wqv, rname.chars()));
1175     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1176     }
1177     else
1178     dr->add(wqv);
1179     } //collapsing duplicates
1180     else { //not collapsing duplicates
1181     //apply the dust filter now
1182     if (doDust) {
1183     int dustbases=dust(wseq);
1184     if (dustbases>(wseq.length()>>1)) {
1185     return 'D';
1186     }
1187     }
1188     } //not collapsing duplicates
1189     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1190     }
1191    
1192     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1193     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1194     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1195     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1196     }
1197    
1198     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1199     outcounter++;
1200     bool asFasta=(rqv.is_empty() || fastaOutput);
1201     if (asFasta) {
1202     if (prefix.is_empty()) {
1203     printHeader(f_out, '>',rname,rinfo);
1204     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1205     }
1206     else {
1207     fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1208     rseq.chars());
1209     }
1210     }
1211     else { //fastq
1212     if (convert_phred) convertPhred(rqv);
1213     if (prefix.is_empty()) {
1214     printHeader(f_out, '@', rname, rinfo);
1215     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1216     }
1217     else
1218     fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1219     rseq.chars(),rqv.chars() );
1220     }
1221     }
1222    
1223     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1224     if (freport==NULL || trashcode<=' ') return;
1225     if (trashcode=='3' || trashcode=='5') {
1226 gpertea 103 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1227 gpertea 13 }
1228     else {
1229     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1230     }
1231     //tcounter++;
1232     }
1233    
1234     GStr getFext(GStr& s, int* xpos=NULL) {
1235     //s must be a filename without a path
1236     GStr r("");
1237     if (xpos!=NULL) *xpos=0;
1238     if (s.is_empty() || s=="-") return r;
1239     int p=s.rindex('.');
1240     int d=s.rindex('/');
1241     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1242     r=s.substr(p+1);
1243     if (xpos!=NULL) *xpos=p+1;
1244     r.lower();
1245     return r;
1246     }
1247    
1248     void baseFileName(GStr& fname) {
1249     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1250     int xpos=0;
1251     GStr fext=getFext(fname, &xpos);
1252     if (xpos<=1) return;
1253     bool extdel=false;
1254     GStr f2;
1255     if (fext=="z" || fext=="zip") {
1256     extdel=true;
1257     }
1258     else if (fext.length()>=2) {
1259     f2=fext.substr(0,2);
1260     extdel=(f2=="gz" || f2=="bz");
1261     }
1262     if (extdel) {
1263     fname.cut(xpos-1);
1264     fext=getFext(fname, &xpos);
1265     if (xpos<=1) return;
1266     }
1267     extdel=false;
1268     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1269     extdel=true;
1270     }
1271     else if (fext.length()>=2) {
1272     extdel=(fext.substr(0,2)=="fa");
1273     }
1274     if (extdel) fname.cut(xpos-1);
1275     GStr fncp(fname);
1276     fncp.lower();
1277     fncp.chomp("seq");
1278     fncp.chomp("sequence");
1279     fncp.trimR("_.");
1280     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1281     }
1282    
1283     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1284     FILE* f_out=NULL;
1285     GStr fname(getFileName(infname.chars()));
1286     //eliminate known extensions
1287     baseFileName(fname);
1288     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1289     else if (pocmd.is_empty()) {
1290     GStr oname(fname);
1291     oname.append('.');
1292     oname.append(outsuffix);
1293     f_out=fopen(oname.chars(),"w");
1294     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1295     }
1296     else {
1297     GStr oname(">");
1298     oname.append(fname);
1299     oname.append('.');
1300     oname.append(outsuffix);
1301     pocmd.append(oname);
1302     f_out=popen(pocmd.chars(), "w");
1303     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1304     }
1305     return f_out;
1306     }
1307    
1308     void guess_unzip(GStr& fname, GStr& picmd) {
1309     GStr fext=getFext(fname);
1310     if (fext=="gz" || fext=="gzip" || fext=="z") {
1311     picmd="gzip -cd ";
1312     }
1313     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1314     picmd="bzip2 -cd ";
1315     }
1316     }
1317    
1318 gpertea 94
1319     int loadAdapters(const char* fname) {
1320     GLineReader lr(fname);
1321     char* l;
1322     while ((l=lr.nextLine())!=NULL) {
1323     if (lr.length()<=3 || l[0]=='#') continue;
1324 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1325     l[0]==';'|| l[0]==':' ) {
1326 gpertea 94 int i=1;
1327     while (l[i]!=0 && isspace(l[i])) {
1328     i++;
1329     }
1330     if (l[i]!=0) {
1331 gpertea 102 GStr s(&(l[i]));
1332     adapters3.Add(s);
1333 gpertea 94 continue;
1334     }
1335     }
1336     else {
1337 gpertea 98 GStr s(l);
1338     s.startTokenize("\t ;,:");
1339     GStr a5,a3;
1340     if (s.nextToken(a5))
1341     s.nextToken(a3);
1342 gpertea 102 a5.upper();
1343     a3.upper();
1344     adapters5.Add(a5);
1345     adapters3.Add(a3);
1346 gpertea 94 }
1347     }
1348 gpertea 102 return adapters5.Count()+adapters3.Count();
1349 gpertea 94 }
1350    
1351 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1352     GStr& s, GStr& infname, GStr& infname2) {
1353     // uses outsuffix to generate output file names and open file handles as needed
1354     infname="";
1355     infname2="";
1356     f_in=NULL;
1357     f_in2=NULL;
1358     f_out=NULL;
1359     f_out2=NULL;
1360     //analyze outsuffix intent
1361     GStr pocmd;
1362 gpertea 76 if (outsuffix=="-") {
1363     f_out=stdout;
1364     }
1365     else {
1366     GStr ox=getFext(outsuffix);
1367     if (ox.length()>2) ox=ox.substr(0,2);
1368     if (ox=="gz") pocmd="gzip -9 -c ";
1369     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1370     }
1371 gpertea 13 if (s=="-") {
1372     f_in=stdin;
1373     infname="stdin";
1374     f_out=prepOutFile(infname, pocmd);
1375     return;
1376     } // streaming from stdin
1377     s.startTokenize(",:");
1378     s.nextToken(infname);
1379     bool paired=s.nextToken(infname2);
1380     if (fileExists(infname.chars())==0)
1381     GError("Error: cannot find file %s!\n",infname.chars());
1382     GStr fname(getFileName(infname.chars()));
1383     GStr picmd;
1384     guess_unzip(fname, picmd);
1385     if (picmd.is_empty()) {
1386     f_in=fopen(infname.chars(), "r");
1387     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1388     }
1389     else {
1390     picmd.append(infname);
1391     f_in=popen(picmd.chars(), "r");
1392     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1393     }
1394 gpertea 76 if (f_out==stdout) {
1395     if (paired) GError("Error: output suffix required for paired reads\n");
1396     return;
1397     }
1398 gpertea 13 f_out=prepOutFile(infname, pocmd);
1399     if (!paired) return;
1400     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1401     // ---- paired reads:-------------
1402     if (fileExists(infname2.chars())==0)
1403     GError("Error: cannot find file %s!\n",infname2.chars());
1404     picmd="";
1405     GStr fname2(getFileName(infname2.chars()));
1406     guess_unzip(fname2, picmd);
1407     if (picmd.is_empty()) {
1408     f_in2=fopen(infname2.chars(), "r");
1409     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1410     }
1411     else {
1412     picmd.append(infname2);
1413     f_in2=popen(picmd.chars(), "r");
1414     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1415     }
1416     f_out2=prepOutFile(infname2, pocmd);
1417     }