ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 98
Committed: Thu Oct 6 20:49:00 2011 UTC (8 years, 2 months ago) by gpertea
File size: 46629 byte(s)
Log Message:
wip

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 76 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10     [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 76 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15     for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28     -f file with adapter sequences to trim, each line having this format:\n\
29     <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 gpertea 4 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32     -3 trim the given adapter sequence at the 3' end of each read\n\
33     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 94 -A disable polyA trimming (enabled by default)\n\
35     -T enable polyT trimming (disabled by default)\n\
36     -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
37 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
40     -l minimum \"clean\" length after trimming that a read must have\n\
41     in order to pass the filter (default: 16)\n\
42 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
43     one letter code for the reason why they were trashed\n\
44 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
45     50% of its length detected as low complexity\n\
46 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
47 gpertea 12 name (e.g. for microRNAs)\n\
48     -p input is phred64/phred33 (use -p64 or -p33)\n\
49     -Q convert quality values to the other Phred qv type\n\
50     -V verbose processing\n\
51 gpertea 4 "
52 gpertea 13
53     //-z for -o option, the output stream(s) will be first piped into the given\n
54     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
55    
56    
57 gpertea 4 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
58 gpertea 12
59 gpertea 76 //For paired reads sequencing:
60 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
61     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
62 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
63     //FILE* f_out2=NULL; //for paired reads
64     //FILE* f_in=NULL; //input fastq (stdin if not provided)
65     //FILE* f_in2=NULL; //for paired reads
66 gpertea 4 FILE* freport=NULL;
67 gpertea 13
68 gpertea 4 bool debug=false;
69 gpertea 12 bool verbose=false;
70 gpertea 4 bool doCollapse=false;
71     bool doDust=false;
72 gpertea 13 bool fastaOutput=false;
73 gpertea 4 bool trashReport=false;
74 gpertea 12 //bool rawFormat=false;
75 gpertea 4 int min_read_len=16;
76 gpertea 12 double max_perc_N=7.0;
77 gpertea 4 int dust_cutoff=16;
78     bool isfasta=false;
79 gpertea 12 bool convert_phred=false;
80 gpertea 13 GStr outsuffix; // -o
81     GStr prefix;
82     GStr zcmd;
83     int num_trimmed5=0;
84     int num_trimmed3=0;
85     int num_trimmedN=0;
86     int num_trimmedQ=0;
87     int min_trimmed5=INT_MAX;
88     int min_trimmed3=INT_MAX;
89 gpertea 12
90 gpertea 13 int qvtrim_qmin=0;
91     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
92 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93     int qv_cvtadd=0; //could be -31 or +31
94    
95 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
96     const int poly_m_score=2; //match score
97     const int poly_mis_score=-3; //mismatch
98 gpertea 88 const int poly_dropoff_score=7;
99 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
100 gpertea 4
101 gpertea 94 const char *polyA_seed="AAAA";
102     const char *polyT_seed="TTTT";
103 gpertea 12
104 gpertea 94 struct CAdapters {
105     GStr a5;
106     GStr a3;
107     CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
108 gpertea 12 }
109 gpertea 94 };
110 gpertea 12
111 gpertea 94 GPVec<CAdapters> adapters;
112 gpertea 12
113 gpertea 4 // element in dhash:
114     class FqDupRec {
115     public:
116     int count; //how many of these reads are the same
117     int len; //length of qv
118     char* firstname; //optional, only if we want to keep the original read names
119     char* qv;
120     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
121     len=0;
122     qv=NULL;
123     firstname=NULL;
124     count=0;
125     if (qstr!=NULL) {
126     qv=Gstrdup(qstr->chars());
127     len=qstr->length();
128     count++;
129     }
130     if (rname!=NULL) firstname=Gstrdup(rname);
131     }
132     ~FqDupRec() {
133     GFREE(qv);
134     GFREE(firstname);
135     }
136     void add(GStr& d) { //collapse another record into this one
137     if (d.length()!=len)
138     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
139     count++;
140     for (int i=0;i<len;i++)
141     qv[i]=(qv[i]+d[i])/2;
142     }
143     };
144    
145     void openfw(FILE* &f, GArgs& args, char opt) {
146     GStr s=args.getOpt(opt);
147     if (!s.is_empty()) {
148     if (s=='-') f=stdout;
149     else {
150 gpertea 13 f=fopen(s.chars(),"w");
151 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
152     }
153     }
154     }
155    
156     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
157     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
158    
159     GHash<FqDupRec> dhash; //hash to keep track of duplicates
160    
161 gpertea 94 int loadAdapters(const char* fname);
162    
163 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
164     GStr& s, GStr& infname, GStr& infname2);
165     // uses outsuffix to generate output file names and open file handles as needed
166    
167     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
168     void trash_report(char trashcode, GStr& rname, FILE* freport);
169    
170     bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
171     GStr& rname, GStr& rinfo, GStr& infname);
172    
173     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
174     //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
175    
176 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
177     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
178 gpertea 4 int dust(GStr& seq);
179 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
180     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
181 gpertea 4 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
182 gpertea 94 bool trim_adapter3(GStr& seq, int &l5, int &l3);
183 gpertea 4
184 gpertea 12 void convertPhred(char* q, int len);
185     void convertPhred(GStr& q);
186 gpertea 4
187     int main(int argc, char * const argv[]) {
188 gpertea 76 GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189 gpertea 4 int e;
190     if ((e=args.isError())>0) {
191     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
192     exit(224);
193     }
194 gpertea 9 debug=(args.getOpt('Y')!=NULL);
195 gpertea 13 verbose=(args.getOpt('V')!=NULL);
196 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
197 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
198     doDust=(args.getOpt('D')!=NULL);
199 gpertea 12 /*
200 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
201     if (rawFormat) {
202     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
203     }
204 gpertea 12 */
205 gpertea 4 prefix=args.getOpt('n');
206     GStr s=args.getOpt('l');
207     if (!s.is_empty())
208     min_read_len=s.asInt();
209 gpertea 12 s=args.getOpt('m');
210     if (!s.is_empty())
211     max_perc_N=s.asDouble();
212 gpertea 4 s=args.getOpt('d');
213     if (!s.is_empty()) {
214     dust_cutoff=s.asInt();
215     doDust=true;
216     }
217 gpertea 12 s=args.getOpt('q');
218     if (!s.is_empty()) {
219 gpertea 13 qvtrim_qmin=s.asInt();
220 gpertea 12 }
221 gpertea 13 s=args.getOpt('t');
222     if (!s.is_empty()) {
223     qvtrim_max=s.asInt();
224     }
225 gpertea 12 s=args.getOpt('p');
226     if (!s.is_empty()) {
227     int v=s.asInt();
228     if (v==33) {
229     qv_phredtype=33;
230     qv_cvtadd=31;
231     }
232     else if (v==64) {
233     qv_phredtype=64;
234     qv_cvtadd=-31;
235     }
236     else
237     GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
238     }
239 gpertea 94 s=args.getOpt('f');
240     if (!s.is_empty()) {
241     loadAdapters(s.chars());
242     }
243     bool fileAdapters=adapters.Count();
244     s=args.getOpt('5');
245     if (!s.is_empty()) {
246     if (fileAdapters)
247     GError("Error: options -5 and -f cannot be used together!\n");
248     s.upper();
249     adapters.Add(new CAdapters(s.chars()));
250 gpertea 4 }
251 gpertea 94 s=args.getOpt('3');
252     if (!s.is_empty()) {
253     if (fileAdapters)
254     GError("Error: options -3 and -f cannot be used together!\n");
255     s.upper();
256     if (adapters.Count()>0)
257     adapters[0]->a3=s.chars();
258     else adapters.Add(NULL, new CAdapters(s.chars()));
259 gpertea 4 }
260 gpertea 94 s=args.getOpt('y');
261 gpertea 13 if (!s.is_empty()) {
262 gpertea 94 int minmatch=s.asInt();
263     poly_minScore=minmatch*poly_m_score;
264 gpertea 13 }
265 gpertea 76
266 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
267 gpertea 76 else outsuffix="-";
268 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
269 gpertea 13 int fcount=args.startNonOpt();
270     if (fcount==0) {
271 gpertea 4 GMessage(USAGE);
272     exit(224);
273     }
274 gpertea 13 if (fcount>1 && doCollapse) {
275     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
276     }
277     //openfw(f_out, args, 'o');
278     //if (f_out==NULL) f_out=stdout;
279 gpertea 4 if (trashReport)
280     openfw(freport, args, 'r');
281     char* infile=NULL;
282     while ((infile=args.nextNonOpt())!=NULL) {
283 gpertea 13 int incounter=0; //counter for input reads
284     int outcounter=0; //counter for output reads
285     int trash_s=0; //too short from the get go
286     int trash_Q=0;
287     int trash_N=0;
288     int trash_D=0;
289     int trash_A3=0;
290     int trash_A5=0;
291     s=infile;
292     GStr infname;
293     GStr infname2;
294     FILE* f_in=NULL;
295     FILE* f_in2=NULL;
296     FILE* f_out=NULL;
297     FILE* f_out2=NULL;
298     bool paired_reads=false;
299     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
300     GLineReader fq(f_in);
301     GLineReader* fq2=NULL;
302     if (f_in2!=NULL) {
303     fq2=new GLineReader(f_in2);
304     paired_reads=true;
305     }
306     GStr rseq, rqv, seqid, seqinfo;
307     GStr rseq2, rqv2, seqid2, seqinfo2;
308     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
309     incounter++;
310     int a5=0, a3=0, b5=0, b3=0;
311     char tcode=0, tcode2=0;
312     tcode=process_read(seqid, rseq, rqv, a5, a3);
313     //if (!doCollapse) {
314     if (fq2!=NULL) {
315     getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
316     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
317     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
318     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
319     }
320     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
321     //decide what to do with this pair and print rseq2 if the pair makes it
322     if (tcode>1 && tcode2<=1) {
323     //"untrash" rseq
324     if (a3-a5+1<min_read_len) {
325     a5=1;
326     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
327 gpertea 4 }
328 gpertea 13 tcode=1;
329     }
330     else if (tcode<=1 && tcode2>1) {
331     //"untrash" rseq2
332     if (b3-b5+1<min_read_len) {
333     b5=1;
334     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
335     }
336     tcode2=1;
337     }
338     if (tcode<=1) { //trimmed or left intact -- write it!
339     if (tcode>0) {
340     rseq2=rseq2.substr(b5,b3-b5+1);
341     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
342 gpertea 4 }
343 gpertea 13 int nocounter=0;
344     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
345     }
346     } //paired read
347     // }
348     if (tcode>1) { //trashed
349     if (tcode=='s') trash_s++;
350     else if (tcode=='Q') trash_Q++;
351     else if (tcode=='N') trash_N++;
352     else if (tcode=='D') trash_D++;
353     else if (tcode=='3') trash_A3++;
354     else if (tcode=='5') trash_A5++;
355     if (trashReport) trash_report(tcode, seqid, freport);
356     }
357     else if (!doCollapse) { //write it
358     if (tcode>0) {
359     rseq=rseq.substr(a5,a3-a5+1);
360     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
361     }
362     writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
363     }
364     } //for each fastq record
365     delete fq2;
366     FRCLOSE(f_in);
367     FRCLOSE(f_in2);
368     if (doCollapse) {
369     outcounter=0;
370     int maxdup_count=1;
371     char* maxdup_seq=NULL;
372     dhash.startIterate();
373     FqDupRec* qd=NULL;
374     char* seq=NULL;
375     while ((qd=dhash.NextData(seq))!=NULL) {
376     GStr rseq(seq);
377     //do the dusting here
378     if (doDust) {
379     int dustbases=dust(rseq);
380     if (dustbases>(rseq.length()>>1)) {
381     if (trashReport && qd->firstname!=NULL) {
382     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
383 gpertea 4 }
384 gpertea 13 trash_D+=qd->count;
385     continue;
386     }
387     }
388     outcounter++;
389     if (qd->count>maxdup_count) {
390     maxdup_count=qd->count;
391     maxdup_seq=seq;
392     }
393     if (isfasta) {
394     if (prefix.is_empty()) {
395     fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
396     rseq.chars());
397 gpertea 4 }
398 gpertea 13 else { //use custom read name
399     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
400     qd->count, rseq.chars());
401 gpertea 4 }
402 gpertea 13 }
403     else { //fastq format
404     if (convert_phred) convertPhred(qd->qv, qd->len);
405     if (prefix.is_empty()) {
406     fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
407     rseq.chars(), qd->qv);
408 gpertea 4 }
409 gpertea 13 else { //use custom read name
410     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
411     qd->count, rseq.chars(), qd->qv);
412     }
413     }
414     }//for each element of dhash
415     if (maxdup_count>1) {
416     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
417 gpertea 4 }
418 gpertea 13 } //collapse entries
419     if (verbose) {
420     if (paired_reads) {
421     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
422     GMessage("Number of input pairs :%9d\n", incounter);
423     GMessage(" Output pairs :%9d\n", outcounter);
424     }
425     else {
426     GMessage(">Input file : %s\n", infname.chars());
427     GMessage("Number of input reads :%9d\n", incounter);
428     GMessage(" Output reads :%9d\n", outcounter);
429     }
430     GMessage("------------------------------------\n");
431     if (num_trimmed5)
432     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
433     if (num_trimmed3)
434     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
435     GMessage("------------------------------------\n");
436     if (trash_s>0)
437     GMessage(" Trashed by length:%9d\n", trash_s);
438     if (trash_N>0)
439     GMessage(" Trashed by N%%:%9d\n", trash_N);
440     if (trash_Q>0)
441     GMessage("Trashed by low quality:%9d\n", trash_Q);
442     if (trash_A5>0)
443     GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
444     if (trash_A3>0)
445     GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
446     }
447     if (trashReport) {
448     FWCLOSE(freport);
449 gpertea 4 }
450 gpertea 13 FWCLOSE(f_out);
451     FWCLOSE(f_out2);
452     } //while each input file
453 gpertea 4
454 gpertea 12 //getc(stdin);
455 gpertea 4 }
456    
457     class NData {
458     public:
459     int NPos[1024]; //there should be no reads longer than 1K ?
460     int NCount;
461     int end5;
462     int end3;
463     int seqlen;
464     double perc_N; //percentage of Ns in end5..end3 range only!
465     const char* seq;
466     bool valid;
467     NData() {
468     NCount=0;
469     end5=0;
470     end3=0;
471     seq=NULL;
472     perc_N=0;
473     valid=true;
474     }
475     void init(GStr& rseq) {
476     NCount=0;
477     perc_N=0;
478     seqlen=rseq.length();
479     seq=rseq.chars();
480     end5=1;
481     end3=seqlen;
482     for (int i=0;i<seqlen;i++)
483 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
484 gpertea 4 NPos[NCount]=i;
485     NCount++;
486     }
487     perc_N=(NCount*100.0)/seqlen;
488     valid=true;
489     }
490     void N_calc() { //only in the region after trimming
491     int n=0;
492     for (int i=end3-1;i<end5;i++) {
493     if (seq[i]=='N') n++;
494     }
495     perc_N=(n*100.0)/(end5-end3+1);
496     }
497     };
498    
499     static NData feat;
500     int perc_lenN=12; // incremental distance from ends, in percentage of
501     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
502    
503     void N_analyze(int l5, int l3, int p5, int p3) {
504     /* assumes feat was filled properly */
505     int old_dif, t5,t3,v;
506 gpertea 12 if (l3<l5+2 || p5>p3 ) {
507 gpertea 4 feat.end5=l5+1;
508     feat.end3=l3+1;
509     return;
510     }
511    
512     t5=feat.NPos[p5]-l5;
513     t3=l3-feat.NPos[p3];
514     old_dif=p3-p5;
515     v=(int)((((double)(l3-l5))*perc_lenN)/100);
516     if (v>20) v=20; /* enforce N-search limit for very long reads */
517     if (t5 < v ) {
518     l5=feat.NPos[p5]+1;
519     p5++;
520     }
521     if (t3 < v) {
522     l3=feat.NPos[p3]-1;
523     p3--;
524     }
525     /* restNs=p3-p5; number of Ns in the new CLR */
526     if (p3-p5==old_dif) { /* no change, return */
527     /* don't trim if this may shorten the read too much */
528     //if (l5-l3<min_read_len) return;
529     feat.end5=l5+1;
530     feat.end3=l3+1;
531     return;
532     }
533     else
534     N_analyze(l5,l3, p5,p3);
535     }
536    
537    
538 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
539 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
540 gpertea 12 if (qv_phredtype==0) {
541     //try to guess the Phred type
542     int vmin=256, vmax=0;
543     for (int i=0;i<qvs.length();i++) {
544     if (vmin>qvs[i]) vmin=qvs[i];
545     if (vmax<qvs[i]) vmax=qvs[i];
546     }
547     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
548     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
549     if (qv_phredtype==0) {
550     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
551     }
552 gpertea 13 if (verbose)
553     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
554     } //guessing Phred type
555 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
556 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
557 gpertea 12 }
558     //just in case, check also the 5' the end (?)
559     for (l5=0;l5<qvs.length()-3;l5++) {
560 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
561 gpertea 12 }
562 gpertea 13 if (qvtrim_max>0) {
563     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
564     if (l5>qvtrim_max) l5=qvtrim_max;
565     }
566 gpertea 12 return (l5>0 || l3<qvs.length()-1);
567     }
568    
569     bool ntrim(GStr& rseq, int &l5, int &l3) {
570     //count Ns in the sequence, trim N-rich ends
571 gpertea 4 feat.init(rseq);
572     l5=feat.end5-1;
573     l3=feat.end3-1;
574     N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
575 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
576     if (feat.perc_N>max_perc_N) {
577     feat.valid=false;
578     l3=l5+1;
579     return true;
580     }
581     else {
582     return false; //nothing changed
583     }
584     }
585 gpertea 4 l5=feat.end5-1;
586     l3=feat.end3-1;
587     if (l3-l5+1<min_read_len) {
588     feat.valid=false;
589     return true;
590     }
591     feat.N_calc();
592 gpertea 12
593     if (feat.perc_N>max_perc_N) {
594 gpertea 4 feat.valid=false;
595     l3=l5+1;
596     return true;
597     }
598     return true;
599     }
600    
601     //--------------- dust functions ----------------
602     class DNADuster {
603     public:
604     int dustword;
605     int dustwindow;
606     int dustwindow2;
607     int dustcutoff;
608     int mv, iv, jv;
609     int counts[32*32*32];
610     int iis[32*32*32];
611     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
612     dustword=wordsize;
613     dustwindow=winsize;
614     dustwindow2 = (winsize>>1);
615     dustcutoff=cutoff;
616     mv=0;
617     iv=0;
618     jv=0;
619     }
620     void setWindowSize(int value) {
621     dustwindow = value;
622     dustwindow2 = (dustwindow >> 1);
623     }
624     void setWordSize(int value) {
625     dustword=value;
626     }
627     void wo1(int len, const char* s, int ivv) {
628     int i, ii, j, v, t, n, n1, sum;
629     int js, nis;
630     n = 32 * 32 * 32;
631     n1 = n - 1;
632     nis = 0;
633     i = 0;
634     ii = 0;
635     sum = 0;
636     v = 0;
637     for (j=0; j < len; j++, s++) {
638     ii <<= 5;
639     if (*s<=32) {
640     i=0;
641     continue;
642     }
643     ii |= *s - 'A'; //assume uppercase!
644     ii &= n1;
645     i++;
646     if (i >= dustword) {
647     for (js=0; js < nis && iis[js] != ii; js++) ;
648     if (js == nis) {
649     iis[nis] = ii;
650     counts[ii] = 0;
651     nis++;
652     }
653     if ((t = counts[ii]) > 0) {
654     sum += t;
655     v = 10 * sum / j;
656     if (mv < v) {
657     mv = v;
658     iv = ivv;
659     jv = j;
660     }
661     }
662     counts[ii]++;
663     }
664     }
665     }
666    
667     int wo(int len, const char* s, int* beg, int* end) {
668     int i, l1;
669     l1 = len - dustword + 1;
670     if (l1 < 0) {
671     *beg = 0;
672     *end = len - 1;
673     return 0;
674     }
675     mv = 0;
676     iv = 0;
677     jv = 0;
678     for (i=0; i < l1; i++) {
679     wo1(len-i, s+i, i);
680     }
681     *beg = iv;
682     *end = iv + jv;
683     return mv;
684     }
685    
686     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
687     int i, j, l, a, b, v;
688     if (cutoff==0) cutoff=dustcutoff;
689     a=0;b=0;
690     //GMessage("Dust cutoff=%d\n", cutoff);
691     for (i=0; i < seqlen; i += dustwindow2) {
692     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
693     v = wo(l, seq+i, &a, &b);
694     if (v > cutoff) {
695     //for (j = a; j <= b && j < dustwindow2; j++) {
696     for (j = a; j <= b; j++) {
697     seqmsk[i+j]='N';//could be made lowercase instead
698     }
699     }
700     }
701     //return first;
702     }
703     };
704    
705     static DNADuster duster;
706    
707     int dust(GStr& rseq) {
708     char* seq=Gstrdup(rseq.chars());
709     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
710     //check the number of Ns:
711     int ncount=0;
712     for (int i=0;i<rseq.length();i++) {
713     if (seq[i]=='N') ncount++;
714     }
715     GFREE(seq);
716     return ncount;
717     }
718    
719 gpertea 12 int get3mer_value(const char* s) {
720     return (s[0]<<16)+(s[1]<<8)+s[2];
721     }
722    
723     int w3_match(int qv, const char* str, int slen, int start_index=0) {
724     if (start_index>=slen || start_index<0) return -1;
725     for (int i=start_index;i<slen-3;i++) {
726     int rv=get3mer_value(str+i);
727     if (rv==qv) return i;
728     }
729     return -1;
730     }
731    
732     int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
733     if (end_index>=slen) return -1;
734     if (end_index<0) end_index=slen-1;
735     for (int i=end_index-2;i>=0;i--) {
736     int rv=get3mer_value(str+i);
737     if (rv==qv) return i;
738     }
739     return -1;
740     }
741    
742 gpertea 94 struct SLocScore {
743     int pos;
744     int score;
745     SLocScore(int p=0,int s=0) {
746     pos=p;
747     score=s;
748     }
749     void set(int p, int s) {
750     pos=p;
751     score=s;
752     }
753     void add(int p, int add) {
754     pos=p;
755     score+=add;
756     }
757     };
758 gpertea 12
759 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
760     int rlen=seq.length();
761     l5=0;
762     l3=rlen-1;
763     int32 seedVal=*(int32*)poly_seed;
764     char polyChar=poly_seed[0];
765     //assumes N trimming was already done
766     //so a poly match should be very close to the end of the read
767     // -- find the initial match (seed)
768     int lmin=GMAX((rlen-12), 0);
769     int li;
770     for (li=rlen-4;li>lmin;li--) {
771     if (seedVal==*(int*)&(seq[li])) {
772     break;
773     }
774 gpertea 12 }
775 gpertea 94 if (li<=lmin) return false;
776     //seed found, try to extend it both ways
777     //extend right
778     int ri=li+3;
779     SLocScore loc(ri, poly_m_score<<2);
780     SLocScore maxloc(loc);
781     //extend right
782     while (ri<rlen-2) {
783     ri++;
784     if (seq[ri]==polyChar) {
785     loc.add(ri,poly_m_score);
786     }
787     else if (seq[ri]=='N') {
788     loc.add(ri,0);
789     }
790     else { //mismatch
791     loc.add(ri,poly_mis_score);
792     if (maxloc.score-loc.score>poly_dropoff_score) break;
793     }
794     if (maxloc.score<=loc.score) {
795     maxloc=loc;
796     }
797 gpertea 12 }
798 gpertea 94 ri=maxloc.pos;
799     if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end
800     //ri = right boundary for the poly match
801     //extend left
802     loc.set(li, maxloc.score);
803     maxloc.pos=li;
804     while (li>0) {
805     li--;
806     if (seq[li]==polyChar) {
807     loc.add(li,poly_m_score);
808     }
809     else if (seq[li]=='N') {
810     loc.add(li,0);
811     }
812     else { //mismatch
813     loc.add(li,poly_mis_score);
814     if (maxloc.score-loc.score>poly_dropoff_score) break;
815     }
816     if (maxloc.score<=loc.score) {
817     maxloc=loc;
818     }
819     }
820     if (maxloc.score>poly_minScore && ri>=rlen-3) {
821     l5=li;
822     l3=ri;
823     return true;
824     }
825     return false;
826 gpertea 12 }
827    
828 gpertea 94
829     bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
830 gpertea 88 int rlen=seq.length();
831     l5=0;
832     l3=rlen-1;
833 gpertea 94 int32 seedVal=*(int32*)poly_seed;
834     char polyChar=poly_seed[0];
835 gpertea 88 //assumes N trimming was already done
836     //so a poly match should be very close to the end of the read
837     // -- find the initial match (seed)
838 gpertea 94 int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
839     int li;
840     for (li=0;li<=lmax;li++) {
841     if (seedVal==*(int*)&(seq[li])) {
842 gpertea 88 break;
843     }
844     }
845 gpertea 94 if (li>lmax) return false;
846     //seed found, try to extend it both ways
847     //extend left
848     int ri=li+3; //save rightmost base of the seed
849     SLocScore loc(li, poly_m_score<<2);
850     SLocScore maxloc(loc);
851     while (li>0) {
852     li--;
853     if (seq[li]==polyChar) {
854     loc.add(li,poly_m_score);
855     }
856     else if (seq[li]=='N') {
857     loc.add(li,0);
858     }
859     else { //mismatch
860     loc.add(li,poly_mis_score);
861     if (maxloc.score-loc.score>poly_dropoff_score) break;
862     }
863     if (maxloc.score<=loc.score) {
864     maxloc=loc;
865     }
866 gpertea 88 }
867 gpertea 94 li=maxloc.pos;
868     if (li>3) return false; //no trimming wanted, too far from 5' end
869     //li = right boundary for the poly match
870    
871     //extend right
872     loc.set(ri, maxloc.score);
873     maxloc.pos=ri;
874     while (ri<rlen-2) {
875     ri++;
876     if (seq[ri]==polyChar) {
877     loc.add(ri,poly_m_score);
878     }
879     else if (seq[ri]=='N') {
880     loc.add(ri,0);
881     }
882     else { //mismatch
883     loc.add(ri,poly_mis_score);
884     if (maxloc.score-loc.score>poly_dropoff_score) break;
885     }
886     if (maxloc.score<=loc.score) {
887     maxloc=loc;
888     }
889     }
890    
891     if (maxloc.score>poly_minScore && li<=3) {
892     l5=li;
893     l3=ri;
894     return true;
895     }
896     return false;
897 gpertea 88 }
898    
899 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
900     int rlen=seq.length();
901     l5=0;
902     l3=rlen-1;
903     //first try a full match, we might get lucky
904     int fi=-1;
905     if ((fi=seq.index(adapter3))>=0) {
906     if (fi<rlen-fi-a3len) {//match is closer to the right end
907     l5=fi+a3len;
908     l3=rlen-1;
909     }
910     else {
911     l5=0;
912 gpertea 12 l3=fi-1;
913 gpertea 4 }
914     return true;
915     }
916 gpertea 12 #ifdef DEBUG
917     if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars());
918     #endif
919    
920 gpertea 4 //also, for fast detection of other adapter-only reads that start past
921     // the beginning of the adapter sequence, try to see if the first a3len-4
922 gpertea 12 // bases of the read are a substring of the adapter
923     if (rlen>a3len-3) {
924     GStr rstart=seq.substr(1,a3len-4);
925     if ((fi=adapter3.index(rstart))>=0) {
926     l3=rlen-1;
927     l5=a3len-4;
928     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
929     return true;
930     }
931     }
932     CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
933     //check the easy cases - 11 bases exact match at the end
934     int fdlen=11;
935     if (a3len<16) {
936     fdlen=a3len>>1;
937 gpertea 4 }
938 gpertea 12 if (fdlen>4) {
939     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
940     GStr rstart=seq.substr(-fdlen-3,fdlen);
941     if ((fi=adapter3.index(rstart))>=0) {
942     #ifdef DEBUG
943     if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
944     #endif
945     if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
946     adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs))
947     return true;
948     }
949     //another easy case: first 11 characters of the adaptor found as a substring of the read
950     GStr bstr=adapter3.substr(0, fdlen);
951     if ((fi=seq.rindex(bstr))>=0) {
952     #ifdef DEBUG
953     if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
954     #endif
955     if (extendMatch(seq.chars(), rlen, fi,
956     adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs))
957     return true;
958     }
959     } //tried to match 11 bases first
960    
961     //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
962     //-- only extend if the match is in the 3' (ending) region of the read
963     int wordSize=3;
964     int hlen=12;
965     if (hlen>a3len-wordSize) hlen=a3len-wordSize;
966     int imin=rlen>>1; //last half of the read, left boundary for the wmer match
967     if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
968     imin=rlen-imin;
969     for (int iw=0;iw<hlen;iw++) {
970     //int32* qv=(int32*)(adapter3.chars()+iw);
971     int qv=get3mer_value(adapter3.chars()+iw);
972     fi=-1;
973     //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
974     while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
975     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
976    
977     #ifdef DEBUG
978     if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
979     #endif
980     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
981     a3len, iw, wordSize, l5,l3, a3segs)) return true;
982     fi--;
983     if (fi<imin) break;
984     }
985     } //for each wmer in the first hlen bases of the adaptor
986     /*
987     //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
988     //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
989     if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
990     int hlen2=a3len-wordSize;
991     //if (hlen2>a3len-4) hlen2=a3len-4;
992     if (hlen2>hlen) {
993     #ifdef DEBUG
994     if (debug && a3segs.Count()>0) {
995     GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
996     }
997     #endif
998     for (int iw=hlen;iw<hlen2;iw++) {
999     //int* qv=(int32 *)(adapter3.chars()+iw);
1000     int qv=get3mer_value(adapter3.chars()+iw);
1001     fi=-1;
1002     //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1003     while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1004     extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1005     a3len, iw, wordSize, l5,l3, a3segs);
1006     fi--;
1007     if (fi<imin) break;
1008     }
1009     } //for each wmer between hlen2 and hlen bases of the adaptor
1010     }
1011     //lastly, analyze collected a3segs for a possible gapped alignment:
1012     GList<CSegChain> segchains(false,true,false);
1013     #ifdef DEBUG
1014     if (debug && a3segs.Count()>0) {
1015     GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1016 gpertea 4 }
1017 gpertea 12 #endif
1018     for (int i=0;i<a3segs.Count();i++) {
1019     if (a3segs[i]->chain==NULL) {
1020     if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1021     CSegChain* newchain=new CSegChain();
1022     newchain->setFreeItem(false);
1023     newchain->addSegPair(a3segs[i]);
1024     a3segs[i]->chain=newchain;
1025     segchains.Add(newchain); //just to free them when done
1026     }
1027     for (int j=i+1;j<a3segs.Count();j++) {
1028     CSegChain* chain=a3segs[i]->chain;
1029     if (chain->extendChain(a3segs[j])) {
1030     a3segs[j]->chain=chain;
1031     #ifdef DEBUG
1032     if (debug) dbgPrintChain(*chain, adapter3.chars());
1033     #endif
1034     //save time by checking here if the extended chain is already acceptable for trimming
1035     if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1036     l5=0;
1037     l3=chain->astart-2;
1038     #ifdef DEBUG
1039     if (debug && a3segs.Count()>0) {
1040     GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1041     }
1042     #endif
1043     return true;
1044     }
1045     } //chain can be extended
1046 gpertea 4 }
1047 gpertea 12 } //collect segment alignments into chains
1048     */
1049 gpertea 4 return false; //no adapter parts found
1050     }
1051    
1052     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1053 gpertea 9 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1054 gpertea 4 int rlen=seq.length();
1055     l5=0;
1056     l3=rlen-1;
1057     //try to see if adapter is fully included in the read
1058     int fi=-1;
1059 gpertea 98 for (int ai=0;ai<adapters.Count();ai++) {
1060     if (adapters[ai]->a5.is_empty()) continue;
1061     int a5len=adapters[ai]->a5.length();
1062     GStr& adapter5=adapters[ai]->a5;
1063     if ((fi=seq.index(adapter5))>=0) {
1064     if (fi<rlen-fi-a5len) {//match is closer to the right end
1065     l5=fi+a5len;
1066     l3=rlen-1;
1067 gpertea 12 }
1068 gpertea 98 else {
1069     l5=0;
1070     l3=fi-1;
1071 gpertea 12 }
1072 gpertea 98 return true;
1073     }
1074     #ifdef DEBUG
1075     if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars());
1076     #endif
1077 gpertea 12
1078 gpertea 98 //try the easy way out first - look for an exact match of 11 bases
1079     int fdlen=11;
1080     if (a5len<16) {
1081     fdlen=a5len>>1;
1082     }
1083     if (fdlen>4) {
1084     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1085     if ((fi=adapter5.index(rstart))>=0) {
1086     #ifdef DEBUG
1087     if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1088     #endif
1089     if (extendMatch(seq.chars(), rlen, 1,
1090     adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true))
1091 gpertea 12 return true;
1092 gpertea 98 }
1093     //another easy case: last 11 characters of the adaptor found as a substring of the read
1094     GStr bstr=adapter5.substr(-fdlen);
1095     if ((fi=seq.index(bstr))>=0) {
1096     #ifdef DEBUG
1097     if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1098     #endif
1099     if (extendMatch(seq.chars(), rlen, fi,
1100     adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true))
1101     return true;
1102     }
1103     } //tried to matching at most 11 bases first
1104    
1105     //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1106     //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1107     int wordSize=3;
1108     int hlen=12;
1109     if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1110     int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1111     if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1112     for (int iw=0;iw<=hlen;iw++) {
1113     int apstart=a5len-iw-wordSize;
1114     fi=0;
1115     //int* qv=(int32 *)(adapter5.chars()+apstart);
1116     int qv=get3mer_value(adapter5.chars()+apstart);
1117     //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1118     while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1119     #ifdef DEBUG
1120     if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1121     #endif
1122     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1123     a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1124     fi++;
1125     if (fi>imax) break;
1126 gpertea 4 }
1127 gpertea 98 } //for each wmer in the last hlen bases of the adaptor
1128     //if we're here we couldn't find a good extension
1129     return false; //no adapter parts found
1130     }//for each 5' adapter
1131     }
1132 gpertea 4
1133 gpertea 12 //convert qvs to/from phred64 from/to phread33
1134     void convertPhred(GStr& q) {
1135     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1136 gpertea 4 }
1137    
1138 gpertea 12 void convertPhred(char* q, int len) {
1139     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1140 gpertea 4 }
1141    
1142 gpertea 13 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1143     GStr& rname, GStr& rinfo, GStr& infname) {
1144     rseq="";
1145     rqv="";
1146     rname="";
1147     rinfo="";
1148     if (fq.eof()) return false;
1149     char* l=fq.getLine();
1150     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1151     if (l==NULL) return false;
1152     /* if (rawFormat) {
1153     //TODO: implement raw qseq parsing here
1154     //if (raw type=N) then continue; //skip invalid/bad records
1155     } //raw qseq format
1156     else { // FASTQ or FASTA */
1157     isfasta=(l[0]=='>');
1158     if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1159     infname.chars(), l);
1160     GStr s(l);
1161     rname=&(l[1]);
1162     for (int i=0;i<rname.length();i++)
1163     if (rname[i]<=' ') {
1164     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1165     rname.cut(i);
1166     break;
1167     }
1168     //now get the sequence
1169     if ((l=fq.getLine())==NULL)
1170     GError("Error: unexpected EOF after header for read %s (%s)\n",
1171     rname.chars(), infname.chars());
1172     rseq=l; //this must be the DNA line
1173     while ((l=fq.getLine())!=NULL) {
1174     //seq can span multiple lines
1175     if (l[0]=='>' || l[0]=='+') {
1176     fq.pushBack();
1177     break; //
1178     }
1179     rseq+=l;
1180     } //check for multi-line seq
1181     if (!isfasta) { //reading fastq quality values, which can also be multi-line
1182     if ((l=fq.getLine())==NULL)
1183     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1184     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1185     if ((l=fq.getLine())==NULL)
1186     GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1187     rqv=l;
1188     //if (rqv.length()!=rseq.length())
1189     // GError("Error: qv len != seq len for %s\n", rname.chars());
1190     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1191     rqv+=l; //append to qv string
1192     }
1193     }// fastq
1194     // } //<-- FASTA or FASTQ
1195 gpertea 94 rseq.upper();
1196 gpertea 13 return true;
1197     }
1198    
1199     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1200     //returns 0 if the read was untouched, 1 if it was just trimmed
1201     // and a trash code if it was trashed
1202     l5=0;
1203     l3=rseq.length()-1;
1204     if (l3-l5+1<min_read_len) {
1205     return 's';
1206     }
1207     GStr wseq(rseq.chars());
1208     GStr wqv(rqv.chars());
1209     int w5=l5;
1210     int w3=l3;
1211     //first do the q-based trimming
1212     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1213     if (w3-w5+1<min_read_len) {
1214     return 'Q'; //invalid read
1215     }
1216     //-- keep only the w5..w3 range
1217     l5=w5;
1218     l3=w3;
1219     wseq=wseq.substr(w5, w3-w5+1);
1220     if (!wqv.is_empty())
1221     wqv=wqv.substr(w5, w3-w5+1);
1222     } //qv trimming
1223     // N-trimming on the remaining read seq
1224     if (ntrim(wseq, w5, w3)) {
1225     //GMessage("before: %s\n",wseq.chars());
1226     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1227     l5+=w5;
1228     l3-=(wseq.length()-1-w3);
1229     if (w3-w5+1<min_read_len) {
1230     return 'N'; //to be trashed
1231     }
1232     //-- keep only the w5..w3 range
1233     wseq=wseq.substr(w5, w3-w5+1);
1234     if (!wqv.is_empty())
1235     wqv=wqv.substr(w5, w3-w5+1);
1236     w5=0;
1237     w3=wseq.length()-1;
1238     }
1239     if (a3len>0) {
1240     if (trim_adapter3(wseq, w5, w3)) {
1241     int trimlen=wseq.length()-(w3-w5+1);
1242     num_trimmed3++;
1243     if (trimlen<min_trimmed3)
1244     min_trimmed3=trimlen;
1245     l5+=w5;
1246     l3-=(wseq.length()-1-w3);
1247     if (w3-w5+1<min_read_len) {
1248     return '3';
1249     }
1250     //-- keep only the w5..w3 range
1251     wseq=wseq.substr(w5, w3-w5+1);
1252     if (!wqv.is_empty())
1253     wqv=wqv.substr(w5, w3-w5+1);
1254     }//some adapter was trimmed
1255     } //adapter trimming
1256     if (a5len>0) {
1257     if (trim_adapter5(wseq, w5, w3)) {
1258     int trimlen=wseq.length()-(w3-w5+1);
1259     num_trimmed5++;
1260     if (trimlen<min_trimmed5)
1261     min_trimmed5=trimlen;
1262     l5+=w5;
1263     l3-=(wseq.length()-1-w3);
1264     if (w3-w5+1<min_read_len) {
1265     return '5';
1266     }
1267     //-- keep only the w5..w3 range
1268     wseq=wseq.substr(w5, w3-w5+1);
1269     if (!wqv.is_empty())
1270     wqv=wqv.substr(w5, w3-w5+1);
1271     }//some adapter was trimmed
1272     } //adapter trimming
1273     if (doCollapse) {
1274     //keep read for later
1275     FqDupRec* dr=dhash.Find(wseq.chars());
1276     if (dr==NULL) { //new entry
1277     //if (prefix.is_empty())
1278     dhash.Add(wseq.chars(),
1279     new FqDupRec(&wqv, rname.chars()));
1280     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1281     }
1282     else
1283     dr->add(wqv);
1284     } //collapsing duplicates
1285     else { //not collapsing duplicates
1286     //apply the dust filter now
1287     if (doDust) {
1288     int dustbases=dust(wseq);
1289     if (dustbases>(wseq.length()>>1)) {
1290     return 'D';
1291     }
1292     }
1293     } //not collapsing duplicates
1294     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1295     }
1296    
1297     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1298     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1299     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1300     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1301     }
1302    
1303     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1304     outcounter++;
1305     bool asFasta=(rqv.is_empty() || fastaOutput);
1306     if (asFasta) {
1307     if (prefix.is_empty()) {
1308     printHeader(f_out, '>',rname,rinfo);
1309     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1310     }
1311     else {
1312     fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1313     rseq.chars());
1314     }
1315     }
1316     else { //fastq
1317     if (convert_phred) convertPhred(rqv);
1318     if (prefix.is_empty()) {
1319     printHeader(f_out, '@', rname, rinfo);
1320     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1321     }
1322     else
1323     fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1324     rseq.chars(),rqv.chars() );
1325     }
1326     }
1327    
1328     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1329     if (freport==NULL || trashcode<=' ') return;
1330     if (trashcode=='3' || trashcode=='5') {
1331     fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1332     }
1333     else {
1334     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1335     }
1336     //tcounter++;
1337     }
1338    
1339     GStr getFext(GStr& s, int* xpos=NULL) {
1340     //s must be a filename without a path
1341     GStr r("");
1342     if (xpos!=NULL) *xpos=0;
1343     if (s.is_empty() || s=="-") return r;
1344     int p=s.rindex('.');
1345     int d=s.rindex('/');
1346     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1347     r=s.substr(p+1);
1348     if (xpos!=NULL) *xpos=p+1;
1349     r.lower();
1350     return r;
1351     }
1352    
1353     void baseFileName(GStr& fname) {
1354     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1355     int xpos=0;
1356     GStr fext=getFext(fname, &xpos);
1357     if (xpos<=1) return;
1358     bool extdel=false;
1359     GStr f2;
1360     if (fext=="z" || fext=="zip") {
1361     extdel=true;
1362     }
1363     else if (fext.length()>=2) {
1364     f2=fext.substr(0,2);
1365     extdel=(f2=="gz" || f2=="bz");
1366     }
1367     if (extdel) {
1368     fname.cut(xpos-1);
1369     fext=getFext(fname, &xpos);
1370     if (xpos<=1) return;
1371     }
1372     extdel=false;
1373     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1374     extdel=true;
1375     }
1376     else if (fext.length()>=2) {
1377     extdel=(fext.substr(0,2)=="fa");
1378     }
1379     if (extdel) fname.cut(xpos-1);
1380     GStr fncp(fname);
1381     fncp.lower();
1382     fncp.chomp("seq");
1383     fncp.chomp("sequence");
1384     fncp.trimR("_.");
1385     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1386     }
1387    
1388     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1389     FILE* f_out=NULL;
1390     GStr fname(getFileName(infname.chars()));
1391     //eliminate known extensions
1392     baseFileName(fname);
1393     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1394     else if (pocmd.is_empty()) {
1395     GStr oname(fname);
1396     oname.append('.');
1397     oname.append(outsuffix);
1398     f_out=fopen(oname.chars(),"w");
1399     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1400     }
1401     else {
1402     GStr oname(">");
1403     oname.append(fname);
1404     oname.append('.');
1405     oname.append(outsuffix);
1406     pocmd.append(oname);
1407     f_out=popen(pocmd.chars(), "w");
1408     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1409     }
1410     return f_out;
1411     }
1412    
1413     void guess_unzip(GStr& fname, GStr& picmd) {
1414     GStr fext=getFext(fname);
1415     if (fext=="gz" || fext=="gzip" || fext=="z") {
1416     picmd="gzip -cd ";
1417     }
1418     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1419     picmd="bzip2 -cd ";
1420     }
1421     }
1422    
1423 gpertea 94
1424     int loadAdapters(const char* fname) {
1425     GLineReader lr(fname);
1426     char* l;
1427     while ((l=lr.nextLine())!=NULL) {
1428     if (lr.length()<=3 || l[0]=='#') continue;
1429 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1430     l[0]==';'|| l[0]==':' ) {
1431 gpertea 94 int i=1;
1432     while (l[i]!=0 && isspace(l[i])) {
1433     i++;
1434     }
1435     if (l[i]!=0) {
1436     adapters.Add(new CAdapters(NULL, &(l[i])));
1437     continue;
1438     }
1439     }
1440     else {
1441 gpertea 98 GStr s(l);
1442     s.startTokenize("\t ;,:");
1443     GStr a5,a3;
1444     if (s.nextToken(a5))
1445     s.nextToken(a3);
1446     adapters.Add(new CAdapters(a5.is_empty()?NULL:a5.chars(),
1447     a3.is_empty()?NULL:a3.chars()));
1448 gpertea 94 }
1449     }
1450 gpertea 98 return adapters.Count();
1451 gpertea 94 }
1452    
1453 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1454     GStr& s, GStr& infname, GStr& infname2) {
1455     // uses outsuffix to generate output file names and open file handles as needed
1456     infname="";
1457     infname2="";
1458     f_in=NULL;
1459     f_in2=NULL;
1460     f_out=NULL;
1461     f_out2=NULL;
1462     //analyze outsuffix intent
1463     GStr pocmd;
1464 gpertea 76 if (outsuffix=="-") {
1465     f_out=stdout;
1466     }
1467     else {
1468     GStr ox=getFext(outsuffix);
1469     if (ox.length()>2) ox=ox.substr(0,2);
1470     if (ox=="gz") pocmd="gzip -9 -c ";
1471     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1472     }
1473 gpertea 13 if (s=="-") {
1474     f_in=stdin;
1475     infname="stdin";
1476     f_out=prepOutFile(infname, pocmd);
1477     return;
1478     } // streaming from stdin
1479     s.startTokenize(",:");
1480     s.nextToken(infname);
1481     bool paired=s.nextToken(infname2);
1482     if (fileExists(infname.chars())==0)
1483     GError("Error: cannot find file %s!\n",infname.chars());
1484     GStr fname(getFileName(infname.chars()));
1485     GStr picmd;
1486     guess_unzip(fname, picmd);
1487     if (picmd.is_empty()) {
1488     f_in=fopen(infname.chars(), "r");
1489     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1490     }
1491     else {
1492     picmd.append(infname);
1493     f_in=popen(picmd.chars(), "r");
1494     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1495     }
1496 gpertea 76 if (f_out==stdout) {
1497     if (paired) GError("Error: output suffix required for paired reads\n");
1498     return;
1499     }
1500 gpertea 13 f_out=prepOutFile(infname, pocmd);
1501     if (!paired) return;
1502     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1503     // ---- paired reads:-------------
1504     if (fileExists(infname2.chars())==0)
1505     GError("Error: cannot find file %s!\n",infname2.chars());
1506     picmd="";
1507     GStr fname2(getFileName(infname2.chars()));
1508     guess_unzip(fname2, picmd);
1509     if (picmd.is_empty()) {
1510     f_in2=fopen(infname2.chars(), "r");
1511     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1512     }
1513     else {
1514     picmd.append(infname2);
1515     f_in2=popen(picmd.chars(), "r");
1516     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1517     }
1518     f_out2=prepOutFile(infname2, pocmd);
1519     }