ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 76
Committed: Tue Sep 27 12:37:02 2011 UTC (7 years, 11 months ago) by gpertea
File size: 53160 byte(s)
Log Message:
u

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 4
7     #define USAGE "Usage:\n\
8 gpertea 76 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
9     [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
11     <input.fq>[,<input_mates.fq>\n\
12 gpertea 4 \n\
13 gpertea 76 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
14     for low complexity and collapse duplicate reads.\n\
15 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
16     one read in a pair), the two file names should be given delimited by a comma\n\
17     or a colon character\n\
18 gpertea 4 \n\
19     Options:\n\
20     -n rename all the reads using the <prefix> followed by a read counter;\n\
21     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
22     the read duplication count\n\
23 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
24     file(s) named <input>.<outsuffix> which will be created in the \n\
25     current (working) directory; (writes to stdout if -o- is given);\n\
26     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
27     -f file with adapter sequences to trim, each line having this format:\n\
28     <5'-adapter-sequence> <3'-adapter-sequence>\n\
29 gpertea 4 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
30     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
31     -3 trim the given adapter sequence at the 3' end of each read\n\
32     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
33 gpertea 76 -a minimum length of exact match to adaptor sequence at the proper end (6)\n\
34 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
35 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
36 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
37     -l minimum \"clean\" length after trimming that a read must have\n\
38     in order to pass the filter (default: 16)\n\
39 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
40     one letter code for the reason why they were trashed\n\
41 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
42     50% of its length detected as low complexity\n\
43 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
44 gpertea 12 name (e.g. for microRNAs)\n\
45     -p input is phred64/phred33 (use -p64 or -p33)\n\
46     -Q convert quality values to the other Phred qv type\n\
47     -V verbose processing\n\
48 gpertea 4 "
49 gpertea 13
50     //-z for -o option, the output stream(s) will be first piped into the given\n
51     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
52    
53    
54 gpertea 4 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
55 gpertea 12
56 gpertea 76 //For paired reads sequencing:
57 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
58     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
59 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
60     //FILE* f_out2=NULL; //for paired reads
61     //FILE* f_in=NULL; //input fastq (stdin if not provided)
62     //FILE* f_in2=NULL; //for paired reads
63 gpertea 4 FILE* freport=NULL;
64 gpertea 13
65 gpertea 4 bool debug=false;
66 gpertea 12 bool verbose=false;
67 gpertea 4 bool doCollapse=false;
68     bool doDust=false;
69 gpertea 13 bool fastaOutput=false;
70 gpertea 4 bool trashReport=false;
71 gpertea 12 //bool rawFormat=false;
72 gpertea 4 int min_read_len=16;
73 gpertea 12 double max_perc_N=7.0;
74 gpertea 4 int dust_cutoff=16;
75     bool isfasta=false;
76 gpertea 12 bool convert_phred=false;
77 gpertea 13 GStr outsuffix; // -o
78 gpertea 76 //GStr adapter3;
79     //GStr adapter5;
80 gpertea 13 GStr prefix;
81     GStr zcmd;
82     int num_trimmed5=0;
83     int num_trimmed3=0;
84     int num_trimmedN=0;
85     int num_trimmedQ=0;
86     int min_trimmed5=INT_MAX;
87     int min_trimmed3=INT_MAX;
88 gpertea 12
89 gpertea 13 int qvtrim_qmin=0;
90     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
91 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
92     int qv_cvtadd=0; //could be -31 or +31
93    
94 gpertea 4 int a3len=0;
95     int a5len=0;
96     // adaptor matching metrics -- for extendMatch() function
97 gpertea 12 const int a_m_score=2; //match score
98     const int a_mis_score=-3; //mismatch
99     const int a_dropoff_score=7;
100 gpertea 76 int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
101 gpertea 12 const int a_min_chain_score=15; //for gapped alignments
102 gpertea 4
103 gpertea 12 class CSegChain;
104    
105     class CSegPair {
106     public:
107     GSeg a;
108     GSeg b; //the adapter segment
109     int score;
110     int flags;
111     CSegChain* chain;
112     CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
113     score=mscore;
114     if (score==0) score=a.len()*a_m_score;
115     flags=0;
116     chain=NULL;
117     }
118     int len() { return a.len(); }
119     bool operator==(CSegPair& d){
120     //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
121     //make equal even segments that are included into one another:
122     return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
123     }
124     bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
125     if (b.start==d.b.start) {
126     if (score==d.score) {
127     //just try to be consistent:
128     if (b.end==d.b.end) {
129     return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
130     }
131     return (b.end>d.b.end);
132     }
133     else return (score<d.score);
134     }
135     return (b.start>d.b.start);
136     }
137     bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
138     /*if (b.start==d.b.start && b.end==d.b.end) {
139     return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
140     }
141     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
142     if (b.start==d.b.start) {
143     if (score==d.score) {
144     //just try to be consistent:
145     if (b.end==d.b.end) {
146     return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
147     }
148     return (b.end<d.b.end);
149     }
150     else return (score>d.score);
151     }
152     return (b.start<d.b.start);
153     }
154     };
155    
156     int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
157     CSegPair& x = *(CSegPair *)sa;
158     CSegPair& y = *(CSegPair *)sb;
159     /*
160     if (x.b.end==y.b.end) {
161     if (x.b.start==y.b.start) {
162     if (x.a.end==y.a.end) {
163     if (x.a.start==y.a.start) return 0;
164     return ((x.a.start>y.a.start) ? -1 : 1);
165     }
166     else {
167     return ((x.a.end>y.a.end) ? -1 : 1);
168     }
169     }
170     else {
171     return ((x.b.start>y.b.start) ? -1 : 1);
172     }
173     }
174     else {
175     return ((x.b.end>y.b.end) ? -1 : 1);
176     }
177     */
178     if (x.b.end==y.b.end) {
179     if (x.score==y.score) {
180     if (x.b.start==y.b.start) {
181     if (x.a.end==y.a.end) {
182     if (x.a.start==y.a.start) return 0;
183     return ((x.a.start<y.a.start) ? -1 : 1);
184     }
185     else {
186     return ((x.a.end<y.a.end) ? -1 : 1);
187     }
188     }
189     else {
190     return ((x.b.start<y.b.start) ? -1 : 1);
191     }
192     } else return ((x.score>y.score) ? -1 : 1);
193     }
194     else {
195     return ((x.b.end>y.b.end) ? -1 : 1);
196     }
197    
198     }
199    
200     class CSegChain:public GList<CSegPair> {
201     public:
202     uint astart;
203     uint aend;
204     uint bstart;
205     uint bend;
206     int score;
207     bool endSort;
208     CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
209     //as SegPairs are inserted, they will be sorted by a.start coordinate
210     score=0;
211     astart=MAX_UINT;
212     aend=0;
213     bstart=MAX_UINT;
214     bend=0;
215     endSort=aln5;
216     if (aln5) { setSorted(cmpSegEnds); }
217     }
218     bool operator==(CSegChain& d) {
219     //return (score==d.score);
220     return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
221     }
222     bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
223     //return (score<d.score);
224     if (bstart==d.bstart && bend==d.bend) {
225     return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
226     }
227     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
228     }
229     bool operator<(CSegChain& d) {
230     //return (score>d.score);
231     if (bstart==d.bstart && bend==d.bend) {
232     return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
233     }
234     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
235     }
236     void addSegPair(CSegPair* segp) {
237     if (AddIfNew(segp)!=segp) return;
238     score+=segp->score;
239     if (astart>segp->a.start) astart=segp->a.start;
240     if (aend<segp->a.end) aend=segp->a.end;
241     if (bstart>segp->b.start) bstart=segp->b.start;
242     if (bend<segp->b.end) bend=segp->b.end;
243     }
244     //for building actual chains:
245     bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
246     int bgap=0;
247     int agap=0;
248     //if (endSort) {
249     if (bstart>segp->b.start) {
250     bgap = (int)(bstart-segp->b.end);
251     if (abs(bgap)>2) return false;
252     agap = (int)(astart-segp->a.end);
253     if (abs(agap)>2) return false;
254     }
255     else {
256     bgap = (int) (segp->b.start-bend);
257     if (abs(bgap)>2) return false;
258     agap = (int)(segp->a.start-aend);
259     if (abs(agap)>2) return false;
260     }
261     if (agap*bgap<0) return false;
262     addSegPair(segp);
263     score-=abs(agap)+abs(bgap);
264     return true;
265     }
266     };
267    
268 gpertea 4 // element in dhash:
269     class FqDupRec {
270     public:
271     int count; //how many of these reads are the same
272     int len; //length of qv
273     char* firstname; //optional, only if we want to keep the original read names
274     char* qv;
275     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
276     len=0;
277     qv=NULL;
278     firstname=NULL;
279     count=0;
280     if (qstr!=NULL) {
281     qv=Gstrdup(qstr->chars());
282     len=qstr->length();
283     count++;
284     }
285     if (rname!=NULL) firstname=Gstrdup(rname);
286     }
287     ~FqDupRec() {
288     GFREE(qv);
289     GFREE(firstname);
290     }
291     void add(GStr& d) { //collapse another record into this one
292     if (d.length()!=len)
293     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
294     count++;
295     for (int i=0;i<len;i++)
296     qv[i]=(qv[i]+d[i])/2;
297     }
298     };
299    
300     void openfw(FILE* &f, GArgs& args, char opt) {
301     GStr s=args.getOpt(opt);
302     if (!s.is_empty()) {
303     if (s=='-') f=stdout;
304     else {
305 gpertea 13 f=fopen(s.chars(),"w");
306 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
307     }
308     }
309     }
310    
311     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
312     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
313    
314     GHash<FqDupRec> dhash; //hash to keep track of duplicates
315    
316 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
317     GStr& s, GStr& infname, GStr& infname2);
318     // uses outsuffix to generate output file names and open file handles as needed
319    
320     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
321     void trash_report(char trashcode, GStr& rname, FILE* freport);
322    
323     bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
324     GStr& rname, GStr& rinfo, GStr& infname);
325    
326     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
327     //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
328    
329 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
330     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
331 gpertea 4 int dust(GStr& seq);
332     bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
333     bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
334    
335 gpertea 12 void convertPhred(char* q, int len);
336     void convertPhred(GStr& q);
337 gpertea 4
338     int main(int argc, char * const argv[]) {
339 gpertea 76 GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
340 gpertea 4 int e;
341     if ((e=args.isError())>0) {
342     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
343     exit(224);
344     }
345 gpertea 9 debug=(args.getOpt('Y')!=NULL);
346 gpertea 13 verbose=(args.getOpt('V')!=NULL);
347 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
348 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
349     doDust=(args.getOpt('D')!=NULL);
350 gpertea 12 /*
351 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
352     if (rawFormat) {
353     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
354     }
355 gpertea 12 */
356 gpertea 4 prefix=args.getOpt('n');
357     GStr s=args.getOpt('l');
358     if (!s.is_empty())
359     min_read_len=s.asInt();
360 gpertea 12 s=args.getOpt('m');
361     if (!s.is_empty())
362     max_perc_N=s.asDouble();
363 gpertea 4 s=args.getOpt('d');
364     if (!s.is_empty()) {
365     dust_cutoff=s.asInt();
366     doDust=true;
367     }
368 gpertea 12 s=args.getOpt('q');
369     if (!s.is_empty()) {
370 gpertea 13 qvtrim_qmin=s.asInt();
371 gpertea 12 }
372 gpertea 13 s=args.getOpt('t');
373     if (!s.is_empty()) {
374     qvtrim_max=s.asInt();
375     }
376 gpertea 12 s=args.getOpt('p');
377     if (!s.is_empty()) {
378     int v=s.asInt();
379     if (v==33) {
380     qv_phredtype=33;
381     qv_cvtadd=31;
382     }
383     else if (v==64) {
384     qv_phredtype=64;
385     qv_cvtadd=-31;
386     }
387     else
388     GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
389     }
390 gpertea 4 if (args.getOpt('3')!=NULL) {
391     adapter3=args.getOpt('3');
392     adapter3.upper();
393     a3len=adapter3.length();
394     }
395     if (args.getOpt('5')!=NULL) {
396     adapter5=args.getOpt('5');
397     adapter5.upper();
398     a5len=adapter5.length();
399     }
400 gpertea 13 s=args.getOpt('a');
401     if (!s.is_empty()) {
402     int a_minmatch=s.asInt();
403     a_min_score=a_minmatch<<1;
404     }
405 gpertea 76
406 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
407 gpertea 76 else outsuffix="-";
408 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
409 gpertea 13 int fcount=args.startNonOpt();
410     if (fcount==0) {
411 gpertea 4 GMessage(USAGE);
412     exit(224);
413     }
414 gpertea 13 if (fcount>1 && doCollapse) {
415     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
416     }
417     //openfw(f_out, args, 'o');
418     //if (f_out==NULL) f_out=stdout;
419 gpertea 4 if (trashReport)
420     openfw(freport, args, 'r');
421     char* infile=NULL;
422     while ((infile=args.nextNonOpt())!=NULL) {
423 gpertea 13 int incounter=0; //counter for input reads
424     int outcounter=0; //counter for output reads
425     int trash_s=0; //too short from the get go
426     int trash_Q=0;
427     int trash_N=0;
428     int trash_D=0;
429     int trash_A3=0;
430     int trash_A5=0;
431     s=infile;
432     GStr infname;
433     GStr infname2;
434     FILE* f_in=NULL;
435     FILE* f_in2=NULL;
436     FILE* f_out=NULL;
437     FILE* f_out2=NULL;
438     bool paired_reads=false;
439     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
440     GLineReader fq(f_in);
441     GLineReader* fq2=NULL;
442     if (f_in2!=NULL) {
443     fq2=new GLineReader(f_in2);
444     paired_reads=true;
445     }
446     GStr rseq, rqv, seqid, seqinfo;
447     GStr rseq2, rqv2, seqid2, seqinfo2;
448     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
449     incounter++;
450     int a5=0, a3=0, b5=0, b3=0;
451     char tcode=0, tcode2=0;
452     tcode=process_read(seqid, rseq, rqv, a5, a3);
453     //if (!doCollapse) {
454     if (fq2!=NULL) {
455     getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
456     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
457     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
458     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
459     }
460     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
461     //decide what to do with this pair and print rseq2 if the pair makes it
462     if (tcode>1 && tcode2<=1) {
463     //"untrash" rseq
464     if (a3-a5+1<min_read_len) {
465     a5=1;
466     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
467 gpertea 4 }
468 gpertea 13 tcode=1;
469     }
470     else if (tcode<=1 && tcode2>1) {
471     //"untrash" rseq2
472     if (b3-b5+1<min_read_len) {
473     b5=1;
474     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
475     }
476     tcode2=1;
477     }
478     if (tcode<=1) { //trimmed or left intact -- write it!
479     if (tcode>0) {
480     rseq2=rseq2.substr(b5,b3-b5+1);
481     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
482 gpertea 4 }
483 gpertea 13 int nocounter=0;
484     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
485     }
486     } //paired read
487     // }
488     if (tcode>1) { //trashed
489     if (tcode=='s') trash_s++;
490     else if (tcode=='Q') trash_Q++;
491     else if (tcode=='N') trash_N++;
492     else if (tcode=='D') trash_D++;
493     else if (tcode=='3') trash_A3++;
494     else if (tcode=='5') trash_A5++;
495     if (trashReport) trash_report(tcode, seqid, freport);
496     }
497     else if (!doCollapse) { //write it
498     if (tcode>0) {
499     rseq=rseq.substr(a5,a3-a5+1);
500     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
501     }
502     writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
503     }
504     } //for each fastq record
505     delete fq2;
506     FRCLOSE(f_in);
507     FRCLOSE(f_in2);
508     if (doCollapse) {
509     outcounter=0;
510     int maxdup_count=1;
511     char* maxdup_seq=NULL;
512     dhash.startIterate();
513     FqDupRec* qd=NULL;
514     char* seq=NULL;
515     while ((qd=dhash.NextData(seq))!=NULL) {
516     GStr rseq(seq);
517     //do the dusting here
518     if (doDust) {
519     int dustbases=dust(rseq);
520     if (dustbases>(rseq.length()>>1)) {
521     if (trashReport && qd->firstname!=NULL) {
522     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
523 gpertea 4 }
524 gpertea 13 trash_D+=qd->count;
525     continue;
526     }
527     }
528     outcounter++;
529     if (qd->count>maxdup_count) {
530     maxdup_count=qd->count;
531     maxdup_seq=seq;
532     }
533     if (isfasta) {
534     if (prefix.is_empty()) {
535     fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
536     rseq.chars());
537 gpertea 4 }
538 gpertea 13 else { //use custom read name
539     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
540     qd->count, rseq.chars());
541 gpertea 4 }
542 gpertea 13 }
543     else { //fastq format
544     if (convert_phred) convertPhred(qd->qv, qd->len);
545     if (prefix.is_empty()) {
546     fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
547     rseq.chars(), qd->qv);
548 gpertea 4 }
549 gpertea 13 else { //use custom read name
550     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
551     qd->count, rseq.chars(), qd->qv);
552     }
553     }
554     }//for each element of dhash
555     if (maxdup_count>1) {
556     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
557 gpertea 4 }
558 gpertea 13 } //collapse entries
559     if (verbose) {
560     if (paired_reads) {
561     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
562     GMessage("Number of input pairs :%9d\n", incounter);
563     GMessage(" Output pairs :%9d\n", outcounter);
564     }
565     else {
566     GMessage(">Input file : %s\n", infname.chars());
567     GMessage("Number of input reads :%9d\n", incounter);
568     GMessage(" Output reads :%9d\n", outcounter);
569     }
570     GMessage("------------------------------------\n");
571     if (num_trimmed5)
572     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
573     if (num_trimmed3)
574     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
575     GMessage("------------------------------------\n");
576     if (trash_s>0)
577     GMessage(" Trashed by length:%9d\n", trash_s);
578     if (trash_N>0)
579     GMessage(" Trashed by N%%:%9d\n", trash_N);
580     if (trash_Q>0)
581     GMessage("Trashed by low quality:%9d\n", trash_Q);
582     if (trash_A5>0)
583     GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
584     if (trash_A3>0)
585     GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
586     }
587     if (trashReport) {
588     FWCLOSE(freport);
589 gpertea 4 }
590 gpertea 13 FWCLOSE(f_out);
591     FWCLOSE(f_out2);
592     } //while each input file
593 gpertea 4
594 gpertea 12 //getc(stdin);
595 gpertea 4 }
596    
597     class NData {
598     public:
599     int NPos[1024]; //there should be no reads longer than 1K ?
600     int NCount;
601     int end5;
602     int end3;
603     int seqlen;
604     double perc_N; //percentage of Ns in end5..end3 range only!
605     const char* seq;
606     bool valid;
607     NData() {
608     NCount=0;
609     end5=0;
610     end3=0;
611     seq=NULL;
612     perc_N=0;
613     valid=true;
614     }
615     void init(GStr& rseq) {
616     NCount=0;
617     perc_N=0;
618     seqlen=rseq.length();
619     seq=rseq.chars();
620     end5=1;
621     end3=seqlen;
622     for (int i=0;i<seqlen;i++)
623 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
624 gpertea 4 NPos[NCount]=i;
625     NCount++;
626     }
627     perc_N=(NCount*100.0)/seqlen;
628     valid=true;
629     }
630     void N_calc() { //only in the region after trimming
631     int n=0;
632     for (int i=end3-1;i<end5;i++) {
633     if (seq[i]=='N') n++;
634     }
635     perc_N=(n*100.0)/(end5-end3+1);
636     }
637     };
638    
639     static NData feat;
640     int perc_lenN=12; // incremental distance from ends, in percentage of
641     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
642    
643     void N_analyze(int l5, int l3, int p5, int p3) {
644     /* assumes feat was filled properly */
645     int old_dif, t5,t3,v;
646 gpertea 12 if (l3<l5+2 || p5>p3 ) {
647 gpertea 4 feat.end5=l5+1;
648     feat.end3=l3+1;
649     return;
650     }
651    
652     t5=feat.NPos[p5]-l5;
653     t3=l3-feat.NPos[p3];
654     old_dif=p3-p5;
655     v=(int)((((double)(l3-l5))*perc_lenN)/100);
656     if (v>20) v=20; /* enforce N-search limit for very long reads */
657     if (t5 < v ) {
658     l5=feat.NPos[p5]+1;
659     p5++;
660     }
661     if (t3 < v) {
662     l3=feat.NPos[p3]-1;
663     p3--;
664     }
665     /* restNs=p3-p5; number of Ns in the new CLR */
666     if (p3-p5==old_dif) { /* no change, return */
667     /* don't trim if this may shorten the read too much */
668     //if (l5-l3<min_read_len) return;
669     feat.end5=l5+1;
670     feat.end3=l3+1;
671     return;
672     }
673     else
674     N_analyze(l5,l3, p5,p3);
675     }
676    
677    
678 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
679 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
680 gpertea 12 if (qv_phredtype==0) {
681     //try to guess the Phred type
682     int vmin=256, vmax=0;
683     for (int i=0;i<qvs.length();i++) {
684     if (vmin>qvs[i]) vmin=qvs[i];
685     if (vmax<qvs[i]) vmax=qvs[i];
686     }
687     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
688     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
689     if (qv_phredtype==0) {
690     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
691     }
692 gpertea 13 if (verbose)
693     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
694     } //guessing Phred type
695 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
696 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
697 gpertea 12 }
698     //just in case, check also the 5' the end (?)
699     for (l5=0;l5<qvs.length()-3;l5++) {
700 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
701 gpertea 12 }
702 gpertea 13 if (qvtrim_max>0) {
703     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
704     if (l5>qvtrim_max) l5=qvtrim_max;
705     }
706 gpertea 12 return (l5>0 || l3<qvs.length()-1);
707     }
708    
709     bool ntrim(GStr& rseq, int &l5, int &l3) {
710     //count Ns in the sequence, trim N-rich ends
711 gpertea 4 feat.init(rseq);
712     l5=feat.end5-1;
713     l3=feat.end3-1;
714     N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
715 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
716     if (feat.perc_N>max_perc_N) {
717     feat.valid=false;
718     l3=l5+1;
719     return true;
720     }
721     else {
722     return false; //nothing changed
723     }
724     }
725 gpertea 4 l5=feat.end5-1;
726     l3=feat.end3-1;
727     if (l3-l5+1<min_read_len) {
728     feat.valid=false;
729     return true;
730     }
731     feat.N_calc();
732 gpertea 12
733     if (feat.perc_N>max_perc_N) {
734 gpertea 4 feat.valid=false;
735     l3=l5+1;
736     return true;
737     }
738     return true;
739     }
740    
741     //--------------- dust functions ----------------
742     class DNADuster {
743     public:
744     int dustword;
745     int dustwindow;
746     int dustwindow2;
747     int dustcutoff;
748     int mv, iv, jv;
749     int counts[32*32*32];
750     int iis[32*32*32];
751     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
752     dustword=wordsize;
753     dustwindow=winsize;
754     dustwindow2 = (winsize>>1);
755     dustcutoff=cutoff;
756     mv=0;
757     iv=0;
758     jv=0;
759     }
760     void setWindowSize(int value) {
761     dustwindow = value;
762     dustwindow2 = (dustwindow >> 1);
763     }
764     void setWordSize(int value) {
765     dustword=value;
766     }
767     void wo1(int len, const char* s, int ivv) {
768     int i, ii, j, v, t, n, n1, sum;
769     int js, nis;
770     n = 32 * 32 * 32;
771     n1 = n - 1;
772     nis = 0;
773     i = 0;
774     ii = 0;
775     sum = 0;
776     v = 0;
777     for (j=0; j < len; j++, s++) {
778     ii <<= 5;
779     if (*s<=32) {
780     i=0;
781     continue;
782     }
783     ii |= *s - 'A'; //assume uppercase!
784     ii &= n1;
785     i++;
786     if (i >= dustword) {
787     for (js=0; js < nis && iis[js] != ii; js++) ;
788     if (js == nis) {
789     iis[nis] = ii;
790     counts[ii] = 0;
791     nis++;
792     }
793     if ((t = counts[ii]) > 0) {
794     sum += t;
795     v = 10 * sum / j;
796     if (mv < v) {
797     mv = v;
798     iv = ivv;
799     jv = j;
800     }
801     }
802     counts[ii]++;
803     }
804     }
805     }
806    
807     int wo(int len, const char* s, int* beg, int* end) {
808     int i, l1;
809     l1 = len - dustword + 1;
810     if (l1 < 0) {
811     *beg = 0;
812     *end = len - 1;
813     return 0;
814     }
815     mv = 0;
816     iv = 0;
817     jv = 0;
818     for (i=0; i < l1; i++) {
819     wo1(len-i, s+i, i);
820     }
821     *beg = iv;
822     *end = iv + jv;
823     return mv;
824     }
825    
826     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
827     int i, j, l, a, b, v;
828     if (cutoff==0) cutoff=dustcutoff;
829     a=0;b=0;
830     //GMessage("Dust cutoff=%d\n", cutoff);
831     for (i=0; i < seqlen; i += dustwindow2) {
832     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
833     v = wo(l, seq+i, &a, &b);
834     if (v > cutoff) {
835     //for (j = a; j <= b && j < dustwindow2; j++) {
836     for (j = a; j <= b; j++) {
837     seqmsk[i+j]='N';//could be made lowercase instead
838     }
839     }
840     }
841     //return first;
842     }
843     };
844    
845     static DNADuster duster;
846    
847     int dust(GStr& rseq) {
848     char* seq=Gstrdup(rseq.chars());
849     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
850     //check the number of Ns:
851     int ncount=0;
852     for (int i=0;i<rseq.length();i++) {
853     if (seq[i]=='N') ncount++;
854     }
855     GFREE(seq);
856     return ncount;
857     }
858    
859 gpertea 12
860     // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
861     //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
862     //check minimum score and
863     //for 3' adapter trimming:
864 gpertea 4 // require that the right end of the alignment for either the adaptor OR the read must be
865     // < 3 distance from its right end
866     // for 5' adapter trimming:
867     // require that the left end of the alignment for either the adaptor OR the read must
868 gpertea 12 // be at coordinate < 3 from start
869 gpertea 4
870     bool extendMatch(const char* a, int alen, int ai,
871 gpertea 12 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
872 gpertea 4 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
873 gpertea 12 #ifdef DEBUG
874     GStr dbg(b);
875     #endif
876     //if (debug) {
877     // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
878     // }
879 gpertea 4 int a_l=ai; //alignment coordinates on a
880     int a_r=ai+mlen-1;
881     int b_l=bi; //alignment coordinates on b
882     int b_r=bi+mlen-1;
883     int ai_maxscore=ai;
884     int bi_maxscore=bi;
885     int score=mlen*a_m_score;
886     int maxscore=score;
887 gpertea 9 int mism5score=a_mis_score;
888     if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
889 gpertea 4 //try to extend to the left first, if possible
890     while (ai>0 && bi>0) {
891     ai--;
892     bi--;
893     score+= (a[ai]==b[bi])? a_m_score : mism5score;
894     if (score>maxscore) {
895     ai_maxscore=ai;
896     bi_maxscore=bi;
897     maxscore=score;
898     }
899     else if (maxscore-score>a_dropoff_score) break;
900     }
901     a_l=ai_maxscore;
902     b_l=bi_maxscore;
903 gpertea 9 //if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
904 gpertea 4 //now extend to the right
905     ai_maxscore=a_r;
906     bi_maxscore=b_r;
907     ai=a_r;
908     bi=b_r;
909     score=maxscore;
910     //sometimes there are extra AAAAs at the end of the read, ignore those
911     if (strcmp(&a[alen-4],"AAAA")==0) {
912     alen-=3;
913     while (a[alen-1]=='A' && alen>ai) alen--;
914     }
915     while (ai<alen-1 && bi<blen-1) {
916     ai++;
917     bi++;
918     //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
919     if (a[ai]==b[bi]) { //match
920     score+=a_m_score;
921     if (ai>=alen-2) {
922     score+=a_m_score-(alen-ai-1);
923     }
924     }
925     else { //mismatch
926     score+=a_mis_score;
927     }
928     if (score>maxscore) {
929     ai_maxscore=ai;
930     bi_maxscore=bi;
931     maxscore=score;
932     }
933     else if (maxscore-score>a_dropoff_score) break;
934     }
935     a_r=ai_maxscore;
936     b_r=bi_maxscore;
937 gpertea 9 int a_ovh3=alen-a_r-1;
938     int b_ovh3=blen-b_r-1;
939     int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
940     int mmovh5=(a_l<b_l)? a_l : b_l;
941     //if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
942 gpertea 12 #ifdef DEBUG
943     if (debug) GMessage(" extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
944     #endif
945 gpertea 9 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
946     if (a_l<a_ovh3) {
947 gpertea 4 //adapter closer to the left end (typical for 5' adapter)
948     l5=a_r+1;
949     l3=alen-1;
950     }
951     else {
952     //adapter matching at the right end (typical for 3' adapter)
953     l5=0;
954     l3=a_l-1;
955     }
956     return true;
957     }
958 gpertea 12 else { //keep this segment pair for later (gapped alignment)
959     segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
960     //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
961     }
962 gpertea 11 //do not trim:
963     l5=0;
964     l3=alen-1;
965 gpertea 4 return false;
966     }
967    
968 gpertea 12 /*
969     int getWordValue(const char* s, int wlen) {
970     int r=0;
971     while (wlen--) { r+=(((int)s[wlen])<<wlen) }
972     return r;
973     }
974     */
975     int get3mer_value(const char* s) {
976     return (s[0]<<16)+(s[1]<<8)+s[2];
977     }
978    
979     int w3_match(int qv, const char* str, int slen, int start_index=0) {
980     if (start_index>=slen || start_index<0) return -1;
981     for (int i=start_index;i<slen-3;i++) {
982     int rv=get3mer_value(str+i);
983     if (rv==qv) return i;
984     }
985     return -1;
986     }
987    
988     int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
989     if (end_index>=slen) return -1;
990     if (end_index<0) end_index=slen-1;
991     for (int i=end_index-2;i>=0;i--) {
992     int rv=get3mer_value(str+i);
993     if (rv==qv) return i;
994     }
995     return -1;
996     }
997    
998     int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
999     if (start_index>=slen || start_index<0) return -1;
1000     for (int i=start_index;i<slen-4;i++) {
1001     int32* rv=(int32*)(str+i);
1002     if (*rv==qv) return i;
1003     }
1004     return -1;
1005     }
1006    
1007     int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1008     if (end_index>=slen) return -1;
1009     if (end_index<0) end_index=slen-1;
1010     for (int i=end_index-3;i>=0;i--) {
1011     int32* rv=(int32*)(str+i);
1012     if (*rv==qv) return i;
1013     }
1014     return -1;
1015     }
1016    
1017     #ifdef DEBUG
1018     void dbgPrintChain(CSegChain& chain, const char* aseq) {
1019     GStr s(aseq);
1020     for (int i=0;i<chain.Count();i++) {
1021     CSegPair& seg=*chain[i];
1022     GMessage(" dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1023     s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
1024     }
1025     }
1026     #endif
1027    
1028 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1029     int rlen=seq.length();
1030     l5=0;
1031     l3=rlen-1;
1032     //first try a full match, we might get lucky
1033     int fi=-1;
1034     if ((fi=seq.index(adapter3))>=0) {
1035     if (fi<rlen-fi-a3len) {//match is closer to the right end
1036     l5=fi+a3len;
1037     l3=rlen-1;
1038     }
1039     else {
1040     l5=0;
1041 gpertea 12 l3=fi-1;
1042 gpertea 4 }
1043     return true;
1044     }
1045 gpertea 12 #ifdef DEBUG
1046     if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars());
1047     #endif
1048    
1049 gpertea 4 //also, for fast detection of other adapter-only reads that start past
1050     // the beginning of the adapter sequence, try to see if the first a3len-4
1051 gpertea 12 // bases of the read are a substring of the adapter
1052     if (rlen>a3len-3) {
1053     GStr rstart=seq.substr(1,a3len-4);
1054     if ((fi=adapter3.index(rstart))>=0) {
1055     l3=rlen-1;
1056     l5=a3len-4;
1057     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1058     return true;
1059     }
1060     }
1061     CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1062     //check the easy cases - 11 bases exact match at the end
1063     int fdlen=11;
1064     if (a3len<16) {
1065     fdlen=a3len>>1;
1066 gpertea 4 }
1067 gpertea 12 if (fdlen>4) {
1068     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1069     GStr rstart=seq.substr(-fdlen-3,fdlen);
1070     if ((fi=adapter3.index(rstart))>=0) {
1071     #ifdef DEBUG
1072     if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1073     #endif
1074     if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1075     adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs))
1076     return true;
1077     }
1078     //another easy case: first 11 characters of the adaptor found as a substring of the read
1079     GStr bstr=adapter3.substr(0, fdlen);
1080     if ((fi=seq.rindex(bstr))>=0) {
1081     #ifdef DEBUG
1082     if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1083     #endif
1084     if (extendMatch(seq.chars(), rlen, fi,
1085     adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs))
1086     return true;
1087     }
1088     } //tried to match 11 bases first
1089    
1090     //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1091     //-- only extend if the match is in the 3' (ending) region of the read
1092     int wordSize=3;
1093     int hlen=12;
1094     if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1095     int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1096     if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1097     imin=rlen-imin;
1098     for (int iw=0;iw<hlen;iw++) {
1099     //int32* qv=(int32*)(adapter3.chars()+iw);
1100     int qv=get3mer_value(adapter3.chars()+iw);
1101     fi=-1;
1102     //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1103     while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1104     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1105    
1106     #ifdef DEBUG
1107     if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1108     #endif
1109     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1110     a3len, iw, wordSize, l5,l3, a3segs)) return true;
1111     fi--;
1112     if (fi<imin) break;
1113     }
1114     } //for each wmer in the first hlen bases of the adaptor
1115     /*
1116     //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1117     //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1118     if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1119     int hlen2=a3len-wordSize;
1120     //if (hlen2>a3len-4) hlen2=a3len-4;
1121     if (hlen2>hlen) {
1122     #ifdef DEBUG
1123     if (debug && a3segs.Count()>0) {
1124     GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1125     }
1126     #endif
1127     for (int iw=hlen;iw<hlen2;iw++) {
1128     //int* qv=(int32 *)(adapter3.chars()+iw);
1129     int qv=get3mer_value(adapter3.chars()+iw);
1130     fi=-1;
1131     //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1132     while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1133     extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1134     a3len, iw, wordSize, l5,l3, a3segs);
1135     fi--;
1136     if (fi<imin) break;
1137     }
1138     } //for each wmer between hlen2 and hlen bases of the adaptor
1139     }
1140     //lastly, analyze collected a3segs for a possible gapped alignment:
1141     GList<CSegChain> segchains(false,true,false);
1142     #ifdef DEBUG
1143     if (debug && a3segs.Count()>0) {
1144     GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1145 gpertea 4 }
1146 gpertea 12 #endif
1147     for (int i=0;i<a3segs.Count();i++) {
1148     if (a3segs[i]->chain==NULL) {
1149     if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1150     CSegChain* newchain=new CSegChain();
1151     newchain->setFreeItem(false);
1152     newchain->addSegPair(a3segs[i]);
1153     a3segs[i]->chain=newchain;
1154     segchains.Add(newchain); //just to free them when done
1155     }
1156     for (int j=i+1;j<a3segs.Count();j++) {
1157     CSegChain* chain=a3segs[i]->chain;
1158     if (chain->extendChain(a3segs[j])) {
1159     a3segs[j]->chain=chain;
1160     #ifdef DEBUG
1161     if (debug) dbgPrintChain(*chain, adapter3.chars());
1162     #endif
1163     //save time by checking here if the extended chain is already acceptable for trimming
1164     if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1165     l5=0;
1166     l3=chain->astart-2;
1167     #ifdef DEBUG
1168     if (debug && a3segs.Count()>0) {
1169     GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1170     }
1171     #endif
1172     return true;
1173     }
1174     } //chain can be extended
1175 gpertea 4 }
1176 gpertea 12 } //collect segment alignments into chains
1177     */
1178 gpertea 4 return false; //no adapter parts found
1179     }
1180    
1181     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1182 gpertea 9 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1183 gpertea 4 int rlen=seq.length();
1184     l5=0;
1185     l3=rlen-1;
1186     //try to see if adapter is fully included in the read
1187     int fi=-1;
1188     if ((fi=seq.index(adapter5))>=0) {
1189     if (fi<rlen-fi-a5len) {//match is closer to the right end
1190     l5=fi+a5len;
1191     l3=rlen-1;
1192     }
1193     else {
1194     l5=0;
1195     l3=fi-1;
1196     }
1197     return true;
1198     }
1199 gpertea 12 #ifdef DEBUG
1200     if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars());
1201     #endif
1202    
1203     CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1204    
1205     //try the easy way out first - look for an exact match of 11 bases
1206     int fdlen=11;
1207     if (a5len<16) {
1208     fdlen=a5len>>1;
1209 gpertea 4 }
1210 gpertea 12 if (fdlen>4) {
1211     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1212     if ((fi=adapter5.index(rstart))>=0) {
1213     #ifdef DEBUG
1214     if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1215     #endif
1216     if (extendMatch(seq.chars(), rlen, 1,
1217     adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true))
1218     return true;
1219     }
1220     //another easy case: last 11 characters of the adaptor found as a substring of the read
1221     GStr bstr=adapter5.substr(-fdlen);
1222     if ((fi=seq.index(bstr))>=0) {
1223     #ifdef DEBUG
1224     if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1225     #endif
1226     if (extendMatch(seq.chars(), rlen, fi,
1227     adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true))
1228     return true;
1229     }
1230     } //tried to matching at most 11 bases first
1231    
1232     //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1233     //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1234     int wordSize=3;
1235     int hlen=12;
1236     if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1237     int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1238     if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1239     for (int iw=0;iw<=hlen;iw++) {
1240     int apstart=a5len-iw-wordSize;
1241     fi=0;
1242     //int* qv=(int32 *)(adapter5.chars()+apstart);
1243     int qv=get3mer_value(adapter5.chars()+apstart);
1244     //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1245     while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1246     #ifdef DEBUG
1247     if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1248     #endif
1249     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1250     a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1251     fi++;
1252     if (fi>imax) break;
1253     }
1254     } //for each wmer in the last hlen bases of the adaptor
1255     /*
1256    
1257     //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1258     //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1259     if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1260     int hlen2=a5len-wordSize;
1261     //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1262     #ifdef DEBUG
1263     if (debug && a5segs.Count()>0) {
1264     GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1265     }
1266     #endif
1267     if (hlen2>hlen) {
1268     for (int iw=hlen+1;iw<=hlen2;iw++) {
1269     int apstart=a5len-iw-wordSize;
1270     fi=0;
1271     //int* qv=(int32 *)(adapter5.chars()+apstart);
1272     int qv=get3mer_value(adapter5.chars()+apstart);
1273     //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1274     while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1275     extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1276     a5len, apstart, wordSize, l5,l3, a5segs, true);
1277     fi++;
1278     if (fi>imax) break;
1279     }
1280     } //for each wmer between hlen2 and hlen bases of the adaptor
1281     }
1282     if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1283     // lastly, analyze collected a5segs for a possible gapped alignment:
1284     GList<CSegChain> segchains(false,true,false);
1285     #ifdef DEBUG
1286     if (debug && a5segs.Count()>0) {
1287     GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1288 gpertea 4 }
1289 gpertea 12 #endif
1290     for (int i=0;i<a5segs.Count();i++) {
1291     if (a5segs[i]->chain==NULL) {
1292     if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1293     CSegChain* newchain=new CSegChain(true);
1294     newchain->setFreeItem(false);
1295     newchain->addSegPair(a5segs[i]);
1296     a5segs[i]->chain=newchain;
1297     segchains.Add(newchain); //just to free them when done
1298     }
1299     for (int j=i+1;j<a5segs.Count();j++) {
1300     CSegChain* chain=a5segs[i]->chain;
1301     if (chain->extendChain(a5segs[j])) {
1302     a5segs[j]->chain=chain;
1303     #ifdef DEBUG
1304     if (debug) dbgPrintChain(*chain, adapter5.chars());
1305     #endif
1306     //save time by checking here if the extended chain is already acceptable for trimming
1307     if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1308     l5=chain->aend;
1309     l3=rlen-1;
1310     return true;
1311     }
1312     } //chain can be extended
1313 gpertea 4 }
1314 gpertea 12 } //collect segment alignments into chains
1315     */
1316 gpertea 4 return false; //no adapter parts found
1317     }
1318    
1319 gpertea 12 //convert qvs to/from phred64 from/to phread33
1320     void convertPhred(GStr& q) {
1321     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1322 gpertea 4 }
1323    
1324 gpertea 12 void convertPhred(char* q, int len) {
1325     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1326 gpertea 4 }
1327    
1328 gpertea 13 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1329     GStr& rname, GStr& rinfo, GStr& infname) {
1330     rseq="";
1331     rqv="";
1332     rname="";
1333     rinfo="";
1334     if (fq.eof()) return false;
1335     char* l=fq.getLine();
1336     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1337     if (l==NULL) return false;
1338     /* if (rawFormat) {
1339     //TODO: implement raw qseq parsing here
1340     //if (raw type=N) then continue; //skip invalid/bad records
1341     } //raw qseq format
1342     else { // FASTQ or FASTA */
1343     isfasta=(l[0]=='>');
1344     if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1345     infname.chars(), l);
1346     GStr s(l);
1347     rname=&(l[1]);
1348     for (int i=0;i<rname.length();i++)
1349     if (rname[i]<=' ') {
1350     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1351     rname.cut(i);
1352     break;
1353     }
1354     //now get the sequence
1355     if ((l=fq.getLine())==NULL)
1356     GError("Error: unexpected EOF after header for read %s (%s)\n",
1357     rname.chars(), infname.chars());
1358     rseq=l; //this must be the DNA line
1359     while ((l=fq.getLine())!=NULL) {
1360     //seq can span multiple lines
1361     if (l[0]=='>' || l[0]=='+') {
1362     fq.pushBack();
1363     break; //
1364     }
1365     rseq+=l;
1366     } //check for multi-line seq
1367     if (!isfasta) { //reading fastq quality values, which can also be multi-line
1368     if ((l=fq.getLine())==NULL)
1369     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1370     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1371     if ((l=fq.getLine())==NULL)
1372     GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1373     rqv=l;
1374     //if (rqv.length()!=rseq.length())
1375     // GError("Error: qv len != seq len for %s\n", rname.chars());
1376     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1377     rqv+=l; //append to qv string
1378     }
1379     }// fastq
1380     // } //<-- FASTA or FASTQ
1381     rseq.upper(); //TODO: what if we care about masking?
1382     return true;
1383     }
1384    
1385     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1386     //returns 0 if the read was untouched, 1 if it was just trimmed
1387     // and a trash code if it was trashed
1388     l5=0;
1389     l3=rseq.length()-1;
1390     if (l3-l5+1<min_read_len) {
1391     return 's';
1392     }
1393     GStr wseq(rseq.chars());
1394     GStr wqv(rqv.chars());
1395     int w5=l5;
1396     int w3=l3;
1397     //first do the q-based trimming
1398     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1399     if (w3-w5+1<min_read_len) {
1400     return 'Q'; //invalid read
1401     }
1402     //-- keep only the w5..w3 range
1403     l5=w5;
1404     l3=w3;
1405     wseq=wseq.substr(w5, w3-w5+1);
1406     if (!wqv.is_empty())
1407     wqv=wqv.substr(w5, w3-w5+1);
1408     } //qv trimming
1409     // N-trimming on the remaining read seq
1410     if (ntrim(wseq, w5, w3)) {
1411     //GMessage("before: %s\n",wseq.chars());
1412     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1413     l5+=w5;
1414     l3-=(wseq.length()-1-w3);
1415     if (w3-w5+1<min_read_len) {
1416     return 'N'; //to be trashed
1417     }
1418     //-- keep only the w5..w3 range
1419     wseq=wseq.substr(w5, w3-w5+1);
1420     if (!wqv.is_empty())
1421     wqv=wqv.substr(w5, w3-w5+1);
1422     w5=0;
1423     w3=wseq.length()-1;
1424     }
1425     if (a3len>0) {
1426     if (trim_adapter3(wseq, w5, w3)) {
1427     int trimlen=wseq.length()-(w3-w5+1);
1428     num_trimmed3++;
1429     if (trimlen<min_trimmed3)
1430     min_trimmed3=trimlen;
1431     l5+=w5;
1432     l3-=(wseq.length()-1-w3);
1433     if (w3-w5+1<min_read_len) {
1434     return '3';
1435     }
1436     //-- keep only the w5..w3 range
1437     wseq=wseq.substr(w5, w3-w5+1);
1438     if (!wqv.is_empty())
1439     wqv=wqv.substr(w5, w3-w5+1);
1440     }//some adapter was trimmed
1441     } //adapter trimming
1442     if (a5len>0) {
1443     if (trim_adapter5(wseq, w5, w3)) {
1444     int trimlen=wseq.length()-(w3-w5+1);
1445     num_trimmed5++;
1446     if (trimlen<min_trimmed5)
1447     min_trimmed5=trimlen;
1448     l5+=w5;
1449     l3-=(wseq.length()-1-w3);
1450     if (w3-w5+1<min_read_len) {
1451     return '5';
1452     }
1453     //-- keep only the w5..w3 range
1454     wseq=wseq.substr(w5, w3-w5+1);
1455     if (!wqv.is_empty())
1456     wqv=wqv.substr(w5, w3-w5+1);
1457     }//some adapter was trimmed
1458     } //adapter trimming
1459     if (doCollapse) {
1460     //keep read for later
1461     FqDupRec* dr=dhash.Find(wseq.chars());
1462     if (dr==NULL) { //new entry
1463     //if (prefix.is_empty())
1464     dhash.Add(wseq.chars(),
1465     new FqDupRec(&wqv, rname.chars()));
1466     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1467     }
1468     else
1469     dr->add(wqv);
1470     } //collapsing duplicates
1471     else { //not collapsing duplicates
1472     //apply the dust filter now
1473     if (doDust) {
1474     int dustbases=dust(wseq);
1475     if (dustbases>(wseq.length()>>1)) {
1476     return 'D';
1477     }
1478     }
1479     } //not collapsing duplicates
1480     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1481     }
1482    
1483     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1484     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1485     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1486     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1487     }
1488    
1489     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1490     outcounter++;
1491     bool asFasta=(rqv.is_empty() || fastaOutput);
1492     if (asFasta) {
1493     if (prefix.is_empty()) {
1494     printHeader(f_out, '>',rname,rinfo);
1495     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1496     }
1497     else {
1498     fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1499     rseq.chars());
1500     }
1501     }
1502     else { //fastq
1503     if (convert_phred) convertPhred(rqv);
1504     if (prefix.is_empty()) {
1505     printHeader(f_out, '@', rname, rinfo);
1506     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1507     }
1508     else
1509     fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1510     rseq.chars(),rqv.chars() );
1511     }
1512     }
1513    
1514     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1515     if (freport==NULL || trashcode<=' ') return;
1516     if (trashcode=='3' || trashcode=='5') {
1517     fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1518     }
1519     else {
1520     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1521     }
1522     //tcounter++;
1523     }
1524    
1525     GStr getFext(GStr& s, int* xpos=NULL) {
1526     //s must be a filename without a path
1527     GStr r("");
1528     if (xpos!=NULL) *xpos=0;
1529     if (s.is_empty() || s=="-") return r;
1530     int p=s.rindex('.');
1531     int d=s.rindex('/');
1532     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1533     r=s.substr(p+1);
1534     if (xpos!=NULL) *xpos=p+1;
1535     r.lower();
1536     return r;
1537     }
1538    
1539     void baseFileName(GStr& fname) {
1540     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1541     int xpos=0;
1542     GStr fext=getFext(fname, &xpos);
1543     if (xpos<=1) return;
1544     bool extdel=false;
1545     GStr f2;
1546     if (fext=="z" || fext=="zip") {
1547     extdel=true;
1548     }
1549     else if (fext.length()>=2) {
1550     f2=fext.substr(0,2);
1551     extdel=(f2=="gz" || f2=="bz");
1552     }
1553     if (extdel) {
1554     fname.cut(xpos-1);
1555     fext=getFext(fname, &xpos);
1556     if (xpos<=1) return;
1557     }
1558     extdel=false;
1559     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1560     extdel=true;
1561     }
1562     else if (fext.length()>=2) {
1563     extdel=(fext.substr(0,2)=="fa");
1564     }
1565     if (extdel) fname.cut(xpos-1);
1566     GStr fncp(fname);
1567     fncp.lower();
1568     fncp.chomp("seq");
1569     fncp.chomp("sequence");
1570     fncp.trimR("_.");
1571     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1572     }
1573    
1574     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1575     FILE* f_out=NULL;
1576     GStr fname(getFileName(infname.chars()));
1577     //eliminate known extensions
1578     baseFileName(fname);
1579     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1580     else if (pocmd.is_empty()) {
1581     GStr oname(fname);
1582     oname.append('.');
1583     oname.append(outsuffix);
1584     f_out=fopen(oname.chars(),"w");
1585     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1586     }
1587     else {
1588     GStr oname(">");
1589     oname.append(fname);
1590     oname.append('.');
1591     oname.append(outsuffix);
1592     pocmd.append(oname);
1593     f_out=popen(pocmd.chars(), "w");
1594     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1595     }
1596     return f_out;
1597     }
1598    
1599     void guess_unzip(GStr& fname, GStr& picmd) {
1600     GStr fext=getFext(fname);
1601     if (fext=="gz" || fext=="gzip" || fext=="z") {
1602     picmd="gzip -cd ";
1603     }
1604     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1605     picmd="bzip2 -cd ";
1606     }
1607     }
1608    
1609     void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1610     GStr& s, GStr& infname, GStr& infname2) {
1611     // uses outsuffix to generate output file names and open file handles as needed
1612     infname="";
1613     infname2="";
1614     f_in=NULL;
1615     f_in2=NULL;
1616     f_out=NULL;
1617     f_out2=NULL;
1618     //analyze outsuffix intent
1619     GStr pocmd;
1620 gpertea 76 if (outsuffix=="-") {
1621     f_out=stdout;
1622     }
1623     else {
1624     GStr ox=getFext(outsuffix);
1625     if (ox.length()>2) ox=ox.substr(0,2);
1626     if (ox=="gz") pocmd="gzip -9 -c ";
1627     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1628     }
1629 gpertea 13 if (s=="-") {
1630     f_in=stdin;
1631     infname="stdin";
1632     f_out=prepOutFile(infname, pocmd);
1633     return;
1634     } // streaming from stdin
1635     s.startTokenize(",:");
1636     s.nextToken(infname);
1637     bool paired=s.nextToken(infname2);
1638     if (fileExists(infname.chars())==0)
1639     GError("Error: cannot find file %s!\n",infname.chars());
1640     GStr fname(getFileName(infname.chars()));
1641     GStr picmd;
1642     guess_unzip(fname, picmd);
1643     if (picmd.is_empty()) {
1644     f_in=fopen(infname.chars(), "r");
1645     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1646     }
1647     else {
1648     picmd.append(infname);
1649     f_in=popen(picmd.chars(), "r");
1650     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1651     }
1652 gpertea 76 if (f_out==stdout) {
1653     if (paired) GError("Error: output suffix required for paired reads\n");
1654     return;
1655     }
1656 gpertea 13 f_out=prepOutFile(infname, pocmd);
1657     if (!paired) return;
1658     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1659     // ---- paired reads:-------------
1660     if (fileExists(infname2.chars())==0)
1661     GError("Error: cannot find file %s!\n",infname2.chars());
1662     picmd="";
1663     GStr fname2(getFileName(infname2.chars()));
1664     guess_unzip(fname2, picmd);
1665     if (picmd.is_empty()) {
1666     f_in2=fopen(infname2.chars(), "r");
1667     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1668     }
1669     else {
1670     picmd.append(infname2);
1671     f_in2=popen(picmd.chars(), "r");
1672     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1673     }
1674     f_out2=prepOutFile(infname2, pocmd);
1675     }