ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 88
Committed: Mon Oct 3 16:34:03 2011 UTC (7 years, 11 months ago) by gpertea
File size: 54110 byte(s)
Log Message:
starting to add polyA/T trimming

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 4
7     #define USAGE "Usage:\n\
8 gpertea 76 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
9     [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
11     <input.fq>[,<input_mates.fq>\n\
12 gpertea 4 \n\
13 gpertea 76 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
14     for low complexity and collapse duplicate reads.\n\
15 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
16     one read in a pair), the two file names should be given delimited by a comma\n\
17     or a colon character\n\
18 gpertea 4 \n\
19     Options:\n\
20     -n rename all the reads using the <prefix> followed by a read counter;\n\
21     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
22     the read duplication count\n\
23 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
24     file(s) named <input>.<outsuffix> which will be created in the \n\
25     current (working) directory; (writes to stdout if -o- is given);\n\
26     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
27     -f file with adapter sequences to trim, each line having this format:\n\
28     <5'-adapter-sequence> <3'-adapter-sequence>\n\
29 gpertea 4 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
30     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
31     -3 trim the given adapter sequence at the 3' end of each read\n\
32     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
33 gpertea 76 -a minimum length of exact match to adaptor sequence at the proper end (6)\n\
34 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
35 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
36 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
37     -l minimum \"clean\" length after trimming that a read must have\n\
38     in order to pass the filter (default: 16)\n\
39 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
40     one letter code for the reason why they were trashed\n\
41 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
42     50% of its length detected as low complexity\n\
43 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
44 gpertea 12 name (e.g. for microRNAs)\n\
45     -p input is phred64/phred33 (use -p64 or -p33)\n\
46     -Q convert quality values to the other Phred qv type\n\
47     -V verbose processing\n\
48 gpertea 4 "
49 gpertea 13
50     //-z for -o option, the output stream(s) will be first piped into the given\n
51     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
52    
53    
54 gpertea 4 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
55 gpertea 12
56 gpertea 76 //For paired reads sequencing:
57 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
58     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
59 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
60     //FILE* f_out2=NULL; //for paired reads
61     //FILE* f_in=NULL; //input fastq (stdin if not provided)
62     //FILE* f_in2=NULL; //for paired reads
63 gpertea 4 FILE* freport=NULL;
64 gpertea 13
65 gpertea 4 bool debug=false;
66 gpertea 12 bool verbose=false;
67 gpertea 4 bool doCollapse=false;
68     bool doDust=false;
69 gpertea 13 bool fastaOutput=false;
70 gpertea 4 bool trashReport=false;
71 gpertea 12 //bool rawFormat=false;
72 gpertea 4 int min_read_len=16;
73 gpertea 12 double max_perc_N=7.0;
74 gpertea 4 int dust_cutoff=16;
75     bool isfasta=false;
76 gpertea 12 bool convert_phred=false;
77 gpertea 13 GStr outsuffix; // -o
78 gpertea 76 //GStr adapter3;
79     //GStr adapter5;
80 gpertea 13 GStr prefix;
81     GStr zcmd;
82     int num_trimmed5=0;
83     int num_trimmed3=0;
84     int num_trimmedN=0;
85     int num_trimmedQ=0;
86     int min_trimmed5=INT_MAX;
87     int min_trimmed3=INT_MAX;
88 gpertea 12
89 gpertea 13 int qvtrim_qmin=0;
90     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
91 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
92     int qv_cvtadd=0; //could be -31 or +31
93    
94 gpertea 4 int a3len=0;
95     int a5len=0;
96     // adaptor matching metrics -- for extendMatch() function
97 gpertea 88 const int match_score=2; //match score
98     const int mismatch_score=-3; //mismatch
99    
100 gpertea 12 const int a_m_score=2; //match score
101     const int a_mis_score=-3; //mismatch
102     const int a_dropoff_score=7;
103 gpertea 88 const int poly_dropoff_score=7;
104 gpertea 76 int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
105 gpertea 12 const int a_min_chain_score=15; //for gapped alignments
106 gpertea 4
107 gpertea 12 class CSegChain;
108    
109     class CSegPair {
110     public:
111     GSeg a;
112     GSeg b; //the adapter segment
113     int score;
114     int flags;
115     CSegChain* chain;
116     CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
117     score=mscore;
118     if (score==0) score=a.len()*a_m_score;
119     flags=0;
120     chain=NULL;
121     }
122     int len() { return a.len(); }
123     bool operator==(CSegPair& d){
124     //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
125     //make equal even segments that are included into one another:
126     return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
127     }
128     bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
129     if (b.start==d.b.start) {
130     if (score==d.score) {
131     //just try to be consistent:
132     if (b.end==d.b.end) {
133     return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
134     }
135     return (b.end>d.b.end);
136     }
137     else return (score<d.score);
138     }
139     return (b.start>d.b.start);
140     }
141     bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
142     /*if (b.start==d.b.start && b.end==d.b.end) {
143     return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
144     }
145     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
146     if (b.start==d.b.start) {
147     if (score==d.score) {
148     //just try to be consistent:
149     if (b.end==d.b.end) {
150     return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
151     }
152     return (b.end<d.b.end);
153     }
154     else return (score>d.score);
155     }
156     return (b.start<d.b.start);
157     }
158     };
159    
160     int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
161     CSegPair& x = *(CSegPair *)sa;
162     CSegPair& y = *(CSegPair *)sb;
163     /*
164     if (x.b.end==y.b.end) {
165     if (x.b.start==y.b.start) {
166     if (x.a.end==y.a.end) {
167     if (x.a.start==y.a.start) return 0;
168     return ((x.a.start>y.a.start) ? -1 : 1);
169     }
170     else {
171     return ((x.a.end>y.a.end) ? -1 : 1);
172     }
173     }
174     else {
175     return ((x.b.start>y.b.start) ? -1 : 1);
176     }
177     }
178     else {
179     return ((x.b.end>y.b.end) ? -1 : 1);
180     }
181     */
182     if (x.b.end==y.b.end) {
183     if (x.score==y.score) {
184     if (x.b.start==y.b.start) {
185     if (x.a.end==y.a.end) {
186     if (x.a.start==y.a.start) return 0;
187     return ((x.a.start<y.a.start) ? -1 : 1);
188     }
189     else {
190     return ((x.a.end<y.a.end) ? -1 : 1);
191     }
192     }
193     else {
194     return ((x.b.start<y.b.start) ? -1 : 1);
195     }
196     } else return ((x.score>y.score) ? -1 : 1);
197     }
198     else {
199     return ((x.b.end>y.b.end) ? -1 : 1);
200     }
201    
202     }
203    
204     class CSegChain:public GList<CSegPair> {
205     public:
206     uint astart;
207     uint aend;
208     uint bstart;
209     uint bend;
210     int score;
211     bool endSort;
212     CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
213     //as SegPairs are inserted, they will be sorted by a.start coordinate
214     score=0;
215     astart=MAX_UINT;
216     aend=0;
217     bstart=MAX_UINT;
218     bend=0;
219     endSort=aln5;
220     if (aln5) { setSorted(cmpSegEnds); }
221     }
222     bool operator==(CSegChain& d) {
223     //return (score==d.score);
224     return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
225     }
226     bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
227     //return (score<d.score);
228     if (bstart==d.bstart && bend==d.bend) {
229     return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
230     }
231     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
232     }
233     bool operator<(CSegChain& d) {
234     //return (score>d.score);
235     if (bstart==d.bstart && bend==d.bend) {
236     return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
237     }
238     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
239     }
240     void addSegPair(CSegPair* segp) {
241     if (AddIfNew(segp)!=segp) return;
242     score+=segp->score;
243     if (astart>segp->a.start) astart=segp->a.start;
244     if (aend<segp->a.end) aend=segp->a.end;
245     if (bstart>segp->b.start) bstart=segp->b.start;
246     if (bend<segp->b.end) bend=segp->b.end;
247     }
248     //for building actual chains:
249     bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
250     int bgap=0;
251     int agap=0;
252     //if (endSort) {
253     if (bstart>segp->b.start) {
254     bgap = (int)(bstart-segp->b.end);
255     if (abs(bgap)>2) return false;
256     agap = (int)(astart-segp->a.end);
257     if (abs(agap)>2) return false;
258     }
259     else {
260     bgap = (int) (segp->b.start-bend);
261     if (abs(bgap)>2) return false;
262     agap = (int)(segp->a.start-aend);
263     if (abs(agap)>2) return false;
264     }
265     if (agap*bgap<0) return false;
266     addSegPair(segp);
267     score-=abs(agap)+abs(bgap);
268     return true;
269     }
270     };
271    
272 gpertea 4 // element in dhash:
273     class FqDupRec {
274     public:
275     int count; //how many of these reads are the same
276     int len; //length of qv
277     char* firstname; //optional, only if we want to keep the original read names
278     char* qv;
279     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
280     len=0;
281     qv=NULL;
282     firstname=NULL;
283     count=0;
284     if (qstr!=NULL) {
285     qv=Gstrdup(qstr->chars());
286     len=qstr->length();
287     count++;
288     }
289     if (rname!=NULL) firstname=Gstrdup(rname);
290     }
291     ~FqDupRec() {
292     GFREE(qv);
293     GFREE(firstname);
294     }
295     void add(GStr& d) { //collapse another record into this one
296     if (d.length()!=len)
297     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
298     count++;
299     for (int i=0;i<len;i++)
300     qv[i]=(qv[i]+d[i])/2;
301     }
302     };
303    
304     void openfw(FILE* &f, GArgs& args, char opt) {
305     GStr s=args.getOpt(opt);
306     if (!s.is_empty()) {
307     if (s=='-') f=stdout;
308     else {
309 gpertea 13 f=fopen(s.chars(),"w");
310 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
311     }
312     }
313     }
314    
315     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
316     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
317    
318     GHash<FqDupRec> dhash; //hash to keep track of duplicates
319    
320 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
321     GStr& s, GStr& infname, GStr& infname2);
322     // uses outsuffix to generate output file names and open file handles as needed
323    
324     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
325     void trash_report(char trashcode, GStr& rname, FILE* freport);
326    
327     bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
328     GStr& rname, GStr& rinfo, GStr& infname);
329    
330     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
331     //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
332    
333 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
334 gpertea 88 bool polytrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
335 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
336 gpertea 4 int dust(GStr& seq);
337     bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
338     bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
339    
340 gpertea 12 void convertPhred(char* q, int len);
341     void convertPhred(GStr& q);
342 gpertea 4
343     int main(int argc, char * const argv[]) {
344 gpertea 76 GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
345 gpertea 4 int e;
346     if ((e=args.isError())>0) {
347     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
348     exit(224);
349     }
350 gpertea 9 debug=(args.getOpt('Y')!=NULL);
351 gpertea 13 verbose=(args.getOpt('V')!=NULL);
352 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
353 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
354     doDust=(args.getOpt('D')!=NULL);
355 gpertea 12 /*
356 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
357     if (rawFormat) {
358     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
359     }
360 gpertea 12 */
361 gpertea 4 prefix=args.getOpt('n');
362     GStr s=args.getOpt('l');
363     if (!s.is_empty())
364     min_read_len=s.asInt();
365 gpertea 12 s=args.getOpt('m');
366     if (!s.is_empty())
367     max_perc_N=s.asDouble();
368 gpertea 4 s=args.getOpt('d');
369     if (!s.is_empty()) {
370     dust_cutoff=s.asInt();
371     doDust=true;
372     }
373 gpertea 12 s=args.getOpt('q');
374     if (!s.is_empty()) {
375 gpertea 13 qvtrim_qmin=s.asInt();
376 gpertea 12 }
377 gpertea 13 s=args.getOpt('t');
378     if (!s.is_empty()) {
379     qvtrim_max=s.asInt();
380     }
381 gpertea 12 s=args.getOpt('p');
382     if (!s.is_empty()) {
383     int v=s.asInt();
384     if (v==33) {
385     qv_phredtype=33;
386     qv_cvtadd=31;
387     }
388     else if (v==64) {
389     qv_phredtype=64;
390     qv_cvtadd=-31;
391     }
392     else
393     GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
394     }
395 gpertea 4 if (args.getOpt('3')!=NULL) {
396     adapter3=args.getOpt('3');
397     adapter3.upper();
398     a3len=adapter3.length();
399     }
400     if (args.getOpt('5')!=NULL) {
401     adapter5=args.getOpt('5');
402     adapter5.upper();
403     a5len=adapter5.length();
404     }
405 gpertea 13 s=args.getOpt('a');
406     if (!s.is_empty()) {
407     int a_minmatch=s.asInt();
408     a_min_score=a_minmatch<<1;
409     }
410 gpertea 76
411 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
412 gpertea 76 else outsuffix="-";
413 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
414 gpertea 13 int fcount=args.startNonOpt();
415     if (fcount==0) {
416 gpertea 4 GMessage(USAGE);
417     exit(224);
418     }
419 gpertea 13 if (fcount>1 && doCollapse) {
420     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
421     }
422     //openfw(f_out, args, 'o');
423     //if (f_out==NULL) f_out=stdout;
424 gpertea 4 if (trashReport)
425     openfw(freport, args, 'r');
426     char* infile=NULL;
427     while ((infile=args.nextNonOpt())!=NULL) {
428 gpertea 13 int incounter=0; //counter for input reads
429     int outcounter=0; //counter for output reads
430     int trash_s=0; //too short from the get go
431     int trash_Q=0;
432     int trash_N=0;
433     int trash_D=0;
434     int trash_A3=0;
435     int trash_A5=0;
436     s=infile;
437     GStr infname;
438     GStr infname2;
439     FILE* f_in=NULL;
440     FILE* f_in2=NULL;
441     FILE* f_out=NULL;
442     FILE* f_out2=NULL;
443     bool paired_reads=false;
444     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
445     GLineReader fq(f_in);
446     GLineReader* fq2=NULL;
447     if (f_in2!=NULL) {
448     fq2=new GLineReader(f_in2);
449     paired_reads=true;
450     }
451     GStr rseq, rqv, seqid, seqinfo;
452     GStr rseq2, rqv2, seqid2, seqinfo2;
453     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
454     incounter++;
455     int a5=0, a3=0, b5=0, b3=0;
456     char tcode=0, tcode2=0;
457     tcode=process_read(seqid, rseq, rqv, a5, a3);
458     //if (!doCollapse) {
459     if (fq2!=NULL) {
460     getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
461     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
462     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
463     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
464     }
465     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
466     //decide what to do with this pair and print rseq2 if the pair makes it
467     if (tcode>1 && tcode2<=1) {
468     //"untrash" rseq
469     if (a3-a5+1<min_read_len) {
470     a5=1;
471     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
472 gpertea 4 }
473 gpertea 13 tcode=1;
474     }
475     else if (tcode<=1 && tcode2>1) {
476     //"untrash" rseq2
477     if (b3-b5+1<min_read_len) {
478     b5=1;
479     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
480     }
481     tcode2=1;
482     }
483     if (tcode<=1) { //trimmed or left intact -- write it!
484     if (tcode>0) {
485     rseq2=rseq2.substr(b5,b3-b5+1);
486     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
487 gpertea 4 }
488 gpertea 13 int nocounter=0;
489     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
490     }
491     } //paired read
492     // }
493     if (tcode>1) { //trashed
494     if (tcode=='s') trash_s++;
495     else if (tcode=='Q') trash_Q++;
496     else if (tcode=='N') trash_N++;
497     else if (tcode=='D') trash_D++;
498     else if (tcode=='3') trash_A3++;
499     else if (tcode=='5') trash_A5++;
500     if (trashReport) trash_report(tcode, seqid, freport);
501     }
502     else if (!doCollapse) { //write it
503     if (tcode>0) {
504     rseq=rseq.substr(a5,a3-a5+1);
505     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
506     }
507     writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
508     }
509     } //for each fastq record
510     delete fq2;
511     FRCLOSE(f_in);
512     FRCLOSE(f_in2);
513     if (doCollapse) {
514     outcounter=0;
515     int maxdup_count=1;
516     char* maxdup_seq=NULL;
517     dhash.startIterate();
518     FqDupRec* qd=NULL;
519     char* seq=NULL;
520     while ((qd=dhash.NextData(seq))!=NULL) {
521     GStr rseq(seq);
522     //do the dusting here
523     if (doDust) {
524     int dustbases=dust(rseq);
525     if (dustbases>(rseq.length()>>1)) {
526     if (trashReport && qd->firstname!=NULL) {
527     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
528 gpertea 4 }
529 gpertea 13 trash_D+=qd->count;
530     continue;
531     }
532     }
533     outcounter++;
534     if (qd->count>maxdup_count) {
535     maxdup_count=qd->count;
536     maxdup_seq=seq;
537     }
538     if (isfasta) {
539     if (prefix.is_empty()) {
540     fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
541     rseq.chars());
542 gpertea 4 }
543 gpertea 13 else { //use custom read name
544     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
545     qd->count, rseq.chars());
546 gpertea 4 }
547 gpertea 13 }
548     else { //fastq format
549     if (convert_phred) convertPhred(qd->qv, qd->len);
550     if (prefix.is_empty()) {
551     fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
552     rseq.chars(), qd->qv);
553 gpertea 4 }
554 gpertea 13 else { //use custom read name
555     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
556     qd->count, rseq.chars(), qd->qv);
557     }
558     }
559     }//for each element of dhash
560     if (maxdup_count>1) {
561     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
562 gpertea 4 }
563 gpertea 13 } //collapse entries
564     if (verbose) {
565     if (paired_reads) {
566     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
567     GMessage("Number of input pairs :%9d\n", incounter);
568     GMessage(" Output pairs :%9d\n", outcounter);
569     }
570     else {
571     GMessage(">Input file : %s\n", infname.chars());
572     GMessage("Number of input reads :%9d\n", incounter);
573     GMessage(" Output reads :%9d\n", outcounter);
574     }
575     GMessage("------------------------------------\n");
576     if (num_trimmed5)
577     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
578     if (num_trimmed3)
579     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
580     GMessage("------------------------------------\n");
581     if (trash_s>0)
582     GMessage(" Trashed by length:%9d\n", trash_s);
583     if (trash_N>0)
584     GMessage(" Trashed by N%%:%9d\n", trash_N);
585     if (trash_Q>0)
586     GMessage("Trashed by low quality:%9d\n", trash_Q);
587     if (trash_A5>0)
588     GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
589     if (trash_A3>0)
590     GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
591     }
592     if (trashReport) {
593     FWCLOSE(freport);
594 gpertea 4 }
595 gpertea 13 FWCLOSE(f_out);
596     FWCLOSE(f_out2);
597     } //while each input file
598 gpertea 4
599 gpertea 12 //getc(stdin);
600 gpertea 4 }
601    
602     class NData {
603     public:
604     int NPos[1024]; //there should be no reads longer than 1K ?
605     int NCount;
606     int end5;
607     int end3;
608     int seqlen;
609     double perc_N; //percentage of Ns in end5..end3 range only!
610     const char* seq;
611     bool valid;
612     NData() {
613     NCount=0;
614     end5=0;
615     end3=0;
616     seq=NULL;
617     perc_N=0;
618     valid=true;
619     }
620     void init(GStr& rseq) {
621     NCount=0;
622     perc_N=0;
623     seqlen=rseq.length();
624     seq=rseq.chars();
625     end5=1;
626     end3=seqlen;
627     for (int i=0;i<seqlen;i++)
628 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
629 gpertea 4 NPos[NCount]=i;
630     NCount++;
631     }
632     perc_N=(NCount*100.0)/seqlen;
633     valid=true;
634     }
635     void N_calc() { //only in the region after trimming
636     int n=0;
637     for (int i=end3-1;i<end5;i++) {
638     if (seq[i]=='N') n++;
639     }
640     perc_N=(n*100.0)/(end5-end3+1);
641     }
642     };
643    
644     static NData feat;
645     int perc_lenN=12; // incremental distance from ends, in percentage of
646     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
647    
648     void N_analyze(int l5, int l3, int p5, int p3) {
649     /* assumes feat was filled properly */
650     int old_dif, t5,t3,v;
651 gpertea 12 if (l3<l5+2 || p5>p3 ) {
652 gpertea 4 feat.end5=l5+1;
653     feat.end3=l3+1;
654     return;
655     }
656    
657     t5=feat.NPos[p5]-l5;
658     t3=l3-feat.NPos[p3];
659     old_dif=p3-p5;
660     v=(int)((((double)(l3-l5))*perc_lenN)/100);
661     if (v>20) v=20; /* enforce N-search limit for very long reads */
662     if (t5 < v ) {
663     l5=feat.NPos[p5]+1;
664     p5++;
665     }
666     if (t3 < v) {
667     l3=feat.NPos[p3]-1;
668     p3--;
669     }
670     /* restNs=p3-p5; number of Ns in the new CLR */
671     if (p3-p5==old_dif) { /* no change, return */
672     /* don't trim if this may shorten the read too much */
673     //if (l5-l3<min_read_len) return;
674     feat.end5=l5+1;
675     feat.end3=l3+1;
676     return;
677     }
678     else
679     N_analyze(l5,l3, p5,p3);
680     }
681    
682    
683 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
684 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
685 gpertea 12 if (qv_phredtype==0) {
686     //try to guess the Phred type
687     int vmin=256, vmax=0;
688     for (int i=0;i<qvs.length();i++) {
689     if (vmin>qvs[i]) vmin=qvs[i];
690     if (vmax<qvs[i]) vmax=qvs[i];
691     }
692     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
693     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
694     if (qv_phredtype==0) {
695     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
696     }
697 gpertea 13 if (verbose)
698     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
699     } //guessing Phred type
700 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
701 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
702 gpertea 12 }
703     //just in case, check also the 5' the end (?)
704     for (l5=0;l5<qvs.length()-3;l5++) {
705 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
706 gpertea 12 }
707 gpertea 13 if (qvtrim_max>0) {
708     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
709     if (l5>qvtrim_max) l5=qvtrim_max;
710     }
711 gpertea 12 return (l5>0 || l3<qvs.length()-1);
712     }
713    
714     bool ntrim(GStr& rseq, int &l5, int &l3) {
715     //count Ns in the sequence, trim N-rich ends
716 gpertea 4 feat.init(rseq);
717     l5=feat.end5-1;
718     l3=feat.end3-1;
719     N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
720 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
721     if (feat.perc_N>max_perc_N) {
722     feat.valid=false;
723     l3=l5+1;
724     return true;
725     }
726     else {
727     return false; //nothing changed
728     }
729     }
730 gpertea 4 l5=feat.end5-1;
731     l3=feat.end3-1;
732     if (l3-l5+1<min_read_len) {
733     feat.valid=false;
734     return true;
735     }
736     feat.N_calc();
737 gpertea 12
738     if (feat.perc_N>max_perc_N) {
739 gpertea 4 feat.valid=false;
740     l3=l5+1;
741     return true;
742     }
743     return true;
744     }
745    
746     //--------------- dust functions ----------------
747     class DNADuster {
748     public:
749     int dustword;
750     int dustwindow;
751     int dustwindow2;
752     int dustcutoff;
753     int mv, iv, jv;
754     int counts[32*32*32];
755     int iis[32*32*32];
756     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
757     dustword=wordsize;
758     dustwindow=winsize;
759     dustwindow2 = (winsize>>1);
760     dustcutoff=cutoff;
761     mv=0;
762     iv=0;
763     jv=0;
764     }
765     void setWindowSize(int value) {
766     dustwindow = value;
767     dustwindow2 = (dustwindow >> 1);
768     }
769     void setWordSize(int value) {
770     dustword=value;
771     }
772     void wo1(int len, const char* s, int ivv) {
773     int i, ii, j, v, t, n, n1, sum;
774     int js, nis;
775     n = 32 * 32 * 32;
776     n1 = n - 1;
777     nis = 0;
778     i = 0;
779     ii = 0;
780     sum = 0;
781     v = 0;
782     for (j=0; j < len; j++, s++) {
783     ii <<= 5;
784     if (*s<=32) {
785     i=0;
786     continue;
787     }
788     ii |= *s - 'A'; //assume uppercase!
789     ii &= n1;
790     i++;
791     if (i >= dustword) {
792     for (js=0; js < nis && iis[js] != ii; js++) ;
793     if (js == nis) {
794     iis[nis] = ii;
795     counts[ii] = 0;
796     nis++;
797     }
798     if ((t = counts[ii]) > 0) {
799     sum += t;
800     v = 10 * sum / j;
801     if (mv < v) {
802     mv = v;
803     iv = ivv;
804     jv = j;
805     }
806     }
807     counts[ii]++;
808     }
809     }
810     }
811    
812     int wo(int len, const char* s, int* beg, int* end) {
813     int i, l1;
814     l1 = len - dustword + 1;
815     if (l1 < 0) {
816     *beg = 0;
817     *end = len - 1;
818     return 0;
819     }
820     mv = 0;
821     iv = 0;
822     jv = 0;
823     for (i=0; i < l1; i++) {
824     wo1(len-i, s+i, i);
825     }
826     *beg = iv;
827     *end = iv + jv;
828     return mv;
829     }
830    
831     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
832     int i, j, l, a, b, v;
833     if (cutoff==0) cutoff=dustcutoff;
834     a=0;b=0;
835     //GMessage("Dust cutoff=%d\n", cutoff);
836     for (i=0; i < seqlen; i += dustwindow2) {
837     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
838     v = wo(l, seq+i, &a, &b);
839     if (v > cutoff) {
840     //for (j = a; j <= b && j < dustwindow2; j++) {
841     for (j = a; j <= b; j++) {
842     seqmsk[i+j]='N';//could be made lowercase instead
843     }
844     }
845     }
846     //return first;
847     }
848     };
849    
850     static DNADuster duster;
851    
852     int dust(GStr& rseq) {
853     char* seq=Gstrdup(rseq.chars());
854     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
855     //check the number of Ns:
856     int ncount=0;
857     for (int i=0;i<rseq.length();i++) {
858     if (seq[i]=='N') ncount++;
859     }
860     GFREE(seq);
861     return ncount;
862     }
863    
864 gpertea 12
865     // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
866     //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
867     //check minimum score and
868     //for 3' adapter trimming:
869 gpertea 4 // require that the right end of the alignment for either the adaptor OR the read must be
870     // < 3 distance from its right end
871     // for 5' adapter trimming:
872     // require that the left end of the alignment for either the adaptor OR the read must
873 gpertea 12 // be at coordinate < 3 from start
874 gpertea 4
875     bool extendMatch(const char* a, int alen, int ai,
876 gpertea 12 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
877 gpertea 4 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
878 gpertea 12 #ifdef DEBUG
879     GStr dbg(b);
880     #endif
881     //if (debug) {
882     // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
883     // }
884 gpertea 4 int a_l=ai; //alignment coordinates on a
885     int a_r=ai+mlen-1;
886     int b_l=bi; //alignment coordinates on b
887     int b_r=bi+mlen-1;
888     int ai_maxscore=ai;
889     int bi_maxscore=bi;
890     int score=mlen*a_m_score;
891     int maxscore=score;
892 gpertea 9 int mism5score=a_mis_score;
893     if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
894 gpertea 4 //try to extend to the left first, if possible
895     while (ai>0 && bi>0) {
896     ai--;
897     bi--;
898     score+= (a[ai]==b[bi])? a_m_score : mism5score;
899     if (score>maxscore) {
900     ai_maxscore=ai;
901     bi_maxscore=bi;
902     maxscore=score;
903     }
904     else if (maxscore-score>a_dropoff_score) break;
905     }
906     a_l=ai_maxscore;
907     b_l=bi_maxscore;
908 gpertea 9 //if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
909 gpertea 4 //now extend to the right
910     ai_maxscore=a_r;
911     bi_maxscore=b_r;
912     ai=a_r;
913     bi=b_r;
914     score=maxscore;
915     //sometimes there are extra AAAAs at the end of the read, ignore those
916     if (strcmp(&a[alen-4],"AAAA")==0) {
917     alen-=3;
918     while (a[alen-1]=='A' && alen>ai) alen--;
919     }
920     while (ai<alen-1 && bi<blen-1) {
921     ai++;
922     bi++;
923     //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
924     if (a[ai]==b[bi]) { //match
925     score+=a_m_score;
926     if (ai>=alen-2) {
927     score+=a_m_score-(alen-ai-1);
928     }
929     }
930     else { //mismatch
931     score+=a_mis_score;
932     }
933     if (score>maxscore) {
934     ai_maxscore=ai;
935     bi_maxscore=bi;
936     maxscore=score;
937     }
938     else if (maxscore-score>a_dropoff_score) break;
939     }
940     a_r=ai_maxscore;
941     b_r=bi_maxscore;
942 gpertea 9 int a_ovh3=alen-a_r-1;
943     int b_ovh3=blen-b_r-1;
944     int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
945     int mmovh5=(a_l<b_l)? a_l : b_l;
946     //if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
947 gpertea 12 #ifdef DEBUG
948     if (debug) GMessage(" extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
949     #endif
950 gpertea 9 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
951     if (a_l<a_ovh3) {
952 gpertea 4 //adapter closer to the left end (typical for 5' adapter)
953     l5=a_r+1;
954     l3=alen-1;
955     }
956     else {
957     //adapter matching at the right end (typical for 3' adapter)
958     l5=0;
959     l3=a_l-1;
960     }
961     return true;
962     }
963 gpertea 12 else { //keep this segment pair for later (gapped alignment)
964     segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
965     //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
966     }
967 gpertea 11 //do not trim:
968     l5=0;
969     l3=alen-1;
970 gpertea 4 return false;
971     }
972    
973 gpertea 12 /*
974     int getWordValue(const char* s, int wlen) {
975     int r=0;
976     while (wlen--) { r+=(((int)s[wlen])<<wlen) }
977     return r;
978     }
979     */
980     int get3mer_value(const char* s) {
981     return (s[0]<<16)+(s[1]<<8)+s[2];
982     }
983    
984     int w3_match(int qv, const char* str, int slen, int start_index=0) {
985     if (start_index>=slen || start_index<0) return -1;
986     for (int i=start_index;i<slen-3;i++) {
987     int rv=get3mer_value(str+i);
988     if (rv==qv) return i;
989     }
990     return -1;
991     }
992    
993     int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
994     if (end_index>=slen) return -1;
995     if (end_index<0) end_index=slen-1;
996     for (int i=end_index-2;i>=0;i--) {
997     int rv=get3mer_value(str+i);
998     if (rv==qv) return i;
999     }
1000     return -1;
1001     }
1002    
1003     int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
1004     if (start_index>=slen || start_index<0) return -1;
1005     for (int i=start_index;i<slen-4;i++) {
1006     int32* rv=(int32*)(str+i);
1007     if (*rv==qv) return i;
1008     }
1009     return -1;
1010     }
1011    
1012     int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1013     if (end_index>=slen) return -1;
1014     if (end_index<0) end_index=slen-1;
1015     for (int i=end_index-3;i>=0;i--) {
1016     int32* rv=(int32*)(str+i);
1017     if (*rv==qv) return i;
1018     }
1019     return -1;
1020     }
1021    
1022     #ifdef DEBUG
1023     void dbgPrintChain(CSegChain& chain, const char* aseq) {
1024     GStr s(aseq);
1025     for (int i=0;i<chain.Count();i++) {
1026     CSegPair& seg=*chain[i];
1027     GMessage(" dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1028     s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
1029     }
1030     }
1031     #endif
1032 gpertea 88 const char *pA_seed="AAAA";
1033     const char *pT_seed="TTTT";
1034     const int polyA_seed=*(int*)pA_seed;
1035     const int polyT_seed=*(int*)pT_seed;
1036 gpertea 12
1037 gpertea 88 bool trim_poly3(GStr &seq, int &l5, int &l3) {
1038     int rlen=seq.length();
1039     l5=0;
1040     l3=rlen-1;
1041     //assumes N trimming was already done
1042     //so a poly match should be very close to the end of the read
1043     //TODO: should we attempt polyA and polyT at the same time, same end?
1044     // -- find the initial match (seed)
1045     int rlo=GMAX((rlen-10), 0);
1046     int si;
1047     for (si=rlen-5;si>rlo;si--) {
1048     if (polyA_seed==*(int*)&(seq.chars()+si)) {
1049     break;
1050     }
1051     }
1052     if (si<=rlo) return false;
1053     //seed found, try to extend it to left and right
1054     int score=4*match_score;
1055     max_rlo=rlo;
1056     rlo--;
1057     while (rlo>=0) {
1058     rlo--;
1059     score+=
1060    
1061     }
1062     }
1063    
1064 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1065     int rlen=seq.length();
1066     l5=0;
1067     l3=rlen-1;
1068     //first try a full match, we might get lucky
1069     int fi=-1;
1070     if ((fi=seq.index(adapter3))>=0) {
1071     if (fi<rlen-fi-a3len) {//match is closer to the right end
1072     l5=fi+a3len;
1073     l3=rlen-1;
1074     }
1075     else {
1076     l5=0;
1077 gpertea 12 l3=fi-1;
1078 gpertea 4 }
1079     return true;
1080     }
1081 gpertea 12 #ifdef DEBUG
1082     if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars());
1083     #endif
1084    
1085 gpertea 4 //also, for fast detection of other adapter-only reads that start past
1086     // the beginning of the adapter sequence, try to see if the first a3len-4
1087 gpertea 12 // bases of the read are a substring of the adapter
1088     if (rlen>a3len-3) {
1089     GStr rstart=seq.substr(1,a3len-4);
1090     if ((fi=adapter3.index(rstart))>=0) {
1091     l3=rlen-1;
1092     l5=a3len-4;
1093     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1094     return true;
1095     }
1096     }
1097     CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1098     //check the easy cases - 11 bases exact match at the end
1099     int fdlen=11;
1100     if (a3len<16) {
1101     fdlen=a3len>>1;
1102 gpertea 4 }
1103 gpertea 12 if (fdlen>4) {
1104     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1105     GStr rstart=seq.substr(-fdlen-3,fdlen);
1106     if ((fi=adapter3.index(rstart))>=0) {
1107     #ifdef DEBUG
1108     if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1109     #endif
1110     if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1111     adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs))
1112     return true;
1113     }
1114     //another easy case: first 11 characters of the adaptor found as a substring of the read
1115     GStr bstr=adapter3.substr(0, fdlen);
1116     if ((fi=seq.rindex(bstr))>=0) {
1117     #ifdef DEBUG
1118     if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1119     #endif
1120     if (extendMatch(seq.chars(), rlen, fi,
1121     adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs))
1122     return true;
1123     }
1124     } //tried to match 11 bases first
1125    
1126     //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1127     //-- only extend if the match is in the 3' (ending) region of the read
1128     int wordSize=3;
1129     int hlen=12;
1130     if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1131     int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1132     if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1133     imin=rlen-imin;
1134     for (int iw=0;iw<hlen;iw++) {
1135     //int32* qv=(int32*)(adapter3.chars()+iw);
1136     int qv=get3mer_value(adapter3.chars()+iw);
1137     fi=-1;
1138     //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1139     while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1140     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1141    
1142     #ifdef DEBUG
1143     if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1144     #endif
1145     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1146     a3len, iw, wordSize, l5,l3, a3segs)) return true;
1147     fi--;
1148     if (fi<imin) break;
1149     }
1150     } //for each wmer in the first hlen bases of the adaptor
1151     /*
1152     //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1153     //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1154     if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1155     int hlen2=a3len-wordSize;
1156     //if (hlen2>a3len-4) hlen2=a3len-4;
1157     if (hlen2>hlen) {
1158     #ifdef DEBUG
1159     if (debug && a3segs.Count()>0) {
1160     GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1161     }
1162     #endif
1163     for (int iw=hlen;iw<hlen2;iw++) {
1164     //int* qv=(int32 *)(adapter3.chars()+iw);
1165     int qv=get3mer_value(adapter3.chars()+iw);
1166     fi=-1;
1167     //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1168     while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1169     extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1170     a3len, iw, wordSize, l5,l3, a3segs);
1171     fi--;
1172     if (fi<imin) break;
1173     }
1174     } //for each wmer between hlen2 and hlen bases of the adaptor
1175     }
1176     //lastly, analyze collected a3segs for a possible gapped alignment:
1177     GList<CSegChain> segchains(false,true,false);
1178     #ifdef DEBUG
1179     if (debug && a3segs.Count()>0) {
1180     GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1181 gpertea 4 }
1182 gpertea 12 #endif
1183     for (int i=0;i<a3segs.Count();i++) {
1184     if (a3segs[i]->chain==NULL) {
1185     if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1186     CSegChain* newchain=new CSegChain();
1187     newchain->setFreeItem(false);
1188     newchain->addSegPair(a3segs[i]);
1189     a3segs[i]->chain=newchain;
1190     segchains.Add(newchain); //just to free them when done
1191     }
1192     for (int j=i+1;j<a3segs.Count();j++) {
1193     CSegChain* chain=a3segs[i]->chain;
1194     if (chain->extendChain(a3segs[j])) {
1195     a3segs[j]->chain=chain;
1196     #ifdef DEBUG
1197     if (debug) dbgPrintChain(*chain, adapter3.chars());
1198     #endif
1199     //save time by checking here if the extended chain is already acceptable for trimming
1200     if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1201     l5=0;
1202     l3=chain->astart-2;
1203     #ifdef DEBUG
1204     if (debug && a3segs.Count()>0) {
1205     GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1206     }
1207     #endif
1208     return true;
1209     }
1210     } //chain can be extended
1211 gpertea 4 }
1212 gpertea 12 } //collect segment alignments into chains
1213     */
1214 gpertea 4 return false; //no adapter parts found
1215     }
1216    
1217     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1218 gpertea 9 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1219 gpertea 4 int rlen=seq.length();
1220     l5=0;
1221     l3=rlen-1;
1222     //try to see if adapter is fully included in the read
1223     int fi=-1;
1224     if ((fi=seq.index(adapter5))>=0) {
1225     if (fi<rlen-fi-a5len) {//match is closer to the right end
1226     l5=fi+a5len;
1227     l3=rlen-1;
1228     }
1229     else {
1230     l5=0;
1231     l3=fi-1;
1232     }
1233     return true;
1234     }
1235 gpertea 12 #ifdef DEBUG
1236     if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars());
1237     #endif
1238    
1239     CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1240    
1241     //try the easy way out first - look for an exact match of 11 bases
1242     int fdlen=11;
1243     if (a5len<16) {
1244     fdlen=a5len>>1;
1245 gpertea 4 }
1246 gpertea 12 if (fdlen>4) {
1247     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1248     if ((fi=adapter5.index(rstart))>=0) {
1249     #ifdef DEBUG
1250     if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1251     #endif
1252     if (extendMatch(seq.chars(), rlen, 1,
1253     adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true))
1254     return true;
1255     }
1256     //another easy case: last 11 characters of the adaptor found as a substring of the read
1257     GStr bstr=adapter5.substr(-fdlen);
1258     if ((fi=seq.index(bstr))>=0) {
1259     #ifdef DEBUG
1260     if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1261     #endif
1262     if (extendMatch(seq.chars(), rlen, fi,
1263     adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true))
1264     return true;
1265     }
1266     } //tried to matching at most 11 bases first
1267    
1268     //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1269     //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1270     int wordSize=3;
1271     int hlen=12;
1272     if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1273     int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1274     if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1275     for (int iw=0;iw<=hlen;iw++) {
1276     int apstart=a5len-iw-wordSize;
1277     fi=0;
1278     //int* qv=(int32 *)(adapter5.chars()+apstart);
1279     int qv=get3mer_value(adapter5.chars()+apstart);
1280     //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1281     while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1282     #ifdef DEBUG
1283     if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1284     #endif
1285     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1286     a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1287     fi++;
1288     if (fi>imax) break;
1289     }
1290     } //for each wmer in the last hlen bases of the adaptor
1291     /*
1292    
1293     //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1294     //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1295     if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1296     int hlen2=a5len-wordSize;
1297     //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1298     #ifdef DEBUG
1299     if (debug && a5segs.Count()>0) {
1300     GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1301     }
1302     #endif
1303     if (hlen2>hlen) {
1304     for (int iw=hlen+1;iw<=hlen2;iw++) {
1305     int apstart=a5len-iw-wordSize;
1306     fi=0;
1307     //int* qv=(int32 *)(adapter5.chars()+apstart);
1308     int qv=get3mer_value(adapter5.chars()+apstart);
1309     //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1310     while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1311     extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1312     a5len, apstart, wordSize, l5,l3, a5segs, true);
1313     fi++;
1314     if (fi>imax) break;
1315     }
1316     } //for each wmer between hlen2 and hlen bases of the adaptor
1317     }
1318     if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1319     // lastly, analyze collected a5segs for a possible gapped alignment:
1320     GList<CSegChain> segchains(false,true,false);
1321     #ifdef DEBUG
1322     if (debug && a5segs.Count()>0) {
1323     GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1324 gpertea 4 }
1325 gpertea 12 #endif
1326     for (int i=0;i<a5segs.Count();i++) {
1327     if (a5segs[i]->chain==NULL) {
1328     if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1329     CSegChain* newchain=new CSegChain(true);
1330     newchain->setFreeItem(false);
1331     newchain->addSegPair(a5segs[i]);
1332     a5segs[i]->chain=newchain;
1333     segchains.Add(newchain); //just to free them when done
1334     }
1335     for (int j=i+1;j<a5segs.Count();j++) {
1336     CSegChain* chain=a5segs[i]->chain;
1337     if (chain->extendChain(a5segs[j])) {
1338     a5segs[j]->chain=chain;
1339     #ifdef DEBUG
1340     if (debug) dbgPrintChain(*chain, adapter5.chars());
1341     #endif
1342     //save time by checking here if the extended chain is already acceptable for trimming
1343     if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1344     l5=chain->aend;
1345     l3=rlen-1;
1346     return true;
1347     }
1348     } //chain can be extended
1349 gpertea 4 }
1350 gpertea 12 } //collect segment alignments into chains
1351     */
1352 gpertea 4 return false; //no adapter parts found
1353     }
1354    
1355 gpertea 12 //convert qvs to/from phred64 from/to phread33
1356     void convertPhred(GStr& q) {
1357     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1358 gpertea 4 }
1359    
1360 gpertea 12 void convertPhred(char* q, int len) {
1361     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1362 gpertea 4 }
1363    
1364 gpertea 13 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1365     GStr& rname, GStr& rinfo, GStr& infname) {
1366     rseq="";
1367     rqv="";
1368     rname="";
1369     rinfo="";
1370     if (fq.eof()) return false;
1371     char* l=fq.getLine();
1372     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1373     if (l==NULL) return false;
1374     /* if (rawFormat) {
1375     //TODO: implement raw qseq parsing here
1376     //if (raw type=N) then continue; //skip invalid/bad records
1377     } //raw qseq format
1378     else { // FASTQ or FASTA */
1379     isfasta=(l[0]=='>');
1380     if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1381     infname.chars(), l);
1382     GStr s(l);
1383     rname=&(l[1]);
1384     for (int i=0;i<rname.length();i++)
1385     if (rname[i]<=' ') {
1386     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1387     rname.cut(i);
1388     break;
1389     }
1390     //now get the sequence
1391     if ((l=fq.getLine())==NULL)
1392     GError("Error: unexpected EOF after header for read %s (%s)\n",
1393     rname.chars(), infname.chars());
1394     rseq=l; //this must be the DNA line
1395     while ((l=fq.getLine())!=NULL) {
1396     //seq can span multiple lines
1397     if (l[0]=='>' || l[0]=='+') {
1398     fq.pushBack();
1399     break; //
1400     }
1401     rseq+=l;
1402     } //check for multi-line seq
1403     if (!isfasta) { //reading fastq quality values, which can also be multi-line
1404     if ((l=fq.getLine())==NULL)
1405     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1406     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1407     if ((l=fq.getLine())==NULL)
1408     GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1409     rqv=l;
1410     //if (rqv.length()!=rseq.length())
1411     // GError("Error: qv len != seq len for %s\n", rname.chars());
1412     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1413     rqv+=l; //append to qv string
1414     }
1415     }// fastq
1416     // } //<-- FASTA or FASTQ
1417     rseq.upper(); //TODO: what if we care about masking?
1418     return true;
1419     }
1420    
1421     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1422     //returns 0 if the read was untouched, 1 if it was just trimmed
1423     // and a trash code if it was trashed
1424     l5=0;
1425     l3=rseq.length()-1;
1426     if (l3-l5+1<min_read_len) {
1427     return 's';
1428     }
1429     GStr wseq(rseq.chars());
1430     GStr wqv(rqv.chars());
1431     int w5=l5;
1432     int w3=l3;
1433     //first do the q-based trimming
1434     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1435     if (w3-w5+1<min_read_len) {
1436     return 'Q'; //invalid read
1437     }
1438     //-- keep only the w5..w3 range
1439     l5=w5;
1440     l3=w3;
1441     wseq=wseq.substr(w5, w3-w5+1);
1442     if (!wqv.is_empty())
1443     wqv=wqv.substr(w5, w3-w5+1);
1444     } //qv trimming
1445     // N-trimming on the remaining read seq
1446     if (ntrim(wseq, w5, w3)) {
1447     //GMessage("before: %s\n",wseq.chars());
1448     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1449     l5+=w5;
1450     l3-=(wseq.length()-1-w3);
1451     if (w3-w5+1<min_read_len) {
1452     return 'N'; //to be trashed
1453     }
1454     //-- keep only the w5..w3 range
1455     wseq=wseq.substr(w5, w3-w5+1);
1456     if (!wqv.is_empty())
1457     wqv=wqv.substr(w5, w3-w5+1);
1458     w5=0;
1459     w3=wseq.length()-1;
1460     }
1461     if (a3len>0) {
1462     if (trim_adapter3(wseq, w5, w3)) {
1463     int trimlen=wseq.length()-(w3-w5+1);
1464     num_trimmed3++;
1465     if (trimlen<min_trimmed3)
1466     min_trimmed3=trimlen;
1467     l5+=w5;
1468     l3-=(wseq.length()-1-w3);
1469     if (w3-w5+1<min_read_len) {
1470     return '3';
1471     }
1472     //-- keep only the w5..w3 range
1473     wseq=wseq.substr(w5, w3-w5+1);
1474     if (!wqv.is_empty())
1475     wqv=wqv.substr(w5, w3-w5+1);
1476     }//some adapter was trimmed
1477     } //adapter trimming
1478     if (a5len>0) {
1479     if (trim_adapter5(wseq, w5, w3)) {
1480     int trimlen=wseq.length()-(w3-w5+1);
1481     num_trimmed5++;
1482     if (trimlen<min_trimmed5)
1483     min_trimmed5=trimlen;
1484     l5+=w5;
1485     l3-=(wseq.length()-1-w3);
1486     if (w3-w5+1<min_read_len) {
1487     return '5';
1488     }
1489     //-- keep only the w5..w3 range
1490     wseq=wseq.substr(w5, w3-w5+1);
1491     if (!wqv.is_empty())
1492     wqv=wqv.substr(w5, w3-w5+1);
1493     }//some adapter was trimmed
1494     } //adapter trimming
1495     if (doCollapse) {
1496     //keep read for later
1497     FqDupRec* dr=dhash.Find(wseq.chars());
1498     if (dr==NULL) { //new entry
1499     //if (prefix.is_empty())
1500     dhash.Add(wseq.chars(),
1501     new FqDupRec(&wqv, rname.chars()));
1502     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1503     }
1504     else
1505     dr->add(wqv);
1506     } //collapsing duplicates
1507     else { //not collapsing duplicates
1508     //apply the dust filter now
1509     if (doDust) {
1510     int dustbases=dust(wseq);
1511     if (dustbases>(wseq.length()>>1)) {
1512     return 'D';
1513     }
1514     }
1515     } //not collapsing duplicates
1516     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1517     }
1518    
1519     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1520     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1521     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1522     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1523     }
1524    
1525     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1526     outcounter++;
1527     bool asFasta=(rqv.is_empty() || fastaOutput);
1528     if (asFasta) {
1529     if (prefix.is_empty()) {
1530     printHeader(f_out, '>',rname,rinfo);
1531     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1532     }
1533     else {
1534     fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1535     rseq.chars());
1536     }
1537     }
1538     else { //fastq
1539     if (convert_phred) convertPhred(rqv);
1540     if (prefix.is_empty()) {
1541     printHeader(f_out, '@', rname, rinfo);
1542     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1543     }
1544     else
1545     fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1546     rseq.chars(),rqv.chars() );
1547     }
1548     }
1549    
1550     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1551     if (freport==NULL || trashcode<=' ') return;
1552     if (trashcode=='3' || trashcode=='5') {
1553     fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1554     }
1555     else {
1556     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1557     }
1558     //tcounter++;
1559     }
1560    
1561     GStr getFext(GStr& s, int* xpos=NULL) {
1562     //s must be a filename without a path
1563     GStr r("");
1564     if (xpos!=NULL) *xpos=0;
1565     if (s.is_empty() || s=="-") return r;
1566     int p=s.rindex('.');
1567     int d=s.rindex('/');
1568     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1569     r=s.substr(p+1);
1570     if (xpos!=NULL) *xpos=p+1;
1571     r.lower();
1572     return r;
1573     }
1574    
1575     void baseFileName(GStr& fname) {
1576     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1577     int xpos=0;
1578     GStr fext=getFext(fname, &xpos);
1579     if (xpos<=1) return;
1580     bool extdel=false;
1581     GStr f2;
1582     if (fext=="z" || fext=="zip") {
1583     extdel=true;
1584     }
1585     else if (fext.length()>=2) {
1586     f2=fext.substr(0,2);
1587     extdel=(f2=="gz" || f2=="bz");
1588     }
1589     if (extdel) {
1590     fname.cut(xpos-1);
1591     fext=getFext(fname, &xpos);
1592     if (xpos<=1) return;
1593     }
1594     extdel=false;
1595     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1596     extdel=true;
1597     }
1598     else if (fext.length()>=2) {
1599     extdel=(fext.substr(0,2)=="fa");
1600     }
1601     if (extdel) fname.cut(xpos-1);
1602     GStr fncp(fname);
1603     fncp.lower();
1604     fncp.chomp("seq");
1605     fncp.chomp("sequence");
1606     fncp.trimR("_.");
1607     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1608     }
1609    
1610     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1611     FILE* f_out=NULL;
1612     GStr fname(getFileName(infname.chars()));
1613     //eliminate known extensions
1614     baseFileName(fname);
1615     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1616     else if (pocmd.is_empty()) {
1617     GStr oname(fname);
1618     oname.append('.');
1619     oname.append(outsuffix);
1620     f_out=fopen(oname.chars(),"w");
1621     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1622     }
1623     else {
1624     GStr oname(">");
1625     oname.append(fname);
1626     oname.append('.');
1627     oname.append(outsuffix);
1628     pocmd.append(oname);
1629     f_out=popen(pocmd.chars(), "w");
1630     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1631     }
1632     return f_out;
1633     }
1634    
1635     void guess_unzip(GStr& fname, GStr& picmd) {
1636     GStr fext=getFext(fname);
1637     if (fext=="gz" || fext=="gzip" || fext=="z") {
1638     picmd="gzip -cd ";
1639     }
1640     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1641     picmd="bzip2 -cd ";
1642     }
1643     }
1644    
1645     void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1646     GStr& s, GStr& infname, GStr& infname2) {
1647     // uses outsuffix to generate output file names and open file handles as needed
1648     infname="";
1649     infname2="";
1650     f_in=NULL;
1651     f_in2=NULL;
1652     f_out=NULL;
1653     f_out2=NULL;
1654     //analyze outsuffix intent
1655     GStr pocmd;
1656 gpertea 76 if (outsuffix=="-") {
1657     f_out=stdout;
1658     }
1659     else {
1660     GStr ox=getFext(outsuffix);
1661     if (ox.length()>2) ox=ox.substr(0,2);
1662     if (ox=="gz") pocmd="gzip -9 -c ";
1663     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1664     }
1665 gpertea 13 if (s=="-") {
1666     f_in=stdin;
1667     infname="stdin";
1668     f_out=prepOutFile(infname, pocmd);
1669     return;
1670     } // streaming from stdin
1671     s.startTokenize(",:");
1672     s.nextToken(infname);
1673     bool paired=s.nextToken(infname2);
1674     if (fileExists(infname.chars())==0)
1675     GError("Error: cannot find file %s!\n",infname.chars());
1676     GStr fname(getFileName(infname.chars()));
1677     GStr picmd;
1678     guess_unzip(fname, picmd);
1679     if (picmd.is_empty()) {
1680     f_in=fopen(infname.chars(), "r");
1681     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1682     }
1683     else {
1684     picmd.append(infname);
1685     f_in=popen(picmd.chars(), "r");
1686     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1687     }
1688 gpertea 76 if (f_out==stdout) {
1689     if (paired) GError("Error: output suffix required for paired reads\n");
1690     return;
1691     }
1692 gpertea 13 f_out=prepOutFile(infname, pocmd);
1693     if (!paired) return;
1694     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1695     // ---- paired reads:-------------
1696     if (fileExists(infname2.chars())==0)
1697     GError("Error: cannot find file %s!\n",infname2.chars());
1698     picmd="";
1699     GStr fname2(getFileName(infname2.chars()));
1700     guess_unzip(fname2, picmd);
1701     if (picmd.is_empty()) {
1702     f_in2=fopen(infname2.chars(), "r");
1703     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1704     }
1705     else {
1706     picmd.append(infname2);
1707     f_in2=popen(picmd.chars(), "r");
1708     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1709     }
1710     f_out2=prepOutFile(infname2, pocmd);
1711     }