ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 12
Committed: Wed Jan 26 19:05:05 2011 UTC (8 years, 6 months ago) by gpertea
File size: 41915 byte(s)
Log Message:
added qv-threshold trimming

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5    
6     #define USAGE "Usage:\n\
7 gpertea 12 fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-q <minqv>] [-C] [-D]\\\n\
8     [-p {64|33}] [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>]\\\n\
9     [-Q] <input.fq>\n\
10 gpertea 4 \n\
11 gpertea 12 Trim low quality bases at 3' end, optionally trim adapter sequence, filter\n\
12     for low complexity and collapse duplicate reads\n\
13 gpertea 4 \n\
14     Options:\n\
15     -n rename all the reads using the <prefix> followed by a read counter;\n\
16     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
17     the read duplication count\n\
18     -o write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
19     -5 trim the given adapter or primer sequence at the 5' end of each read\n\
20     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
21     -3 trim the given adapter sequence at the 3' end of each read\n\
22     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
23 gpertea 12 -q trim bases with quality value lower than <minq> (starting the 3' end)\n\
24     -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
25     -l minimum \"clean\" length after trimming that a read must have\n\
26     in order to pass the filter (default: 16)\n\
27 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
28     one letter code for the reason why they were trashed\n\
29 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
30     50% of its length detected as low complexity\n\
31 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
32 gpertea 12 name (e.g. for microRNAs)\n\
33     -p input is phred64/phred33 (use -p64 or -p33)\n\
34     -Q convert quality values to the other Phred qv type\n\
35     -V verbose processing\n\
36 gpertea 4 "
37     // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
38 gpertea 12
39     //For pair ends sequencing:
40     //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
41     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
42 gpertea 4 FILE* f_out=NULL; //stdout if not provided
43     FILE* f_in=NULL; //input fastq (stdin if not provided)
44     FILE* freport=NULL;
45     bool debug=false;
46 gpertea 12 bool verbose=false;
47 gpertea 4 bool doCollapse=false;
48     bool doDust=false;
49     bool trashReport=false;
50 gpertea 12 //bool rawFormat=false;
51 gpertea 4 int min_read_len=16;
52 gpertea 12 double max_perc_N=7.0;
53 gpertea 4 int dust_cutoff=16;
54     bool isfasta=false;
55 gpertea 12 bool convert_phred=false;
56 gpertea 4 GStr prefix;
57     GStr adapter3;
58     GStr adapter5;
59 gpertea 12
60     int qvtrim_min=0;
61    
62     int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
63     int qv_cvtadd=0; //could be -31 or +31
64    
65 gpertea 4 int a3len=0;
66     int a5len=0;
67     // adaptor matching metrics -- for extendMatch() function
68 gpertea 12 const int a_m_score=2; //match score
69     const int a_mis_score=-3; //mismatch
70     const int a_dropoff_score=7;
71     const int a_min_score=8; //an exact match of 4 bases at the proper ends WILL be trimmed
72     const int a_min_chain_score=15; //for gapped alignments
73 gpertea 4
74 gpertea 12 class CSegChain;
75    
76     class CSegPair {
77     public:
78     GSeg a;
79     GSeg b; //the adapter segment
80     int score;
81     int flags;
82     CSegChain* chain;
83     CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
84     score=mscore;
85     if (score==0) score=a.len()*a_m_score;
86     flags=0;
87     chain=NULL;
88     }
89     int len() { return a.len(); }
90     bool operator==(CSegPair& d){
91     //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
92     //make equal even segments that are included into one another:
93     return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
94     }
95     bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
96     if (b.start==d.b.start) {
97     if (score==d.score) {
98     //just try to be consistent:
99     if (b.end==d.b.end) {
100     return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
101     }
102     return (b.end>d.b.end);
103     }
104     else return (score<d.score);
105     }
106     return (b.start>d.b.start);
107     }
108     bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
109     /*if (b.start==d.b.start && b.end==d.b.end) {
110     return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
111     }
112     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
113     if (b.start==d.b.start) {
114     if (score==d.score) {
115     //just try to be consistent:
116     if (b.end==d.b.end) {
117     return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
118     }
119     return (b.end<d.b.end);
120     }
121     else return (score>d.score);
122     }
123     return (b.start<d.b.start);
124     }
125     };
126    
127     int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
128     CSegPair& x = *(CSegPair *)sa;
129     CSegPair& y = *(CSegPair *)sb;
130     /*
131     if (x.b.end==y.b.end) {
132     if (x.b.start==y.b.start) {
133     if (x.a.end==y.a.end) {
134     if (x.a.start==y.a.start) return 0;
135     return ((x.a.start>y.a.start) ? -1 : 1);
136     }
137     else {
138     return ((x.a.end>y.a.end) ? -1 : 1);
139     }
140     }
141     else {
142     return ((x.b.start>y.b.start) ? -1 : 1);
143     }
144     }
145     else {
146     return ((x.b.end>y.b.end) ? -1 : 1);
147     }
148     */
149     if (x.b.end==y.b.end) {
150     if (x.score==y.score) {
151     if (x.b.start==y.b.start) {
152     if (x.a.end==y.a.end) {
153     if (x.a.start==y.a.start) return 0;
154     return ((x.a.start<y.a.start) ? -1 : 1);
155     }
156     else {
157     return ((x.a.end<y.a.end) ? -1 : 1);
158     }
159     }
160     else {
161     return ((x.b.start<y.b.start) ? -1 : 1);
162     }
163     } else return ((x.score>y.score) ? -1 : 1);
164     }
165     else {
166     return ((x.b.end>y.b.end) ? -1 : 1);
167     }
168    
169     }
170    
171     class CSegChain:public GList<CSegPair> {
172     public:
173     uint astart;
174     uint aend;
175     uint bstart;
176     uint bend;
177     int score;
178     bool endSort;
179     CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
180     //as SegPairs are inserted, they will be sorted by a.start coordinate
181     score=0;
182     astart=MAX_UINT;
183     aend=0;
184     bstart=MAX_UINT;
185     bend=0;
186     endSort=aln5;
187     if (aln5) { setSorted(cmpSegEnds); }
188     }
189     bool operator==(CSegChain& d) {
190     //return (score==d.score);
191     return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
192     }
193     bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
194     //return (score<d.score);
195     if (bstart==d.bstart && bend==d.bend) {
196     return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
197     }
198     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
199     }
200     bool operator<(CSegChain& d) {
201     //return (score>d.score);
202     if (bstart==d.bstart && bend==d.bend) {
203     return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
204     }
205     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
206     }
207     void addSegPair(CSegPair* segp) {
208     if (AddIfNew(segp)!=segp) return;
209     score+=segp->score;
210     if (astart>segp->a.start) astart=segp->a.start;
211     if (aend<segp->a.end) aend=segp->a.end;
212     if (bstart>segp->b.start) bstart=segp->b.start;
213     if (bend<segp->b.end) bend=segp->b.end;
214     }
215     //for building actual chains:
216     bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
217     int bgap=0;
218     int agap=0;
219     //if (endSort) {
220     if (bstart>segp->b.start) {
221     bgap = (int)(bstart-segp->b.end);
222     if (abs(bgap)>2) return false;
223     agap = (int)(astart-segp->a.end);
224     if (abs(agap)>2) return false;
225     }
226     else {
227     bgap = (int) (segp->b.start-bend);
228     if (abs(bgap)>2) return false;
229     agap = (int)(segp->a.start-aend);
230     if (abs(agap)>2) return false;
231     }
232     if (agap*bgap<0) return false;
233     addSegPair(segp);
234     score-=abs(agap)+abs(bgap);
235     return true;
236     }
237     };
238    
239 gpertea 4 // element in dhash:
240     class FqDupRec {
241     public:
242     int count; //how many of these reads are the same
243     int len; //length of qv
244     char* firstname; //optional, only if we want to keep the original read names
245     char* qv;
246     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
247     len=0;
248     qv=NULL;
249     firstname=NULL;
250     count=0;
251     if (qstr!=NULL) {
252     qv=Gstrdup(qstr->chars());
253     len=qstr->length();
254     count++;
255     }
256     if (rname!=NULL) firstname=Gstrdup(rname);
257     }
258     ~FqDupRec() {
259     GFREE(qv);
260     GFREE(firstname);
261     }
262     void add(GStr& d) { //collapse another record into this one
263     if (d.length()!=len)
264     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
265     count++;
266     for (int i=0;i<len;i++)
267     qv[i]=(qv[i]+d[i])/2;
268     }
269     };
270    
271     void openfw(FILE* &f, GArgs& args, char opt) {
272     GStr s=args.getOpt(opt);
273     if (!s.is_empty()) {
274     if (s=='-') f=stdout;
275     else {
276     f=fopen(s,"w");
277     if (f==NULL) GError("Error creating file: %s\n", s.chars());
278     }
279     }
280     }
281    
282     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
283     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
284    
285     GHash<FqDupRec> dhash; //hash to keep track of duplicates
286    
287 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
288     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
289 gpertea 4 int dust(GStr& seq);
290     bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
291     bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
292    
293 gpertea 12 void convertPhred(char* q, int len);
294     void convertPhred(GStr& q);
295 gpertea 4
296     int main(int argc, char * const argv[]) {
297 gpertea 12 GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:o:");
298 gpertea 4 int e;
299     int icounter=0; //counter for input reads
300     int outcounter=0; //counter for output reads
301     if ((e=args.isError())>0) {
302     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
303     exit(224);
304     }
305 gpertea 9 debug=(args.getOpt('Y')!=NULL);
306 gpertea 12 debug=(args.getOpt('V')!=NULL);
307     convert_phred=(args.getOpt('Q')!=NULL);
308 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
309     doDust=(args.getOpt('D')!=NULL);
310 gpertea 12 /*
311 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
312     if (rawFormat) {
313     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
314     }
315 gpertea 12 */
316 gpertea 4 prefix=args.getOpt('n');
317     GStr s=args.getOpt('l');
318     if (!s.is_empty())
319     min_read_len=s.asInt();
320 gpertea 12 s=args.getOpt('m');
321     if (!s.is_empty())
322     max_perc_N=s.asDouble();
323 gpertea 4 s=args.getOpt('d');
324     if (!s.is_empty()) {
325     dust_cutoff=s.asInt();
326     doDust=true;
327     }
328 gpertea 12 s=args.getOpt('q');
329     if (!s.is_empty()) {
330     qvtrim_min=s.asInt();
331     }
332     s=args.getOpt('p');
333     if (!s.is_empty()) {
334     int v=s.asInt();
335     if (v==33) {
336     qv_phredtype=33;
337     qv_cvtadd=31;
338     }
339     else if (v==64) {
340     qv_phredtype=64;
341     qv_cvtadd=-31;
342     }
343     else
344     GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
345     }
346 gpertea 4 if (args.getOpt('3')!=NULL) {
347     adapter3=args.getOpt('3');
348     adapter3.upper();
349     a3len=adapter3.length();
350     }
351     if (args.getOpt('5')!=NULL) {
352     adapter5=args.getOpt('5');
353     adapter5.upper();
354     a5len=adapter5.length();
355     }
356     trashReport= (args.getOpt('r')!=NULL);
357     if (args.startNonOpt()==0) {
358     GMessage(USAGE);
359     exit(224);
360     }
361    
362     openfw(f_out, args, 'o');
363 gpertea 10 if (f_out==NULL) f_out=stdout;
364 gpertea 4 if (trashReport)
365     openfw(freport, args, 'r');
366     char* infile=NULL;
367     while ((infile=args.nextNonOpt())!=NULL) {
368     GStr infname(infile);
369     if (strcmp(infile,"-")==0) {
370     f_in=stdin; infname="stdin"; }
371     else {
372     f_in=fopen(infile,"r");
373     if (f_in==NULL)
374     GError("Cannot open input file %s!\n",infile);
375     }
376     GLineReader fq(f_in);
377     char* l=NULL;
378     while ((l=fq.getLine())!=NULL) {
379     GStr rname; //current read name
380     GStr rseq; //current read sequence
381     GStr rqv; //current read quality values
382     GStr s;
383 gpertea 12 /* if (rawFormat) {
384 gpertea 4 //TODO: implement qseq parsing here
385     //if (raw type=N) then continue; //skip invalid/bad records
386    
387     } //raw qseq format
388 gpertea 12 else { // FASTQ or FASTA */
389 gpertea 4 isfasta=(l[0]=='>');
390     if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
391     s=l;
392     rname=&(l[1]);
393     icounter++;
394     for (int i=0;i<rname.length();i++)
395     if (rname[i]<=' ') { rname.cut(i); break; }
396     //now get the sequence
397     if ((l=fq.getLine())==NULL)
398     GError("Error: unexpected EOF after header for %s\n",rname.chars());
399     rseq=l; //this must be the DNA line
400 gpertea 12 while ((l=fq.getLine())!=NULL) {
401     //seq can span multiple lines
402     if (l[0]=='>' || l[0]=='+') {
403 gpertea 4 fq.pushBack();
404     break; //
405     }
406     rseq+=l;
407 gpertea 12 } //check for multi-line seq
408     if (!isfasta) { //reading fastq quality values, which can also be multi-line
409     if ((l=fq.getLine())==NULL)
410 gpertea 4 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
411     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
412     if ((l=fq.getLine())==NULL)
413     GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
414     rqv=l;
415 gpertea 12 //if (rqv.length()!=rseq.length())
416     // GError("Error: qv len != seq len for %s\n", rname.chars());
417     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
418     rqv+=l; //append to qv string
419     }
420     }// fastq
421     // } //<-- FASTA or FASTQ
422 gpertea 4 rseq.upper();
423     int l5=0;
424     int l3=rseq.length()-1;
425     if (l3-l5+1<min_read_len) {
426     if (trashReport) {
427     fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
428     }
429     continue;
430     }
431 gpertea 12 if (ntrim(rseq, l5, l3)) { // N-trimming
432     //GMessage("before: %s\n",rseq.chars());
433     //GMessage("after : %s (%d)\n",rseq.substr(l5,l3-l5+1).chars(),l3-l5+1);
434 gpertea 4 if (l3-l5+1<min_read_len) {
435     if (trashReport) {
436     fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
437     }
438     continue; //invalid read
439     }
440     //-- keep only the l5..l3 range
441     rseq=rseq.substr(l5, l3-l5+1);
442     if (!rqv.is_empty())
443     rqv=rqv.substr(l5, l3-l5+1);
444 gpertea 12 l5=0;
445     l3=rseq.length()-1;
446 gpertea 4 }
447 gpertea 12 if (qvtrim_min!=0 && !rqv.is_empty() && qtrim(rqv, l5, l3)) { // qv-threshold trimming
448     if (l3-l5+1<min_read_len) {
449     if (trashReport) {
450     fprintf(freport, "%s\tQ\t%s\n", rname.chars(), rseq.chars());
451     }
452     continue; //invalid read
453     }
454     //-- keep only the l5..l3 range
455     rseq=rseq.substr(l5, l3-l5+1);
456     if (!rqv.is_empty())
457     rqv=rqv.substr(l5, l3-l5+1);
458     } //qv trimming
459 gpertea 4 if (a3len>0) {
460     if (trim_adapter3(rseq, l5, l3)) {
461     if (l3-l5+1<min_read_len) {
462     if (trashReport) {
463     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
464     }
465     continue;
466     }
467     //-- keep only the l5..l3 range
468     rseq=rseq.substr(l5, l3-l5+1);
469     if (!rqv.is_empty())
470     rqv=rqv.substr(l5, l3-l5+1);
471     }//some adapter was trimmed
472     } //adapter trimming
473     if (a5len>0) {
474     if (trim_adapter5(rseq, l5, l3)) {
475     if (l3-l5+1<min_read_len) {
476     if (trashReport) {
477     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
478     }
479     continue;
480     }
481     //-- keep only the l5..l3 range
482     rseq=rseq.substr(l5, l3-l5+1);
483     if (!rqv.is_empty())
484     rqv=rqv.substr(l5, l3-l5+1);
485     }//some adapter was trimmed
486     } //adapter trimming
487     if (doCollapse) {
488     //keep read for later
489     FqDupRec* dr=dhash.Find(rseq.chars());
490     if (dr==NULL) { //new entry
491     //if (prefix.is_empty())
492     dhash.Add(rseq.chars(),
493     new FqDupRec(&rqv, rname.chars()));
494     //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
495     }
496     else
497     dr->add(rqv);
498     } //collapsing duplicates
499     else { //not collapsing duplicates
500     //do the dust filter now
501     if (doDust) {
502     int dustbases=dust(rseq);
503     if (dustbases>(rseq.length()>>1)) {
504     if (trashReport) {
505     fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
506     }
507     continue;
508     }
509     }
510     //print this record here
511     outcounter++;
512     if (isfasta) {
513     if (prefix.is_empty())
514     fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
515     else
516     fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
517     rseq.chars());
518     }
519     else { //fastq
520 gpertea 12 if (convert_phred) convertPhred(rqv);
521 gpertea 4 if (prefix.is_empty())
522     fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
523     else
524     fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
525     rseq.chars(),rqv.chars() );
526     }
527     } //not collapsing duplicates
528     } //for each fastq record
529     } //while each input file
530     FRCLOSE(f_in);
531     if (doCollapse) {
532     outcounter=0;
533     int maxdup_count=1;
534     char* maxdup_seq=NULL;
535     dhash.startIterate();
536     FqDupRec* qd=NULL;
537     char* seq=NULL;
538     while ((qd=dhash.NextData(seq))!=NULL) {
539     GStr rseq(seq);
540     //do the dusting here
541     if (doDust) {
542     int dustbases=dust(rseq);
543     if (dustbases>(rseq.length()>>1)) {
544     if (trashReport && qd->firstname!=NULL) {
545     fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
546     }
547     continue;
548     }
549     }
550     outcounter++;
551     if (qd->count>maxdup_count) {
552     maxdup_count=qd->count;
553     maxdup_seq=seq;
554     }
555     if (isfasta) {
556     if (prefix.is_empty()) {
557 gpertea 10 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
558 gpertea 4 rseq.chars());
559     }
560     else { //use custom read name
561 gpertea 10 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
562 gpertea 4 qd->count, rseq.chars());
563     }
564     }
565     else { //fastq format
566 gpertea 12 if (convert_phred) convertPhred(qd->qv, qd->len);
567 gpertea 4 if (prefix.is_empty()) {
568     fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
569     rseq.chars(), qd->qv);
570     }
571     else { //use custom read name
572     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
573     qd->count, rseq.chars(), qd->qv);
574     }
575     }
576     }//for each element of dhash
577     if (maxdup_count>1) {
578     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
579     }
580     } //report collapsed dhash entries
581     GMessage("Number of input reads: %9d\n", icounter);
582     GMessage(" Output records: %9d\n", outcounter);
583     if (trashReport) {
584     FWCLOSE(freport);
585     }
586    
587     FWCLOSE(f_out);
588 gpertea 12 //getc(stdin);
589 gpertea 4 }
590    
591     class NData {
592     public:
593     int NPos[1024]; //there should be no reads longer than 1K ?
594     int NCount;
595     int end5;
596     int end3;
597     int seqlen;
598     double perc_N; //percentage of Ns in end5..end3 range only!
599     const char* seq;
600     bool valid;
601     NData() {
602     NCount=0;
603     end5=0;
604     end3=0;
605     seq=NULL;
606     perc_N=0;
607     valid=true;
608     }
609     void init(GStr& rseq) {
610     NCount=0;
611     perc_N=0;
612     seqlen=rseq.length();
613     seq=rseq.chars();
614     end5=1;
615     end3=seqlen;
616     for (int i=0;i<seqlen;i++)
617 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
618 gpertea 4 NPos[NCount]=i;
619     NCount++;
620     }
621     perc_N=(NCount*100.0)/seqlen;
622     valid=true;
623     }
624     void N_calc() { //only in the region after trimming
625     int n=0;
626     for (int i=end3-1;i<end5;i++) {
627     if (seq[i]=='N') n++;
628     }
629     perc_N=(n*100.0)/(end5-end3+1);
630     }
631     };
632    
633     static NData feat;
634     int perc_lenN=12; // incremental distance from ends, in percentage of
635     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
636    
637     void N_analyze(int l5, int l3, int p5, int p3) {
638     /* assumes feat was filled properly */
639     int old_dif, t5,t3,v;
640 gpertea 12 if (l3<l5+2 || p5>p3 ) {
641 gpertea 4 feat.end5=l5+1;
642     feat.end3=l3+1;
643     return;
644     }
645    
646     t5=feat.NPos[p5]-l5;
647     t3=l3-feat.NPos[p3];
648     old_dif=p3-p5;
649     v=(int)((((double)(l3-l5))*perc_lenN)/100);
650     if (v>20) v=20; /* enforce N-search limit for very long reads */
651     if (t5 < v ) {
652     l5=feat.NPos[p5]+1;
653     p5++;
654     }
655     if (t3 < v) {
656     l3=feat.NPos[p3]-1;
657     p3--;
658     }
659     /* restNs=p3-p5; number of Ns in the new CLR */
660     if (p3-p5==old_dif) { /* no change, return */
661     /* don't trim if this may shorten the read too much */
662     //if (l5-l3<min_read_len) return;
663     feat.end5=l5+1;
664     feat.end3=l3+1;
665     return;
666     }
667     else
668     N_analyze(l5,l3, p5,p3);
669     }
670    
671    
672 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
673     if (qvtrim_min==0 || qvs.is_empty()) return false;
674     if (qv_phredtype==0) {
675     //try to guess the Phred type
676     int vmin=256, vmax=0;
677     for (int i=0;i<qvs.length();i++) {
678     if (vmin>qvs[i]) vmin=qvs[i];
679     if (vmax<qvs[i]) vmax=qvs[i];
680     }
681     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
682     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
683     if (qv_phredtype==0) {
684     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
685     }
686     } //guessing the Phred type
687     for (l3=qvs.length()-1;l3>2;l3--) {
688     if (qvs[l3]-qv_phredtype>=qvtrim_min && qvs[l3-1]-qv_phredtype>=qvtrim_min) break;
689     }
690     //just in case, check also the 5' the end (?)
691     for (l5=0;l5<qvs.length()-3;l5++) {
692     if (qvs[l5]-qv_phredtype>=qvtrim_min && qvs[l5+1]-qv_phredtype>=qvtrim_min) break;
693     }
694     return (l5>0 || l3<qvs.length()-1);
695     }
696    
697     bool ntrim(GStr& rseq, int &l5, int &l3) {
698     //count Ns in the sequence, trim N-rich ends
699 gpertea 4 feat.init(rseq);
700     l5=feat.end5-1;
701     l3=feat.end3-1;
702     N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
703 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
704     if (feat.perc_N>max_perc_N) {
705     feat.valid=false;
706     l3=l5+1;
707     return true;
708     }
709     else {
710     return false; //nothing changed
711     }
712     }
713 gpertea 4 l5=feat.end5-1;
714     l3=feat.end3-1;
715     if (l3-l5+1<min_read_len) {
716     feat.valid=false;
717     return true;
718     }
719     feat.N_calc();
720 gpertea 12
721     if (feat.perc_N>max_perc_N) {
722 gpertea 4 feat.valid=false;
723     l3=l5+1;
724     return true;
725     }
726     return true;
727     }
728    
729     //--------------- dust functions ----------------
730     class DNADuster {
731     public:
732     int dustword;
733     int dustwindow;
734     int dustwindow2;
735     int dustcutoff;
736     int mv, iv, jv;
737     int counts[32*32*32];
738     int iis[32*32*32];
739     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
740     dustword=wordsize;
741     dustwindow=winsize;
742     dustwindow2 = (winsize>>1);
743     dustcutoff=cutoff;
744     mv=0;
745     iv=0;
746     jv=0;
747     }
748     void setWindowSize(int value) {
749     dustwindow = value;
750     dustwindow2 = (dustwindow >> 1);
751     }
752     void setWordSize(int value) {
753     dustword=value;
754     }
755     void wo1(int len, const char* s, int ivv) {
756     int i, ii, j, v, t, n, n1, sum;
757     int js, nis;
758     n = 32 * 32 * 32;
759     n1 = n - 1;
760     nis = 0;
761     i = 0;
762     ii = 0;
763     sum = 0;
764     v = 0;
765     for (j=0; j < len; j++, s++) {
766     ii <<= 5;
767     if (*s<=32) {
768     i=0;
769     continue;
770     }
771     ii |= *s - 'A'; //assume uppercase!
772     ii &= n1;
773     i++;
774     if (i >= dustword) {
775     for (js=0; js < nis && iis[js] != ii; js++) ;
776     if (js == nis) {
777     iis[nis] = ii;
778     counts[ii] = 0;
779     nis++;
780     }
781     if ((t = counts[ii]) > 0) {
782     sum += t;
783     v = 10 * sum / j;
784     if (mv < v) {
785     mv = v;
786     iv = ivv;
787     jv = j;
788     }
789     }
790     counts[ii]++;
791     }
792     }
793     }
794    
795     int wo(int len, const char* s, int* beg, int* end) {
796     int i, l1;
797     l1 = len - dustword + 1;
798     if (l1 < 0) {
799     *beg = 0;
800     *end = len - 1;
801     return 0;
802     }
803     mv = 0;
804     iv = 0;
805     jv = 0;
806     for (i=0; i < l1; i++) {
807     wo1(len-i, s+i, i);
808     }
809     *beg = iv;
810     *end = iv + jv;
811     return mv;
812     }
813    
814     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
815     int i, j, l, a, b, v;
816     if (cutoff==0) cutoff=dustcutoff;
817     a=0;b=0;
818     //GMessage("Dust cutoff=%d\n", cutoff);
819     for (i=0; i < seqlen; i += dustwindow2) {
820     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
821     v = wo(l, seq+i, &a, &b);
822     if (v > cutoff) {
823     //for (j = a; j <= b && j < dustwindow2; j++) {
824     for (j = a; j <= b; j++) {
825     seqmsk[i+j]='N';//could be made lowercase instead
826     }
827     }
828     }
829     //return first;
830     }
831     };
832    
833     static DNADuster duster;
834    
835     int dust(GStr& rseq) {
836     char* seq=Gstrdup(rseq.chars());
837     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
838     //check the number of Ns:
839     int ncount=0;
840     for (int i=0;i<rseq.length();i++) {
841     if (seq[i]=='N') ncount++;
842     }
843     GFREE(seq);
844     return ncount;
845     }
846    
847 gpertea 12
848     // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
849     //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
850     //check minimum score and
851     //for 3' adapter trimming:
852 gpertea 4 // require that the right end of the alignment for either the adaptor OR the read must be
853     // < 3 distance from its right end
854     // for 5' adapter trimming:
855     // require that the left end of the alignment for either the adaptor OR the read must
856 gpertea 12 // be at coordinate < 3 from start
857 gpertea 4
858     bool extendMatch(const char* a, int alen, int ai,
859 gpertea 12 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
860 gpertea 4 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
861 gpertea 12 #ifdef DEBUG
862     GStr dbg(b);
863     #endif
864     //if (debug) {
865     // GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
866     // }
867 gpertea 4 int a_l=ai; //alignment coordinates on a
868     int a_r=ai+mlen-1;
869     int b_l=bi; //alignment coordinates on b
870     int b_r=bi+mlen-1;
871     int ai_maxscore=ai;
872     int bi_maxscore=bi;
873     int score=mlen*a_m_score;
874     int maxscore=score;
875 gpertea 9 int mism5score=a_mis_score;
876     if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
877 gpertea 4 //try to extend to the left first, if possible
878     while (ai>0 && bi>0) {
879     ai--;
880     bi--;
881     score+= (a[ai]==b[bi])? a_m_score : mism5score;
882     if (score>maxscore) {
883     ai_maxscore=ai;
884     bi_maxscore=bi;
885     maxscore=score;
886     }
887     else if (maxscore-score>a_dropoff_score) break;
888     }
889     a_l=ai_maxscore;
890     b_l=bi_maxscore;
891 gpertea 9 //if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
892 gpertea 4 //now extend to the right
893     ai_maxscore=a_r;
894     bi_maxscore=b_r;
895     ai=a_r;
896     bi=b_r;
897     score=maxscore;
898     //sometimes there are extra AAAAs at the end of the read, ignore those
899     if (strcmp(&a[alen-4],"AAAA")==0) {
900     alen-=3;
901     while (a[alen-1]=='A' && alen>ai) alen--;
902     }
903     while (ai<alen-1 && bi<blen-1) {
904     ai++;
905     bi++;
906     //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
907     if (a[ai]==b[bi]) { //match
908     score+=a_m_score;
909     if (ai>=alen-2) {
910     score+=a_m_score-(alen-ai-1);
911     }
912     }
913     else { //mismatch
914     score+=a_mis_score;
915     }
916     if (score>maxscore) {
917     ai_maxscore=ai;
918     bi_maxscore=bi;
919     maxscore=score;
920     }
921     else if (maxscore-score>a_dropoff_score) break;
922     }
923     a_r=ai_maxscore;
924     b_r=bi_maxscore;
925 gpertea 9 int a_ovh3=alen-a_r-1;
926     int b_ovh3=blen-b_r-1;
927     int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
928     int mmovh5=(a_l<b_l)? a_l : b_l;
929     //if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
930 gpertea 12 #ifdef DEBUG
931     if (debug) GMessage(" extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
932     #endif
933 gpertea 9 if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
934     if (a_l<a_ovh3) {
935 gpertea 4 //adapter closer to the left end (typical for 5' adapter)
936     l5=a_r+1;
937     l3=alen-1;
938     }
939     else {
940     //adapter matching at the right end (typical for 3' adapter)
941     l5=0;
942     l3=a_l-1;
943     }
944     return true;
945     }
946 gpertea 12 else { //keep this segment pair for later (gapped alignment)
947     segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
948     //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
949     }
950 gpertea 11 //do not trim:
951     l5=0;
952     l3=alen-1;
953 gpertea 4 return false;
954     }
955    
956 gpertea 12 /*
957     int getWordValue(const char* s, int wlen) {
958     int r=0;
959     while (wlen--) { r+=(((int)s[wlen])<<wlen) }
960     return r;
961     }
962     */
963     int get3mer_value(const char* s) {
964     return (s[0]<<16)+(s[1]<<8)+s[2];
965     }
966    
967     int w3_match(int qv, const char* str, int slen, int start_index=0) {
968     if (start_index>=slen || start_index<0) return -1;
969     for (int i=start_index;i<slen-3;i++) {
970     int rv=get3mer_value(str+i);
971     if (rv==qv) return i;
972     }
973     return -1;
974     }
975    
976     int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
977     if (end_index>=slen) return -1;
978     if (end_index<0) end_index=slen-1;
979     for (int i=end_index-2;i>=0;i--) {
980     int rv=get3mer_value(str+i);
981     if (rv==qv) return i;
982     }
983     return -1;
984     }
985    
986     int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
987     if (start_index>=slen || start_index<0) return -1;
988     for (int i=start_index;i<slen-4;i++) {
989     int32* rv=(int32*)(str+i);
990     if (*rv==qv) return i;
991     }
992     return -1;
993     }
994    
995     int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
996     if (end_index>=slen) return -1;
997     if (end_index<0) end_index=slen-1;
998     for (int i=end_index-3;i>=0;i--) {
999     int32* rv=(int32*)(str+i);
1000     if (*rv==qv) return i;
1001     }
1002     return -1;
1003     }
1004    
1005     #ifdef DEBUG
1006     void dbgPrintChain(CSegChain& chain, const char* aseq) {
1007     GStr s(aseq);
1008     for (int i=0;i<chain.Count();i++) {
1009     CSegPair& seg=*chain[i];
1010     GMessage(" dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1011     s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
1012     }
1013     }
1014     #endif
1015    
1016 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1017     int rlen=seq.length();
1018     l5=0;
1019     l3=rlen-1;
1020     //first try a full match, we might get lucky
1021     int fi=-1;
1022     if ((fi=seq.index(adapter3))>=0) {
1023     if (fi<rlen-fi-a3len) {//match is closer to the right end
1024     l5=fi+a3len;
1025     l3=rlen-1;
1026     }
1027     else {
1028     l5=0;
1029 gpertea 12 l3=fi-1;
1030 gpertea 4 }
1031     return true;
1032     }
1033 gpertea 12 #ifdef DEBUG
1034     if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars());
1035     #endif
1036    
1037 gpertea 4 //also, for fast detection of other adapter-only reads that start past
1038     // the beginning of the adapter sequence, try to see if the first a3len-4
1039 gpertea 12 // bases of the read are a substring of the adapter
1040     if (rlen>a3len-3) {
1041     GStr rstart=seq.substr(1,a3len-4);
1042     if ((fi=adapter3.index(rstart))>=0) {
1043     l3=rlen-1;
1044     l5=a3len-4;
1045     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1046     return true;
1047     }
1048     }
1049     CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1050     //check the easy cases - 11 bases exact match at the end
1051     int fdlen=11;
1052     if (a3len<16) {
1053     fdlen=a3len>>1;
1054 gpertea 4 }
1055 gpertea 12 if (fdlen>4) {
1056     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1057     GStr rstart=seq.substr(-fdlen-3,fdlen);
1058     if ((fi=adapter3.index(rstart))>=0) {
1059     #ifdef DEBUG
1060     if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1061     #endif
1062     if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1063     adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs))
1064     return true;
1065     }
1066     //another easy case: first 11 characters of the adaptor found as a substring of the read
1067     GStr bstr=adapter3.substr(0, fdlen);
1068     if ((fi=seq.rindex(bstr))>=0) {
1069     #ifdef DEBUG
1070     if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1071     #endif
1072     if (extendMatch(seq.chars(), rlen, fi,
1073     adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs))
1074     return true;
1075     }
1076     } //tried to match 11 bases first
1077    
1078     //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1079     //-- only extend if the match is in the 3' (ending) region of the read
1080     int wordSize=3;
1081     int hlen=12;
1082     if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1083     int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1084     if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1085     imin=rlen-imin;
1086     for (int iw=0;iw<hlen;iw++) {
1087     //int32* qv=(int32*)(adapter3.chars()+iw);
1088     int qv=get3mer_value(adapter3.chars()+iw);
1089     fi=-1;
1090     //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1091     while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1092     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1093    
1094     #ifdef DEBUG
1095     if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1096     #endif
1097     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1098     a3len, iw, wordSize, l5,l3, a3segs)) return true;
1099     fi--;
1100     if (fi<imin) break;
1101     }
1102     } //for each wmer in the first hlen bases of the adaptor
1103     /*
1104     //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1105     //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1106     if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1107     int hlen2=a3len-wordSize;
1108     //if (hlen2>a3len-4) hlen2=a3len-4;
1109     if (hlen2>hlen) {
1110     #ifdef DEBUG
1111     if (debug && a3segs.Count()>0) {
1112     GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1113     }
1114     #endif
1115     for (int iw=hlen;iw<hlen2;iw++) {
1116     //int* qv=(int32 *)(adapter3.chars()+iw);
1117     int qv=get3mer_value(adapter3.chars()+iw);
1118     fi=-1;
1119     //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1120     while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1121     extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1122     a3len, iw, wordSize, l5,l3, a3segs);
1123     fi--;
1124     if (fi<imin) break;
1125     }
1126     } //for each wmer between hlen2 and hlen bases of the adaptor
1127     }
1128     //lastly, analyze collected a3segs for a possible gapped alignment:
1129     GList<CSegChain> segchains(false,true,false);
1130     #ifdef DEBUG
1131     if (debug && a3segs.Count()>0) {
1132     GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1133 gpertea 4 }
1134 gpertea 12 #endif
1135     for (int i=0;i<a3segs.Count();i++) {
1136     if (a3segs[i]->chain==NULL) {
1137     if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1138     CSegChain* newchain=new CSegChain();
1139     newchain->setFreeItem(false);
1140     newchain->addSegPair(a3segs[i]);
1141     a3segs[i]->chain=newchain;
1142     segchains.Add(newchain); //just to free them when done
1143     }
1144     for (int j=i+1;j<a3segs.Count();j++) {
1145     CSegChain* chain=a3segs[i]->chain;
1146     if (chain->extendChain(a3segs[j])) {
1147     a3segs[j]->chain=chain;
1148     #ifdef DEBUG
1149     if (debug) dbgPrintChain(*chain, adapter3.chars());
1150     #endif
1151     //save time by checking here if the extended chain is already acceptable for trimming
1152     if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1153     l5=0;
1154     l3=chain->astart-2;
1155     #ifdef DEBUG
1156     if (debug && a3segs.Count()>0) {
1157     GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1158     }
1159     #endif
1160     return true;
1161     }
1162     } //chain can be extended
1163 gpertea 4 }
1164 gpertea 12 } //collect segment alignments into chains
1165     */
1166 gpertea 4 return false; //no adapter parts found
1167     }
1168    
1169     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1170 gpertea 9 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1171 gpertea 4 int rlen=seq.length();
1172     l5=0;
1173     l3=rlen-1;
1174     //try to see if adapter is fully included in the read
1175     int fi=-1;
1176     if ((fi=seq.index(adapter5))>=0) {
1177     if (fi<rlen-fi-a5len) {//match is closer to the right end
1178     l5=fi+a5len;
1179     l3=rlen-1;
1180     }
1181     else {
1182     l5=0;
1183     l3=fi-1;
1184     }
1185     return true;
1186     }
1187 gpertea 12 #ifdef DEBUG
1188     if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars());
1189     #endif
1190    
1191     CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1192    
1193     //try the easy way out first - look for an exact match of 11 bases
1194     int fdlen=11;
1195     if (a5len<16) {
1196     fdlen=a5len>>1;
1197 gpertea 4 }
1198 gpertea 12 if (fdlen>4) {
1199     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1200     if ((fi=adapter5.index(rstart))>=0) {
1201     #ifdef DEBUG
1202     if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1203     #endif
1204     if (extendMatch(seq.chars(), rlen, 1,
1205     adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true))
1206     return true;
1207     }
1208     //another easy case: last 11 characters of the adaptor found as a substring of the read
1209     GStr bstr=adapter5.substr(-fdlen);
1210     if ((fi=seq.index(bstr))>=0) {
1211     #ifdef DEBUG
1212     if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1213     #endif
1214     if (extendMatch(seq.chars(), rlen, fi,
1215     adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true))
1216     return true;
1217     }
1218     } //tried to matching at most 11 bases first
1219    
1220     //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1221     //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1222     int wordSize=3;
1223     int hlen=12;
1224     if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1225     int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1226     if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1227     for (int iw=0;iw<=hlen;iw++) {
1228     int apstart=a5len-iw-wordSize;
1229     fi=0;
1230     //int* qv=(int32 *)(adapter5.chars()+apstart);
1231     int qv=get3mer_value(adapter5.chars()+apstart);
1232     //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1233     while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1234     #ifdef DEBUG
1235     if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1236     #endif
1237     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1238     a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1239     fi++;
1240     if (fi>imax) break;
1241     }
1242     } //for each wmer in the last hlen bases of the adaptor
1243     /*
1244    
1245     //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1246     //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1247     if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1248     int hlen2=a5len-wordSize;
1249     //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1250     #ifdef DEBUG
1251     if (debug && a5segs.Count()>0) {
1252     GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1253     }
1254     #endif
1255     if (hlen2>hlen) {
1256     for (int iw=hlen+1;iw<=hlen2;iw++) {
1257     int apstart=a5len-iw-wordSize;
1258     fi=0;
1259     //int* qv=(int32 *)(adapter5.chars()+apstart);
1260     int qv=get3mer_value(adapter5.chars()+apstart);
1261     //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1262     while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1263     extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1264     a5len, apstart, wordSize, l5,l3, a5segs, true);
1265     fi++;
1266     if (fi>imax) break;
1267     }
1268     } //for each wmer between hlen2 and hlen bases of the adaptor
1269     }
1270     if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1271     // lastly, analyze collected a5segs for a possible gapped alignment:
1272     GList<CSegChain> segchains(false,true,false);
1273     #ifdef DEBUG
1274     if (debug && a5segs.Count()>0) {
1275     GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1276 gpertea 4 }
1277 gpertea 12 #endif
1278     for (int i=0;i<a5segs.Count();i++) {
1279     if (a5segs[i]->chain==NULL) {
1280     if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1281     CSegChain* newchain=new CSegChain(true);
1282     newchain->setFreeItem(false);
1283     newchain->addSegPair(a5segs[i]);
1284     a5segs[i]->chain=newchain;
1285     segchains.Add(newchain); //just to free them when done
1286     }
1287     for (int j=i+1;j<a5segs.Count();j++) {
1288     CSegChain* chain=a5segs[i]->chain;
1289     if (chain->extendChain(a5segs[j])) {
1290     a5segs[j]->chain=chain;
1291     #ifdef DEBUG
1292     if (debug) dbgPrintChain(*chain, adapter5.chars());
1293     #endif
1294     //save time by checking here if the extended chain is already acceptable for trimming
1295     if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1296     l5=chain->aend;
1297     l3=rlen-1;
1298     return true;
1299     }
1300     } //chain can be extended
1301 gpertea 4 }
1302 gpertea 12 } //collect segment alignments into chains
1303     */
1304 gpertea 4 return false; //no adapter parts found
1305     }
1306    
1307 gpertea 12 //convert qvs to/from phred64 from/to phread33
1308     void convertPhred(GStr& q) {
1309     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1310 gpertea 4 }
1311    
1312 gpertea 12 void convertPhred(char* q, int len) {
1313     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1314 gpertea 4 }