ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 4 | Line 4
4   #include "GList.hh"
5  
6   #define USAGE "Usage:\n\
7 < fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\
8 <    [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\
7 > fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-q <minqv>] [-C] [-D]\\\n\
8 >   [-p {64|33}] [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>]\\\n\
9 >   [-Q] <input.fq>\n\
10   \n\
11 < Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\
12 < optionally collapse duplicates in the input fastq data.\n\
11 > Trim low quality bases at 3' end, optionally trim adapter sequence, filter\n\
12 > for low complexity and collapse duplicate reads\n\
13   \n\
14   Options:\n\
15   -n  rename all the reads using the <prefix> followed by a read counter;\n\
# Line 19 | Line 20
20      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
21   -3  trim the given adapter sequence at the 3' end of each read\n\
22      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
23 < -l  minimum \"clean\" length at the high quality 5'end that a read must\n\
24 <    have in order to pass the filter (default: 16)\n\
23 > -q  trim bases with quality value lower than <minq> (starting the 3' end)\n\
24 > -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
25 > -l  minimum \"clean\" length after trimming that a read must have\n\
26 >    in order to pass the filter (default: 16)\n\
27   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
28      one letter code for the reason why they were trashed\n\
29 + -D  apply a low-complexity (dust) filter and discard any read that has over\n\
30 +    50% of its length detected as low complexity\n\
31   -C  collapse duplicate reads and add a prefix with their count to the read\n\
32 <    name\n\
33 < -D  apply a low-complexity (dust) filter and discard any read that has at\n\
34 <    least half of it masked by this\n\
35 < -Q  quality values in the input data are interpreted as phred64, convert\n\
31 <    them to phred33\n\
32 >    name (e.g. for microRNAs)\n\
33 > -p  input is phred64/phred33 (use -p64 or -p33)\n\
34 > -Q  convert quality values to the other Phred qv type\n\
35 > -V  verbose processing\n\
36   "
37   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
38 +
39 + //For pair ends sequencing:
40 + //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
41 + //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
42   FILE* f_out=NULL; //stdout if not provided
43   FILE* f_in=NULL; //input fastq (stdin if not provided)
44   FILE* freport=NULL;
45   bool debug=false;
46 + bool verbose=false;
47   bool doCollapse=false;
48   bool doDust=false;
49   bool trashReport=false;
50 < bool rawFormat=false;
50 > //bool rawFormat=false;
51   int min_read_len=16;
52 + double max_perc_N=7.0;
53   int dust_cutoff=16;
54   bool isfasta=false;
55 < bool phred64=false;
55 > bool convert_phred=false;
56   GStr prefix;
57   GStr adapter3;
58   GStr adapter5;
59 +
60 + int qvtrim_min=0;
61 +
62 + int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
63 + int qv_cvtadd=0; //could be -31 or +31
64 +
65   int a3len=0;
66   int a5len=0;
67   // adaptor matching metrics -- for extendMatch() function
68 < static const int a_m_score=2; //match score
69 < static const int a_mis_score=-3; //mismatch
70 < static const int a_dropoff_score=7;
71 < static const int a_min_score=7;
68 > const int a_m_score=2; //match score
69 > const int a_mis_score=-3; //mismatch
70 > const int a_dropoff_score=7;
71 > const int a_min_score=8; //an exact match of 4 bases at the proper ends WILL be trimmed
72 > const int a_min_chain_score=15; //for gapped alignments
73 >
74 > class CSegChain;
75 >
76 > class CSegPair {
77 >  public:
78 >   GSeg a;
79 >   GSeg b; //the adapter segment
80 >   int score;
81 >   int flags;
82 >   CSegChain* chain;
83 >   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
84 >      score=mscore;
85 >      if (score==0) score=a.len()*a_m_score;
86 >      flags=0;
87 >      chain=NULL;
88 >      }
89 >   int len() { return  a.len(); }
90 >   bool operator==(CSegPair& d){
91 >      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
92 >      //make equal even segments that are included into one another:
93 >      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
94 >      }
95 >   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
96 >     if (b.start==d.b.start) {
97 >        if (score==d.score) {
98 >           //just try to be consistent:
99 >           if (b.end==d.b.end) {
100 >             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
101 >             }
102 >           return (b.end>d.b.end);
103 >           }
104 >         else return (score<d.score);
105 >        }
106 >     return (b.start>d.b.start);
107 >     }
108 >   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
109 >     /*if (b.start==d.b.start && b.end==d.b.end) {
110 >          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
111 >          }
112 >     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
113 >     if (b.start==d.b.start) {
114 >        if (score==d.score) {
115 >           //just try to be consistent:
116 >           if (b.end==d.b.end) {
117 >             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
118 >             }
119 >           return (b.end<d.b.end);
120 >           }
121 >         else return (score>d.score);
122 >        }
123 >     return (b.start<d.b.start);
124 >     }
125 > };
126 >
127 > int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
128 > CSegPair& x = *(CSegPair *)sa;
129 > CSegPair& y = *(CSegPair *)sb;
130 > /*
131 > if (x.b.end==y.b.end) {
132 >     if (x.b.start==y.b.start) {
133 >         if (x.a.end==y.a.end) {
134 >            if (x.a.start==y.a.start) return 0;
135 >            return ((x.a.start>y.a.start) ? -1 : 1);
136 >            }
137 >          else {
138 >            return ((x.a.end>y.a.end) ? -1 : 1);
139 >            }
140 >          }
141 >      else {
142 >       return ((x.b.start>y.b.start) ? -1 : 1);
143 >       }
144 >     }
145 >    else {
146 >     return ((x.b.end>y.b.end) ? -1 : 1);
147 >     }
148 > */
149 >  if (x.b.end==y.b.end) {
150 >     if (x.score==y.score) {
151 >     if (x.b.start==y.b.start) {
152 >         if (x.a.end==y.a.end) {
153 >            if (x.a.start==y.a.start) return 0;
154 >            return ((x.a.start<y.a.start) ? -1 : 1);
155 >            }
156 >          else {
157 >            return ((x.a.end<y.a.end) ? -1 : 1);
158 >            }
159 >          }
160 >      else {
161 >       return ((x.b.start<y.b.start) ? -1 : 1);
162 >       }
163 >      } else return ((x.score>y.score) ? -1 : 1);
164 >     }
165 >    else {
166 >     return ((x.b.end>y.b.end) ? -1 : 1);
167 >     }
168 >
169 > }
170 >
171 > class CSegChain:public GList<CSegPair> {
172 > public:
173 >   uint astart;
174 >   uint aend;
175 >   uint bstart;
176 >   uint bend;
177 >   int score;
178 >   bool endSort;
179 >  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
180 >   //as SegPairs are inserted, they will be sorted by a.start coordinate
181 >   score=0;
182 >   astart=MAX_UINT;
183 >   aend=0;
184 >   bstart=MAX_UINT;
185 >   bend=0;
186 >   endSort=aln5;
187 >   if (aln5) { setSorted(cmpSegEnds); }
188 >   }
189 > bool operator==(CSegChain& d) {
190 >   //return (score==d.score);
191 >    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
192 >   }
193 > bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
194 >   //return (score<d.score);
195 >   if (bstart==d.bstart && bend==d.bend) {
196 >          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
197 >          }
198 >     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
199 >   }
200 > bool operator<(CSegChain& d) {
201 >   //return (score>d.score);
202 >   if (bstart==d.bstart && bend==d.bend) {
203 >          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
204 >          }
205 >     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
206 >   }
207 > void addSegPair(CSegPair* segp) {
208 >   if (AddIfNew(segp)!=segp) return;
209 >   score+=segp->score;
210 >   if (astart>segp->a.start) astart=segp->a.start;
211 >   if (aend<segp->a.end) aend=segp->a.end;
212 >   if (bstart>segp->b.start) bstart=segp->b.start;
213 >   if (bend<segp->b.end) bend=segp->b.end;
214 >   }
215 > //for building actual chains:
216 > bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
217 >   int bgap=0;
218 >   int agap=0;
219 >   //if (endSort) {
220 >   if (bstart>segp->b.start) {
221 >      bgap = (int)(bstart-segp->b.end);
222 >      if (abs(bgap)>2) return false;
223 >      agap = (int)(astart-segp->a.end);
224 >      if (abs(agap)>2) return false;
225 >      }
226 >     else {
227 >      bgap = (int) (segp->b.start-bend);
228 >      if (abs(bgap)>2) return false;
229 >      agap = (int)(segp->a.start-aend);
230 >      if (abs(agap)>2) return false;
231 >      }
232 >   if (agap*bgap<0) return false;
233 >   addSegPair(segp);
234 >   score-=abs(agap)+abs(bgap);
235 >   return true;
236 >   }
237 > };
238  
239   // element in dhash:
240   class FqDupRec {
# Line 102 | Line 284
284  
285   GHash<FqDupRec> dhash; //hash to keep track of duplicates
286  
287 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured
288 <
287 > bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
288 > bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
289   int dust(GStr& seq);
290   bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
291   bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
292  
293 < void convertQ64(char* q, int len);
294 < void convertQ64(GStr& q);
293 > void convertPhred(char* q, int len);
294 > void convertPhred(GStr& q);
295  
296   int main(int argc, char * const argv[]) {
297 <  GArgs args(argc, argv, "YQRDCl:d:3:5:n:r:o:");
297 >  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:o:");
298    int e;
299    int icounter=0; //counter for input reads
300    int outcounter=0; //counter for output reads
# Line 121 | Line 303
303        exit(224);
304        }
305    debug=(args.getOpt('Y')!=NULL);
306 <  phred64=(args.getOpt('Q')!=NULL);
306 >  debug=(args.getOpt('V')!=NULL);
307 >  convert_phred=(args.getOpt('Q')!=NULL);
308    doCollapse=(args.getOpt('C')!=NULL);
309    doDust=(args.getOpt('D')!=NULL);
310 +  /*
311    rawFormat=(args.getOpt('R')!=NULL);
312    if (rawFormat) {
313      GError("Sorry, raw qseq format parsing is not implemented yet!\n");
314      }
315 +  */
316    prefix=args.getOpt('n');
317    GStr s=args.getOpt('l');
318    if (!s.is_empty())
319       min_read_len=s.asInt();
320 +  s=args.getOpt('m');
321 +  if (!s.is_empty())
322 +     max_perc_N=s.asDouble();
323    s=args.getOpt('d');
324    if (!s.is_empty()) {
325       dust_cutoff=s.asInt();
326       doDust=true;
327       }
328 <    
328 >  s=args.getOpt('q');
329 >  if (!s.is_empty()) {
330 >     qvtrim_min=s.asInt();
331 >     }
332 >  s=args.getOpt('p');
333 >  if (!s.is_empty()) {
334 >     int v=s.asInt();
335 >     if (v==33) {
336 >        qv_phredtype=33;
337 >        qv_cvtadd=31;
338 >        }
339 >      else if (v==64) {
340 >        qv_phredtype=64;
341 >        qv_cvtadd=-31;
342 >        }
343 >       else
344 >         GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
345 >     }
346    if (args.getOpt('3')!=NULL) {
347      adapter3=args.getOpt('3');
348      adapter3.upper();
# Line 175 | Line 380
380          GStr rseq;  //current read sequence
381          GStr rqv;   //current read quality values
382          GStr s;
383 <        if (rawFormat) {
383 >        /* if (rawFormat) {
384            //TODO: implement qseq parsing here
385            //if (raw type=N) then continue; //skip invalid/bad records
386            
387            } //raw qseq format
388 <         else { // FASTQ or FASTA
388 >         else { // FASTQ or FASTA */
389            isfasta=(l[0]=='>');
390            if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
391            s=l;
392            rname=&(l[1]);
393            icounter++;
189          //GMessage("readname=%s\n",rname.chars());
394            for (int i=0;i<rname.length();i++)
395              if (rname[i]<=' ') { rname.cut(i); break; }
396            //now get the sequence
397            if ((l=fq.getLine())==NULL)
398                GError("Error: unexpected EOF after header for %s\n",rname.chars());
399            rseq=l; //this must be the DNA line
400 <          if (isfasta) {
401 <            while ((l=fq.getLine())!=NULL) {
402 <              //fasta can have multiple sequence lines
199 <              if (l[0]=='>') {
400 >          while ((l=fq.getLine())!=NULL) {
401 >              //seq can span multiple lines
402 >              if (l[0]=='>' || l[0]=='+') {
403                     fq.pushBack();
404                     break; //
405                     }
406                rseq+=l;
407 <              }
408 <            } //multi-line fasta file
409 <          if (!isfasta) {
207 <            if ((l=fq.getLine())==NULL)
407 >              } //check for multi-line seq
408 >          if (!isfasta) { //reading fastq quality values, which can also be multi-line
409 >            if ((l=fq.getLine())==NULL)
410                  GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
411              if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
412              if ((l=fq.getLine())==NULL)
413                  GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
414              rqv=l;
415 <            if (rqv.length()!=rseq.length())
416 <                GError("Error: qv len != seq len for %s\n", rname.chars());
417 <            }
418 <        } //<-- FASTA or FASTQ
415 >            //if (rqv.length()!=rseq.length())
416 >            //  GError("Error: qv len != seq len for %s\n", rname.chars());
417 >            while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
418 >              rqv+=l; //append to qv string
419 >              }
420 >            }// fastq
421 >        // } //<-- FASTA or FASTQ
422          rseq.upper();
423          int l5=0;
424          int l3=rseq.length()-1;
# Line 223 | Line 428
428                    }
429             continue;
430             }
431 <        if (ntrim(rseq, rqv, l5, l3)) {
431 >        if (ntrim(rseq, l5, l3)) { // N-trimming
432 >           //GMessage("before: %s\n",rseq.chars());
433 >           //GMessage("after : %s (%d)\n",rseq.substr(l5,l3-l5+1).chars(),l3-l5+1);
434             if (l3-l5+1<min_read_len) {
435               if (trashReport) {
436                      fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
# Line 234 | Line 441
441             rseq=rseq.substr(l5, l3-l5+1);
442             if (!rqv.is_empty())
443                rqv=rqv.substr(l5, l3-l5+1);
444 +           l5=0;
445 +           l3=rseq.length()-1;
446             }
447 <          
447 >        if (qvtrim_min!=0 && !rqv.is_empty() && qtrim(rqv, l5, l3)) { // qv-threshold trimming
448 >           if (l3-l5+1<min_read_len) {
449 >             if (trashReport) {
450 >                    fprintf(freport, "%s\tQ\t%s\n", rname.chars(), rseq.chars());
451 >                    }
452 >             continue; //invalid read
453 >             }
454 >            //-- keep only the l5..l3 range
455 >           rseq=rseq.substr(l5, l3-l5+1);
456 >           if (!rqv.is_empty())
457 >              rqv=rqv.substr(l5, l3-l5+1);
458 >           } //qv trimming
459          if (a3len>0) {
460            if (trim_adapter3(rseq, l5, l3)) {
461               if (l3-l5+1<min_read_len) {
# Line 297 | Line 517
517                                    rseq.chars());
518               }
519             else {  //fastq
520 <            if (phred64) convertQ64(rqv);
520 >            if (convert_phred) convertPhred(rqv);
521              if (prefix.is_empty())
522                 fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
523                else
# Line 343 | Line 563
563            }
564          }
565        else { //fastq format
566 <       if (phred64) convertQ64(qd->qv, qd->len);
566 >       if (convert_phred) convertPhred(qd->qv, qd->len);
567         if (prefix.is_empty()) {
568           fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
569                          rseq.chars(), qd->qv);
# Line 365 | Line 585
585      }
586  
587   FWCLOSE(f_out);
588 + //getc(stdin);
589   }
590  
591   class NData {
# Line 393 | Line 614
614       end5=1;
615       end3=seqlen;
616       for (int i=0;i<seqlen;i++)
617 <        if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
617 >        if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
618             NPos[NCount]=i;
619             NCount++;
620             }
# Line 416 | Line 637
637   void N_analyze(int l5, int l3, int p5, int p3) {
638   /* assumes feat was filled properly */
639   int old_dif, t5,t3,v;
640 < if (l3-l5<min_read_len || p5>p3 ) {
640 > if (l3<l5+2 || p5>p3 ) {
641     feat.end5=l5+1;
642     feat.end3=l3+1;
643     return;
# Line 448 | Line 669
669   }
670  
671  
672 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) {
673 < //count Ns in the sequence
672 > bool qtrim(GStr& qvs, int &l5, int &l3) {
673 > if (qvtrim_min==0 || qvs.is_empty()) return false;
674 > if (qv_phredtype==0) {
675 >  //try to guess the Phred type
676 >  int vmin=256, vmax=0;
677 >  for (int i=0;i<qvs.length();i++) {
678 >     if (vmin>qvs[i]) vmin=qvs[i];
679 >     if (vmax<qvs[i]) vmax=qvs[i];
680 >     }
681 >  if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
682 >  if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
683 >  if (qv_phredtype==0) {
684 >    GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
685 >    }
686 >  } //guessing the Phred type
687 > for (l3=qvs.length()-1;l3>2;l3--) {
688 >  if (qvs[l3]-qv_phredtype>=qvtrim_min && qvs[l3-1]-qv_phredtype>=qvtrim_min) break;
689 >  }
690 > //just in case, check also the 5' the end (?)
691 > for (l5=0;l5<qvs.length()-3;l5++) {
692 >  if (qvs[l5]-qv_phredtype>=qvtrim_min && qvs[l5+1]-qv_phredtype>=qvtrim_min) break;
693 >  }
694 > return (l5>0 || l3<qvs.length()-1);
695 > }
696 >
697 > bool ntrim(GStr& rseq, int &l5, int &l3) {
698 > //count Ns in the sequence, trim N-rich ends
699   feat.init(rseq);
700   l5=feat.end5-1;
701   l3=feat.end3-1;
702   N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
703 < if (l5==feat.end5-1 && l3==feat.end3-1)
704 <    return false; //nothing changed
703 > if (l5==feat.end5-1 && l3==feat.end3-1) {
704 >    if (feat.perc_N>max_perc_N) {
705 >           feat.valid=false;
706 >           l3=l5+1;
707 >           return true;
708 >           }
709 >      else {
710 >       return false; //nothing changed
711 >       }
712 >    }
713   l5=feat.end5-1;
714   l3=feat.end3-1;
715   if (l3-l5+1<min_read_len) {
# Line 463 | Line 717
717     return true;
718     }
719   feat.N_calc();
720 < if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases
720 >
721 > if (feat.perc_N>max_perc_N) {
722        feat.valid=false;
723        l3=l5+1;
724        return true;
# Line 589 | Line 844
844   return ncount;
845   }
846  
847 < // ------------------ adapter matching - triplet matching
848 < //look for matching triplets amongst the first 9 nucleotides of the 3' adaptor
849 < // or the last 9 nucleotides for the 5' adapter
850 < //when a triplet match is found, simply try to extend the alignment using a drop-off scheme
851 < // check minimum score and
597 < // for 3' adapter trimming:
847 >
848 > // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
849 > //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
850 > //check minimum score and
851 > //for 3' adapter trimming:
852   //     require that the right end of the alignment for either the adaptor OR the read must be
853   //     < 3 distance from its right end
854   // for 5' adapter trimming:
855   //     require that the left end of the alignment for either the adaptor OR the read must
856 < //     be at coordinate 0
856 > //     be at coordinate < 3 from start
857  
858   bool extendMatch(const char* a, int alen, int ai,
859 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, bool end5=false) {
859 >                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
860   //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
861 < //if (debug)
862 < //  GStr dbg(b);
863 < //GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai);
861 > #ifdef DEBUG
862 > GStr dbg(b);
863 > #endif
864 > //if (debug) {
865 > //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
866 > //  }
867   int a_l=ai; //alignment coordinates on a
868   int a_r=ai+mlen-1;
869   int b_l=bi; //alignment coordinates on b
# Line 670 | Line 927
927    int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
928    int mmovh5=(a_l<b_l)? a_l : b_l;
929    //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
930 + #ifdef DEBUG
931 +  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
932 + #endif
933    if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
934       if (a_l<a_ovh3) {
935          //adapter closer to the left end (typical for 5' adapter)
# Line 683 | Line 943
943          }
944       return true;
945       }
946 + else { //keep this segment pair for later (gapped alignment)
947 +   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
948 +   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
949 +   }
950 +  //do not trim:
951 +  l5=0;
952 +  l3=alen-1;
953    return false;
954   }
955  
956 + /*
957 + int getWordValue(const char* s, int wlen) {
958 + int r=0;
959 + while (wlen--) { r+=(((int)s[wlen])<<wlen) }
960 + return r;
961 + }
962 + */
963 + int get3mer_value(const char* s) {
964 + return (s[0]<<16)+(s[1]<<8)+s[2];
965 + }
966 +
967 + int w3_match(int qv, const char* str, int slen, int start_index=0) {
968 + if (start_index>=slen || start_index<0) return -1;
969 + for (int i=start_index;i<slen-3;i++) {
970 +   int rv=get3mer_value(str+i);
971 +   if (rv==qv) return i;
972 +   }
973 + return -1;
974 + }
975 +
976 + int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
977 + if (end_index>=slen) return -1;
978 + if (end_index<0) end_index=slen-1;
979 + for (int i=end_index-2;i>=0;i--) {
980 +   int rv=get3mer_value(str+i);
981 +   if (rv==qv) return i;
982 +   }
983 + return -1;
984 + }
985 +
986 + int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
987 + if (start_index>=slen || start_index<0) return -1;
988 + for (int i=start_index;i<slen-4;i++) {
989 +   int32* rv=(int32*)(str+i);
990 +   if (*rv==qv) return i;
991 +   }
992 + return -1;
993 + }
994 +
995 + int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
996 + if (end_index>=slen) return -1;
997 + if (end_index<0) end_index=slen-1;
998 + for (int i=end_index-3;i>=0;i--) {
999 +   int32* rv=(int32*)(str+i);
1000 +   if (*rv==qv) return i;
1001 +   }
1002 + return -1;
1003 + }
1004 +
1005 + #ifdef DEBUG
1006 + void dbgPrintChain(CSegChain& chain, const char* aseq) {
1007 +  GStr s(aseq);
1008 +  for (int i=0;i<chain.Count();i++) {
1009 +   CSegPair& seg=*chain[i];
1010 +   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1011 +            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
1012 +   }
1013 + }
1014 + #endif
1015 +
1016   bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1017   int rlen=seq.length();
1018   l5=0;
# Line 699 | Line 1026
1026        }
1027      else {
1028        l5=0;
1029 <      l3=fi-1;
1029 >      l3=fi-1;
1030        }
1031     return true;
1032     }
1033 + #ifdef DEBUG
1034 + if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
1035 + #endif
1036 +
1037   //also, for fast detection of other adapter-only reads that start past
1038   // the beginning of the adapter sequence, try to see if the first a3len-4
1039 < // characters of the read are a substring of the adapter
1040 < GStr rstart=seq.substr(0,a3len-4);
1041 < if ((fi=adapter3.index(rstart))>=0) {
1042 <   l3=rlen-1;
1043 <   l5=a3len-4;
1044 <   while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1045 <   return true;
1039 > // bases of the read are a substring of the adapter
1040 > if (rlen>a3len-3) {
1041 >   GStr rstart=seq.substr(1,a3len-4);
1042 >   if ((fi=adapter3.index(rstart))>=0) {
1043 >     l3=rlen-1;
1044 >     l5=a3len-4;
1045 >     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1046 >     return true;
1047 >     }
1048 >  }
1049 > CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1050 >  //check the easy cases - 11 bases exact match at the end
1051 > int fdlen=11;
1052 >  if (a3len<16) {
1053 >   fdlen=a3len>>1;
1054     }
1055 <  
1056 < //another easy case: first half of the adaptor matches
1057 < int a3hlen=a3len>>1;
1058 < GStr ahalf=adapter3.substr(0, a3hlen);
1059 < if ((fi=seq.index(ahalf))>=0) {
1060 <   extendMatch(seq.chars(), rlen, fi,
1061 <                 adapter3.chars(), a3len, 0,  a3hlen, l5,l3);
1062 <   return true;
1055 > if (fdlen>4) {
1056 >     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1057 >     GStr rstart=seq.substr(-fdlen-3,fdlen);
1058 >     if ((fi=adapter3.index(rstart))>=0) {
1059 > #ifdef DEBUG
1060 >       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1061 > #endif
1062 >       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1063 >                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
1064 >            return true;
1065 >       }
1066 >     //another easy case: first 11 characters of the adaptor found as a substring of the read
1067 >     GStr bstr=adapter3.substr(0, fdlen);
1068 >     if ((fi=seq.rindex(bstr))>=0) {
1069 > #ifdef DEBUG
1070 >       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1071 > #endif
1072 >       if (extendMatch(seq.chars(), rlen, fi,
1073 >                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
1074 >            return true;
1075 >       }
1076 >     } //tried to match 11 bases first
1077 >    
1078 > //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1079 > //-- only extend if the match is in the 3' (ending) region of the read
1080 > int wordSize=3;
1081 > int hlen=12;
1082 > if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1083 > int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1084 > if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1085 > imin=rlen-imin;
1086 > for (int iw=0;iw<hlen;iw++) {
1087 >   //int32* qv=(int32*)(adapter3.chars()+iw);
1088 >   int qv=get3mer_value(adapter3.chars()+iw);
1089 >   fi=-1;
1090 >   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1091 >   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1092 >     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1093 >
1094 > #ifdef DEBUG
1095 >     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1096 > #endif
1097 >     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1098 >                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
1099 >     fi--;
1100 >     if (fi<imin) break;
1101 >     }
1102 >   } //for each wmer in the first hlen bases of the adaptor
1103 > /*
1104 > //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1105 > //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1106 > if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1107 > int hlen2=a3len-wordSize;
1108 > //if (hlen2>a3len-4) hlen2=a3len-4;
1109 > if (hlen2>hlen) {
1110 > #ifdef DEBUG
1111 >     if (debug && a3segs.Count()>0) {
1112 >        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1113 >        }
1114 > #endif
1115 >     for (int iw=hlen;iw<hlen2;iw++) {
1116 >         //int* qv=(int32 *)(adapter3.chars()+iw);
1117 >         int qv=get3mer_value(adapter3.chars()+iw);
1118 >         fi=-1;
1119 >         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1120 >         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1121 >           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1122 >                         a3len, iw, wordSize, l5,l3, a3segs);
1123 >           fi--;
1124 >           if (fi<imin) break;
1125 >           }
1126 >         } //for each wmer between hlen2 and hlen bases of the adaptor
1127 >     }
1128 > //lastly, analyze collected a3segs for a possible gapped alignment:
1129 > GList<CSegChain> segchains(false,true,false);
1130 > #ifdef DEBUG
1131 > if (debug && a3segs.Count()>0) {
1132 >   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1133     }
1134 < //no easy cases, so let's do the word hashing
1135 < for (int iw=0;iw<6;iw++) {
1136 <   GStr aword=adapter3.substr(iw,3);
1137 <   if ((fi=seq.index(aword))>=0  && rlen-fi>3) {
1138 <      if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1139 <                   a3len, iw, 3, l5,l3)) return true;
1134 > #endif
1135 > for (int i=0;i<a3segs.Count();i++) {
1136 >   if (a3segs[i]->chain==NULL) {
1137 >       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1138 >       CSegChain* newchain=new CSegChain();
1139 >       newchain->setFreeItem(false);
1140 >       newchain->addSegPair(a3segs[i]);
1141 >       a3segs[i]->chain=newchain;
1142 >       segchains.Add(newchain); //just to free them when done
1143 >       }
1144 >   for (int j=i+1;j<a3segs.Count();j++) {
1145 >      CSegChain* chain=a3segs[i]->chain;
1146 >      if (chain->extendChain(a3segs[j])) {
1147 >          a3segs[j]->chain=chain;
1148 > #ifdef DEBUG
1149 >          if (debug) dbgPrintChain(*chain, adapter3.chars());
1150 > #endif      
1151 >          //save time by checking here if the extended chain is already acceptable for trimming
1152 >          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1153 >            l5=0;
1154 >            l3=chain->astart-2;
1155 > #ifdef DEBUG
1156 >          if (debug && a3segs.Count()>0) {
1157 >            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1158 >            }
1159 > #endif
1160 >            return true;
1161 >            }
1162 >          } //chain can be extended
1163        }
1164 <   }
1164 >   } //collect segment alignments into chains
1165 > */  
1166   return false; //no adapter parts found
1167   }
1168  
# Line 751 | Line 1184
1184        }
1185     return true;
1186     }
1187 < //for fast detection of adapter-rich reads, check if the first 12
1188 < //characters of the read are a substring of the adapter
1189 < GStr rstart=seq.substr(1,12);
1190 < if ((fi=adapter5.index(rstart))>=0) {
1191 <   //l3=rlen-1;
1192 <   //l5=a5len-4;
1193 <   //while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++;
1194 <   //return true;
1195 <   //if (debug) GMessage(" first 12char of the read match adaptor!\n");
1196 <   extendMatch(seq.chars(), rlen, 1,
764 <                 adapter5.chars(), a5len, fi,  12, l5,l3, true);
765 <   return true;
1187 > #ifdef DEBUG
1188 > if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1189 > #endif
1190 >
1191 > CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1192 >
1193 > //try the easy way out first - look for an exact match of 11 bases
1194 > int fdlen=11;
1195 >  if (a5len<16) {
1196 >   fdlen=a5len>>1;
1197     }
1198 <  
1199 < //another easy case: last 12 characters of the adaptor found as a substring of the read
1200 < int aplen=12;
1201 < int apstart=a5len-aplen-2;
1202 < if (apstart<0) { apstart=0; aplen=a5len-1; }
1203 < GStr bstr=adapter5.substr(apstart, aplen);
1204 < if ((fi=seq.index(bstr))>=0) {
1205 <   //if (debug) GMessage(" last 12char of adaptor match the read!\n");
1206 <   extendMatch(seq.chars(), rlen, fi,
1207 <                 adapter5.chars(), a5len, apstart,  aplen, l5,l3,true);
1208 <   return true;
1198 > if (fdlen>4) {
1199 >     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1200 >     if ((fi=adapter5.index(rstart))>=0) {
1201 > #ifdef DEBUG
1202 >       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1203 > #endif
1204 >       if (extendMatch(seq.chars(), rlen, 1,
1205 >                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1206 >           return true;
1207 >       }
1208 >     //another easy case: last 11 characters of the adaptor found as a substring of the read
1209 >     GStr bstr=adapter5.substr(-fdlen);
1210 >     if ((fi=seq.index(bstr))>=0) {
1211 > #ifdef DEBUG
1212 >       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1213 > #endif
1214 >       if (extendMatch(seq.chars(), rlen, fi,
1215 >                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1216 >          return true;
1217 >       }
1218 >     } //tried to matching at most 11 bases first
1219 >    
1220 > //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1221 > //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1222 > int wordSize=3;
1223 > int hlen=12;
1224 > if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1225 > int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1226 > if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1227 > for (int iw=0;iw<=hlen;iw++) {
1228 >   int apstart=a5len-iw-wordSize;
1229 >   fi=0;
1230 >   //int* qv=(int32 *)(adapter5.chars()+apstart);
1231 >   int qv=get3mer_value(adapter5.chars()+apstart);
1232 >   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1233 >   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1234 > #ifdef DEBUG
1235 >     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1236 > #endif
1237 >     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1238 >                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1239 >     fi++;
1240 >     if (fi>imax) break;
1241 >     }
1242 >   } //for each wmer in the last hlen bases of the adaptor
1243 > /*
1244 >
1245 > //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1246 > //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1247 > if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1248 > int hlen2=a5len-wordSize;
1249 > //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1250 > #ifdef DEBUG
1251 >      if (debug && a5segs.Count()>0) {
1252 >        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1253 >        }
1254 > #endif
1255 > if (hlen2>hlen) {
1256 >     for (int iw=hlen+1;iw<=hlen2;iw++) {
1257 >         int apstart=a5len-iw-wordSize;
1258 >         fi=0;
1259 >         //int* qv=(int32 *)(adapter5.chars()+apstart);
1260 >         int qv=get3mer_value(adapter5.chars()+apstart);
1261 >         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1262 >         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1263 >           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1264 >                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1265 >           fi++;
1266 >           if (fi>imax) break;
1267 >           }
1268 >         } //for each wmer between hlen2 and hlen bases of the adaptor
1269 >     }
1270 > if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1271 > // lastly, analyze collected a5segs for a possible gapped alignment:
1272 > GList<CSegChain> segchains(false,true,false);
1273 > #ifdef DEBUG
1274 > if (debug && a5segs.Count()>0) {
1275 >   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1276     }
1277 < //no easy cases, find a triplet match as a seed for alignment extension
1278 < //find triplets at the right end of the adapter
1279 < for (int iw=0;iw<6;iw++) {
1280 <   apstart=a5len-iw-4;
1281 <   GStr aword=adapter5.substr(apstart,3);
1282 <   if ((fi=seq.index(aword))>=0) {
1283 <      //if (debug) GMessage("extending wmatch :%*s\n", fi+3, aword.chars());
1284 <      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1285 <                   a5len, apstart, 3, l5,l3,true)) {
1286 <                     return true;
1287 <                     }
1277 > #endif
1278 > for (int i=0;i<a5segs.Count();i++) {
1279 >   if (a5segs[i]->chain==NULL) {
1280 >       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1281 >       CSegChain* newchain=new CSegChain(true);
1282 >       newchain->setFreeItem(false);
1283 >       newchain->addSegPair(a5segs[i]);
1284 >       a5segs[i]->chain=newchain;
1285 >       segchains.Add(newchain); //just to free them when done
1286 >       }
1287 >   for (int j=i+1;j<a5segs.Count();j++) {
1288 >      CSegChain* chain=a5segs[i]->chain;
1289 >      if (chain->extendChain(a5segs[j])) {
1290 >         a5segs[j]->chain=chain;
1291 > #ifdef DEBUG
1292 >         if (debug) dbgPrintChain(*chain, adapter5.chars());
1293 > #endif      
1294 >      //save time by checking here if the extended chain is already acceptable for trimming
1295 >         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1296 >            l5=chain->aend;
1297 >            l3=rlen-1;
1298 >            return true;
1299 >            }
1300 >         } //chain can be extended
1301        }
1302 <   }
1302 >   } //collect segment alignments into chains
1303 > */
1304   return false; //no adapter parts found
1305   }
1306  
1307 < //conversion of phred64 to phread33
1308 < void convertQ64(GStr& q) {
1309 < for (int i=0;i<q.length();i++) q[i]-=31;
1307 > //convert qvs to/from phred64 from/to phread33
1308 > void convertPhred(GStr& q) {
1309 > for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1310   }
1311  
1312 < void convertQ64(char* q, int len) {
1313 < for (int i=0;i<len;i++) q[i]-=31;
1312 > void convertPhred(char* q, int len) {
1313 > for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1314   }
1315  

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines