ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 3 | Line 3
3   #include "GHash.hh"
4   #include "GList.hh"
5   #include <ctype.h>
6 + #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9   fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
# Line 30 | Line 31
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32   -3  trim the given adapter sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -a  minimum length of exact match to adaptor sequence at the proper end (6)\n\
34 > -A  disable polyA trimming (enabled by default)\n\
35 > -T  enable polyT trimming (disabled by default)\n\
36 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
37   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38   -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
# Line 75 | Line 78
78   bool isfasta=false;
79   bool convert_phred=false;
80   GStr outsuffix; // -o
78 //GStr adapter3;
79 //GStr adapter5;
81   GStr prefix;
82   GStr zcmd;
83   int num_trimmed5=0;
# Line 91 | Line 92
92   int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93   int qv_cvtadd=0; //could be -31 or +31
94  
95 < int a3len=0;
96 < int a5len=0;
97 < // adaptor matching metrics -- for extendMatch() function
97 < const int match_score=2; //match score
98 < const int mismatch_score=-3; //mismatch
99 <
100 < const int a_m_score=2; //match score
101 < const int a_mis_score=-3; //mismatch
102 < const int a_dropoff_score=7;
95 > // adaptor matching metrics -- for X-drop ungapped extension
96 > const int poly_m_score=2; //match score
97 > const int poly_mis_score=-3; //mismatch
98   const int poly_dropoff_score=7;
99 < int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
105 < const int a_min_chain_score=15; //for gapped alignments
99 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
100  
101 < class CSegChain;
102 <
103 < class CSegPair {
104 <  public:
105 <   GSeg a;
106 <   GSeg b; //the adapter segment
107 <   int score;
114 <   int flags;
115 <   CSegChain* chain;
116 <   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
117 <      score=mscore;
118 <      if (score==0) score=a.len()*a_m_score;
119 <      flags=0;
120 <      chain=NULL;
121 <      }
122 <   int len() { return  a.len(); }
123 <   bool operator==(CSegPair& d){
124 <      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
125 <      //make equal even segments that are included into one another:
126 <      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
127 <      }
128 <   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
129 <     if (b.start==d.b.start) {
130 <        if (score==d.score) {
131 <           //just try to be consistent:
132 <           if (b.end==d.b.end) {
133 <             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
134 <             }
135 <           return (b.end>d.b.end);
136 <           }
137 <         else return (score<d.score);
138 <        }
139 <     return (b.start>d.b.start);
140 <     }
141 <   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
142 <     /*if (b.start==d.b.start && b.end==d.b.end) {
143 <          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
144 <          }
145 <     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
146 <     if (b.start==d.b.start) {
147 <        if (score==d.score) {
148 <           //just try to be consistent:
149 <           if (b.end==d.b.end) {
150 <             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
151 <             }
152 <           return (b.end<d.b.end);
153 <           }
154 <         else return (score>d.score);
155 <        }
156 <     return (b.start<d.b.start);
157 <     }
158 < };
159 <
160 < int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
161 < CSegPair& x = *(CSegPair *)sa;
162 < CSegPair& y = *(CSegPair *)sb;
163 < /*
164 < if (x.b.end==y.b.end) {
165 <     if (x.b.start==y.b.start) {
166 <         if (x.a.end==y.a.end) {
167 <            if (x.a.start==y.a.start) return 0;
168 <            return ((x.a.start>y.a.start) ? -1 : 1);
169 <            }
170 <          else {
171 <            return ((x.a.end>y.a.end) ? -1 : 1);
172 <            }
173 <          }
174 <      else {
175 <       return ((x.b.start>y.b.start) ? -1 : 1);
176 <       }
177 <     }
178 <    else {
179 <     return ((x.b.end>y.b.end) ? -1 : 1);
180 <     }
181 < */
182 <  if (x.b.end==y.b.end) {
183 <     if (x.score==y.score) {
184 <     if (x.b.start==y.b.start) {
185 <         if (x.a.end==y.a.end) {
186 <            if (x.a.start==y.a.start) return 0;
187 <            return ((x.a.start<y.a.start) ? -1 : 1);
188 <            }
189 <          else {
190 <            return ((x.a.end<y.a.end) ? -1 : 1);
191 <            }
192 <          }
193 <      else {
194 <       return ((x.b.start<y.b.start) ? -1 : 1);
195 <       }
196 <      } else return ((x.score>y.score) ? -1 : 1);
197 <     }
198 <    else {
199 <     return ((x.b.end>y.b.end) ? -1 : 1);
200 <     }
201 <
202 < }
203 <
204 < class CSegChain:public GList<CSegPair> {
205 < public:
206 <   uint astart;
207 <   uint aend;
208 <   uint bstart;
209 <   uint bend;
210 <   int score;
211 <   bool endSort;
212 <  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
213 <   //as SegPairs are inserted, they will be sorted by a.start coordinate
214 <   score=0;
215 <   astart=MAX_UINT;
216 <   aend=0;
217 <   bstart=MAX_UINT;
218 <   bend=0;
219 <   endSort=aln5;
220 <   if (aln5) { setSorted(cmpSegEnds); }
221 <   }
222 < bool operator==(CSegChain& d) {
223 <   //return (score==d.score);
224 <    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
225 <   }
226 < bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
227 <   //return (score<d.score);
228 <   if (bstart==d.bstart && bend==d.bend) {
229 <          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
230 <          }
231 <     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
232 <   }
233 < bool operator<(CSegChain& d) {
234 <   //return (score>d.score);
235 <   if (bstart==d.bstart && bend==d.bend) {
236 <          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
237 <          }
238 <     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
239 <   }
240 < void addSegPair(CSegPair* segp) {
241 <   if (AddIfNew(segp)!=segp) return;
242 <   score+=segp->score;
243 <   if (astart>segp->a.start) astart=segp->a.start;
244 <   if (aend<segp->a.end) aend=segp->a.end;
245 <   if (bstart>segp->b.start) bstart=segp->b.start;
246 <   if (bend<segp->b.end) bend=segp->b.end;
247 <   }
248 < //for building actual chains:
249 < bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
250 <   int bgap=0;
251 <   int agap=0;
252 <   //if (endSort) {
253 <   if (bstart>segp->b.start) {
254 <      bgap = (int)(bstart-segp->b.end);
255 <      if (abs(bgap)>2) return false;
256 <      agap = (int)(astart-segp->a.end);
257 <      if (abs(agap)>2) return false;
258 <      }
259 <     else {
260 <      bgap = (int) (segp->b.start-bend);
261 <      if (abs(bgap)>2) return false;
262 <      agap = (int)(segp->a.start-aend);
263 <      if (abs(agap)>2) return false;
101 > const char *polyA_seed="AAAA";
102 > const char *polyT_seed="TTTT";
103 > /*
104 > struct CAdapters {
105 >    GStr a5;
106 >    GStr a3;
107 >    CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
108        }
109 <   if (agap*bgap<0) return false;
110 <   addSegPair(segp);
111 <   score-=abs(agap)+abs(bgap);
112 <   return true;
269 <   }
270 < };
109 > };
110 > */
111 > GVec<GStr> adapters5;
112 > GVec<GStr> adapters3;
113  
114   // element in dhash:
115   class FqDupRec {
# Line 317 | Line 159
159  
160   GHash<FqDupRec> dhash; //hash to keep track of duplicates
161  
162 + int loadAdapters(const char* fname);
163 +
164   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
165                         GStr& s, GStr& infname, GStr& infname2);
166   // uses outsuffix to generate output file names and open file handles as needed
# Line 331 | Line 175
175   //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
176  
177   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
334 bool polytrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
178   bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
179   int dust(GStr& seq);
180 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
180 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
181 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
182   bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
183 + bool trim_adapter3(GStr& seq, int &l5, int &l3);
184  
185   void convertPhred(char* q, int len);
186   void convertPhred(GStr& q);
# Line 392 | Line 237
237         else
238           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
239       }
240 <  if (args.getOpt('3')!=NULL) {
241 <    adapter3=args.getOpt('3');
242 <    adapter3.upper();
243 <    a3len=adapter3.length();
244 <    }
245 <  if (args.getOpt('5')!=NULL) {
246 <    adapter5=args.getOpt('5');
247 <    adapter5.upper();
248 <    a5len=adapter5.length();
240 >  s=args.getOpt('f');
241 >  if (!s.is_empty()) {
242 >   loadAdapters(s.chars());
243 >   }
244 >  bool fileAdapters=adapters5.Count()+adapters3.Count();
245 >  s=args.getOpt('5');
246 >  if (!s.is_empty()) {
247 >    if (fileAdapters)
248 >      GError("Error: options -5 and -f cannot be used together!\n");
249 >    s.upper();
250 >    adapters5.Add(s);
251      }
252 <  s=args.getOpt('a');
252 >  s=args.getOpt('3');
253    if (!s.is_empty()) {
254 <     int a_minmatch=s.asInt();
255 <     a_min_score=a_minmatch<<1;
254 >    if (fileAdapters)
255 >      GError("Error: options -3 and -f cannot be used together!\n");
256 >      s.upper();
257 >      adapters3.Add(s);
258 >    }
259 >  s=args.getOpt('y');
260 >  if (!s.is_empty()) {
261 >     int minmatch=s.asInt();
262 >     poly_minScore=minmatch*poly_m_score;
263       }
264    
265    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
# Line 861 | Line 715
715   return ncount;
716   }
717  
864
865 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
866 //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
867 //check minimum score and
868 //for 3' adapter trimming:
869 //     require that the right end of the alignment for either the adaptor OR the read must be
870 //     < 3 distance from its right end
871 // for 5' adapter trimming:
872 //     require that the left end of the alignment for either the adaptor OR the read must
873 //     be at coordinate < 3 from start
874
875 bool extendMatch(const char* a, int alen, int ai,
876                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
877 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
878 #ifdef DEBUG
879 GStr dbg(b);
880 #endif
881 //if (debug) {
882 //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
883 //  }
884 int a_l=ai; //alignment coordinates on a
885 int a_r=ai+mlen-1;
886 int b_l=bi; //alignment coordinates on b
887 int b_r=bi+mlen-1;
888 int ai_maxscore=ai;
889 int bi_maxscore=bi;
890 int score=mlen*a_m_score;
891 int maxscore=score;
892 int mism5score=a_mis_score;
893 if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
894 //try to extend to the left first, if possible
895 while (ai>0 && bi>0) {
896   ai--;
897   bi--;
898   score+= (a[ai]==b[bi])? a_m_score : mism5score;
899   if (score>maxscore) {
900       ai_maxscore=ai;
901       bi_maxscore=bi;
902       maxscore=score;
903       }
904     else if (maxscore-score>a_dropoff_score) break;
905   }
906 a_l=ai_maxscore;
907 b_l=bi_maxscore;
908 //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
909 //now extend to the right
910 ai_maxscore=a_r;
911 bi_maxscore=b_r;
912 ai=a_r;
913 bi=b_r;
914 score=maxscore;
915 //sometimes there are extra AAAAs at the end of the read, ignore those
916 if (strcmp(&a[alen-4],"AAAA")==0) {
917    alen-=3;
918    while (a[alen-1]=='A' && alen>ai) alen--;
919    }
920 while (ai<alen-1 && bi<blen-1) {
921   ai++;
922   bi++;
923   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
924   if (a[ai]==b[bi]) { //match
925      score+=a_m_score;
926      if (ai>=alen-2) {
927           score+=a_m_score-(alen-ai-1);
928           }
929      }
930    else { //mismatch
931      score+=a_mis_score;
932      }  
933   if (score>maxscore) {
934       ai_maxscore=ai;
935       bi_maxscore=bi;
936       maxscore=score;
937       }
938     else if (maxscore-score>a_dropoff_score) break;
939   }
940  a_r=ai_maxscore;
941  b_r=bi_maxscore;
942  int a_ovh3=alen-a_r-1;
943  int b_ovh3=blen-b_r-1;
944  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
945  int mmovh5=(a_l<b_l)? a_l : b_l;
946  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
947 #ifdef DEBUG
948  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
949 #endif
950  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
951     if (a_l<a_ovh3) {
952        //adapter closer to the left end (typical for 5' adapter)
953        l5=a_r+1;
954        l3=alen-1;
955        }
956      else {
957        //adapter matching at the right end (typical for 3' adapter)
958        l5=0;
959        l3=a_l-1;
960        }
961     return true;
962     }
963 else { //keep this segment pair for later (gapped alignment)
964   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
965   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
966   }
967  //do not trim:
968  l5=0;
969  l3=alen-1;
970  return false;
971 }
972
973 /*
974 int getWordValue(const char* s, int wlen) {
975 int r=0;
976 while (wlen--) { r+=(((int)s[wlen])<<wlen) }
977 return r;
978 }
979 */
718   int get3mer_value(const char* s) {
719   return (s[0]<<16)+(s[1]<<8)+s[2];
720   }
# Line 1000 | Line 738
738   return -1;
739   }
740  
741 < int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
742 < if (start_index>=slen || start_index<0) return -1;
743 < for (int i=start_index;i<slen-4;i++) {
744 <   int32* rv=(int32*)(str+i);
745 <   if (*rv==qv) return i;
746 <   }
747 < return -1;
748 < }
749 <
750 < int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
751 < if (end_index>=slen) return -1;
752 < if (end_index<0) end_index=slen-1;
753 < for (int i=end_index-3;i>=0;i--) {
754 <   int32* rv=(int32*)(str+i);
755 <   if (*rv==qv) return i;
756 <   }
1019 < return -1;
1020 < }
1021 <
1022 < #ifdef DEBUG
1023 < void dbgPrintChain(CSegChain& chain, const char* aseq) {
1024 <  GStr s(aseq);
1025 <  for (int i=0;i<chain.Count();i++) {
1026 <   CSegPair& seg=*chain[i];
1027 <   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1028 <            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
1029 <   }
1030 < }
1031 < #endif
1032 < const char *pA_seed="AAAA";
1033 < const char *pT_seed="TTTT";
1034 < const int polyA_seed=*(int*)pA_seed;
1035 < const int polyT_seed=*(int*)pT_seed;
741 > struct SLocScore {
742 >  int pos;
743 >  int score;
744 >  SLocScore(int p=0,int s=0) {
745 >    pos=p;
746 >    score=s;
747 >    }
748 >  void set(int p, int s) {
749 >    pos=p;
750 >    score=s;
751 >    }
752 >  void add(int p, int add) {
753 >    pos=p;
754 >    score+=add;
755 >    }
756 > };
757  
758 < bool trim_poly3(GStr &seq, int &l5, int &l3) {
758 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
759   int rlen=seq.length();
760   l5=0;
761   l3=rlen-1;
762 + int32 seedVal=*(int32*)poly_seed;
763 + char polyChar=poly_seed[0];
764   //assumes N trimming was already done
765   //so a poly match should be very close to the end of the read
1043 //TODO: should we attempt polyA and polyT at the same time, same end?
766   // -- find the initial match (seed)
767 < int rlo=GMAX((rlen-10), 0);
768 < int si;
769 < for (si=rlen-5;si>rlo;si--) {
770 <   if (polyA_seed==*(int*)&(seq.chars()+si)) {
767 > int lmin=GMAX((rlen-12), 0);
768 > int li;
769 > for (li=rlen-4;li>lmin;li--) {
770 >   if (seedVal==*(int*)&(seq[li])) {
771        break;
772        }
773     }
774 < if (si<=rlo) return false;
775 < //seed found, try to extend it to left and right
776 < int score=4*match_score;
777 < max_rlo=rlo;
778 < rlo--;
779 < while (rlo>=0) {
780 <    rlo--;
781 <    score+=
782 <    
774 > if (li<=lmin) return false;
775 > //seed found, try to extend it both ways
776 > //extend right
777 > int ri=li+3;
778 > SLocScore loc(ri, poly_m_score<<2);
779 > SLocScore maxloc(loc);
780 > //extend right
781 > while (ri<rlen-2) {
782 >   ri++;
783 >   if (seq[ri]==polyChar) {
784 >                loc.add(ri,poly_m_score);
785 >                }
786 >   else if (seq[ri]=='N') {
787 >                loc.add(ri,0);
788 >                }
789 >   else { //mismatch
790 >        loc.add(ri,poly_mis_score);
791 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
792 >        }
793 >   if (maxloc.score<=loc.score) {
794 >      maxloc=loc;
795 >      }
796 >   }
797 > ri=maxloc.pos;
798 > if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end
799 > //ri = right boundary for the poly match
800 > //extend left
801 > loc.set(li, maxloc.score);
802 > maxloc.pos=li;
803 > while (li>0) {
804 >    li--;
805 >    if (seq[li]==polyChar) {
806 >                 loc.add(li,poly_m_score);
807 >                 }
808 >    else if (seq[li]=='N') {
809 >                 loc.add(li,0);
810 >                 }
811 >    else { //mismatch
812 >         loc.add(li,poly_mis_score);
813 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
814 >         }
815 >    if (maxloc.score<=loc.score) {
816 >       maxloc=loc;
817 >       }
818 >    }
819 > if (maxloc.score>poly_minScore && ri>=rlen-3) {
820 >    l5=li;
821 >    l3=ri;
822 >    return true;
823      }
824 + return false;
825   }
826  
827 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
827 >
828 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
829   int rlen=seq.length();
830   l5=0;
831   l3=rlen-1;
832 < //first try a full match, we might get lucky
833 < int fi=-1;
834 < if ((fi=seq.index(adapter3))>=0) {
835 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
836 <      l5=fi+a3len;
837 <      l3=rlen-1;
838 <      }
839 <    else {
840 <      l5=0;
841 <      l3=fi-1;
832 > int32 seedVal=*(int32*)poly_seed;
833 > char polyChar=poly_seed[0];
834 > //assumes N trimming was already done
835 > //so a poly match should be very close to the end of the read
836 > // -- find the initial match (seed)
837 > int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
838 > int li;
839 > for (li=0;li<=lmax;li++) {
840 >   if (seedVal==*(int*)&(seq[li])) {
841 >      break;
842        }
1079   return true;
843     }
844 < #ifdef DEBUG
845 < if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
846 < #endif
847 <
848 < //also, for fast detection of other adapter-only reads that start past
849 < // the beginning of the adapter sequence, try to see if the first a3len-4
850 < // bases of the read are a substring of the adapter
851 < if (rlen>a3len-3) {
852 <   GStr rstart=seq.substr(1,a3len-4);
853 <   if ((fi=adapter3.index(rstart))>=0) {
854 <     l3=rlen-1;
855 <     l5=a3len-4;
856 <     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
857 <     return true;
858 <     }
859 <  }
860 < CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
861 <  //check the easy cases - 11 bases exact match at the end
862 < int fdlen=11;
863 <  if (a3len<16) {
1101 <   fdlen=a3len>>1;
1102 <   }
1103 < if (fdlen>4) {
1104 <     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1105 <     GStr rstart=seq.substr(-fdlen-3,fdlen);
1106 <     if ((fi=adapter3.index(rstart))>=0) {
1107 < #ifdef DEBUG
1108 <       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1109 < #endif
1110 <       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1111 <                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
1112 <            return true;
1113 <       }
1114 <     //another easy case: first 11 characters of the adaptor found as a substring of the read
1115 <     GStr bstr=adapter3.substr(0, fdlen);
1116 <     if ((fi=seq.rindex(bstr))>=0) {
1117 < #ifdef DEBUG
1118 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1119 < #endif
1120 <       if (extendMatch(seq.chars(), rlen, fi,
1121 <                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
1122 <            return true;
844 > if (li>lmax) return false;
845 > //seed found, try to extend it both ways
846 > //extend left
847 > int ri=li+3; //save rightmost base of the seed
848 > SLocScore loc(li, poly_m_score<<2);
849 > SLocScore maxloc(loc);
850 > while (li>0) {
851 >    li--;
852 >    if (seq[li]==polyChar) {
853 >                 loc.add(li,poly_m_score);
854 >                 }
855 >    else if (seq[li]=='N') {
856 >                 loc.add(li,0);
857 >                 }
858 >    else { //mismatch
859 >         loc.add(li,poly_mis_score);
860 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
861 >         }
862 >    if (maxloc.score<=loc.score) {
863 >       maxloc=loc;
864         }
865 <     } //tried to match 11 bases first
866 <    
867 < //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
868 < //-- only extend if the match is in the 3' (ending) region of the read
869 < int wordSize=3;
870 < int hlen=12;
871 < if (hlen>a3len-wordSize) hlen=a3len-wordSize;
872 < int imin=rlen>>1; //last half of the read, left boundary for the wmer match
873 < if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
874 < imin=rlen-imin;
875 < for (int iw=0;iw<hlen;iw++) {
876 <   //int32* qv=(int32*)(adapter3.chars()+iw);
877 <   int qv=get3mer_value(adapter3.chars()+iw);
878 <   fi=-1;
879 <   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
880 <   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
881 <     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
882 <
883 < #ifdef DEBUG
1143 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1144 < #endif
1145 <     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1146 <                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
1147 <     fi--;
1148 <     if (fi<imin) break;
1149 <     }
1150 <   } //for each wmer in the first hlen bases of the adaptor
1151 < /*
1152 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1153 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1154 < if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1155 < int hlen2=a3len-wordSize;
1156 < //if (hlen2>a3len-4) hlen2=a3len-4;
1157 < if (hlen2>hlen) {
1158 < #ifdef DEBUG
1159 <     if (debug && a3segs.Count()>0) {
1160 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
865 >    }
866 > li=maxloc.pos;
867 > if (li>3) return false; //no trimming wanted, too far from 5' end
868 > //li = right boundary for the poly match
869 >
870 > //extend right
871 > loc.set(ri, maxloc.score);
872 > maxloc.pos=ri;
873 > while (ri<rlen-2) {
874 >   ri++;
875 >   if (seq[ri]==polyChar) {
876 >                loc.add(ri,poly_m_score);
877 >                }
878 >   else if (seq[ri]=='N') {
879 >                loc.add(ri,0);
880 >                }
881 >   else { //mismatch
882 >        loc.add(ri,poly_mis_score);
883 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
884          }
885 < #endif
886 <     for (int iw=hlen;iw<hlen2;iw++) {
1164 <         //int* qv=(int32 *)(adapter3.chars()+iw);
1165 <         int qv=get3mer_value(adapter3.chars()+iw);
1166 <         fi=-1;
1167 <         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1168 <         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1169 <           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1170 <                         a3len, iw, wordSize, l5,l3, a3segs);
1171 <           fi--;
1172 <           if (fi<imin) break;
1173 <           }
1174 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1175 <     }
1176 < //lastly, analyze collected a3segs for a possible gapped alignment:
1177 < GList<CSegChain> segchains(false,true,false);
1178 < #ifdef DEBUG
1179 < if (debug && a3segs.Count()>0) {
1180 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1181 <   }
1182 < #endif
1183 < for (int i=0;i<a3segs.Count();i++) {
1184 <   if (a3segs[i]->chain==NULL) {
1185 <       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1186 <       CSegChain* newchain=new CSegChain();
1187 <       newchain->setFreeItem(false);
1188 <       newchain->addSegPair(a3segs[i]);
1189 <       a3segs[i]->chain=newchain;
1190 <       segchains.Add(newchain); //just to free them when done
1191 <       }
1192 <   for (int j=i+1;j<a3segs.Count();j++) {
1193 <      CSegChain* chain=a3segs[i]->chain;
1194 <      if (chain->extendChain(a3segs[j])) {
1195 <          a3segs[j]->chain=chain;
1196 < #ifdef DEBUG
1197 <          if (debug) dbgPrintChain(*chain, adapter3.chars());
1198 < #endif      
1199 <          //save time by checking here if the extended chain is already acceptable for trimming
1200 <          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1201 <            l5=0;
1202 <            l3=chain->astart-2;
1203 < #ifdef DEBUG
1204 <          if (debug && a3segs.Count()>0) {
1205 <            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1206 <            }
1207 < #endif
1208 <            return true;
1209 <            }
1210 <          } //chain can be extended
885 >   if (maxloc.score<=loc.score) {
886 >      maxloc=loc;
887        }
888 <   } //collect segment alignments into chains
889 < */  
888 >   }
889 >
890 > if (maxloc.score>poly_minScore && li<=3) {
891 >    l5=li;
892 >    l3=ri;
893 >    return true;
894 >    }
895 > return false;
896 > }
897 >
898 > bool trim_adapter3(GStr& seq, int&l5, int &l3) {
899 > int rlen=seq.length();
900 > l5=0;
901 > l3=rlen-1;
902 >
903   return false; //no adapter parts found
904   }
905  
906   bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1218 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
907   int rlen=seq.length();
908   l5=0;
909   l3=rlen-1;
910 < //try to see if adapter is fully included in the read
911 < int fi=-1;
912 < if ((fi=seq.index(adapter5))>=0) {
913 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
914 <      l5=fi+a5len;
1227 <      l3=rlen-1;
1228 <      }
1229 <    else {
1230 <      l5=0;
1231 <      l3=fi-1;
1232 <      }
1233 <   return true;
1234 <   }
1235 < #ifdef DEBUG
1236 < if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1237 < #endif
1238 <
1239 < CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1240 <
1241 < //try the easy way out first - look for an exact match of 11 bases
1242 < int fdlen=11;
1243 <  if (a5len<16) {
1244 <   fdlen=a5len>>1;
1245 <   }
1246 < if (fdlen>4) {
1247 <     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1248 <     if ((fi=adapter5.index(rstart))>=0) {
1249 < #ifdef DEBUG
1250 <       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1251 < #endif
1252 <       if (extendMatch(seq.chars(), rlen, 1,
1253 <                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1254 <           return true;
1255 <       }
1256 <     //another easy case: last 11 characters of the adaptor found as a substring of the read
1257 <     GStr bstr=adapter5.substr(-fdlen);
1258 <     if ((fi=seq.index(bstr))>=0) {
1259 < #ifdef DEBUG
1260 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1261 < #endif
1262 <       if (extendMatch(seq.chars(), rlen, fi,
1263 <                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1264 <          return true;
1265 <       }
1266 <     } //tried to matching at most 11 bases first
1267 <    
1268 < //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1269 < //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1270 < int wordSize=3;
1271 < int hlen=12;
1272 < if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1273 < int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1274 < if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1275 < for (int iw=0;iw<=hlen;iw++) {
1276 <   int apstart=a5len-iw-wordSize;
1277 <   fi=0;
1278 <   //int* qv=(int32 *)(adapter5.chars()+apstart);
1279 <   int qv=get3mer_value(adapter5.chars()+apstart);
1280 <   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1281 <   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1282 < #ifdef DEBUG
1283 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1284 < #endif
1285 <     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1286 <                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1287 <     fi++;
1288 <     if (fi>imax) break;
1289 <     }
1290 <   } //for each wmer in the last hlen bases of the adaptor
1291 < /*
910 > bool trimmed=false;
911 > for (int ai=0;ai<adapters5.Count();ai++) {
912 >  if (adapters5[ai].is_empty()) continue;
913 >  int a5len=adapters5[ai].length();
914 >  GStr& adapter5=adapters5[ai];
915  
916 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
917 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
918 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1296 < int hlen2=a5len-wordSize;
1297 < //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1298 < #ifdef DEBUG
1299 <      if (debug && a5segs.Count()>0) {
1300 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1301 <        }
1302 < #endif
1303 < if (hlen2>hlen) {
1304 <     for (int iw=hlen+1;iw<=hlen2;iw++) {
1305 <         int apstart=a5len-iw-wordSize;
1306 <         fi=0;
1307 <         //int* qv=(int32 *)(adapter5.chars()+apstart);
1308 <         int qv=get3mer_value(adapter5.chars()+apstart);
1309 <         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1310 <         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1311 <           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1312 <                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1313 <           fi++;
1314 <           if (fi>imax) break;
1315 <           }
1316 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1317 <     }
1318 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1319 < // lastly, analyze collected a5segs for a possible gapped alignment:
1320 < GList<CSegChain> segchains(false,true,false);
1321 < #ifdef DEBUG
1322 < if (debug && a5segs.Count()>0) {
1323 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1324 <   }
1325 < #endif
1326 < for (int i=0;i<a5segs.Count();i++) {
1327 <   if (a5segs[i]->chain==NULL) {
1328 <       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1329 <       CSegChain* newchain=new CSegChain(true);
1330 <       newchain->setFreeItem(false);
1331 <       newchain->addSegPair(a5segs[i]);
1332 <       a5segs[i]->chain=newchain;
1333 <       segchains.Add(newchain); //just to free them when done
1334 <       }
1335 <   for (int j=i+1;j<a5segs.Count();j++) {
1336 <      CSegChain* chain=a5segs[i]->chain;
1337 <      if (chain->extendChain(a5segs[j])) {
1338 <         a5segs[j]->chain=chain;
1339 < #ifdef DEBUG
1340 <         if (debug) dbgPrintChain(*chain, adapter5.chars());
1341 < #endif      
1342 <      //save time by checking here if the extended chain is already acceptable for trimming
1343 <         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1344 <            l5=chain->aend;
1345 <            l3=rlen-1;
1346 <            return true;
1347 <            }
1348 <         } //chain can be extended
1349 <      }
1350 <   } //collect segment alignments into chains
1351 < */
1352 < return false; //no adapter parts found
1353 < }
916 >  }//for each 5' adapter
917 >  return trimmed;
918 > }
919  
920   //convert qvs to/from phred64 from/to phread33
921   void convertPhred(GStr& q) {
# Line 1414 | Line 979
979        }
980      }// fastq
981   // } //<-- FASTA or FASTQ
982 < rseq.upper(); //TODO: what if we care about masking?
982 > rseq.upper();
983   return true;
984   }
985  
# Line 1458 | Line 1023
1023     w5=0;
1024     w3=wseq.length()-1;
1025     }
1026 < if (a3len>0) {
1026 > if (adapters3.Count()>0) {
1027    if (trim_adapter3(wseq, w5, w3)) {
1028       int trimlen=wseq.length()-(w3-w5+1);
1029       num_trimmed3++;
# Line 1475 | Line 1040
1040           wqv=wqv.substr(w5, w3-w5+1);
1041        }//some adapter was trimmed
1042     } //adapter trimming
1043 < if (a5len>0) {
1043 > if (adapters5.Count()>0) {
1044    if (trim_adapter5(wseq, w5, w3)) {
1045       int trimlen=wseq.length()-(w3-w5+1);
1046       num_trimmed5++;
# Line 1642 | Line 1207
1207      }
1208   }
1209  
1210 +
1211 + int loadAdapters(const char* fname) {
1212 +  GLineReader lr(fname);
1213 +  char* l;
1214 +  while ((l=lr.nextLine())!=NULL) {
1215 +   if (lr.length()<=3 || l[0]=='#') continue;
1216 +   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1217 +        l[0]==';'|| l[0]==':' ) {
1218 +      int i=1;
1219 +      while (l[i]!=0 && isspace(l[i])) {
1220 +        i++;
1221 +        }
1222 +      if (l[i]!=0) {
1223 +        GStr s(&(l[i]));
1224 +        adapters3.Add(s);
1225 +        continue;
1226 +        }
1227 +      }
1228 +    else {
1229 +      GStr s(l);
1230 +      s.startTokenize("\t ;,:");
1231 +      GStr a5,a3;
1232 +      if (s.nextToken(a5))
1233 +         s.nextToken(a3);
1234 +      a5.upper();
1235 +      a3.upper();
1236 +      adapters5.Add(a5);
1237 +      adapters3.Add(a3);
1238 +      }
1239 +   }
1240 +   return adapters5.Count()+adapters3.Count();
1241 + }
1242 +
1243   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1244                         GStr& s, GStr& infname, GStr& infname2) {
1245   // uses outsuffix to generate output file names and open file handles as needed

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines