ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 3 | Line 3
3   #include "GHash.hh"
4   #include "GList.hh"
5   #include <ctype.h>
6 + #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9 < fqtrim [-5 <5adapter>] [-3 <3adapter>] [-a <min_matchlen>] [-p {64|33}] [-q <minq> [-t <trim_max>]]\\\n\
10 <   [-n <rename_prefix>] [-o <outsuffix>] [-z <zcmd>] [-r <discarded.lst>]\\\n\
11 <   [-l <minlen>] [-C] [-D] [-Q] <input.fq>[,<input_mates.fq>\n\
9 > fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 >   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 >   [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 >    <input.fq>[,<input_mates.fq>\n\
13   \n\
14 < Trim low quality bases at the 3' end, optionally trim adapter sequence, filter\n\
15 < for low complexity and collapse duplicate reads\n\
14 > Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 > for low complexity and collapse duplicate reads.\n\
16   If read pairs should be trimmed and kept together (i.e. without discarding\n\
17   one read in a pair), the two file names should be given delimited by a comma\n\
18   or a colon character\n\
# Line 19 | Line 21
21   -n  rename all the reads using the <prefix> followed by a read counter;\n\
22      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23      the read duplication count\n\
24 < -o  write the trimmed/filtered reads to file(s) named <input>.<outsuffix>\n\
25 <    which will be created in the current (working) directory\n\
24 > -o  unless this parameter is '-', write the trimmed/filtered reads to \n\
25 >    file(s) named <input>.<outsuffix> which will be created in the \n\
26 >    current (working) directory; (writes to stdout if -o- is given);\n\
27 >    a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 > -f  file with adapter sequences to trim, each line having this format:\n\
29 >    <5'-adapter-sequence> <3'-adapter-sequence>\n\
30   -5  trim the given adapter or primer sequence at the 5' end of each read\n\
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32   -3  trim the given adapter sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -a  minimum bases to match to adaptor sequence (default 5)\n\
34 > -A  disable polyA trimming (enabled by default)\n\
35 > -T  enable polyT trimming (disabled by default)\n\
36 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
37   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38 < -t  for -q option, maximum trimming at the 3' end is limited to <trim_max>\n\
38 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
40   -l  minimum \"clean\" length after trimming that a read must have\n\
41      in order to pass the filter (default: 16)\n\
# Line 48 | Line 56
56  
57   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
58  
59 < //For pair ends sequencing:
59 > //For paired reads sequencing:
60   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
61   //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
62   //FILE* f_out=NULL; //stdout if not provided
# Line 70 | Line 78
78   bool isfasta=false;
79   bool convert_phred=false;
80   GStr outsuffix; // -o
73 GStr adapter3;
74 GStr adapter5;
81   GStr prefix;
82   GStr zcmd;
83   int num_trimmed5=0;
# Line 86 | Line 92
92   int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93   int qv_cvtadd=0; //could be -31 or +31
94  
95 < int a3len=0;
96 < int a5len=0;
97 < // adaptor matching metrics -- for extendMatch() function
98 < const int a_m_score=2; //match score
99 < const int a_mis_score=-3; //mismatch
100 < const int a_dropoff_score=7;
101 < int a_min_score=10; //an exact match of 5 bases at the proper ends WILL be trimmed
102 < const int a_min_chain_score=15; //for gapped alignments
103 <
104 < class CSegChain;
105 <
106 < class CSegPair {
107 <  public:
102 <   GSeg a;
103 <   GSeg b; //the adapter segment
104 <   int score;
105 <   int flags;
106 <   CSegChain* chain;
107 <   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
108 <      score=mscore;
109 <      if (score==0) score=a.len()*a_m_score;
110 <      flags=0;
111 <      chain=NULL;
95 > // adaptor matching metrics -- for X-drop ungapped extension
96 > const int poly_m_score=2; //match score
97 > const int poly_mis_score=-3; //mismatch
98 > const int poly_dropoff_score=7;
99 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
100 >
101 > const char *polyA_seed="AAAA";
102 > const char *polyT_seed="TTTT";
103 >
104 > struct CAdapters {
105 >    GStr a5;
106 >    GStr a3;
107 >    CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
108        }
109 <   int len() { return  a.len(); }
114 <   bool operator==(CSegPair& d){
115 <      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
116 <      //make equal even segments that are included into one another:
117 <      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
118 <      }
119 <   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
120 <     if (b.start==d.b.start) {
121 <        if (score==d.score) {
122 <           //just try to be consistent:
123 <           if (b.end==d.b.end) {
124 <             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
125 <             }
126 <           return (b.end>d.b.end);
127 <           }
128 <         else return (score<d.score);
129 <        }
130 <     return (b.start>d.b.start);
131 <     }
132 <   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
133 <     /*if (b.start==d.b.start && b.end==d.b.end) {
134 <          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
135 <          }
136 <     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
137 <     if (b.start==d.b.start) {
138 <        if (score==d.score) {
139 <           //just try to be consistent:
140 <           if (b.end==d.b.end) {
141 <             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
142 <             }
143 <           return (b.end<d.b.end);
144 <           }
145 <         else return (score>d.score);
146 <        }
147 <     return (b.start<d.b.start);
148 <     }
149 < };
150 <
151 < int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
152 < CSegPair& x = *(CSegPair *)sa;
153 < CSegPair& y = *(CSegPair *)sb;
154 < /*
155 < if (x.b.end==y.b.end) {
156 <     if (x.b.start==y.b.start) {
157 <         if (x.a.end==y.a.end) {
158 <            if (x.a.start==y.a.start) return 0;
159 <            return ((x.a.start>y.a.start) ? -1 : 1);
160 <            }
161 <          else {
162 <            return ((x.a.end>y.a.end) ? -1 : 1);
163 <            }
164 <          }
165 <      else {
166 <       return ((x.b.start>y.b.start) ? -1 : 1);
167 <       }
168 <     }
169 <    else {
170 <     return ((x.b.end>y.b.end) ? -1 : 1);
171 <     }
172 < */
173 <  if (x.b.end==y.b.end) {
174 <     if (x.score==y.score) {
175 <     if (x.b.start==y.b.start) {
176 <         if (x.a.end==y.a.end) {
177 <            if (x.a.start==y.a.start) return 0;
178 <            return ((x.a.start<y.a.start) ? -1 : 1);
179 <            }
180 <          else {
181 <            return ((x.a.end<y.a.end) ? -1 : 1);
182 <            }
183 <          }
184 <      else {
185 <       return ((x.b.start<y.b.start) ? -1 : 1);
186 <       }
187 <      } else return ((x.score>y.score) ? -1 : 1);
188 <     }
189 <    else {
190 <     return ((x.b.end>y.b.end) ? -1 : 1);
191 <     }
192 <
193 < }
109 > };
110  
111 < class CSegChain:public GList<CSegPair> {
196 < public:
197 <   uint astart;
198 <   uint aend;
199 <   uint bstart;
200 <   uint bend;
201 <   int score;
202 <   bool endSort;
203 <  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
204 <   //as SegPairs are inserted, they will be sorted by a.start coordinate
205 <   score=0;
206 <   astart=MAX_UINT;
207 <   aend=0;
208 <   bstart=MAX_UINT;
209 <   bend=0;
210 <   endSort=aln5;
211 <   if (aln5) { setSorted(cmpSegEnds); }
212 <   }
213 < bool operator==(CSegChain& d) {
214 <   //return (score==d.score);
215 <    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
216 <   }
217 < bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
218 <   //return (score<d.score);
219 <   if (bstart==d.bstart && bend==d.bend) {
220 <          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
221 <          }
222 <     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
223 <   }
224 < bool operator<(CSegChain& d) {
225 <   //return (score>d.score);
226 <   if (bstart==d.bstart && bend==d.bend) {
227 <          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
228 <          }
229 <     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
230 <   }
231 < void addSegPair(CSegPair* segp) {
232 <   if (AddIfNew(segp)!=segp) return;
233 <   score+=segp->score;
234 <   if (astart>segp->a.start) astart=segp->a.start;
235 <   if (aend<segp->a.end) aend=segp->a.end;
236 <   if (bstart>segp->b.start) bstart=segp->b.start;
237 <   if (bend<segp->b.end) bend=segp->b.end;
238 <   }
239 < //for building actual chains:
240 < bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
241 <   int bgap=0;
242 <   int agap=0;
243 <   //if (endSort) {
244 <   if (bstart>segp->b.start) {
245 <      bgap = (int)(bstart-segp->b.end);
246 <      if (abs(bgap)>2) return false;
247 <      agap = (int)(astart-segp->a.end);
248 <      if (abs(agap)>2) return false;
249 <      }
250 <     else {
251 <      bgap = (int) (segp->b.start-bend);
252 <      if (abs(bgap)>2) return false;
253 <      agap = (int)(segp->a.start-aend);
254 <      if (abs(agap)>2) return false;
255 <      }
256 <   if (agap*bgap<0) return false;
257 <   addSegPair(segp);
258 <   score-=abs(agap)+abs(bgap);
259 <   return true;
260 <   }
261 < };
111 > GPVec<CAdapters> adapters;
112  
113   // element in dhash:
114   class FqDupRec {
# Line 308 | Line 158
158  
159   GHash<FqDupRec> dhash; //hash to keep track of duplicates
160  
161 + int loadAdapters(const char* fname);
162 +
163   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
164                         GStr& s, GStr& infname, GStr& infname2);
165   // uses outsuffix to generate output file names and open file handles as needed
# Line 324 | Line 176
176   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
177   bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
178   int dust(GStr& seq);
179 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
179 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
180 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
181   bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
182 + bool trim_adapter3(GStr& seq, int &l5, int &l3);
183  
184   void convertPhred(char* q, int len);
185   void convertPhred(GStr& q);
186  
187   int main(int argc, char * const argv[]) {
188 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:t:o:z:a:");
188 >  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189    int e;
190    if ((e=args.isError())>0) {
191        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 382 | Line 236
236         else
237           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
238       }
239 <  if (args.getOpt('3')!=NULL) {
240 <    adapter3=args.getOpt('3');
241 <    adapter3.upper();
242 <    a3len=adapter3.length();
243 <    }
244 <  if (args.getOpt('5')!=NULL) {
245 <    adapter5=args.getOpt('5');
246 <    adapter5.upper();
247 <    a5len=adapter5.length();
239 >  s=args.getOpt('f');
240 >  if (!s.is_empty()) {
241 >   loadAdapters(s.chars());
242 >   }
243 >  bool fileAdapters=adapters.Count();
244 >  s=args.getOpt('5');
245 >  if (!s.is_empty()) {
246 >    if (fileAdapters)
247 >      GError("Error: options -5 and -f cannot be used together!\n");
248 >    s.upper();
249 >    adapters.Add(new CAdapters(s.chars()));
250 >    }
251 >  s=args.getOpt('3');
252 >  if (!s.is_empty()) {
253 >    if (fileAdapters)
254 >      GError("Error: options -3 and -f cannot be used together!\n");
255 >    s.upper();
256 >    if (adapters.Count()>0)
257 >          adapters[0]->a3=s.chars();
258 >     else adapters.Add(NULL, new CAdapters(s.chars()));
259      }
260 <  s=args.getOpt('a');
260 >  s=args.getOpt('y');
261    if (!s.is_empty()) {
262 <     int a_minmatch=s.asInt();
263 <     a_min_score=a_minmatch<<1;
262 >     int minmatch=s.asInt();
263 >     poly_minScore=minmatch*poly_m_score;
264       }
265 <
265 >  
266    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
267 +                         else outsuffix="-";
268    trashReport=  (args.getOpt('r')!=NULL);
269    int fcount=args.startNonOpt();
270    if (fcount==0) {
# Line 850 | Line 716
716   return ncount;
717   }
718  
853
854 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
855 //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
856 //check minimum score and
857 //for 3' adapter trimming:
858 //     require that the right end of the alignment for either the adaptor OR the read must be
859 //     < 3 distance from its right end
860 // for 5' adapter trimming:
861 //     require that the left end of the alignment for either the adaptor OR the read must
862 //     be at coordinate < 3 from start
863
864 bool extendMatch(const char* a, int alen, int ai,
865                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
866 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
867 #ifdef DEBUG
868 GStr dbg(b);
869 #endif
870 //if (debug) {
871 //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
872 //  }
873 int a_l=ai; //alignment coordinates on a
874 int a_r=ai+mlen-1;
875 int b_l=bi; //alignment coordinates on b
876 int b_r=bi+mlen-1;
877 int ai_maxscore=ai;
878 int bi_maxscore=bi;
879 int score=mlen*a_m_score;
880 int maxscore=score;
881 int mism5score=a_mis_score;
882 if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
883 //try to extend to the left first, if possible
884 while (ai>0 && bi>0) {
885   ai--;
886   bi--;
887   score+= (a[ai]==b[bi])? a_m_score : mism5score;
888   if (score>maxscore) {
889       ai_maxscore=ai;
890       bi_maxscore=bi;
891       maxscore=score;
892       }
893     else if (maxscore-score>a_dropoff_score) break;
894   }
895 a_l=ai_maxscore;
896 b_l=bi_maxscore;
897 //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
898 //now extend to the right
899 ai_maxscore=a_r;
900 bi_maxscore=b_r;
901 ai=a_r;
902 bi=b_r;
903 score=maxscore;
904 //sometimes there are extra AAAAs at the end of the read, ignore those
905 if (strcmp(&a[alen-4],"AAAA")==0) {
906    alen-=3;
907    while (a[alen-1]=='A' && alen>ai) alen--;
908    }
909 while (ai<alen-1 && bi<blen-1) {
910   ai++;
911   bi++;
912   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
913   if (a[ai]==b[bi]) { //match
914      score+=a_m_score;
915      if (ai>=alen-2) {
916           score+=a_m_score-(alen-ai-1);
917           }
918      }
919    else { //mismatch
920      score+=a_mis_score;
921      }  
922   if (score>maxscore) {
923       ai_maxscore=ai;
924       bi_maxscore=bi;
925       maxscore=score;
926       }
927     else if (maxscore-score>a_dropoff_score) break;
928   }
929  a_r=ai_maxscore;
930  b_r=bi_maxscore;
931  int a_ovh3=alen-a_r-1;
932  int b_ovh3=blen-b_r-1;
933  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
934  int mmovh5=(a_l<b_l)? a_l : b_l;
935  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
936 #ifdef DEBUG
937  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
938 #endif
939  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
940     if (a_l<a_ovh3) {
941        //adapter closer to the left end (typical for 5' adapter)
942        l5=a_r+1;
943        l3=alen-1;
944        }
945      else {
946        //adapter matching at the right end (typical for 3' adapter)
947        l5=0;
948        l3=a_l-1;
949        }
950     return true;
951     }
952 else { //keep this segment pair for later (gapped alignment)
953   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
954   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
955   }
956  //do not trim:
957  l5=0;
958  l3=alen-1;
959  return false;
960 }
961
962 /*
963 int getWordValue(const char* s, int wlen) {
964 int r=0;
965 while (wlen--) { r+=(((int)s[wlen])<<wlen) }
966 return r;
967 }
968 */
719   int get3mer_value(const char* s) {
720   return (s[0]<<16)+(s[1]<<8)+s[2];
721   }
# Line 989 | Line 739
739   return -1;
740   }
741  
742 < int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
743 < if (start_index>=slen || start_index<0) return -1;
744 < for (int i=start_index;i<slen-4;i++) {
745 <   int32* rv=(int32*)(str+i);
746 <   if (*rv==qv) return i;
747 <   }
748 < return -1;
749 < }
742 > struct SLocScore {
743 >  int pos;
744 >  int score;
745 >  SLocScore(int p=0,int s=0) {
746 >    pos=p;
747 >    score=s;
748 >    }
749 >  void set(int p, int s) {
750 >    pos=p;
751 >    score=s;
752 >    }
753 >  void add(int p, int add) {
754 >    pos=p;
755 >    score+=add;
756 >    }
757 > };
758  
759 < int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
760 < if (end_index>=slen) return -1;
761 < if (end_index<0) end_index=slen-1;
762 < for (int i=end_index-3;i>=0;i--) {
763 <   int32* rv=(int32*)(str+i);
764 <   if (*rv==qv) return i;
759 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
760 > int rlen=seq.length();
761 > l5=0;
762 > l3=rlen-1;
763 > int32 seedVal=*(int32*)poly_seed;
764 > char polyChar=poly_seed[0];
765 > //assumes N trimming was already done
766 > //so a poly match should be very close to the end of the read
767 > // -- find the initial match (seed)
768 > int lmin=GMAX((rlen-12), 0);
769 > int li;
770 > for (li=rlen-4;li>lmin;li--) {
771 >   if (seedVal==*(int*)&(seq[li])) {
772 >      break;
773 >      }
774     }
775 < return -1;
776 < }
775 > if (li<=lmin) return false;
776 > //seed found, try to extend it both ways
777 > //extend right
778 > int ri=li+3;
779 > SLocScore loc(ri, poly_m_score<<2);
780 > SLocScore maxloc(loc);
781 > //extend right
782 > while (ri<rlen-2) {
783 >   ri++;
784 >   if (seq[ri]==polyChar) {
785 >                loc.add(ri,poly_m_score);
786 >                }
787 >   else if (seq[ri]=='N') {
788 >                loc.add(ri,0);
789 >                }
790 >   else { //mismatch
791 >        loc.add(ri,poly_mis_score);
792 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
793 >        }
794 >   if (maxloc.score<=loc.score) {
795 >      maxloc=loc;
796 >      }
797 >   }
798 > ri=maxloc.pos;
799 > if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end
800 > //ri = right boundary for the poly match
801 > //extend left
802 > loc.set(li, maxloc.score);
803 > maxloc.pos=li;
804 > while (li>0) {
805 >    li--;
806 >    if (seq[li]==polyChar) {
807 >                 loc.add(li,poly_m_score);
808 >                 }
809 >    else if (seq[li]=='N') {
810 >                 loc.add(li,0);
811 >                 }
812 >    else { //mismatch
813 >         loc.add(li,poly_mis_score);
814 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
815 >         }
816 >    if (maxloc.score<=loc.score) {
817 >       maxloc=loc;
818 >       }
819 >    }
820 > if (maxloc.score>poly_minScore && ri>=rlen-3) {
821 >    l5=li;
822 >    l3=ri;
823 >    return true;
824 >    }
825 > return false;
826 > }
827  
828 < #ifdef DEBUG
829 < void dbgPrintChain(CSegChain& chain, const char* aseq) {
830 <  GStr s(aseq);
831 <  for (int i=0;i<chain.Count();i++) {
832 <   CSegPair& seg=*chain[i];
833 <   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
834 <            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
828 >
829 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
830 > int rlen=seq.length();
831 > l5=0;
832 > l3=rlen-1;
833 > int32 seedVal=*(int32*)poly_seed;
834 > char polyChar=poly_seed[0];
835 > //assumes N trimming was already done
836 > //so a poly match should be very close to the end of the read
837 > // -- find the initial match (seed)
838 > int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
839 > int li;
840 > for (li=0;li<=lmax;li++) {
841 >   if (seedVal==*(int*)&(seq[li])) {
842 >      break;
843 >      }
844     }
845 + if (li>lmax) return false;
846 + //seed found, try to extend it both ways
847 + //extend left
848 + int ri=li+3; //save rightmost base of the seed
849 + SLocScore loc(li, poly_m_score<<2);
850 + SLocScore maxloc(loc);
851 + while (li>0) {
852 +    li--;
853 +    if (seq[li]==polyChar) {
854 +                 loc.add(li,poly_m_score);
855 +                 }
856 +    else if (seq[li]=='N') {
857 +                 loc.add(li,0);
858 +                 }
859 +    else { //mismatch
860 +         loc.add(li,poly_mis_score);
861 +         if (maxloc.score-loc.score>poly_dropoff_score) break;
862 +         }
863 +    if (maxloc.score<=loc.score) {
864 +       maxloc=loc;
865 +       }
866 +    }
867 + li=maxloc.pos;
868 + if (li>3) return false; //no trimming wanted, too far from 5' end
869 + //li = right boundary for the poly match
870 +
871 + //extend right
872 + loc.set(ri, maxloc.score);
873 + maxloc.pos=ri;
874 + while (ri<rlen-2) {
875 +   ri++;
876 +   if (seq[ri]==polyChar) {
877 +                loc.add(ri,poly_m_score);
878 +                }
879 +   else if (seq[ri]=='N') {
880 +                loc.add(ri,0);
881 +                }
882 +   else { //mismatch
883 +        loc.add(ri,poly_mis_score);
884 +        if (maxloc.score-loc.score>poly_dropoff_score) break;
885 +        }
886 +   if (maxloc.score<=loc.score) {
887 +      maxloc=loc;
888 +      }
889 +   }
890 +
891 + if (maxloc.score>poly_minScore && li<=3) {
892 +    l5=li;
893 +    l3=ri;
894 +    return true;
895 +    }
896 + return false;
897   }
1020 #endif
898  
899   bool trim_adapter3(GStr& seq, int&l5, int &l3) {
900   int rlen=seq.length();
# Line 1179 | Line 1056
1056   l3=rlen-1;
1057   //try to see if adapter is fully included in the read
1058   int fi=-1;
1059 < if ((fi=seq.index(adapter5))>=0) {
1060 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
1061 <      l5=fi+a5len;
1062 <      l3=rlen-1;
1063 <      }
1064 <    else {
1065 <      l5=0;
1066 <      l3=fi-1;
1190 <      }
1191 <   return true;
1192 <   }
1193 < #ifdef DEBUG
1194 < if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1195 < #endif
1196 <
1197 < CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1198 <
1199 < //try the easy way out first - look for an exact match of 11 bases
1200 < int fdlen=11;
1201 <  if (a5len<16) {
1202 <   fdlen=a5len>>1;
1203 <   }
1204 < if (fdlen>4) {
1205 <     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1206 <     if ((fi=adapter5.index(rstart))>=0) {
1207 < #ifdef DEBUG
1208 <       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1209 < #endif
1210 <       if (extendMatch(seq.chars(), rlen, 1,
1211 <                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1212 <           return true;
1059 > for (int ai=0;ai<adapters.Count();ai++) {
1060 >  if (adapters[ai]->a5.is_empty()) continue;
1061 >  int a5len=adapters[ai]->a5.length();
1062 >  GStr& adapter5=adapters[ai]->a5;
1063 >  if ((fi=seq.index(adapter5))>=0) {
1064 >    if (fi<rlen-fi-a5len) {//match is closer to the right end
1065 >       l5=fi+a5len;
1066 >       l3=rlen-1;
1067         }
1068 <     //another easy case: last 11 characters of the adaptor found as a substring of the read
1069 <     GStr bstr=adapter5.substr(-fdlen);
1070 <     if ((fi=seq.index(bstr))>=0) {
1217 < #ifdef DEBUG
1218 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1219 < #endif
1220 <       if (extendMatch(seq.chars(), rlen, fi,
1221 <                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1222 <          return true;
1068 >     else {
1069 >       l5=0;
1070 >       l3=fi-1;
1071         }
1072 <     } //tried to matching at most 11 bases first
1073 <    
1074 < //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1075 < //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1076 < int wordSize=3;
1229 < int hlen=12;
1230 < if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1231 < int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1232 < if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1233 < for (int iw=0;iw<=hlen;iw++) {
1234 <   int apstart=a5len-iw-wordSize;
1235 <   fi=0;
1236 <   //int* qv=(int32 *)(adapter5.chars()+apstart);
1237 <   int qv=get3mer_value(adapter5.chars()+apstart);
1238 <   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1239 <   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1240 < #ifdef DEBUG
1241 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1242 < #endif
1243 <     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1244 <                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1245 <     fi++;
1246 <     if (fi>imax) break;
1247 <     }
1248 <   } //for each wmer in the last hlen bases of the adaptor
1249 < /*
1072 >    return true;
1073 >    }
1074 > #ifdef DEBUG
1075 >  if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1076 > #endif
1077  
1078 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1079 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1080 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1081 < int hlen2=a5len-wordSize;
1082 < //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1083 < #ifdef DEBUG
1084 <      if (debug && a5segs.Count()>0) {
1085 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1086 <        }
1087 < #endif
1088 < if (hlen2>hlen) {
1089 <     for (int iw=hlen+1;iw<=hlen2;iw++) {
1090 <         int apstart=a5len-iw-wordSize;
1264 <         fi=0;
1265 <         //int* qv=(int32 *)(adapter5.chars()+apstart);
1266 <         int qv=get3mer_value(adapter5.chars()+apstart);
1267 <         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1268 <         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1269 <           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1270 <                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1271 <           fi++;
1272 <           if (fi>imax) break;
1273 <           }
1274 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1275 <     }
1276 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1277 < // lastly, analyze collected a5segs for a possible gapped alignment:
1278 < GList<CSegChain> segchains(false,true,false);
1279 < #ifdef DEBUG
1280 < if (debug && a5segs.Count()>0) {
1281 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1282 <   }
1283 < #endif
1284 < for (int i=0;i<a5segs.Count();i++) {
1285 <   if (a5segs[i]->chain==NULL) {
1286 <       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1287 <       CSegChain* newchain=new CSegChain(true);
1288 <       newchain->setFreeItem(false);
1289 <       newchain->addSegPair(a5segs[i]);
1290 <       a5segs[i]->chain=newchain;
1291 <       segchains.Add(newchain); //just to free them when done
1292 <       }
1293 <   for (int j=i+1;j<a5segs.Count();j++) {
1294 <      CSegChain* chain=a5segs[i]->chain;
1295 <      if (chain->extendChain(a5segs[j])) {
1296 <         a5segs[j]->chain=chain;
1297 < #ifdef DEBUG
1298 <         if (debug) dbgPrintChain(*chain, adapter5.chars());
1299 < #endif      
1300 <      //save time by checking here if the extended chain is already acceptable for trimming
1301 <         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1302 <            l5=chain->aend;
1303 <            l3=rlen-1;
1078 >  //try the easy way out first - look for an exact match of 11 bases
1079 >  int fdlen=11;
1080 >   if (a5len<16) {
1081 >    fdlen=a5len>>1;
1082 >    }
1083 >  if (fdlen>4) {
1084 >      GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1085 >      if ((fi=adapter5.index(rstart))>=0) {
1086 > #ifdef DEBUG
1087 >        if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1088 > #endif
1089 >        if (extendMatch(seq.chars(), rlen, 1,
1090 >                      adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1091              return true;
1092 <            }
1093 <         } //chain can be extended
1092 >        }
1093 >      //another easy case: last 11 characters of the adaptor found as a substring of the read
1094 >      GStr bstr=adapter5.substr(-fdlen);
1095 >      if ((fi=seq.index(bstr))>=0) {
1096 > #ifdef DEBUG
1097 >        if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1098 > #endif
1099 >        if (extendMatch(seq.chars(), rlen, fi,
1100 >                      adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1101 >           return true;
1102 >        }
1103 >      } //tried to matching at most 11 bases first
1104 >
1105 >  //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1106 >  //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1107 >  int wordSize=3;
1108 >  int hlen=12;
1109 >  if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1110 >  int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1111 >  if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1112 >  for (int iw=0;iw<=hlen;iw++) {
1113 >    int apstart=a5len-iw-wordSize;
1114 >    fi=0;
1115 >    //int* qv=(int32 *)(adapter5.chars()+apstart);
1116 >    int qv=get3mer_value(adapter5.chars()+apstart);
1117 >    //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1118 >    while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1119 > #ifdef DEBUG
1120 >      if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1121 > #endif
1122 >      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1123 >                 a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1124 >      fi++;
1125 >      if (fi>imax) break;
1126        }
1127 <   } //collect segment alignments into chains
1128 < */
1129 < return false; //no adapter parts found
1130 < }
1127 >    } //for each wmer in the last hlen bases of the adaptor
1128 > //if we're here we couldn't find a good extension
1129 >  return false; //no adapter parts found
1130 > }//for each 5' adapter
1131 > }
1132  
1133   //convert qvs to/from phred64 from/to phread33
1134   void convertPhred(GStr& q) {
# Line 1372 | Line 1192
1192        }
1193      }// fastq
1194   // } //<-- FASTA or FASTQ
1195 < rseq.upper(); //TODO: what if we care about masking?
1195 > rseq.upper();
1196   return true;
1197   }
1198  
# Line 1600 | Line 1420
1420      }
1421   }
1422  
1423 +
1424 + int loadAdapters(const char* fname) {
1425 +  GLineReader lr(fname);
1426 +  char* l;
1427 +  while ((l=lr.nextLine())!=NULL) {
1428 +   if (lr.length()<=3 || l[0]=='#') continue;
1429 +   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1430 +        l[0]==';'|| l[0]==':' ) {
1431 +      int i=1;
1432 +      while (l[i]!=0 && isspace(l[i])) {
1433 +        i++;
1434 +        }
1435 +      if (l[i]!=0) {
1436 +        adapters.Add(new CAdapters(NULL, &(l[i])));
1437 +        continue;
1438 +        }
1439 +      }
1440 +    else {
1441 +      GStr s(l);
1442 +      s.startTokenize("\t ;,:");
1443 +      GStr a5,a3;
1444 +      if (s.nextToken(a5))
1445 +         s.nextToken(a3);
1446 +      adapters.Add(new CAdapters(a5.is_empty()?NULL:a5.chars(),
1447 +                                a3.is_empty()?NULL:a3.chars()));
1448 +      }
1449 +   }
1450 +   return adapters.Count();
1451 + }
1452 +
1453   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1454                         GStr& s, GStr& infname, GStr& infname2) {
1455   // uses outsuffix to generate output file names and open file handles as needed
# Line 1611 | Line 1461
1461   f_out2=NULL;
1462   //analyze outsuffix intent
1463   GStr pocmd;
1464 < GStr ox=getFext(outsuffix);
1465 < if (ox.length()>2) ox=ox.substr(0,2);
1466 < if (ox=="gz") pocmd="gzip -9 -c ";
1467 <   else if (ox=="bz") pocmd="bzip2 -9 -c ";
1464 > if (outsuffix=="-") {
1465 >    f_out=stdout;
1466 >    }
1467 >   else {
1468 >    GStr ox=getFext(outsuffix);
1469 >    if (ox.length()>2) ox=ox.substr(0,2);
1470 >    if (ox=="gz") pocmd="gzip -9 -c ";
1471 >        else if (ox=="bz") pocmd="bzip2 -9 -c ";
1472 >    }
1473   if (s=="-") {
1474      f_in=stdin;
1475      infname="stdin";
# Line 1638 | Line 1493
1493     f_in=popen(picmd.chars(), "r");
1494     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1495     }
1496 + if (f_out==stdout) {
1497 +   if (paired) GError("Error: output suffix required for paired reads\n");
1498 +   return;
1499 +   }
1500   f_out=prepOutFile(infname, pocmd);
1501   if (!paired) return;
1502   if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines