ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 3 | Line 3
3   #include "GHash.hh"
4   #include "GList.hh"
5   #include <ctype.h>
6 + #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9   fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 <   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12      <input.fq>[,<input_mates.fq>\n\
13   \n\
# Line 30 | Line 31
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32   -3  trim the given adapter sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -a  minimum length of exact match to adaptor sequence at the proper end (6)\n\
34 > -A  disable polyA/T trimming (enabled by default)\n\
35 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
36   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37   -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
# Line 46 | Line 48
48   -Q  convert quality values to the other Phred qv type\n\
49   -V  verbose processing\n\
50   "
51 <
51 >
52   //-z  for -o option, the output stream(s) will be first piped into the given\n
53   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54  
55 <
55 >
56   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57  
58   //For paired reads sequencing:
# Line 60 | Line 62
62   //FILE* f_out2=NULL; //for paired reads
63   //FILE* f_in=NULL; //input fastq (stdin if not provided)
64   //FILE* f_in2=NULL; //for paired reads
65 +
66   FILE* freport=NULL;
67  
68   bool debug=false;
69   bool verbose=false;
70   bool doCollapse=false;
71   bool doDust=false;
72 + bool doPolyTrim=true;
73   bool fastaOutput=false;
74   bool trashReport=false;
75 < //bool rawFormat=false;
75 > bool revCompl=false; //also reverse complement adaptor sequences
76   int min_read_len=16;
77   double max_perc_N=7.0;
78   int dust_cutoff=16;
79   bool isfasta=false;
80   bool convert_phred=false;
81 < GStr outsuffix; // -o
78 < //GStr adapter3;
79 < //GStr adapter5;
81 > GStr outsuffix; // -o
82   GStr prefix;
83   GStr zcmd;
84   int num_trimmed5=0;
# Line 91 | Line 93
93   int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94   int qv_cvtadd=0; //could be -31 or +31
95  
96 < int a3len=0;
97 < int a5len=0;
98 < // adaptor matching metrics -- for extendMatch() function
99 < const int match_score=2; //match score
100 < const int mismatch_score=-3; //mismatch
101 <
102 < const int a_m_score=2; //match score
101 < const int a_mis_score=-3; //mismatch
102 < const int a_dropoff_score=7;
96 > // adaptor matching metrics -- for X-drop ungapped extension
97 > const int match_reward=2;
98 > const int mismatch_penalty=3;
99 > const int Xdrop=8;
100 >
101 > const int poly_m_score=2; //match score
102 > const int poly_mis_score=-3; //mismatch
103   const int poly_dropoff_score=7;
104 < int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
105 < const int a_min_chain_score=15; //for gapped alignments
104 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
105  
106 < class CSegChain;
106 > const char *polyA_seed="AAAA";
107 > const char *polyT_seed="TTTT";
108  
109 < class CSegPair {
110 <  public:
111 <   GSeg a;
112 <   GSeg b; //the adapter segment
113 <   int score;
114 <   int flags;
115 <   CSegChain* chain;
116 <   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
117 <      score=mscore;
118 <      if (score==0) score=a.len()*a_m_score;
119 <      flags=0;
120 <      chain=NULL;
121 <      }
122 <   int len() { return  a.len(); }
123 <   bool operator==(CSegPair& d){
124 <      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
125 <      //make equal even segments that are included into one another:
126 <      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
127 <      }
128 <   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
129 <     if (b.start==d.b.start) {
130 <        if (score==d.score) {
131 <           //just try to be consistent:
132 <           if (b.end==d.b.end) {
133 <             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
134 <             }
135 <           return (b.end>d.b.end);
136 <           }
137 <         else return (score<d.score);
138 <        }
139 <     return (b.start>d.b.start);
140 <     }
141 <   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
142 <     /*if (b.start==d.b.start && b.end==d.b.end) {
143 <          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
144 <          }
145 <     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
146 <     if (b.start==d.b.start) {
147 <        if (score==d.score) {
148 <           //just try to be consistent:
149 <           if (b.end==d.b.end) {
150 <             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
151 <             }
152 <           return (b.end<d.b.end);
153 <           }
154 <         else return (score>d.score);
109 > struct CASeqData {
110 >   //positional data for every possible hexamer in an adapter
111 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
112 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
113 >   GStr seq; //actual adapter sequence data
114 >   GStr seqr; //reverse complement sequence
115 >   int amlen; //fraction of adapter length matching that's
116 >              //enough to consider the alignment
117 >   bool use_reverse;
118 >   CASeqData(bool rev=false):seq(),seqr(),
119 >             amlen(0), use_reverse(rev) {
120 >     for (int i=0;i<4096;i++) {
121 >        pz[i]=NULL;
122 >        pzr[i]=NULL;
123          }
156     return (b.start<d.b.start);
124       }
158 };
125  
126 < int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
127 < CSegPair& x = *(CSegPair *)sa;
128 < CSegPair& y = *(CSegPair *)sb;
129 < /*
130 < if (x.b.end==y.b.end) {
131 <     if (x.b.start==y.b.start) {
132 <         if (x.a.end==y.a.end) {
133 <            if (x.a.start==y.a.start) return 0;
134 <            return ((x.a.start>y.a.start) ? -1 : 1);
135 <            }
136 <          else {
137 <            return ((x.a.end>y.a.end) ? -1 : 1);
138 <            }
139 <          }
140 <      else {
141 <       return ((x.b.start>y.b.start) ? -1 : 1);
142 <       }
143 <     }
144 <    else {
179 <     return ((x.b.end>y.b.end) ? -1 : 1);
180 <     }
181 < */
182 <  if (x.b.end==y.b.end) {
183 <     if (x.score==y.score) {
184 <     if (x.b.start==y.b.start) {
185 <         if (x.a.end==y.a.end) {
186 <            if (x.a.start==y.a.start) return 0;
187 <            return ((x.a.start<y.a.start) ? -1 : 1);
188 <            }
189 <          else {
190 <            return ((x.a.end<y.a.end) ? -1 : 1);
191 <            }
192 <          }
193 <      else {
194 <       return ((x.b.start<y.b.start) ? -1 : 1);
195 <       }
196 <      } else return ((x.score>y.score) ? -1 : 1);
197 <     }
198 <    else {
199 <     return ((x.b.end>y.b.end) ? -1 : 1);
126 >   void update(const char* s) {
127 >         seq=s;
128 >         table6mers(seq.chars(), seq.length(), pz);
129 >         amlen=iround(double(seq.length())*0.8);
130 >         if (amlen<12)
131 >                amlen=12;
132 >         if (!use_reverse) return;
133 >         //reverse complement
134 >         seqr=s;
135 >         int slen=seq.length();
136 >         for (int i=0;i<slen;i++)
137 >           seqr[i]=ntComplement(seq[slen-i-1]);
138 >         table6mers(seqr.chars(), seqr.length(), pzr);
139 >   }
140 >
141 >   ~CASeqData() {
142 >     for (int i=0;i<4096;i++) {
143 >       delete pz[i];
144 >       delete pzr[i];
145       }
146 +   }
147 + };
148  
149 < }
149 > GVec<CASeqData> adapters5;
150 > GVec<CASeqData> adapters3;
151  
152 < class CSegChain:public GList<CSegPair> {
153 < public:
206 <   uint astart;
207 <   uint aend;
208 <   uint bstart;
209 <   uint bend;
210 <   int score;
211 <   bool endSort;
212 <  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
213 <   //as SegPairs are inserted, they will be sorted by a.start coordinate
214 <   score=0;
215 <   astart=MAX_UINT;
216 <   aend=0;
217 <   bstart=MAX_UINT;
218 <   bend=0;
219 <   endSort=aln5;
220 <   if (aln5) { setSorted(cmpSegEnds); }
221 <   }
222 < bool operator==(CSegChain& d) {
223 <   //return (score==d.score);
224 <    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
225 <   }
226 < bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
227 <   //return (score<d.score);
228 <   if (bstart==d.bstart && bend==d.bend) {
229 <          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
230 <          }
231 <     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
232 <   }
233 < bool operator<(CSegChain& d) {
234 <   //return (score>d.score);
235 <   if (bstart==d.bstart && bend==d.bend) {
236 <          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
237 <          }
238 <     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
239 <   }
240 < void addSegPair(CSegPair* segp) {
241 <   if (AddIfNew(segp)!=segp) return;
242 <   score+=segp->score;
243 <   if (astart>segp->a.start) astart=segp->a.start;
244 <   if (aend<segp->a.end) aend=segp->a.end;
245 <   if (bstart>segp->b.start) bstart=segp->b.start;
246 <   if (bend<segp->b.end) bend=segp->b.end;
247 <   }
248 < //for building actual chains:
249 < bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
250 <   int bgap=0;
251 <   int agap=0;
252 <   //if (endSort) {
253 <   if (bstart>segp->b.start) {
254 <      bgap = (int)(bstart-segp->b.end);
255 <      if (abs(bgap)>2) return false;
256 <      agap = (int)(astart-segp->a.end);
257 <      if (abs(agap)>2) return false;
258 <      }
259 <     else {
260 <      bgap = (int) (segp->b.start-bend);
261 <      if (abs(bgap)>2) return false;
262 <      agap = (int)(segp->a.start-aend);
263 <      if (abs(agap)>2) return false;
264 <      }
265 <   if (agap*bgap<0) return false;
266 <   addSegPair(segp);
267 <   score-=abs(agap)+abs(bgap);
268 <   return true;
269 <   }
270 < };
152 > CGreedyAlignData* gxmem_l=NULL;
153 > CGreedyAlignData* gxmem_r=NULL;
154  
155   // element in dhash:
156   class FqDupRec {
# Line 317 | Line 200
200  
201   GHash<FqDupRec> dhash; //hash to keep track of duplicates
202  
203 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
204 <                       GStr& s, GStr& infname, GStr& infname2);
203 > void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
204 > int loadAdapters(const char* fname);
205 >
206 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
207 >                       GStr& s, GStr& infname, GStr& infname2);
208   // uses outsuffix to generate output file names and open file handles as needed
209 <
209 >
210   void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
211   void trash_report(char trashcode, GStr& rname, FILE* freport);
212  
213 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
213 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
214            GStr& rname, GStr& rinfo, GStr& infname);
215  
216 < char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
216 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
217   //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
218  
219   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
334 bool polytrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
220   bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
221   int dust(GStr& seq);
222 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
222 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
223 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
224   bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
225 + bool trim_adapter3(GStr& seq, int &l5, int &l3);
226  
227   void convertPhred(char* q, int len);
228   void convertPhred(GStr& q);
229  
230   int main(int argc, char * const argv[]) {
231 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
231 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
232    int e;
233    if ((e=args.isError())>0) {
234        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 352 | Line 239
239    convert_phred=(args.getOpt('Q')!=NULL);
240    doCollapse=(args.getOpt('C')!=NULL);
241    doDust=(args.getOpt('D')!=NULL);
242 +  revCompl=(args.getOpt('R')!=NULL);
243 +  if (args.getOpt('A')) doPolyTrim=false;
244    /*
245    rawFormat=(args.getOpt('R')!=NULL);
246    if (rawFormat) {
# Line 360 | Line 249
249    */
250    prefix=args.getOpt('n');
251    GStr s=args.getOpt('l');
252 <  if (!s.is_empty())
252 >  if (!s.is_empty())
253       min_read_len=s.asInt();
254    s=args.getOpt('m');
255 <  if (!s.is_empty())
255 >  if (!s.is_empty())
256       max_perc_N=s.asDouble();
257    s=args.getOpt('d');
258    if (!s.is_empty()) {
# Line 389 | Line 278
278          qv_phredtype=64;
279          qv_cvtadd=-31;
280          }
281 <       else
281 >       else
282           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
283       }
284 <  if (args.getOpt('3')!=NULL) {
285 <    adapter3=args.getOpt('3');
286 <    adapter3.upper();
287 <    a3len=adapter3.length();
288 <    }
289 <  if (args.getOpt('5')!=NULL) {
290 <    adapter5=args.getOpt('5');
291 <    adapter5.upper();
292 <    a5len=adapter5.length();
284 >  s=args.getOpt('f');
285 >  if (!s.is_empty()) {
286 >   loadAdapters(s.chars());
287 >   }
288 >  bool fileAdapters=adapters5.Count()+adapters3.Count();
289 >  s=args.getOpt('5');
290 >  if (!s.is_empty()) {
291 >    if (fileAdapters)
292 >      GError("Error: options -5 and -f cannot be used together!\n");
293 >    s.upper();
294 >    addAdapter(adapters5, s);
295 >    }
296 >  s=args.getOpt('3');
297 >  if (!s.is_empty()) {
298 >    if (fileAdapters)
299 >      GError("Error: options -3 and -f cannot be used together!\n");
300 >      s.upper();
301 >      addAdapter(adapters3, s);
302      }
303 <  s=args.getOpt('a');
303 >  s=args.getOpt('y');
304    if (!s.is_empty()) {
305 <     int a_minmatch=s.asInt();
306 <     a_min_score=a_minmatch<<1;
305 >     int minmatch=s.asInt();
306 >     poly_minScore=minmatch*poly_m_score;
307       }
308 <  
308 >
309    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
310                           else outsuffix="-";
311    trashReport=  (args.getOpt('r')!=NULL);
# Line 424 | Line 322
322    if (trashReport)
323      openfw(freport, args, 'r');
324    char* infile=NULL;
325 +
326 +  if (adapters5.Count()>0)
327 +    gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
328 +  if (adapters3.Count()>0)
329 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
330 +
331    while ((infile=args.nextNonOpt())!=NULL) {
332 +    //for each input file
333      int incounter=0; //counter for input reads
334      int outcounter=0; //counter for output reads
335      int trash_s=0; //too short from the get go
336      int trash_Q=0;
337      int trash_N=0;
338      int trash_D=0;
339 +    int trash_poly=0;
340      int trash_A3=0;
341      int trash_A5=0;
342      s=infile;
# Line 455 | Line 361
361         int a5=0, a3=0, b5=0, b3=0;
362         char tcode=0, tcode2=0;
363         tcode=process_read(seqid, rseq, rqv, a5, a3);
364 <       //if (!doCollapse) {
459 <         if (fq2!=NULL) {
364 >       if (fq2!=NULL) {
365              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
366              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
367                 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
# Line 488 | Line 393
393                 int nocounter=0;
394                 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
395                 }
396 <            } //paired read
492 <       // }
396 >            } //pair read
397         if (tcode>1) { //trashed
398 +         #ifdef GDEBUG
399 +         GMessage(" !!!!TRASH code = %c\n",tcode);
400 +         #endif
401            if (tcode=='s') trash_s++;
402 +          else if (tcode=='A' || tcode=='T') trash_poly++;
403              else if (tcode=='Q') trash_Q++;
404                else if (tcode=='N') trash_N++;
405                 else if (tcode=='D') trash_D++;
# Line 504 | Line 412
412              rseq=rseq.substr(a5,a3-a5+1);
413              if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
414              }
415 +         #ifdef GDEBUG
416 +            GMessage("  After trimming:\n");
417 +            GMessage("%s\n",rseq.chars());
418 +         #endif
419            writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
420            }
421         } //for each fastq record
# Line 531 | Line 443
443                 }
444              }
445           outcounter++;
446 <         if (qd->count>maxdup_count) {
446 >         if (qd->count>maxdup_count) {
447              maxdup_count=qd->count;
448              maxdup_seq=seq;
449              }
450           if (isfasta) {
451             if (prefix.is_empty()) {
452 <             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
452 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
453                             rseq.chars());
454               }
455             else { //use custom read name
# Line 548 | Line 460
460           else { //fastq format
461            if (convert_phred) convertPhred(qd->qv, qd->len);
462            if (prefix.is_empty()) {
463 <            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
463 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
464                             rseq.chars(), qd->qv);
465              }
466            else { //use custom read name
# Line 584 | Line 496
496           GMessage("         Trashed by N%%:%9d\n", trash_N);
497         if (trash_Q>0)
498           GMessage("Trashed by low quality:%9d\n", trash_Q);
499 +       if (trash_poly>0)
500 +         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
501         if (trash_A5>0)
502           GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
503         if (trash_A3>0)
# Line 595 | Line 509
509      FWCLOSE(f_out);
510      FWCLOSE(f_out2);
511     } //while each input file
512 <
512 > delete gxmem_l;
513 > delete gxmem_r;
514   //getc(stdin);
515   }
516  
# Line 610 | Line 525
525     const char* seq;
526     bool valid;
527     NData() {
528 +    seqlen=0;
529      NCount=0;
530      end5=0;
531      end3=0;
# Line 640 | Line 556
556       perc_N=(n*100.0)/(end5-end3+1);
557       }
558   };
559 <
559 >
560   static NData feat;
561   int perc_lenN=12; // incremental distance from ends, in percentage of
562            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
563 <          
563 >
564   void N_analyze(int l5, int l3, int p5, int p3) {
565   /* assumes feat was filled properly */
566   int old_dif, t5,t3,v;
567   if (l3<l5+2 || p5>p3 ) {
568     feat.end5=l5+1;
569     feat.end3=l3+1;
570 <   return;
570 >   return;
571     }
572  
573   t5=feat.NPos[p5]-l5;
574   t3=l3-feat.NPos[p3];
575   old_dif=p3-p5;
576   v=(int)((((double)(l3-l5))*perc_lenN)/100);
577 < if (v>20) v=20; /* enforce N-search limit for very long reads */
577 > if (v>20) v=20; /* enforce N-search limit for very long reads */
578   if (t5 < v ) {
579     l5=feat.NPos[p5]+1;
580     p5++;
# Line 675 | Line 591
591             feat.end3=l3+1;
592             return;
593             }
594 <    else
594 >    else
595        N_analyze(l5,l3, p5,p3);
596   }
597  
# Line 716 | Line 632
632   feat.init(rseq);
633   l5=feat.end5-1;
634   l3=feat.end3-1;
635 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
635 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
636   if (l5==feat.end5-1 && l3==feat.end3-1) {
637      if (feat.perc_N>max_perc_N) {
638             feat.valid=false;
# Line 734 | Line 650
650     return true;
651     }
652   feat.N_calc();
653 <
653 >
654   if (feat.perc_N>max_perc_N) {
655        feat.valid=false;
656        l3=l5+1;
# Line 746 | Line 662
662   //--------------- dust functions ----------------
663   class DNADuster {
664   public:
665 <  int dustword;
666 <  int dustwindow;
667 <  int dustwindow2;
665 >  int dustword;
666 >  int dustwindow;
667 >  int dustwindow2;
668    int dustcutoff;
669    int mv, iv, jv;
670    int counts[32*32*32];
# Line 843 | Line 759
759                      }
760             }
761           }
762 < //return first;
762 > //return first;
763   }
764   };
765  
# Line 861 | Line 777
777   return ncount;
778   }
779  
780 + struct SLocScore {
781 +  int pos;
782 +  int score;
783 +  SLocScore(int p=0,int s=0) {
784 +    pos=p;
785 +    score=s;
786 +    }
787 +  void set(int p, int s) {
788 +    pos=p;
789 +    score=s;
790 +    }
791 +  void add(int p, int add) {
792 +    pos=p;
793 +    score+=add;
794 +    }
795 + };
796  
797 < // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
798 < //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
799 < //check minimum score and
800 < //for 3' adapter trimming:
801 < //     require that the right end of the alignment for either the adaptor OR the read must be
802 < //     < 3 distance from its right end
803 < // for 5' adapter trimming:
804 < //     require that the left end of the alignment for either the adaptor OR the read must
805 < //     be at coordinate < 3 from start
806 <
807 < bool extendMatch(const char* a, int alen, int ai,
808 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
809 < //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
810 < #ifdef DEBUG
811 < GStr dbg(b);
880 < #endif
881 < //if (debug) {
882 < //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
883 < //  }
884 < int a_l=ai; //alignment coordinates on a
885 < int a_r=ai+mlen-1;
886 < int b_l=bi; //alignment coordinates on b
887 < int b_r=bi+mlen-1;
888 < int ai_maxscore=ai;
889 < int bi_maxscore=bi;
890 < int score=mlen*a_m_score;
891 < int maxscore=score;
892 < int mism5score=a_mis_score;
893 < if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
894 < //try to extend to the left first, if possible
895 < while (ai>0 && bi>0) {
896 <   ai--;
897 <   bi--;
898 <   score+= (a[ai]==b[bi])? a_m_score : mism5score;
899 <   if (score>maxscore) {
900 <       ai_maxscore=ai;
901 <       bi_maxscore=bi;
902 <       maxscore=score;
903 <       }
904 <     else if (maxscore-score>a_dropoff_score) break;
905 <   }
906 < a_l=ai_maxscore;
907 < b_l=bi_maxscore;
908 < //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
909 < //now extend to the right
910 < ai_maxscore=a_r;
911 < bi_maxscore=b_r;
912 < ai=a_r;
913 < bi=b_r;
914 < score=maxscore;
915 < //sometimes there are extra AAAAs at the end of the read, ignore those
916 < if (strcmp(&a[alen-4],"AAAA")==0) {
917 <    alen-=3;
918 <    while (a[alen-1]=='A' && alen>ai) alen--;
919 <    }
920 < while (ai<alen-1 && bi<blen-1) {
921 <   ai++;
922 <   bi++;
923 <   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
924 <   if (a[ai]==b[bi]) { //match
925 <      score+=a_m_score;
926 <      if (ai>=alen-2) {
927 <           score+=a_m_score-(alen-ai-1);
928 <           }
797 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
798 > if (!doPolyTrim) return false;
799 > int rlen=seq.length();
800 > l5=0;
801 > l3=rlen-1;
802 > int32 seedVal=*(int32*)poly_seed;
803 > char polyChar=poly_seed[0];
804 > //assumes N trimming was already done
805 > //so a poly match should be very close to the end of the read
806 > // -- find the initial match (seed)
807 > int lmin=GMAX((rlen-16), 0);
808 > int li;
809 > for (li=rlen-4;li>lmin;li--) {
810 >   if (seedVal==*(int*)&(seq[li])) {
811 >      break;
812        }
930    else { //mismatch
931      score+=a_mis_score;
932      }  
933   if (score>maxscore) {
934       ai_maxscore=ai;
935       bi_maxscore=bi;
936       maxscore=score;
937       }
938     else if (maxscore-score>a_dropoff_score) break;
813     }
814 <  a_r=ai_maxscore;
815 <  b_r=bi_maxscore;
816 <  int a_ovh3=alen-a_r-1;
817 <  int b_ovh3=blen-b_r-1;
818 <  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
819 <  int mmovh5=(a_l<b_l)? a_l : b_l;
820 <  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
821 < #ifdef DEBUG
822 <  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
823 < #endif
824 <  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
825 <     if (a_l<a_ovh3) {
826 <        //adapter closer to the left end (typical for 5' adapter)
827 <        l5=a_r+1;
828 <        l3=alen-1;
814 > if (li<=lmin) return false;
815 > //seed found, try to extend it both ways
816 > //extend right
817 > int ri=li+3;
818 > SLocScore loc(ri, poly_m_score<<2);
819 > SLocScore maxloc(loc);
820 > //extend right
821 > while (ri<rlen-1) {
822 >   ri++;
823 >   if (seq[ri]==polyChar) {
824 >                loc.add(ri,poly_m_score);
825 >                }
826 >   else if (seq[ri]=='N') {
827 >                loc.add(ri,0);
828 >                }
829 >   else { //mismatch
830 >        loc.add(ri,poly_mis_score);
831 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
832          }
833 <      else {
834 <        //adapter matching at the right end (typical for 3' adapter)
835 <        l5=0;
959 <        l3=a_l-1;
960 <        }
961 <     return true;
962 <     }
963 < else { //keep this segment pair for later (gapped alignment)
964 <   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
965 <   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
966 <   }
967 <  //do not trim:
968 <  l5=0;
969 <  l3=alen-1;
970 <  return false;
971 < }
972 <
973 < /*
974 < int getWordValue(const char* s, int wlen) {
975 < int r=0;
976 < while (wlen--) { r+=(((int)s[wlen])<<wlen) }
977 < return r;
978 < }
979 < */
980 < int get3mer_value(const char* s) {
981 < return (s[0]<<16)+(s[1]<<8)+s[2];
982 < }
983 <
984 < int w3_match(int qv, const char* str, int slen, int start_index=0) {
985 < if (start_index>=slen || start_index<0) return -1;
986 < for (int i=start_index;i<slen-3;i++) {
987 <   int rv=get3mer_value(str+i);
988 <   if (rv==qv) return i;
989 <   }
990 < return -1;
991 < }
992 <
993 < int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
994 < if (end_index>=slen) return -1;
995 < if (end_index<0) end_index=slen-1;
996 < for (int i=end_index-2;i>=0;i--) {
997 <   int rv=get3mer_value(str+i);
998 <   if (rv==qv) return i;
999 <   }
1000 < return -1;
1001 < }
1002 <
1003 < int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
1004 < if (start_index>=slen || start_index<0) return -1;
1005 < for (int i=start_index;i<slen-4;i++) {
1006 <   int32* rv=(int32*)(str+i);
1007 <   if (*rv==qv) return i;
1008 <   }
1009 < return -1;
1010 < }
1011 <
1012 < int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1013 < if (end_index>=slen) return -1;
1014 < if (end_index<0) end_index=slen-1;
1015 < for (int i=end_index-3;i>=0;i--) {
1016 <   int32* rv=(int32*)(str+i);
1017 <   if (*rv==qv) return i;
1018 <   }
1019 < return -1;
1020 < }
1021 <
1022 < #ifdef DEBUG
1023 < void dbgPrintChain(CSegChain& chain, const char* aseq) {
1024 <  GStr s(aseq);
1025 <  for (int i=0;i<chain.Count();i++) {
1026 <   CSegPair& seg=*chain[i];
1027 <   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1028 <            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
833 >   if (maxloc.score<=loc.score) {
834 >      maxloc=loc;
835 >      }
836     }
837 + ri=maxloc.pos;
838 + if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
839 + //ri = right boundary for the poly match
840 + //extend left
841 + loc.set(li, maxloc.score);
842 + maxloc.pos=li;
843 + while (li>0) {
844 +    li--;
845 +    if (seq[li]==polyChar) {
846 +                 loc.add(li,poly_m_score);
847 +                 }
848 +    else if (seq[li]=='N') {
849 +                 loc.add(li,0);
850 +                 }
851 +    else { //mismatch
852 +         loc.add(li,poly_mis_score);
853 +         if (maxloc.score-loc.score>poly_dropoff_score) break;
854 +         }
855 +    if (maxloc.score<=loc.score) {
856 +       maxloc=loc;
857 +       }
858 +    }
859 + li=maxloc.pos;
860 + if ((maxloc.score==poly_minScore && ri==rlen-1) ||
861 +    (maxloc.score>poly_minScore && ri>=rlen-3) ||
862 +    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
863 +  //trimming this li-ri match at 3' end
864 +    l3=li-1;
865 +    if (l3<0) l3=0;
866 +    return true;
867 +    }
868 + return false;
869   }
1031 #endif
1032 const char *pA_seed="AAAA";
1033 const char *pT_seed="TTTT";
1034 const int polyA_seed=*(int*)pA_seed;
1035 const int polyT_seed=*(int*)pT_seed;
870  
871 < bool trim_poly3(GStr &seq, int &l5, int &l3) {
871 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
872 > if (!doPolyTrim) return false;
873   int rlen=seq.length();
874   l5=0;
875   l3=rlen-1;
876 + int32 seedVal=*(int32*)poly_seed;
877 + char polyChar=poly_seed[0];
878   //assumes N trimming was already done
879   //so a poly match should be very close to the end of the read
1043 //TODO: should we attempt polyA and polyT at the same time, same end?
880   // -- find the initial match (seed)
881 < int rlo=GMAX((rlen-10), 0);
882 < int si;
883 < for (si=rlen-5;si>rlo;si--) {
884 <   if (polyA_seed==*(int*)&(seq.chars()+si)) {
881 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
882 > int li;
883 > for (li=0;li<=lmax;li++) {
884 >   if (seedVal==*(int*)&(seq[li])) {
885        break;
886        }
887     }
888 < if (si<=rlo) return false;
889 < //seed found, try to extend it to left and right
890 < int score=4*match_score;
891 < max_rlo=rlo;
892 < rlo--;
893 < while (rlo>=0) {
894 <    rlo--;
895 <    score+=
896 <    
888 > if (li>lmax) return false;
889 > //seed found, try to extend it both ways
890 > //extend left
891 > int ri=li+3; //save rightmost base of the seed
892 > SLocScore loc(li, poly_m_score<<2);
893 > SLocScore maxloc(loc);
894 > while (li>0) {
895 >    li--;
896 >    if (seq[li]==polyChar) {
897 >                 loc.add(li,poly_m_score);
898 >                 }
899 >    else if (seq[li]=='N') {
900 >                 loc.add(li,0);
901 >                 }
902 >    else { //mismatch
903 >         loc.add(li,poly_mis_score);
904 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
905 >         }
906 >    if (maxloc.score<=loc.score) {
907 >       maxloc=loc;
908 >       }
909      }
910 + li=maxloc.pos;
911 + if (li>5) return false; //no trimming wanted, too far from 5' end
912 + //li = right boundary for the poly match
913 +
914 + //extend right
915 + loc.set(ri, maxloc.score);
916 + maxloc.pos=ri;
917 + while (ri<rlen-1) {
918 +   ri++;
919 +   if (seq[ri]==polyChar) {
920 +                loc.add(ri,poly_m_score);
921 +                }
922 +   else if (seq[ri]=='N') {
923 +                loc.add(ri,0);
924 +                }
925 +   else { //mismatch
926 +        loc.add(ri,poly_mis_score);
927 +        if (maxloc.score-loc.score>poly_dropoff_score) break;
928 +        }
929 +   if (maxloc.score<=loc.score) {
930 +      maxloc=loc;
931 +      }
932 +   }
933 + ri=maxloc.pos;
934 + if ((maxloc.score==poly_minScore && li==0) ||
935 +     (maxloc.score>poly_minScore && li<2)
936 +     || (maxloc.score>(poly_minScore*3) && li<8)) {
937 +    //adjust l5 to reflect this trimming of 5' end
938 +    l5=ri+1;
939 +    if (l5>rlen-1) l5=rlen-1;
940 +    return true;
941 +    }
942 + return false;
943   }
944  
945   bool trim_adapter3(GStr& seq, int&l5, int &l3) {
946 + if (adapters3.Count()==0) return false;
947   int rlen=seq.length();
948   l5=0;
949   l3=rlen-1;
950 < //first try a full match, we might get lucky
951 < int fi=-1;
952 < if ((fi=seq.index(adapter3))>=0) {
953 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
954 <      l5=fi+a3len;
955 <      l3=rlen-1;
956 <      }
957 <    else {
958 <      l5=0;
959 <      l3=fi-1;
960 <      }
961 <   return true;
962 <   }
963 < #ifdef DEBUG
964 < if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
965 < #endif
966 <
967 < //also, for fast detection of other adapter-only reads that start past
968 < // the beginning of the adapter sequence, try to see if the first a3len-4
969 < // bases of the read are a substring of the adapter
970 < if (rlen>a3len-3) {
971 <   GStr rstart=seq.substr(1,a3len-4);
972 <   if ((fi=adapter3.index(rstart))>=0) {
973 <     l3=rlen-1;
1092 <     l5=a3len-4;
1093 <     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1094 <     return true;
1095 <     }
1096 <  }
1097 < CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1098 <  //check the easy cases - 11 bases exact match at the end
1099 < int fdlen=11;
1100 <  if (a3len<16) {
1101 <   fdlen=a3len>>1;
1102 <   }
1103 < if (fdlen>4) {
1104 <     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1105 <     GStr rstart=seq.substr(-fdlen-3,fdlen);
1106 <     if ((fi=adapter3.index(rstart))>=0) {
1107 < #ifdef DEBUG
1108 <       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1109 < #endif
1110 <       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1111 <                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
1112 <            return true;
1113 <       }
1114 <     //another easy case: first 11 characters of the adaptor found as a substring of the read
1115 <     GStr bstr=adapter3.substr(0, fdlen);
1116 <     if ((fi=seq.rindex(bstr))>=0) {
1117 < #ifdef DEBUG
1118 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1119 < #endif
1120 <       if (extendMatch(seq.chars(), rlen, fi,
1121 <                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
1122 <            return true;
1123 <       }
1124 <     } //tried to match 11 bases first
1125 <    
1126 < //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1127 < //-- only extend if the match is in the 3' (ending) region of the read
1128 < int wordSize=3;
1129 < int hlen=12;
1130 < if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1131 < int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1132 < if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1133 < imin=rlen-imin;
1134 < for (int iw=0;iw<hlen;iw++) {
1135 <   //int32* qv=(int32*)(adapter3.chars()+iw);
1136 <   int qv=get3mer_value(adapter3.chars()+iw);
1137 <   fi=-1;
1138 <   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1139 <   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1140 <     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1141 <
1142 < #ifdef DEBUG
1143 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1144 < #endif
1145 <     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1146 <                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
1147 <     fi--;
1148 <     if (fi<imin) break;
1149 <     }
1150 <   } //for each wmer in the first hlen bases of the adaptor
1151 < /*
1152 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1153 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1154 < if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1155 < int hlen2=a3len-wordSize;
1156 < //if (hlen2>a3len-4) hlen2=a3len-4;
1157 < if (hlen2>hlen) {
1158 < #ifdef DEBUG
1159 <     if (debug && a3segs.Count()>0) {
1160 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1161 <        }
1162 < #endif
1163 <     for (int iw=hlen;iw<hlen2;iw++) {
1164 <         //int* qv=(int32 *)(adapter3.chars()+iw);
1165 <         int qv=get3mer_value(adapter3.chars()+iw);
1166 <         fi=-1;
1167 <         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1168 <         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1169 <           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1170 <                         a3len, iw, wordSize, l5,l3, a3segs);
1171 <           fi--;
1172 <           if (fi<imin) break;
1173 <           }
1174 <         } //for each wmer between hlen2 and hlen bases of the adaptor
950 > bool trimmed=false;
951 > GStr wseq(seq);
952 > int wlen=rlen;
953 > GXSeqData seqdata;
954 > for (int ai=0;ai<adapters3.Count();ai++) {
955 >  seqdata.update(adapters3[ai].seq.chars(), adapters3[ai].seq.length(),
956 >       adapters3[ai].pz, wseq.chars(), wlen, adapters3[ai].amlen);
957 >  GXAlnInfo* bestaln=match_RightEnd(seqdata, gxmem_r, 86);
958 >  if (bestaln) {
959 >     trimmed=true;
960 >     //keep unmatched region on the left OR right (the longer one)
961 >     if (bestaln->sl > wlen-bestaln->sr) {
962 >         //keep left side
963 >         l3-=(wlen-bestaln->sl+1);
964 >         if (l3<0) l3=0;
965 >         }
966 >     else { //keep right side
967 >         l5+=bestaln->sr;
968 >         if (l5>=rlen) l5=rlen-1;
969 >         }
970 >     delete bestaln;
971 >     if (l3-l5+1<min_read_len) return true;
972 >     wseq=seq.substr(l5,l3-l5+1);
973 >     wlen=wseq.length();
974       }
975 < //lastly, analyze collected a3segs for a possible gapped alignment:
976 < GList<CSegChain> segchains(false,true,false);
1178 < #ifdef DEBUG
1179 < if (debug && a3segs.Count()>0) {
1180 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1181 <   }
1182 < #endif
1183 < for (int i=0;i<a3segs.Count();i++) {
1184 <   if (a3segs[i]->chain==NULL) {
1185 <       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1186 <       CSegChain* newchain=new CSegChain();
1187 <       newchain->setFreeItem(false);
1188 <       newchain->addSegPair(a3segs[i]);
1189 <       a3segs[i]->chain=newchain;
1190 <       segchains.Add(newchain); //just to free them when done
1191 <       }
1192 <   for (int j=i+1;j<a3segs.Count();j++) {
1193 <      CSegChain* chain=a3segs[i]->chain;
1194 <      if (chain->extendChain(a3segs[j])) {
1195 <          a3segs[j]->chain=chain;
1196 < #ifdef DEBUG
1197 <          if (debug) dbgPrintChain(*chain, adapter3.chars());
1198 < #endif      
1199 <          //save time by checking here if the extended chain is already acceptable for trimming
1200 <          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1201 <            l5=0;
1202 <            l3=chain->astart-2;
1203 < #ifdef DEBUG
1204 <          if (debug && a3segs.Count()>0) {
1205 <            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1206 <            }
1207 < #endif
1208 <            return true;
1209 <            }
1210 <          } //chain can be extended
1211 <      }
1212 <   } //collect segment alignments into chains
1213 < */  
1214 < return false; //no adapter parts found
975 >  }//for each 5' adapter
976 >  return trimmed;
977   }
978  
979   bool trim_adapter5(GStr& seq, int&l5, int &l3) {
980 < //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
980 > if (adapters5.Count()==0) return false;
981   int rlen=seq.length();
982   l5=0;
983   l3=rlen-1;
984 < //try to see if adapter is fully included in the read
985 < int fi=-1;
986 < if ((fi=seq.index(adapter5))>=0) {
987 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
988 <      l5=fi+a5len;
989 <      l3=rlen-1;
990 <      }
991 <    else {
992 <      l5=0;
993 <      l3=fi-1;
994 <      }
995 <   return true;
996 <   }
997 < #ifdef DEBUG
998 < if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
999 < #endif
1000 <
1001 < CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1002 <
1003 < //try the easy way out first - look for an exact match of 11 bases
1004 < int fdlen=11;
1005 <  if (a5len<16) {
1006 <   fdlen=a5len>>1;
1245 <   }
1246 < if (fdlen>4) {
1247 <     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1248 <     if ((fi=adapter5.index(rstart))>=0) {
1249 < #ifdef DEBUG
1250 <       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1251 < #endif
1252 <       if (extendMatch(seq.chars(), rlen, 1,
1253 <                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1254 <           return true;
1255 <       }
1256 <     //another easy case: last 11 characters of the adaptor found as a substring of the read
1257 <     GStr bstr=adapter5.substr(-fdlen);
1258 <     if ((fi=seq.index(bstr))>=0) {
1259 < #ifdef DEBUG
1260 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1261 < #endif
1262 <       if (extendMatch(seq.chars(), rlen, fi,
1263 <                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1264 <          return true;
1265 <       }
1266 <     } //tried to matching at most 11 bases first
1267 <    
1268 < //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1269 < //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1270 < int wordSize=3;
1271 < int hlen=12;
1272 < if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1273 < int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1274 < if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1275 < for (int iw=0;iw<=hlen;iw++) {
1276 <   int apstart=a5len-iw-wordSize;
1277 <   fi=0;
1278 <   //int* qv=(int32 *)(adapter5.chars()+apstart);
1279 <   int qv=get3mer_value(adapter5.chars()+apstart);
1280 <   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1281 <   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1282 < #ifdef DEBUG
1283 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1284 < #endif
1285 <     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1286 <                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1287 <     fi++;
1288 <     if (fi>imax) break;
1289 <     }
1290 <   } //for each wmer in the last hlen bases of the adaptor
1291 < /*
1292 <
1293 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1294 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1295 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1296 < int hlen2=a5len-wordSize;
1297 < //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1298 < #ifdef DEBUG
1299 <      if (debug && a5segs.Count()>0) {
1300 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1301 <        }
1302 < #endif
1303 < if (hlen2>hlen) {
1304 <     for (int iw=hlen+1;iw<=hlen2;iw++) {
1305 <         int apstart=a5len-iw-wordSize;
1306 <         fi=0;
1307 <         //int* qv=(int32 *)(adapter5.chars()+apstart);
1308 <         int qv=get3mer_value(adapter5.chars()+apstart);
1309 <         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1310 <         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1311 <           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1312 <                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1313 <           fi++;
1314 <           if (fi>imax) break;
1315 <           }
1316 <         } //for each wmer between hlen2 and hlen bases of the adaptor
984 > bool trimmed=false;
985 > GStr wseq(seq);
986 > int wlen=rlen;
987 > GXSeqData seqdata;
988 > for (int ai=0;ai<adapters5.Count();ai++) {
989 >   seqdata.update(adapters5[ai].seq.chars(), adapters5[ai].seq.length(),
990 >       adapters5[ai].pz, wseq.chars(), wlen, adapters5[ai].amlen);
991 >  GXAlnInfo* bestaln=match_LeftEnd(seqdata, gxmem_l, 90);
992 >  if (bestaln) {
993 >     trimmed=true;
994 >     if (bestaln->sl > wlen-bestaln->sr) {
995 >         //keep left side
996 >         l3-=(wlen-bestaln->sl+1);
997 >         if (l3<0) l3=0;
998 >         }
999 >     else { //keep right side
1000 >         l5+=bestaln->sr;
1001 >         if (l5>=rlen) l5=rlen-1;
1002 >         }
1003 >     delete bestaln;
1004 >     if (l3-l5+1<min_read_len) return true;
1005 >     wseq=seq.substr(l5,l3-l5+1);
1006 >     wlen=wseq.length();
1007       }
1008 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1009 < // lastly, analyze collected a5segs for a possible gapped alignment:
1010 < GList<CSegChain> segchains(false,true,false);
1321 < #ifdef DEBUG
1322 < if (debug && a5segs.Count()>0) {
1323 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1324 <   }
1325 < #endif
1326 < for (int i=0;i<a5segs.Count();i++) {
1327 <   if (a5segs[i]->chain==NULL) {
1328 <       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1329 <       CSegChain* newchain=new CSegChain(true);
1330 <       newchain->setFreeItem(false);
1331 <       newchain->addSegPair(a5segs[i]);
1332 <       a5segs[i]->chain=newchain;
1333 <       segchains.Add(newchain); //just to free them when done
1334 <       }
1335 <   for (int j=i+1;j<a5segs.Count();j++) {
1336 <      CSegChain* chain=a5segs[i]->chain;
1337 <      if (chain->extendChain(a5segs[j])) {
1338 <         a5segs[j]->chain=chain;
1339 < #ifdef DEBUG
1340 <         if (debug) dbgPrintChain(*chain, adapter5.chars());
1341 < #endif      
1342 <      //save time by checking here if the extended chain is already acceptable for trimming
1343 <         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1344 <            l5=chain->aend;
1345 <            l3=rlen-1;
1346 <            return true;
1347 <            }
1348 <         } //chain can be extended
1349 <      }
1350 <   } //collect segment alignments into chains
1351 < */
1352 < return false; //no adapter parts found
1353 < }
1008 >  }//for each 5' adapter
1009 >  return trimmed;
1010 > }
1011  
1012 < //convert qvs to/from phred64 from/to phread33
1012 > //convert qvs to/from phred64 from/to phread33
1013   void convertPhred(GStr& q) {
1014   for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1015   }
# Line 1361 | Line 1018
1018   for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1019   }
1020  
1021 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1021 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1022            GStr& rname, GStr& rinfo, GStr& infname) {
1023   rseq="";
1024   rqv="";
# Line 1377 | Line 1034
1034        } //raw qseq format
1035   else { // FASTQ or FASTA */
1036   isfasta=(l[0]=='>');
1037 < if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1037 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1038        infname.chars(), l);
1039   GStr s(l);
1040   rname=&(l[1]);
1041   for (int i=0;i<rname.length();i++)
1042 <    if (rname[i]<=' ') {
1043 <       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1044 <       rname.cut(i);
1045 <       break;
1042 >    if (rname[i]<=' ') {
1043 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1044 >       rname.cut(i);
1045 >       break;
1046         }
1047    //now get the sequence
1048 < if ((l=fq.getLine())==NULL)
1048 > if ((l=fq.getLine())==NULL)
1049        GError("Error: unexpected EOF after header for read %s (%s)\n",
1050                     rname.chars(), infname.chars());
1051   rseq=l; //this must be the DNA line
1052   while ((l=fq.getLine())!=NULL) {
1053        //seq can span multiple lines
1054        if (l[0]=='>' || l[0]=='+') {
1055 <           fq.pushBack();
1055 >           fq.pushBack();
1056             break; //
1057             }
1058        rseq+=l;
1059 <      } //check for multi-line seq
1059 >      } //check for multi-line seq
1060   if (!isfasta) { //reading fastq quality values, which can also be multi-line
1061      if ((l=fq.getLine())==NULL)
1062          GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1063      if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1064 <    if ((l=fq.getLine())==NULL)
1064 >    if ((l=fq.getLine())==NULL)
1065          GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1066      rqv=l;
1067 <    //if (rqv.length()!=rseq.length())
1067 >    //if (rqv.length()!=rseq.length())
1068      //  GError("Error: qv len != seq len for %s\n", rname.chars());
1069      while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1070        rqv+=l; //append to qv string
1071        }
1072      }// fastq
1073   // } //<-- FASTA or FASTQ
1074 < rseq.upper(); //TODO: what if we care about masking?
1074 > rseq.upper();
1075   return true;
1076   }
1077  
1078 + #ifdef GDEBUG
1079 + void showTrim(GStr& s, int l5, int l3) {
1080 +  if (l5>0) {
1081 +    color_bg(c_red);
1082 +    }
1083 +  for (int i=0;i<s.length()-1;i++) {
1084 +    if (i && i==l5) color_resetbg();
1085 +    fprintf(stderr, "%c", s[i]);
1086 +    if (i==l3) color_bg(c_red);
1087 +   }
1088 +  fprintf(stderr, "%c", s[s.length()-1]);
1089 +  color_reset();
1090 +  fprintf(stderr, "\n");
1091 + }
1092 + #endif
1093 +
1094   char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1095 < //returns 0 if the read was untouched, 1 if it was just trimmed
1095 > //returns 0 if the read was untouched, 1 if it was just trimmed
1096   // and a trash code if it was trashed
1097   l5=0;
1098   l3=rseq.length()-1;
1099 + #ifdef GDEBUG
1100 +   //rseq.reverse();
1101 +   GMessage(">%s\n", rname.chars());
1102 +   GMessage("%s\n",rseq.chars());
1103 + #endif
1104   if (l3-l5+1<min_read_len) {
1105     return 's';
1106     }
# Line 1458 | Line 1136
1136     w5=0;
1137     w3=wseq.length()-1;
1138     }
1139 < if (a3len>0) {
1140 <  if (trim_adapter3(wseq, w5, w3)) {
1139 > char trim_code;
1140 > do {
1141 >  trim_code=0;
1142 >  if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1143 >      trim_code='A';
1144 >      }
1145 >  else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1146 >      trim_code='T';
1147 >      }
1148 >  else if (trim_adapter5(wseq, w5, w3)) {
1149 >      trim_code='5';
1150 >      }
1151 >  if (trim_code) {
1152 >     #ifdef GDEBUG
1153 >      GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1154 >      showTrim(wseq, w5, w3);
1155 >     #endif
1156       int trimlen=wseq.length()-(w3-w5+1);
1157 <     num_trimmed3++;
1158 <     if (trimlen<min_trimmed3)
1159 <         min_trimmed3=trimlen;
1157 >     num_trimmed5++;
1158 >     if (trimlen<min_trimmed5)
1159 >         min_trimmed5=trimlen;
1160       l5+=w5;
1161       l3-=(wseq.length()-1-w3);
1162       if (w3-w5+1<min_read_len) {
1163 <         return '3';
1163 >         return trim_code;
1164           }
1165        //-- keep only the w5..w3 range
1166        wseq=wseq.substr(w5, w3-w5+1);
1167        if (!wqv.is_empty())
1168           wqv=wqv.substr(w5, w3-w5+1);
1169 <      }//some adapter was trimmed
1170 <   } //adapter trimming
1171 < if (a5len>0) {
1172 <  if (trim_adapter5(wseq, w5, w3)) {
1169 >      }// trimmed at 5' end
1170 > } while (trim_code);
1171 >
1172 > do {
1173 >  trim_code=0;
1174 >  if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1175 >      trim_code='A';
1176 >      }
1177 >  else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1178 >      trim_code='T';
1179 >      }
1180 >  else if (trim_adapter3(wseq, w5, w3)) {
1181 >      trim_code='3';
1182 >      }
1183 >  if (trim_code) {
1184 >     #ifdef GDEBUG
1185 >     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1186 >     showTrim(wseq, w5, w3);
1187 >     #endif
1188       int trimlen=wseq.length()-(w3-w5+1);
1189 <     num_trimmed5++;
1190 <     if (trimlen<min_trimmed5)
1191 <         min_trimmed5=trimlen;
1189 >     num_trimmed3++;
1190 >     if (trimlen<min_trimmed3)
1191 >         min_trimmed3=trimlen;
1192       l5+=w5;
1193       l3-=(wseq.length()-1-w3);
1194       if (w3-w5+1<min_read_len) {
1195 <         return '5';
1195 >         return trim_code;
1196           }
1197        //-- keep only the w5..w3 range
1198        wseq=wseq.substr(w5, w3-w5+1);
1199        if (!wqv.is_empty())
1200           wqv=wqv.substr(w5, w3-w5+1);
1201 <      }//some adapter was trimmed
1202 <   } //adapter trimming
1201 >      }//trimming at 3' end
1202 > } while (trim_code);
1203 >
1204 >
1205   if (doCollapse) {
1206     //keep read for later
1207     FqDupRec* dr=dhash.Find(wseq.chars());
1208     if (dr==NULL) { //new entry
1209 <          //if (prefix.is_empty())
1210 <             dhash.Add(wseq.chars(),
1209 >          //if (prefix.is_empty())
1210 >             dhash.Add(wseq.chars(),
1211                    new FqDupRec(&wqv, rname.chars()));
1212            //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1213           }
# Line 1531 | Line 1241
1241         fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1242         }
1243        else {
1244 <       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1244 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1245                            rseq.chars());
1246         }
1247       }
# Line 1542 | Line 1252
1252         fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1253         }
1254        else
1255 <       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1255 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1256                            rseq.chars(),rqv.chars() );
1257       }
1258   }
# Line 1550 | Line 1260
1260   void trash_report(char trashcode, GStr& rname, FILE* freport) {
1261   if (freport==NULL || trashcode<=' ') return;
1262   if (trashcode=='3' || trashcode=='5') {
1263 <   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1263 >   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1264     }
1265   else {
1266     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
# Line 1642 | Line 1352
1352      }
1353   }
1354  
1355 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1356 <                       GStr& s, GStr& infname, GStr& infname2) {
1355 > void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1356 > //TODO: prepare CASeqData here, and collect hexamers as well
1357 > CASeqData adata(revCompl);
1358 > int idx=adapters.Add(adata);
1359 > if (idx<0) GError("Error: failed to add adaptor!\n");
1360 > adapters[idx].update(seq.chars());
1361 > }
1362 >
1363 >
1364 > int loadAdapters(const char* fname) {
1365 >  GLineReader lr(fname);
1366 >  char* l;
1367 >  while ((l=lr.nextLine())!=NULL) {
1368 >   if (lr.length()<=3 || l[0]=='#') continue;
1369 >   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1370 >        l[0]==';'|| l[0]==':' ) {
1371 >      int i=1;
1372 >      while (l[i]!=0 && isspace(l[i])) {
1373 >        i++;
1374 >        }
1375 >      if (l[i]!=0) {
1376 >        GStr s(&(l[i]));
1377 >      #ifdef GDEBUG
1378 >          //s.reverse();
1379 >      #endif
1380 >        addAdapter(adapters3, s);
1381 >        continue;
1382 >        }
1383 >      }
1384 >    else {
1385 >      GStr s(l);
1386 >      s.startTokenize("\t ;,:");
1387 >      GStr a5,a3;
1388 >      if (s.nextToken(a5))
1389 >         s.nextToken(a3);
1390 >      a5.upper();
1391 >      a3.upper();
1392 >     #ifdef GDEBUG
1393 >     //   a5.reverse();
1394 >     //   a3.reverse();
1395 >     #endif
1396 >      addAdapter(adapters5, a5);
1397 >      addAdapter(adapters3, a3);
1398 >      }
1399 >   }
1400 >   return adapters5.Count()+adapters3.Count();
1401 > }
1402 >
1403 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1404 >                       GStr& s, GStr& infname, GStr& infname2) {
1405   // uses outsuffix to generate output file names and open file handles as needed
1406   infname="";
1407   infname2="";
# Line 1671 | Line 1429
1429   s.startTokenize(",:");
1430   s.nextToken(infname);
1431   bool paired=s.nextToken(infname2);
1432 < if (fileExists(infname.chars())==0)
1432 > if (fileExists(infname.chars())==0)
1433      GError("Error: cannot find file %s!\n",infname.chars());
1434   GStr fname(getFileName(infname.chars()));
1435   GStr picmd;
# Line 1693 | Line 1451
1451   if (!paired) return;
1452   if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1453   // ---- paired reads:-------------
1454 < if (fileExists(infname2.chars())==0)
1454 > if (fileExists(infname2.chars())==0)
1455       GError("Error: cannot find file %s!\n",infname2.chars());
1456   picmd="";
1457   GStr fname2(getFileName(infname2.chars()));

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines