ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 3 | Line 3
3   #include "GHash.hh"
4   #include "GList.hh"
5   #include <ctype.h>
6 + #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9   fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 <   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12      <input.fq>[,<input_mates.fq>\n\
13   \n\
# Line 30 | Line 31
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32   -3  trim the given adapter sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -a  minimum length of exact match to adaptor sequence at the proper end (6)\n\
34 > -A  disable polyA/T trimming (enabled by default)\n\
35 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
36   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37   -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
# Line 46 | Line 48
48   -Q  convert quality values to the other Phred qv type\n\
49   -V  verbose processing\n\
50   "
51 <
51 >
52   //-z  for -o option, the output stream(s) will be first piped into the given\n
53   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54  
55 <
55 >
56   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57  
58   //For paired reads sequencing:
# Line 60 | Line 62
62   //FILE* f_out2=NULL; //for paired reads
63   //FILE* f_in=NULL; //input fastq (stdin if not provided)
64   //FILE* f_in2=NULL; //for paired reads
65 +
66   FILE* freport=NULL;
67  
68   bool debug=false;
69   bool verbose=false;
70   bool doCollapse=false;
71   bool doDust=false;
72 + bool doPolyTrim=true;
73   bool fastaOutput=false;
74   bool trashReport=false;
75 < //bool rawFormat=false;
75 > bool revCompl=false; //also reverse complement adaptor sequences
76   int min_read_len=16;
77   double max_perc_N=7.0;
78   int dust_cutoff=16;
79   bool isfasta=false;
80   bool convert_phred=false;
81 < GStr outsuffix; // -o
78 < //GStr adapter3;
79 < //GStr adapter5;
81 > GStr outsuffix; // -o
82   GStr prefix;
83   GStr zcmd;
84   int num_trimmed5=0;
# Line 91 | Line 93
93   int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94   int qv_cvtadd=0; //could be -31 or +31
95  
96 < int a3len=0;
97 < int a5len=0;
98 < // adaptor matching metrics -- for extendMatch() function
99 < const int a_m_score=2; //match score
100 < const int a_mis_score=-3; //mismatch
101 < const int a_dropoff_score=7;
102 < int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
103 < const int a_min_chain_score=15; //for gapped alignments
104 <
105 < class CSegChain;
106 <
107 < class CSegPair {
108 <  public:
109 <   GSeg a;
110 <   GSeg b; //the adapter segment
111 <   int score;
112 <   int flags;
113 <   CSegChain* chain;
114 <   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
115 <      score=mscore;
116 <      if (score==0) score=a.len()*a_m_score;
117 <      flags=0;
118 <      chain=NULL;
119 <      }
120 <   int len() { return  a.len(); }
119 <   bool operator==(CSegPair& d){
120 <      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
121 <      //make equal even segments that are included into one another:
122 <      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
123 <      }
124 <   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
125 <     if (b.start==d.b.start) {
126 <        if (score==d.score) {
127 <           //just try to be consistent:
128 <           if (b.end==d.b.end) {
129 <             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
130 <             }
131 <           return (b.end>d.b.end);
132 <           }
133 <         else return (score<d.score);
96 > // adaptor matching metrics -- for X-drop ungapped extension
97 > const int match_reward=2;
98 > const int mismatch_penalty=3;
99 > const int Xdrop=8;
100 >
101 > const int poly_m_score=2; //match score
102 > const int poly_mis_score=-3; //mismatch
103 > const int poly_dropoff_score=7;
104 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
105 >
106 > const char *polyA_seed="AAAA";
107 > const char *polyT_seed="TTTT";
108 >
109 > struct CASeqData {
110 >   //positional data for every possible hexamer in an adapter
111 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
112 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
113 >   GStr seq; //actual adapter sequence data
114 >   GStr seqr; //reverse complement sequence
115 >   bool use_reverse;
116 >   CASeqData(bool rev=false):seq(),seqr() {
117 >     use_reverse=rev;
118 >     for (int i=0;i<4096;i++) {
119 >        pz[i]=NULL;
120 >        pzr[i]=NULL;
121          }
135     return (b.start>d.b.start);
122       }
137   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
138     /*if (b.start==d.b.start && b.end==d.b.end) {
139          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
140          }
141     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
142     if (b.start==d.b.start) {
143        if (score==d.score) {
144           //just try to be consistent:
145           if (b.end==d.b.end) {
146             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
147             }
148           return (b.end<d.b.end);
149           }
150         else return (score>d.score);
151        }
152     return (b.start<d.b.start);
153     }
154 };
123  
124 < int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
125 < CSegPair& x = *(CSegPair *)sa;
126 < CSegPair& y = *(CSegPair *)sb;
127 < /*
128 < if (x.b.end==y.b.end) {
129 <     if (x.b.start==y.b.start) {
130 <         if (x.a.end==y.a.end) {
131 <            if (x.a.start==y.a.start) return 0;
132 <            return ((x.a.start>y.a.start) ? -1 : 1);
133 <            }
134 <          else {
135 <            return ((x.a.end>y.a.end) ? -1 : 1);
136 <            }
137 <          }
138 <      else {
139 <       return ((x.b.start>y.b.start) ? -1 : 1);
172 <       }
173 <     }
174 <    else {
175 <     return ((x.b.end>y.b.end) ? -1 : 1);
176 <     }
177 < */
178 <  if (x.b.end==y.b.end) {
179 <     if (x.score==y.score) {
180 <     if (x.b.start==y.b.start) {
181 <         if (x.a.end==y.a.end) {
182 <            if (x.a.start==y.a.start) return 0;
183 <            return ((x.a.start<y.a.start) ? -1 : 1);
184 <            }
185 <          else {
186 <            return ((x.a.end<y.a.end) ? -1 : 1);
187 <            }
188 <          }
189 <      else {
190 <       return ((x.b.start<y.b.start) ? -1 : 1);
191 <       }
192 <      } else return ((x.score>y.score) ? -1 : 1);
193 <     }
194 <    else {
195 <     return ((x.b.end>y.b.end) ? -1 : 1);
124 >   void update(const char* s) {
125 >   seq=s;
126 >   table6mers(seq.chars(), seq.length(), pz);
127 >   if (!use_reverse) return;
128 >   //reverse complement
129 >   seqr=s;
130 >   int slen=seq.length();
131 >   for (int i=0;i<slen;i++)
132 >     seqr[i]=ntComplement(seq[slen-i-1]);
133 >   table6mers(seqr.chars(), seqr.length(), pzr);
134 >   }
135 >
136 >   ~CASeqData() {
137 >     for (int i=0;i<4096;i++) {
138 >       delete pz[i];
139 >       delete pzr[i];
140       }
141 +   }
142 + };
143  
144 < }
144 > GVec<CASeqData> adapters5;
145 > GVec<CASeqData> adapters3;
146  
147 < class CSegChain:public GList<CSegPair> {
148 < public:
202 <   uint astart;
203 <   uint aend;
204 <   uint bstart;
205 <   uint bend;
206 <   int score;
207 <   bool endSort;
208 <  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
209 <   //as SegPairs are inserted, they will be sorted by a.start coordinate
210 <   score=0;
211 <   astart=MAX_UINT;
212 <   aend=0;
213 <   bstart=MAX_UINT;
214 <   bend=0;
215 <   endSort=aln5;
216 <   if (aln5) { setSorted(cmpSegEnds); }
217 <   }
218 < bool operator==(CSegChain& d) {
219 <   //return (score==d.score);
220 <    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
221 <   }
222 < bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
223 <   //return (score<d.score);
224 <   if (bstart==d.bstart && bend==d.bend) {
225 <          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
226 <          }
227 <     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
228 <   }
229 < bool operator<(CSegChain& d) {
230 <   //return (score>d.score);
231 <   if (bstart==d.bstart && bend==d.bend) {
232 <          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
233 <          }
234 <     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
235 <   }
236 < void addSegPair(CSegPair* segp) {
237 <   if (AddIfNew(segp)!=segp) return;
238 <   score+=segp->score;
239 <   if (astart>segp->a.start) astart=segp->a.start;
240 <   if (aend<segp->a.end) aend=segp->a.end;
241 <   if (bstart>segp->b.start) bstart=segp->b.start;
242 <   if (bend<segp->b.end) bend=segp->b.end;
243 <   }
244 < //for building actual chains:
245 < bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
246 <   int bgap=0;
247 <   int agap=0;
248 <   //if (endSort) {
249 <   if (bstart>segp->b.start) {
250 <      bgap = (int)(bstart-segp->b.end);
251 <      if (abs(bgap)>2) return false;
252 <      agap = (int)(astart-segp->a.end);
253 <      if (abs(agap)>2) return false;
254 <      }
255 <     else {
256 <      bgap = (int) (segp->b.start-bend);
257 <      if (abs(bgap)>2) return false;
258 <      agap = (int)(segp->a.start-aend);
259 <      if (abs(agap)>2) return false;
260 <      }
261 <   if (agap*bgap<0) return false;
262 <   addSegPair(segp);
263 <   score-=abs(agap)+abs(bgap);
264 <   return true;
265 <   }
266 < };
147 > CGreedyAlignData* gxmem_l=NULL;
148 > CGreedyAlignData* gxmem_r=NULL;
149  
150   // element in dhash:
151   class FqDupRec {
# Line 313 | Line 195
195  
196   GHash<FqDupRec> dhash; //hash to keep track of duplicates
197  
198 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
199 <                       GStr& s, GStr& infname, GStr& infname2);
198 > void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
199 > int loadAdapters(const char* fname);
200 >
201 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
202 >                       GStr& s, GStr& infname, GStr& infname2);
203   // uses outsuffix to generate output file names and open file handles as needed
204 <
204 >
205   void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
206   void trash_report(char trashcode, GStr& rname, FILE* freport);
207  
208 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
208 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
209            GStr& rname, GStr& rinfo, GStr& infname);
210  
211 < char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
211 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
212   //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
213  
214   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
215   bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
216   int dust(GStr& seq);
217 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
217 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
218 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
219   bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
220 + bool trim_adapter3(GStr& seq, int &l5, int &l3);
221  
222   void convertPhred(char* q, int len);
223   void convertPhred(GStr& q);
224  
225   int main(int argc, char * const argv[]) {
226 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
226 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
227    int e;
228    if ((e=args.isError())>0) {
229        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 347 | Line 234
234    convert_phred=(args.getOpt('Q')!=NULL);
235    doCollapse=(args.getOpt('C')!=NULL);
236    doDust=(args.getOpt('D')!=NULL);
237 +  revCompl=(args.getOpt('R')!=NULL);
238 +  if (args.getOpt('A')) doPolyTrim=false;
239    /*
240    rawFormat=(args.getOpt('R')!=NULL);
241    if (rawFormat) {
# Line 355 | Line 244
244    */
245    prefix=args.getOpt('n');
246    GStr s=args.getOpt('l');
247 <  if (!s.is_empty())
247 >  if (!s.is_empty())
248       min_read_len=s.asInt();
249    s=args.getOpt('m');
250 <  if (!s.is_empty())
250 >  if (!s.is_empty())
251       max_perc_N=s.asDouble();
252    s=args.getOpt('d');
253    if (!s.is_empty()) {
# Line 384 | Line 273
273          qv_phredtype=64;
274          qv_cvtadd=-31;
275          }
276 <       else
276 >       else
277           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
278       }
279 <  if (args.getOpt('3')!=NULL) {
280 <    adapter3=args.getOpt('3');
281 <    adapter3.upper();
282 <    a3len=adapter3.length();
283 <    }
284 <  if (args.getOpt('5')!=NULL) {
285 <    adapter5=args.getOpt('5');
286 <    adapter5.upper();
287 <    a5len=adapter5.length();
279 >  s=args.getOpt('f');
280 >  if (!s.is_empty()) {
281 >   loadAdapters(s.chars());
282 >   }
283 >  bool fileAdapters=adapters5.Count()+adapters3.Count();
284 >  s=args.getOpt('5');
285 >  if (!s.is_empty()) {
286 >    if (fileAdapters)
287 >      GError("Error: options -5 and -f cannot be used together!\n");
288 >    s.upper();
289 >    addAdapter(adapters5, s);
290      }
291 <  s=args.getOpt('a');
291 >  s=args.getOpt('3');
292    if (!s.is_empty()) {
293 <     int a_minmatch=s.asInt();
294 <     a_min_score=a_minmatch<<1;
293 >    if (fileAdapters)
294 >      GError("Error: options -3 and -f cannot be used together!\n");
295 >      s.upper();
296 >      addAdapter(adapters3, s);
297 >    }
298 >  s=args.getOpt('y');
299 >  if (!s.is_empty()) {
300 >     int minmatch=s.asInt();
301 >     poly_minScore=minmatch*poly_m_score;
302       }
303 <  
303 >
304    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
305                           else outsuffix="-";
306    trashReport=  (args.getOpt('r')!=NULL);
# Line 419 | Line 317
317    if (trashReport)
318      openfw(freport, args, 'r');
319    char* infile=NULL;
320 +
321 +  if (adapters5.Count()>0)
322 +    gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
323 +  if (adapters3.Count()>0)
324 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
325 +
326    while ((infile=args.nextNonOpt())!=NULL) {
327 +    //for each input file
328      int incounter=0; //counter for input reads
329      int outcounter=0; //counter for output reads
330      int trash_s=0; //too short from the get go
331      int trash_Q=0;
332      int trash_N=0;
333      int trash_D=0;
334 +    int trash_poly=0;
335      int trash_A3=0;
336      int trash_A5=0;
337      s=infile;
# Line 450 | Line 356
356         int a5=0, a3=0, b5=0, b3=0;
357         char tcode=0, tcode2=0;
358         tcode=process_read(seqid, rseq, rqv, a5, a3);
359 <       //if (!doCollapse) {
454 <         if (fq2!=NULL) {
359 >       if (fq2!=NULL) {
360              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
361              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
362                 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
# Line 483 | Line 388
388                 int nocounter=0;
389                 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
390                 }
391 <            } //paired read
487 <       // }
391 >            } //pair read
392         if (tcode>1) { //trashed
393 +         #ifdef GDEBUG
394 +         GMessage(" !!!!TRASH => 'N'\n");
395 +         #endif
396            if (tcode=='s') trash_s++;
397 +          else if (tcode=='A' || tcode=='T') trash_poly++;
398              else if (tcode=='Q') trash_Q++;
399                else if (tcode=='N') trash_N++;
400                 else if (tcode=='D') trash_D++;
# Line 499 | Line 407
407              rseq=rseq.substr(a5,a3-a5+1);
408              if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
409              }
410 +         #ifdef GDEBUG
411 +            GMessage("  After trimming:\n");
412 +            GMessage("%s\n",rseq.chars());
413 +         #endif
414            writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
415            }
416         } //for each fastq record
# Line 526 | Line 438
438                 }
439              }
440           outcounter++;
441 <         if (qd->count>maxdup_count) {
441 >         if (qd->count>maxdup_count) {
442              maxdup_count=qd->count;
443              maxdup_seq=seq;
444              }
445           if (isfasta) {
446             if (prefix.is_empty()) {
447 <             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
447 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
448                             rseq.chars());
449               }
450             else { //use custom read name
# Line 543 | Line 455
455           else { //fastq format
456            if (convert_phred) convertPhred(qd->qv, qd->len);
457            if (prefix.is_empty()) {
458 <            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
458 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
459                             rseq.chars(), qd->qv);
460              }
461            else { //use custom read name
# Line 579 | Line 491
491           GMessage("         Trashed by N%%:%9d\n", trash_N);
492         if (trash_Q>0)
493           GMessage("Trashed by low quality:%9d\n", trash_Q);
494 +       if (trash_poly>0)
495 +         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
496         if (trash_A5>0)
497           GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
498         if (trash_A3>0)
# Line 590 | Line 504
504      FWCLOSE(f_out);
505      FWCLOSE(f_out2);
506     } //while each input file
507 <
507 > delete gxmem_l;
508 > delete gxmem_r;
509   //getc(stdin);
510   }
511  
# Line 605 | Line 520
520     const char* seq;
521     bool valid;
522     NData() {
523 +    seqlen=0;
524      NCount=0;
525      end5=0;
526      end3=0;
# Line 635 | Line 551
551       perc_N=(n*100.0)/(end5-end3+1);
552       }
553   };
554 <
554 >
555   static NData feat;
556   int perc_lenN=12; // incremental distance from ends, in percentage of
557            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
558 <          
558 >
559   void N_analyze(int l5, int l3, int p5, int p3) {
560   /* assumes feat was filled properly */
561   int old_dif, t5,t3,v;
562   if (l3<l5+2 || p5>p3 ) {
563     feat.end5=l5+1;
564     feat.end3=l3+1;
565 <   return;
565 >   return;
566     }
567  
568   t5=feat.NPos[p5]-l5;
569   t3=l3-feat.NPos[p3];
570   old_dif=p3-p5;
571   v=(int)((((double)(l3-l5))*perc_lenN)/100);
572 < if (v>20) v=20; /* enforce N-search limit for very long reads */
572 > if (v>20) v=20; /* enforce N-search limit for very long reads */
573   if (t5 < v ) {
574     l5=feat.NPos[p5]+1;
575     p5++;
# Line 670 | Line 586
586             feat.end3=l3+1;
587             return;
588             }
589 <    else
589 >    else
590        N_analyze(l5,l3, p5,p3);
591   }
592  
# Line 711 | Line 627
627   feat.init(rseq);
628   l5=feat.end5-1;
629   l3=feat.end3-1;
630 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
630 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
631   if (l5==feat.end5-1 && l3==feat.end3-1) {
632      if (feat.perc_N>max_perc_N) {
633             feat.valid=false;
# Line 729 | Line 645
645     return true;
646     }
647   feat.N_calc();
648 <
648 >
649   if (feat.perc_N>max_perc_N) {
650        feat.valid=false;
651        l3=l5+1;
# Line 741 | Line 657
657   //--------------- dust functions ----------------
658   class DNADuster {
659   public:
660 <  int dustword;
661 <  int dustwindow;
662 <  int dustwindow2;
660 >  int dustword;
661 >  int dustwindow;
662 >  int dustwindow2;
663    int dustcutoff;
664    int mv, iv, jv;
665    int counts[32*32*32];
# Line 838 | Line 754
754                      }
755             }
756           }
757 < //return first;
757 > //return first;
758   }
759   };
760  
# Line 856 | Line 772
772   return ncount;
773   }
774  
775 + struct SLocScore {
776 +  int pos;
777 +  int score;
778 +  SLocScore(int p=0,int s=0) {
779 +    pos=p;
780 +    score=s;
781 +    }
782 +  void set(int p, int s) {
783 +    pos=p;
784 +    score=s;
785 +    }
786 +  void add(int p, int add) {
787 +    pos=p;
788 +    score+=add;
789 +    }
790 + };
791  
792 < // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
793 < //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
794 < //check minimum score and
795 < //for 3' adapter trimming:
796 < //     require that the right end of the alignment for either the adaptor OR the read must be
797 < //     < 3 distance from its right end
798 < // for 5' adapter trimming:
799 < //     require that the left end of the alignment for either the adaptor OR the read must
800 < //     be at coordinate < 3 from start
801 <
802 < bool extendMatch(const char* a, int alen, int ai,
803 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
804 < //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
805 < #ifdef DEBUG
806 < GStr dbg(b);
875 < #endif
876 < //if (debug) {
877 < //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
878 < //  }
879 < int a_l=ai; //alignment coordinates on a
880 < int a_r=ai+mlen-1;
881 < int b_l=bi; //alignment coordinates on b
882 < int b_r=bi+mlen-1;
883 < int ai_maxscore=ai;
884 < int bi_maxscore=bi;
885 < int score=mlen*a_m_score;
886 < int maxscore=score;
887 < int mism5score=a_mis_score;
888 < if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
889 < //try to extend to the left first, if possible
890 < while (ai>0 && bi>0) {
891 <   ai--;
892 <   bi--;
893 <   score+= (a[ai]==b[bi])? a_m_score : mism5score;
894 <   if (score>maxscore) {
895 <       ai_maxscore=ai;
896 <       bi_maxscore=bi;
897 <       maxscore=score;
898 <       }
899 <     else if (maxscore-score>a_dropoff_score) break;
900 <   }
901 < a_l=ai_maxscore;
902 < b_l=bi_maxscore;
903 < //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
904 < //now extend to the right
905 < ai_maxscore=a_r;
906 < bi_maxscore=b_r;
907 < ai=a_r;
908 < bi=b_r;
909 < score=maxscore;
910 < //sometimes there are extra AAAAs at the end of the read, ignore those
911 < if (strcmp(&a[alen-4],"AAAA")==0) {
912 <    alen-=3;
913 <    while (a[alen-1]=='A' && alen>ai) alen--;
914 <    }
915 < while (ai<alen-1 && bi<blen-1) {
916 <   ai++;
917 <   bi++;
918 <   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
919 <   if (a[ai]==b[bi]) { //match
920 <      score+=a_m_score;
921 <      if (ai>=alen-2) {
922 <           score+=a_m_score-(alen-ai-1);
923 <           }
792 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
793 > if (!doPolyTrim) return false;
794 > int rlen=seq.length();
795 > l5=0;
796 > l3=rlen-1;
797 > int32 seedVal=*(int32*)poly_seed;
798 > char polyChar=poly_seed[0];
799 > //assumes N trimming was already done
800 > //so a poly match should be very close to the end of the read
801 > // -- find the initial match (seed)
802 > int lmin=GMAX((rlen-16), 0);
803 > int li;
804 > for (li=rlen-4;li>lmin;li--) {
805 >   if (seedVal==*(int*)&(seq[li])) {
806 >      break;
807        }
925    else { //mismatch
926      score+=a_mis_score;
927      }  
928   if (score>maxscore) {
929       ai_maxscore=ai;
930       bi_maxscore=bi;
931       maxscore=score;
932       }
933     else if (maxscore-score>a_dropoff_score) break;
808     }
809 <  a_r=ai_maxscore;
810 <  b_r=bi_maxscore;
811 <  int a_ovh3=alen-a_r-1;
812 <  int b_ovh3=blen-b_r-1;
813 <  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
814 <  int mmovh5=(a_l<b_l)? a_l : b_l;
815 <  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
816 < #ifdef DEBUG
817 <  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
818 < #endif
819 <  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
820 <     if (a_l<a_ovh3) {
821 <        //adapter closer to the left end (typical for 5' adapter)
822 <        l5=a_r+1;
823 <        l3=alen-1;
824 <        }
825 <      else {
826 <        //adapter matching at the right end (typical for 3' adapter)
953 <        l5=0;
954 <        l3=a_l-1;
809 > if (li<=lmin) return false;
810 > //seed found, try to extend it both ways
811 > //extend right
812 > int ri=li+3;
813 > SLocScore loc(ri, poly_m_score<<2);
814 > SLocScore maxloc(loc);
815 > //extend right
816 > while (ri<rlen-1) {
817 >   ri++;
818 >   if (seq[ri]==polyChar) {
819 >                loc.add(ri,poly_m_score);
820 >                }
821 >   else if (seq[ri]=='N') {
822 >                loc.add(ri,0);
823 >                }
824 >   else { //mismatch
825 >        loc.add(ri,poly_mis_score);
826 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
827          }
828 <     return true;
829 <     }
830 < else { //keep this segment pair for later (gapped alignment)
959 <   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
960 <   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
961 <   }
962 <  //do not trim:
963 <  l5=0;
964 <  l3=alen-1;
965 <  return false;
966 < }
967 <
968 < /*
969 < int getWordValue(const char* s, int wlen) {
970 < int r=0;
971 < while (wlen--) { r+=(((int)s[wlen])<<wlen) }
972 < return r;
973 < }
974 < */
975 < int get3mer_value(const char* s) {
976 < return (s[0]<<16)+(s[1]<<8)+s[2];
977 < }
978 <
979 < int w3_match(int qv, const char* str, int slen, int start_index=0) {
980 < if (start_index>=slen || start_index<0) return -1;
981 < for (int i=start_index;i<slen-3;i++) {
982 <   int rv=get3mer_value(str+i);
983 <   if (rv==qv) return i;
984 <   }
985 < return -1;
986 < }
987 <
988 < int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
989 < if (end_index>=slen) return -1;
990 < if (end_index<0) end_index=slen-1;
991 < for (int i=end_index-2;i>=0;i--) {
992 <   int rv=get3mer_value(str+i);
993 <   if (rv==qv) return i;
994 <   }
995 < return -1;
996 < }
997 <
998 < int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
999 < if (start_index>=slen || start_index<0) return -1;
1000 < for (int i=start_index;i<slen-4;i++) {
1001 <   int32* rv=(int32*)(str+i);
1002 <   if (*rv==qv) return i;
1003 <   }
1004 < return -1;
1005 < }
1006 <
1007 < int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1008 < if (end_index>=slen) return -1;
1009 < if (end_index<0) end_index=slen-1;
1010 < for (int i=end_index-3;i>=0;i--) {
1011 <   int32* rv=(int32*)(str+i);
1012 <   if (*rv==qv) return i;
1013 <   }
1014 < return -1;
1015 < }
1016 <
1017 < #ifdef DEBUG
1018 < void dbgPrintChain(CSegChain& chain, const char* aseq) {
1019 <  GStr s(aseq);
1020 <  for (int i=0;i<chain.Count();i++) {
1021 <   CSegPair& seg=*chain[i];
1022 <   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1023 <            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
828 >   if (maxloc.score<=loc.score) {
829 >      maxloc=loc;
830 >      }
831     }
832 + ri=maxloc.pos;
833 + if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
834 + //ri = right boundary for the poly match
835 + //extend left
836 + loc.set(li, maxloc.score);
837 + maxloc.pos=li;
838 + while (li>0) {
839 +    li--;
840 +    if (seq[li]==polyChar) {
841 +                 loc.add(li,poly_m_score);
842 +                 }
843 +    else if (seq[li]=='N') {
844 +                 loc.add(li,0);
845 +                 }
846 +    else { //mismatch
847 +         loc.add(li,poly_mis_score);
848 +         if (maxloc.score-loc.score>poly_dropoff_score) break;
849 +         }
850 +    if (maxloc.score<=loc.score) {
851 +       maxloc=loc;
852 +       }
853 +    }
854 + li=maxloc.pos;
855 + if ((maxloc.score==poly_minScore && ri==rlen-1) ||
856 +    (maxloc.score>poly_minScore && ri>=rlen-3) ||
857 +    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
858 +  //trimming this li-ri match at 3' end
859 +    l3=li-1;
860 +    if (l3<0) l3=0;
861 +    return true;
862 +    }
863 + return false;
864   }
1026 #endif
865  
866 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
866 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
867 > if (!doPolyTrim) return false;
868   int rlen=seq.length();
869   l5=0;
870   l3=rlen-1;
871 < //first try a full match, we might get lucky
872 < int fi=-1;
873 < if ((fi=seq.index(adapter3))>=0) {
874 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
875 <      l5=fi+a3len;
876 <      l3=rlen-1;
877 <      }
878 <    else {
879 <      l5=0;
880 <      l3=fi-1;
871 > int32 seedVal=*(int32*)poly_seed;
872 > char polyChar=poly_seed[0];
873 > //assumes N trimming was already done
874 > //so a poly match should be very close to the end of the read
875 > // -- find the initial match (seed)
876 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
877 > int li;
878 > for (li=0;li<=lmax;li++) {
879 >   if (seedVal==*(int*)&(seq[li])) {
880 >      break;
881        }
1043   return true;
882     }
883 < #ifdef DEBUG
884 < if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
885 < #endif
886 <
887 < //also, for fast detection of other adapter-only reads that start past
888 < // the beginning of the adapter sequence, try to see if the first a3len-4
889 < // bases of the read are a substring of the adapter
890 < if (rlen>a3len-3) {
891 <   GStr rstart=seq.substr(1,a3len-4);
892 <   if ((fi=adapter3.index(rstart))>=0) {
893 <     l3=rlen-1;
894 <     l5=a3len-4;
895 <     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
896 <     return true;
897 <     }
898 <  }
899 < CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
900 <  //check the easy cases - 11 bases exact match at the end
901 < int fdlen=11;
902 <  if (a3len<16) {
1065 <   fdlen=a3len>>1;
1066 <   }
1067 < if (fdlen>4) {
1068 <     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1069 <     GStr rstart=seq.substr(-fdlen-3,fdlen);
1070 <     if ((fi=adapter3.index(rstart))>=0) {
1071 < #ifdef DEBUG
1072 <       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1073 < #endif
1074 <       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1075 <                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
1076 <            return true;
1077 <       }
1078 <     //another easy case: first 11 characters of the adaptor found as a substring of the read
1079 <     GStr bstr=adapter3.substr(0, fdlen);
1080 <     if ((fi=seq.rindex(bstr))>=0) {
1081 < #ifdef DEBUG
1082 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1083 < #endif
1084 <       if (extendMatch(seq.chars(), rlen, fi,
1085 <                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
1086 <            return true;
883 > if (li>lmax) return false;
884 > //seed found, try to extend it both ways
885 > //extend left
886 > int ri=li+3; //save rightmost base of the seed
887 > SLocScore loc(li, poly_m_score<<2);
888 > SLocScore maxloc(loc);
889 > while (li>0) {
890 >    li--;
891 >    if (seq[li]==polyChar) {
892 >                 loc.add(li,poly_m_score);
893 >                 }
894 >    else if (seq[li]=='N') {
895 >                 loc.add(li,0);
896 >                 }
897 >    else { //mismatch
898 >         loc.add(li,poly_mis_score);
899 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
900 >         }
901 >    if (maxloc.score<=loc.score) {
902 >       maxloc=loc;
903         }
904 <     } //tried to match 11 bases first
905 <    
906 < //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
907 < //-- only extend if the match is in the 3' (ending) region of the read
908 < int wordSize=3;
909 < int hlen=12;
910 < if (hlen>a3len-wordSize) hlen=a3len-wordSize;
911 < int imin=rlen>>1; //last half of the read, left boundary for the wmer match
912 < if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
913 < imin=rlen-imin;
914 < for (int iw=0;iw<hlen;iw++) {
915 <   //int32* qv=(int32*)(adapter3.chars()+iw);
916 <   int qv=get3mer_value(adapter3.chars()+iw);
917 <   fi=-1;
918 <   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
919 <   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
920 <     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
921 <
922 < #ifdef DEBUG
1107 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1108 < #endif
1109 <     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1110 <                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
1111 <     fi--;
1112 <     if (fi<imin) break;
1113 <     }
1114 <   } //for each wmer in the first hlen bases of the adaptor
1115 < /*
1116 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1117 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1118 < if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1119 < int hlen2=a3len-wordSize;
1120 < //if (hlen2>a3len-4) hlen2=a3len-4;
1121 < if (hlen2>hlen) {
1122 < #ifdef DEBUG
1123 <     if (debug && a3segs.Count()>0) {
1124 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
904 >    }
905 > li=maxloc.pos;
906 > if (li>5) return false; //no trimming wanted, too far from 5' end
907 > //li = right boundary for the poly match
908 >
909 > //extend right
910 > loc.set(ri, maxloc.score);
911 > maxloc.pos=ri;
912 > while (ri<rlen-1) {
913 >   ri++;
914 >   if (seq[ri]==polyChar) {
915 >                loc.add(ri,poly_m_score);
916 >                }
917 >   else if (seq[ri]=='N') {
918 >                loc.add(ri,0);
919 >                }
920 >   else { //mismatch
921 >        loc.add(ri,poly_mis_score);
922 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
923          }
924 < #endif
925 <     for (int iw=hlen;iw<hlen2;iw++) {
1128 <         //int* qv=(int32 *)(adapter3.chars()+iw);
1129 <         int qv=get3mer_value(adapter3.chars()+iw);
1130 <         fi=-1;
1131 <         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1132 <         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1133 <           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1134 <                         a3len, iw, wordSize, l5,l3, a3segs);
1135 <           fi--;
1136 <           if (fi<imin) break;
1137 <           }
1138 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1139 <     }
1140 < //lastly, analyze collected a3segs for a possible gapped alignment:
1141 < GList<CSegChain> segchains(false,true,false);
1142 < #ifdef DEBUG
1143 < if (debug && a3segs.Count()>0) {
1144 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1145 <   }
1146 < #endif
1147 < for (int i=0;i<a3segs.Count();i++) {
1148 <   if (a3segs[i]->chain==NULL) {
1149 <       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1150 <       CSegChain* newchain=new CSegChain();
1151 <       newchain->setFreeItem(false);
1152 <       newchain->addSegPair(a3segs[i]);
1153 <       a3segs[i]->chain=newchain;
1154 <       segchains.Add(newchain); //just to free them when done
1155 <       }
1156 <   for (int j=i+1;j<a3segs.Count();j++) {
1157 <      CSegChain* chain=a3segs[i]->chain;
1158 <      if (chain->extendChain(a3segs[j])) {
1159 <          a3segs[j]->chain=chain;
1160 < #ifdef DEBUG
1161 <          if (debug) dbgPrintChain(*chain, adapter3.chars());
1162 < #endif      
1163 <          //save time by checking here if the extended chain is already acceptable for trimming
1164 <          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1165 <            l5=0;
1166 <            l3=chain->astart-2;
1167 < #ifdef DEBUG
1168 <          if (debug && a3segs.Count()>0) {
1169 <            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1170 <            }
1171 < #endif
1172 <            return true;
1173 <            }
1174 <          } //chain can be extended
924 >   if (maxloc.score<=loc.score) {
925 >      maxloc=loc;
926        }
927 <   } //collect segment alignments into chains
928 < */  
929 < return false; //no adapter parts found
927 >   }
928 > ri=maxloc.pos;
929 > if ((maxloc.score==poly_minScore && li==0) ||
930 >     (maxloc.score>poly_minScore && li<2)
931 >     || (maxloc.score>(poly_minScore*3) && li<8)) {
932 >    //adjust l5 to reflect this trimming of 5' end
933 >    l5=ri+1;
934 >    if (l5>rlen-1) l5=rlen-1;
935 >    return true;
936 >    }
937 > return false;
938 > }
939 >
940 > bool trim_adapter3(GStr& seq, int&l5, int &l3) {
941 > if (adapters3.Count()==0) return false;
942 > int rlen=seq.length();
943 > l5=0;
944 > l3=rlen-1;
945 > bool trimmed=false;
946 > GStr wseq(seq.chars());
947 > int wlen=rlen;
948 > for (int ai=0;ai<adapters3.Count();ai++) {
949 >  //if (adapters3[ai].is_empty()) continue;
950 >  int alen=adapters3[ai].seq.length();
951 >  GStr& aseq=adapters3[ai].seq;
952 >  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, adapters3[ai].pz,
953 >                            wseq.chars(), wlen, gxmem_r, 74);
954 >  if (r_bestaln) {
955 >     trimmed=true;
956 >     //keep unmatched region on the left, if any
957 >     l3-=(wlen-r_bestaln->sl+1);
958 >     delete r_bestaln;
959 >     if (l3<0) l3=0;
960 >     if (l3-l5+1<min_read_len) return true;
961 >     wseq=seq.substr(l5,l3-l5+1);
962 >     wlen=wseq.length();
963 >     }
964 >  }//for each 5' adapter
965 >  return trimmed;
966   }
967  
968   bool trim_adapter5(GStr& seq, int&l5, int &l3) {
969 < //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
969 > if (adapters5.Count()==0) return false;
970   int rlen=seq.length();
971   l5=0;
972   l3=rlen-1;
973 < //try to see if adapter is fully included in the read
974 < int fi=-1;
975 < if ((fi=seq.index(adapter5))>=0) {
976 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
977 <      l5=fi+a5len;
978 <      l3=rlen-1;
979 <      }
980 <    else {
981 <      l5=0;
982 <      l3=fi-1;
983 <      }
984 <   return true;
985 <   }
986 < #ifdef DEBUG
987 < if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
988 < #endif
989 <
1203 < CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1204 <
1205 < //try the easy way out first - look for an exact match of 11 bases
1206 < int fdlen=11;
1207 <  if (a5len<16) {
1208 <   fdlen=a5len>>1;
1209 <   }
1210 < if (fdlen>4) {
1211 <     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1212 <     if ((fi=adapter5.index(rstart))>=0) {
1213 < #ifdef DEBUG
1214 <       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1215 < #endif
1216 <       if (extendMatch(seq.chars(), rlen, 1,
1217 <                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1218 <           return true;
1219 <       }
1220 <     //another easy case: last 11 characters of the adaptor found as a substring of the read
1221 <     GStr bstr=adapter5.substr(-fdlen);
1222 <     if ((fi=seq.index(bstr))>=0) {
1223 < #ifdef DEBUG
1224 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1225 < #endif
1226 <       if (extendMatch(seq.chars(), rlen, fi,
1227 <                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1228 <          return true;
1229 <       }
1230 <     } //tried to matching at most 11 bases first
1231 <    
1232 < //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1233 < //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1234 < int wordSize=3;
1235 < int hlen=12;
1236 < if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1237 < int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1238 < if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1239 < for (int iw=0;iw<=hlen;iw++) {
1240 <   int apstart=a5len-iw-wordSize;
1241 <   fi=0;
1242 <   //int* qv=(int32 *)(adapter5.chars()+apstart);
1243 <   int qv=get3mer_value(adapter5.chars()+apstart);
1244 <   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1245 <   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1246 < #ifdef DEBUG
1247 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1248 < #endif
1249 <     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1250 <                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1251 <     fi++;
1252 <     if (fi>imax) break;
1253 <     }
1254 <   } //for each wmer in the last hlen bases of the adaptor
1255 < /*
1256 <
1257 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1258 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1259 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1260 < int hlen2=a5len-wordSize;
1261 < //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1262 < #ifdef DEBUG
1263 <      if (debug && a5segs.Count()>0) {
1264 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1265 <        }
1266 < #endif
1267 < if (hlen2>hlen) {
1268 <     for (int iw=hlen+1;iw<=hlen2;iw++) {
1269 <         int apstart=a5len-iw-wordSize;
1270 <         fi=0;
1271 <         //int* qv=(int32 *)(adapter5.chars()+apstart);
1272 <         int qv=get3mer_value(adapter5.chars()+apstart);
1273 <         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1274 <         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1275 <           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1276 <                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1277 <           fi++;
1278 <           if (fi>imax) break;
1279 <           }
1280 <         } //for each wmer between hlen2 and hlen bases of the adaptor
973 > bool trimmed=false;
974 > GStr wseq(seq.chars());
975 > int wlen=rlen;
976 > for (int ai=0;ai<adapters5.Count();ai++) {
977 >  //if (adapters5[ai].is_empty()) continue;
978 >  int alen=adapters5[ai].seq.length();
979 >  GStr& aseq=adapters5[ai].seq;
980 >  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
981 >                 wseq.chars(), wlen, gxmem_l, 84);
982 >  if (l_bestaln) {
983 >     trimmed=true;
984 >     l5+=l_bestaln->sr;
985 >     delete l_bestaln;
986 >     if (l5>=rlen) l5=rlen-1;
987 >     if (l3-l5+1<min_read_len) return true;
988 >     wseq=seq.substr(l5,l3-l5+1);
989 >     wlen=wseq.length();
990       }
991 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
992 < // lastly, analyze collected a5segs for a possible gapped alignment:
993 < GList<CSegChain> segchains(false,true,false);
1285 < #ifdef DEBUG
1286 < if (debug && a5segs.Count()>0) {
1287 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1288 <   }
1289 < #endif
1290 < for (int i=0;i<a5segs.Count();i++) {
1291 <   if (a5segs[i]->chain==NULL) {
1292 <       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1293 <       CSegChain* newchain=new CSegChain(true);
1294 <       newchain->setFreeItem(false);
1295 <       newchain->addSegPair(a5segs[i]);
1296 <       a5segs[i]->chain=newchain;
1297 <       segchains.Add(newchain); //just to free them when done
1298 <       }
1299 <   for (int j=i+1;j<a5segs.Count();j++) {
1300 <      CSegChain* chain=a5segs[i]->chain;
1301 <      if (chain->extendChain(a5segs[j])) {
1302 <         a5segs[j]->chain=chain;
1303 < #ifdef DEBUG
1304 <         if (debug) dbgPrintChain(*chain, adapter5.chars());
1305 < #endif      
1306 <      //save time by checking here if the extended chain is already acceptable for trimming
1307 <         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1308 <            l5=chain->aend;
1309 <            l3=rlen-1;
1310 <            return true;
1311 <            }
1312 <         } //chain can be extended
1313 <      }
1314 <   } //collect segment alignments into chains
1315 < */
1316 < return false; //no adapter parts found
1317 < }
991 >  }//for each 5' adapter
992 >  return trimmed;
993 > }
994  
995 < //convert qvs to/from phred64 from/to phread33
995 > //convert qvs to/from phred64 from/to phread33
996   void convertPhred(GStr& q) {
997   for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
998   }
# Line 1325 | Line 1001
1001   for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1002   }
1003  
1004 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1004 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1005            GStr& rname, GStr& rinfo, GStr& infname) {
1006   rseq="";
1007   rqv="";
# Line 1341 | Line 1017
1017        } //raw qseq format
1018   else { // FASTQ or FASTA */
1019   isfasta=(l[0]=='>');
1020 < if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1020 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1021        infname.chars(), l);
1022   GStr s(l);
1023   rname=&(l[1]);
1024   for (int i=0;i<rname.length();i++)
1025 <    if (rname[i]<=' ') {
1026 <       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1027 <       rname.cut(i);
1028 <       break;
1025 >    if (rname[i]<=' ') {
1026 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1027 >       rname.cut(i);
1028 >       break;
1029         }
1030    //now get the sequence
1031 < if ((l=fq.getLine())==NULL)
1031 > if ((l=fq.getLine())==NULL)
1032        GError("Error: unexpected EOF after header for read %s (%s)\n",
1033                     rname.chars(), infname.chars());
1034   rseq=l; //this must be the DNA line
1035   while ((l=fq.getLine())!=NULL) {
1036        //seq can span multiple lines
1037        if (l[0]=='>' || l[0]=='+') {
1038 <           fq.pushBack();
1038 >           fq.pushBack();
1039             break; //
1040             }
1041        rseq+=l;
1042 <      } //check for multi-line seq
1042 >      } //check for multi-line seq
1043   if (!isfasta) { //reading fastq quality values, which can also be multi-line
1044      if ((l=fq.getLine())==NULL)
1045          GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1046      if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1047 <    if ((l=fq.getLine())==NULL)
1047 >    if ((l=fq.getLine())==NULL)
1048          GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1049      rqv=l;
1050 <    //if (rqv.length()!=rseq.length())
1050 >    //if (rqv.length()!=rseq.length())
1051      //  GError("Error: qv len != seq len for %s\n", rname.chars());
1052      while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1053        rqv+=l; //append to qv string
1054        }
1055      }// fastq
1056   // } //<-- FASTA or FASTQ
1057 < rseq.upper(); //TODO: what if we care about masking?
1057 > rseq.upper();
1058   return true;
1059   }
1060  
1061 + #ifdef GDEBUG
1062 + void showTrim(GStr& s, int l5, int l3) {
1063 +  if (l5>0) {
1064 +    color_bg(c_red);
1065 +    }
1066 +  for (int i=0;i<s.length()-1;i++) {
1067 +    if (i && i==l5) color_resetbg();
1068 +    fprintf(stderr, "%c", s[i]);
1069 +    if (i==l3) color_bg(c_red);
1070 +   }
1071 +  fprintf(stderr, "%c", s[s.length()-1]);
1072 +  color_reset();
1073 +  fprintf(stderr, "\n");
1074 + }
1075 + #endif
1076 +
1077   char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1078 < //returns 0 if the read was untouched, 1 if it was just trimmed
1078 > //returns 0 if the read was untouched, 1 if it was just trimmed
1079   // and a trash code if it was trashed
1080   l5=0;
1081   l3=rseq.length()-1;
1082 + #ifdef GDEBUG
1083 +   //rseq.reverse();
1084 +   GMessage(">%s\n", rname.chars());
1085 +   GMessage("%s\n",rseq.chars());
1086 + #endif
1087   if (l3-l5+1<min_read_len) {
1088     return 's';
1089     }
# Line 1422 | Line 1119
1119     w5=0;
1120     w3=wseq.length()-1;
1121     }
1122 < if (a3len>0) {
1123 <  if (trim_adapter3(wseq, w5, w3)) {
1122 > char trim_code;
1123 > do {
1124 >  trim_code=0;
1125 >  if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1126 >      trim_code='A';
1127 >      }
1128 >  else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1129 >      trim_code='T';
1130 >      }
1131 >  else if (trim_adapter5(wseq, w5, w3)) {
1132 >      trim_code='5';
1133 >      }
1134 >  if (trim_code) {
1135 >     #ifdef GDEBUG
1136 >      GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1137 >      showTrim(wseq, w5, w3);
1138 >     #endif
1139       int trimlen=wseq.length()-(w3-w5+1);
1140 <     num_trimmed3++;
1141 <     if (trimlen<min_trimmed3)
1142 <         min_trimmed3=trimlen;
1140 >     num_trimmed5++;
1141 >     if (trimlen<min_trimmed5)
1142 >         min_trimmed5=trimlen;
1143       l5+=w5;
1144       l3-=(wseq.length()-1-w3);
1145       if (w3-w5+1<min_read_len) {
1146 <         return '3';
1146 >         return trim_code;
1147           }
1148        //-- keep only the w5..w3 range
1149        wseq=wseq.substr(w5, w3-w5+1);
1150        if (!wqv.is_empty())
1151           wqv=wqv.substr(w5, w3-w5+1);
1152 <      }//some adapter was trimmed
1153 <   } //adapter trimming
1154 < if (a5len>0) {
1155 <  if (trim_adapter5(wseq, w5, w3)) {
1152 >      }// trimmed at 5' end
1153 > } while (trim_code);
1154 >
1155 > do {
1156 >  trim_code=0;
1157 >  if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1158 >      trim_code='A';
1159 >      }
1160 >  else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1161 >      trim_code='T';
1162 >      }
1163 >  else if (trim_adapter3(wseq, w5, w3)) {
1164 >      trim_code='3';
1165 >      }
1166 >  if (trim_code) {
1167 >     #ifdef GDEBUG
1168 >     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1169 >     showTrim(wseq, w5, w3);
1170 >     #endif
1171       int trimlen=wseq.length()-(w3-w5+1);
1172 <     num_trimmed5++;
1173 <     if (trimlen<min_trimmed5)
1174 <         min_trimmed5=trimlen;
1172 >     num_trimmed3++;
1173 >     if (trimlen<min_trimmed3)
1174 >         min_trimmed3=trimlen;
1175       l5+=w5;
1176       l3-=(wseq.length()-1-w3);
1177       if (w3-w5+1<min_read_len) {
1178 <         return '5';
1178 >         return trim_code;
1179           }
1180        //-- keep only the w5..w3 range
1181        wseq=wseq.substr(w5, w3-w5+1);
1182        if (!wqv.is_empty())
1183           wqv=wqv.substr(w5, w3-w5+1);
1184 <      }//some adapter was trimmed
1185 <   } //adapter trimming
1184 >      }//trimming at 3' end
1185 > } while (trim_code);
1186 >
1187 >
1188   if (doCollapse) {
1189     //keep read for later
1190     FqDupRec* dr=dhash.Find(wseq.chars());
1191     if (dr==NULL) { //new entry
1192 <          //if (prefix.is_empty())
1193 <             dhash.Add(wseq.chars(),
1192 >          //if (prefix.is_empty())
1193 >             dhash.Add(wseq.chars(),
1194                    new FqDupRec(&wqv, rname.chars()));
1195            //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1196           }
# Line 1495 | Line 1224
1224         fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1225         }
1226        else {
1227 <       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1227 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1228                            rseq.chars());
1229         }
1230       }
# Line 1506 | Line 1235
1235         fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1236         }
1237        else
1238 <       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1238 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1239                            rseq.chars(),rqv.chars() );
1240       }
1241   }
# Line 1514 | Line 1243
1243   void trash_report(char trashcode, GStr& rname, FILE* freport) {
1244   if (freport==NULL || trashcode<=' ') return;
1245   if (trashcode=='3' || trashcode=='5') {
1246 <   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1246 >   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1247     }
1248   else {
1249     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
# Line 1606 | Line 1335
1335      }
1336   }
1337  
1338 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1339 <                       GStr& s, GStr& infname, GStr& infname2) {
1338 > void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1339 > //TODO: prepare CASeqData here, and collect hexamers as well
1340 > CASeqData adata(revCompl);
1341 > int idx=adapters.Add(adata);
1342 > if (idx<0) GError("Error: failed to add adaptor!\n");
1343 > adapters[idx].update(seq.chars());
1344 > }
1345 >
1346 >
1347 > int loadAdapters(const char* fname) {
1348 >  GLineReader lr(fname);
1349 >  char* l;
1350 >  while ((l=lr.nextLine())!=NULL) {
1351 >   if (lr.length()<=3 || l[0]=='#') continue;
1352 >   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1353 >        l[0]==';'|| l[0]==':' ) {
1354 >      int i=1;
1355 >      while (l[i]!=0 && isspace(l[i])) {
1356 >        i++;
1357 >        }
1358 >      if (l[i]!=0) {
1359 >        GStr s(&(l[i]));
1360 >      #ifdef GDEBUG
1361 >          //s.reverse();
1362 >      #endif
1363 >        addAdapter(adapters3, s);
1364 >        continue;
1365 >        }
1366 >      }
1367 >    else {
1368 >      GStr s(l);
1369 >      s.startTokenize("\t ;,:");
1370 >      GStr a5,a3;
1371 >      if (s.nextToken(a5))
1372 >         s.nextToken(a3);
1373 >      a5.upper();
1374 >      a3.upper();
1375 >     #ifdef GDEBUG
1376 >     //   a5.reverse();
1377 >     //   a3.reverse();
1378 >     #endif
1379 >      addAdapter(adapters5, a5);
1380 >      addAdapter(adapters3, a3);
1381 >      }
1382 >   }
1383 >   return adapters5.Count()+adapters3.Count();
1384 > }
1385 >
1386 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1387 >                       GStr& s, GStr& infname, GStr& infname2) {
1388   // uses outsuffix to generate output file names and open file handles as needed
1389   infname="";
1390   infname2="";
# Line 1635 | Line 1412
1412   s.startTokenize(",:");
1413   s.nextToken(infname);
1414   bool paired=s.nextToken(infname2);
1415 < if (fileExists(infname.chars())==0)
1415 > if (fileExists(infname.chars())==0)
1416      GError("Error: cannot find file %s!\n",infname.chars());
1417   GStr fname(getFileName(infname.chars()));
1418   GStr picmd;
# Line 1657 | Line 1434
1434   if (!paired) return;
1435   if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1436   // ---- paired reads:-------------
1437 < if (fileExists(infname2.chars())==0)
1437 > if (fileExists(infname2.chars())==0)
1438       GError("Error: cannot find file %s!\n",infname2.chars());
1439   picmd="";
1440   GStr fname2(getFileName(infname2.chars()));

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines