ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 2 | Line 2
2   #include "GStr.h"
3   #include "GHash.hh"
4   #include "GList.hh"
5 + #include <ctype.h>
6 + #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9 < fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-q <minqv>] [-C] [-D]\\\n\
10 <   [-p {64|33}] [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>]\\\n\
11 <   [-Q] <input.fq>\n\
9 > fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 >   [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 >    <input.fq>[,<input_mates.fq>\n\
13   \n\
14 < Trim low quality bases at 3' end, optionally trim adapter sequence, filter\n\
15 < for low complexity and collapse duplicate reads\n\
14 > Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 > for low complexity and collapse duplicate reads.\n\
16 > If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 > one read in a pair), the two file names should be given delimited by a comma\n\
18 > or a colon character\n\
19   \n\
20   Options:\n\
21   -n  rename all the reads using the <prefix> followed by a read counter;\n\
22      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23      the read duplication count\n\
24 < -o  write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
24 > -o  unless this parameter is '-', write the trimmed/filtered reads to \n\
25 >    file(s) named <input>.<outsuffix> which will be created in the \n\
26 >    current (working) directory; (writes to stdout if -o- is given);\n\
27 >    a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 > -f  file with adapter sequences to trim, each line having this format:\n\
29 >    <5'-adapter-sequence> <3'-adapter-sequence>\n\
30   -5  trim the given adapter or primer sequence at the 5' end of each read\n\
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32   -3  trim the given adapter sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -q  trim bases with quality value lower than <minq> (starting the 3' end)\n\
34 > -A  disable polyA/T trimming (enabled by default)\n\
35 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 > -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39   -l  minimum \"clean\" length after trimming that a read must have\n\
40      in order to pass the filter (default: 16)\n\
# Line 34 | Line 48
48   -Q  convert quality values to the other Phred qv type\n\
49   -V  verbose processing\n\
50   "
51 +
52 + //-z  for -o option, the output stream(s) will be first piped into the given\n
53 + //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54 +
55 +
56   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57  
58 < //For pair ends sequencing:
58 > //For paired reads sequencing:
59   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60   //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 < FILE* f_out=NULL; //stdout if not provided
62 < FILE* f_in=NULL; //input fastq (stdin if not provided)
61 > //FILE* f_out=NULL; //stdout if not provided
62 > //FILE* f_out2=NULL; //for paired reads
63 > //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 > //FILE* f_in2=NULL; //for paired reads
65 >
66   FILE* freport=NULL;
67 +
68   bool debug=false;
69   bool verbose=false;
70   bool doCollapse=false;
71   bool doDust=false;
72 + bool doPolyTrim=true;
73 + bool fastaOutput=false;
74   bool trashReport=false;
75 < //bool rawFormat=false;
75 > bool revCompl=false; //also reverse complement adaptor sequences
76   int min_read_len=16;
77   double max_perc_N=7.0;
78   int dust_cutoff=16;
79   bool isfasta=false;
80   bool convert_phred=false;
81 + GStr outsuffix; // -o
82   GStr prefix;
83 < GStr adapter3;
84 < GStr adapter5;
85 <
86 < int qvtrim_min=0;
83 > GStr zcmd;
84 > int num_trimmed5=0;
85 > int num_trimmed3=0;
86 > int num_trimmedN=0;
87 > int num_trimmedQ=0;
88 > int min_trimmed5=INT_MAX;
89 > int min_trimmed3=INT_MAX;
90  
91 + int qvtrim_qmin=0;
92 + int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
93   int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94   int qv_cvtadd=0; //could be -31 or +31
95  
96 < int a3len=0;
97 < int a5len=0;
98 < // adaptor matching metrics -- for extendMatch() function
99 < const int a_m_score=2; //match score
100 < const int a_mis_score=-3; //mismatch
101 < const int a_dropoff_score=7;
102 < const int a_min_score=8; //an exact match of 4 bases at the proper ends WILL be trimmed
103 < const int a_min_chain_score=15; //for gapped alignments
104 <
105 < class CSegChain;
106 <
107 < class CSegPair {
108 <  public:
109 <   GSeg a;
110 <   GSeg b; //the adapter segment
111 <   int score;
112 <   int flags;
113 <   CSegChain* chain;
114 <   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
115 <      score=mscore;
116 <      if (score==0) score=a.len()*a_m_score;
117 <      flags=0;
118 <      chain=NULL;
119 <      }
120 <   int len() { return  a.len(); }
121 <   bool operator==(CSegPair& d){
122 <      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
123 <      //make equal even segments that are included into one another:
124 <      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
94 <      }
95 <   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
96 <     if (b.start==d.b.start) {
97 <        if (score==d.score) {
98 <           //just try to be consistent:
99 <           if (b.end==d.b.end) {
100 <             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
101 <             }
102 <           return (b.end>d.b.end);
103 <           }
104 <         else return (score<d.score);
96 > // adaptor matching metrics -- for X-drop ungapped extension
97 > //const int match_reward=2;
98 > //const int mismatch_penalty=3;
99 > const int match_reward=2;
100 > const int mismatch_penalty=4;
101 > const int Xdrop=10;
102 >
103 > const int poly_m_score=2; //match score
104 > const int poly_mis_score=-3; //mismatch
105 > const int poly_dropoff_score=7;
106 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
107 >
108 > const char *polyA_seed="AAAA";
109 > const char *polyT_seed="TTTT";
110 >
111 > struct CASeqData {
112 >   //positional data for every possible hexamer in an adapter
113 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
114 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
115 >   GStr seq; //actual adapter sequence data
116 >   GStr seqr; //reverse complement sequence
117 >   int amlen; //fraction of adapter length matching that's
118 >              //enough to consider the alignment
119 >   bool use_reverse;
120 >   CASeqData(bool rev=false):seq(),seqr(),
121 >             amlen(0), use_reverse(rev) {
122 >     for (int i=0;i<4096;i++) {
123 >        pz[i]=NULL;
124 >        pzr[i]=NULL;
125          }
106     return (b.start>d.b.start);
126       }
108   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
109     /*if (b.start==d.b.start && b.end==d.b.end) {
110          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
111          }
112     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
113     if (b.start==d.b.start) {
114        if (score==d.score) {
115           //just try to be consistent:
116           if (b.end==d.b.end) {
117             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
118             }
119           return (b.end<d.b.end);
120           }
121         else return (score>d.score);
122        }
123     return (b.start<d.b.start);
124     }
125 };
127  
128 < int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
129 < CSegPair& x = *(CSegPair *)sa;
130 < CSegPair& y = *(CSegPair *)sb;
131 < /*
132 < if (x.b.end==y.b.end) {
133 <     if (x.b.start==y.b.start) {
134 <         if (x.a.end==y.a.end) {
135 <            if (x.a.start==y.a.start) return 0;
136 <            return ((x.a.start>y.a.start) ? -1 : 1);
137 <            }
138 <          else {
139 <            return ((x.a.end>y.a.end) ? -1 : 1);
140 <            }
141 <          }
142 <      else {
143 <       return ((x.b.start>y.b.start) ? -1 : 1);
144 <       }
145 <     }
146 <    else {
146 <     return ((x.b.end>y.b.end) ? -1 : 1);
147 <     }
148 < */
149 <  if (x.b.end==y.b.end) {
150 <     if (x.score==y.score) {
151 <     if (x.b.start==y.b.start) {
152 <         if (x.a.end==y.a.end) {
153 <            if (x.a.start==y.a.start) return 0;
154 <            return ((x.a.start<y.a.start) ? -1 : 1);
155 <            }
156 <          else {
157 <            return ((x.a.end<y.a.end) ? -1 : 1);
158 <            }
159 <          }
160 <      else {
161 <       return ((x.b.start<y.b.start) ? -1 : 1);
162 <       }
163 <      } else return ((x.score>y.score) ? -1 : 1);
164 <     }
165 <    else {
166 <     return ((x.b.end>y.b.end) ? -1 : 1);
128 >   void update(const char* s) {
129 >         seq=s;
130 >         table6mers(seq.chars(), seq.length(), pz);
131 >         amlen=iround(double(seq.length())*0.8);
132 >         if (amlen<12)
133 >                amlen=12;
134 >         if (!use_reverse) return;
135 >         //reverse complement
136 >         seqr=s;
137 >         int slen=seq.length();
138 >         for (int i=0;i<slen;i++)
139 >           seqr[i]=ntComplement(seq[slen-i-1]);
140 >         table6mers(seqr.chars(), seqr.length(), pzr);
141 >   }
142 >
143 >   ~CASeqData() {
144 >     for (int i=0;i<4096;i++) {
145 >       delete pz[i];
146 >       delete pzr[i];
147       }
148 +   }
149 + };
150  
151 < }
151 > GVec<CASeqData> adapters5;
152 > GVec<CASeqData> adapters3;
153  
154 < class CSegChain:public GList<CSegPair> {
155 < public:
173 <   uint astart;
174 <   uint aend;
175 <   uint bstart;
176 <   uint bend;
177 <   int score;
178 <   bool endSort;
179 <  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
180 <   //as SegPairs are inserted, they will be sorted by a.start coordinate
181 <   score=0;
182 <   astart=MAX_UINT;
183 <   aend=0;
184 <   bstart=MAX_UINT;
185 <   bend=0;
186 <   endSort=aln5;
187 <   if (aln5) { setSorted(cmpSegEnds); }
188 <   }
189 < bool operator==(CSegChain& d) {
190 <   //return (score==d.score);
191 <    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
192 <   }
193 < bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
194 <   //return (score<d.score);
195 <   if (bstart==d.bstart && bend==d.bend) {
196 <          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
197 <          }
198 <     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
199 <   }
200 < bool operator<(CSegChain& d) {
201 <   //return (score>d.score);
202 <   if (bstart==d.bstart && bend==d.bend) {
203 <          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
204 <          }
205 <     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
206 <   }
207 < void addSegPair(CSegPair* segp) {
208 <   if (AddIfNew(segp)!=segp) return;
209 <   score+=segp->score;
210 <   if (astart>segp->a.start) astart=segp->a.start;
211 <   if (aend<segp->a.end) aend=segp->a.end;
212 <   if (bstart>segp->b.start) bstart=segp->b.start;
213 <   if (bend<segp->b.end) bend=segp->b.end;
214 <   }
215 < //for building actual chains:
216 < bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
217 <   int bgap=0;
218 <   int agap=0;
219 <   //if (endSort) {
220 <   if (bstart>segp->b.start) {
221 <      bgap = (int)(bstart-segp->b.end);
222 <      if (abs(bgap)>2) return false;
223 <      agap = (int)(astart-segp->a.end);
224 <      if (abs(agap)>2) return false;
225 <      }
226 <     else {
227 <      bgap = (int) (segp->b.start-bend);
228 <      if (abs(bgap)>2) return false;
229 <      agap = (int)(segp->a.start-aend);
230 <      if (abs(agap)>2) return false;
231 <      }
232 <   if (agap*bgap<0) return false;
233 <   addSegPair(segp);
234 <   score-=abs(agap)+abs(bgap);
235 <   return true;
236 <   }
237 < };
154 > CGreedyAlignData* gxmem_l=NULL;
155 > CGreedyAlignData* gxmem_r=NULL;
156  
157   // element in dhash:
158   class FqDupRec {
# Line 273 | Line 191
191    if (!s.is_empty()) {
192        if (s=='-') f=stdout;
193        else {
194 <       f=fopen(s,"w");
194 >       f=fopen(s.chars(),"w");
195         if (f==NULL) GError("Error creating file: %s\n", s.chars());
196         }
197       }
# Line 284 | Line 202
202  
203   GHash<FqDupRec> dhash; //hash to keep track of duplicates
204  
205 + void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
206 + int loadAdapters(const char* fname);
207 +
208 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
209 +                       GStr& s, GStr& infname, GStr& infname2);
210 + // uses outsuffix to generate output file names and open file handles as needed
211 +
212 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
213 + void trash_report(char trashcode, GStr& rname, FILE* freport);
214 +
215 + bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
216 +          GStr& rname, GStr& rinfo, GStr& infname);
217 +
218 + char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
219 + //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
220 +
221   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
222   bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
223   int dust(GStr& seq);
224 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
224 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
225 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
226   bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
227 + bool trim_adapter3(GStr& seq, int &l5, int &l3);
228  
229   void convertPhred(char* q, int len);
230   void convertPhred(GStr& q);
231  
232   int main(int argc, char * const argv[]) {
233 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:o:");
233 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
234    int e;
299  int icounter=0; //counter for input reads
300  int outcounter=0; //counter for output reads
235    if ((e=args.isError())>0) {
236        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
237        exit(224);
238        }
239    debug=(args.getOpt('Y')!=NULL);
240 <  debug=(args.getOpt('V')!=NULL);
240 >  verbose=(args.getOpt('V')!=NULL);
241    convert_phred=(args.getOpt('Q')!=NULL);
242    doCollapse=(args.getOpt('C')!=NULL);
243    doDust=(args.getOpt('D')!=NULL);
244 +  revCompl=(args.getOpt('R')!=NULL);
245 +  if (args.getOpt('A')) doPolyTrim=false;
246    /*
247    rawFormat=(args.getOpt('R')!=NULL);
248    if (rawFormat) {
# Line 315 | Line 251
251    */
252    prefix=args.getOpt('n');
253    GStr s=args.getOpt('l');
254 <  if (!s.is_empty())
254 >  if (!s.is_empty())
255       min_read_len=s.asInt();
256    s=args.getOpt('m');
257 <  if (!s.is_empty())
257 >  if (!s.is_empty())
258       max_perc_N=s.asDouble();
259    s=args.getOpt('d');
260    if (!s.is_empty()) {
# Line 327 | Line 263
263       }
264    s=args.getOpt('q');
265    if (!s.is_empty()) {
266 <     qvtrim_min=s.asInt();
266 >     qvtrim_qmin=s.asInt();
267 >     }
268 >  s=args.getOpt('t');
269 >  if (!s.is_empty()) {
270 >     qvtrim_max=s.asInt();
271       }
272    s=args.getOpt('p');
273    if (!s.is_empty()) {
# Line 340 | Line 280
280          qv_phredtype=64;
281          qv_cvtadd=-31;
282          }
283 <       else
283 >       else
284           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
285       }
286 <  if (args.getOpt('3')!=NULL) {
287 <    adapter3=args.getOpt('3');
288 <    adapter3.upper();
289 <    a3len=adapter3.length();
290 <    }
291 <  if (args.getOpt('5')!=NULL) {
292 <    adapter5=args.getOpt('5');
293 <    adapter5.upper();
294 <    a5len=adapter5.length();
286 >  s=args.getOpt('f');
287 >  if (!s.is_empty()) {
288 >   loadAdapters(s.chars());
289 >   }
290 >  bool fileAdapters=adapters5.Count()+adapters3.Count();
291 >  s=args.getOpt('5');
292 >  if (!s.is_empty()) {
293 >    if (fileAdapters)
294 >      GError("Error: options -5 and -f cannot be used together!\n");
295 >    s.upper();
296 >    addAdapter(adapters5, s);
297      }
298 +  s=args.getOpt('3');
299 +  if (!s.is_empty()) {
300 +    if (fileAdapters)
301 +      GError("Error: options -3 and -f cannot be used together!\n");
302 +      s.upper();
303 +      addAdapter(adapters3, s);
304 +    }
305 +  s=args.getOpt('y');
306 +  if (!s.is_empty()) {
307 +     int minmatch=s.asInt();
308 +     poly_minScore=minmatch*poly_m_score;
309 +     }
310 +
311 +  if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
312 +                         else outsuffix="-";
313    trashReport=  (args.getOpt('r')!=NULL);
314 <  if (args.startNonOpt()==0) {
314 >  int fcount=args.startNonOpt();
315 >  if (fcount==0) {
316      GMessage(USAGE);
317      exit(224);
318      }
319 <
320 <  openfw(f_out, args, 'o');
321 <  if (f_out==NULL) f_out=stdout;
319 >   if (fcount>1 && doCollapse) {
320 >    GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
321 >    }
322 >  //openfw(f_out, args, 'o');
323 >  //if (f_out==NULL) f_out=stdout;
324    if (trashReport)
325      openfw(freport, args, 'r');
326    char* infile=NULL;
327 +
328 +  if (adapters5.Count()>0)
329 +    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
330 +        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
331 +  if (adapters3.Count()>0)
332 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 +
334    while ((infile=args.nextNonOpt())!=NULL) {
335 <    GStr infname(infile);
336 <    if (strcmp(infile,"-")==0) {
337 <       f_in=stdin; infname="stdin"; }
338 <     else {
339 <        f_in=fopen(infile,"r");
340 <        if (f_in==NULL)
341 <            GError("Cannot open input file %s!\n",infile);
342 <        }
343 <     GLineReader fq(f_in);
344 <     char* l=NULL;
345 <     while ((l=fq.getLine())!=NULL) {
346 <        GStr rname; //current read name
347 <        GStr rseq;  //current read sequence
348 <        GStr rqv;   //current read quality values
349 <        GStr s;
350 <        /* if (rawFormat) {
351 <          //TODO: implement qseq parsing here
352 <          //if (raw type=N) then continue; //skip invalid/bad records
353 <          
354 <          } //raw qseq format
355 <         else { // FASTQ or FASTA */
356 <          isfasta=(l[0]=='>');
357 <          if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
358 <          s=l;
359 <          rname=&(l[1]);
360 <          icounter++;
361 <          for (int i=0;i<rname.length();i++)
362 <            if (rname[i]<=' ') { rname.cut(i); break; }
363 <          //now get the sequence
364 <          if ((l=fq.getLine())==NULL)
365 <              GError("Error: unexpected EOF after header for %s\n",rname.chars());
366 <          rseq=l; //this must be the DNA line
367 <          while ((l=fq.getLine())!=NULL) {
368 <              //seq can span multiple lines
369 <              if (l[0]=='>' || l[0]=='+') {
370 <                   fq.pushBack();
371 <                   break; //
335 >    //for each input file
336 >    int incounter=0; //counter for input reads
337 >    int outcounter=0; //counter for output reads
338 >    int trash_s=0; //too short from the get go
339 >    int trash_Q=0;
340 >    int trash_N=0;
341 >    int trash_D=0;
342 >    int trash_poly=0;
343 >    int trash_A3=0;
344 >    int trash_A5=0;
345 >    s=infile;
346 >    GStr infname;
347 >    GStr infname2;
348 >    FILE* f_in=NULL;
349 >    FILE* f_in2=NULL;
350 >    FILE* f_out=NULL;
351 >    FILE* f_out2=NULL;
352 >    bool paired_reads=false;
353 >    setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
354 >    GLineReader fq(f_in);
355 >    GLineReader* fq2=NULL;
356 >    if (f_in2!=NULL) {
357 >       fq2=new GLineReader(f_in2);
358 >       paired_reads=true;
359 >       }
360 >    GStr rseq, rqv, seqid, seqinfo;
361 >    GStr rseq2, rqv2, seqid2, seqinfo2;
362 >    while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
363 >       incounter++;
364 >       int a5=0, a3=0, b5=0, b3=0;
365 >       char tcode=0, tcode2=0;
366 >       tcode=process_read(seqid, rseq, rqv, a5, a3);
367 >       if (fq2!=NULL) {
368 >            getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
369 >            if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
370 >               GError("Error: no paired match for read %s vs %s (%s,%s)\n",
371 >                  seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
372 >               }
373 >            tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
374 >            //decide what to do with this pair and print rseq2 if the pair makes it
375 >            if (tcode>1 && tcode2<=1) {
376 >               //"untrash" rseq
377 >               if (a3-a5+1<min_read_len) {
378 >                   a5=1;
379 >                   if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
380                     }
381 <              rseq+=l;
382 <              } //check for multi-line seq
383 <          if (!isfasta) { //reading fastq quality values, which can also be multi-line
384 <            if ((l=fq.getLine())==NULL)
385 <                GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
386 <            if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
387 <            if ((l=fq.getLine())==NULL)
388 <                GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
389 <            rqv=l;
390 <            //if (rqv.length()!=rseq.length())
391 <            //  GError("Error: qv len != seq len for %s\n", rname.chars());
392 <            while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
393 <              rqv+=l; //append to qv string
394 <              }
420 <            }// fastq
421 <        // } //<-- FASTA or FASTQ
422 <        rseq.upper();
423 <        int l5=0;
424 <        int l3=rseq.length()-1;
425 <        if (l3-l5+1<min_read_len) {
426 <           if (trashReport) {
427 <                  fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
428 <                  }
429 <           continue;
430 <           }
431 <        if (ntrim(rseq, l5, l3)) { // N-trimming
432 <           //GMessage("before: %s\n",rseq.chars());
433 <           //GMessage("after : %s (%d)\n",rseq.substr(l5,l3-l5+1).chars(),l3-l5+1);
434 <           if (l3-l5+1<min_read_len) {
435 <             if (trashReport) {
436 <                    fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
437 <                    }
438 <             continue; //invalid read
439 <             }
440 <            //-- keep only the l5..l3 range
441 <           rseq=rseq.substr(l5, l3-l5+1);
442 <           if (!rqv.is_empty())
443 <              rqv=rqv.substr(l5, l3-l5+1);
444 <           l5=0;
445 <           l3=rseq.length()-1;
446 <           }
447 <        if (qvtrim_min!=0 && !rqv.is_empty() && qtrim(rqv, l5, l3)) { // qv-threshold trimming
448 <           if (l3-l5+1<min_read_len) {
449 <             if (trashReport) {
450 <                    fprintf(freport, "%s\tQ\t%s\n", rname.chars(), rseq.chars());
451 <                    }
452 <             continue; //invalid read
453 <             }
454 <            //-- keep only the l5..l3 range
455 <           rseq=rseq.substr(l5, l3-l5+1);
456 <           if (!rqv.is_empty())
457 <              rqv=rqv.substr(l5, l3-l5+1);
458 <           } //qv trimming
459 <        if (a3len>0) {
460 <          if (trim_adapter3(rseq, l5, l3)) {
461 <             if (l3-l5+1<min_read_len) {
462 <                 if (trashReport) {
463 <                     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
464 <                     }
465 <                 continue;
466 <                 }
467 <              //-- keep only the l5..l3 range
468 <              rseq=rseq.substr(l5, l3-l5+1);
469 <              if (!rqv.is_empty())
470 <                 rqv=rqv.substr(l5, l3-l5+1);
471 <              }//some adapter was trimmed
472 <           } //adapter trimming
473 <        if (a5len>0) {
474 <          if (trim_adapter5(rseq, l5, l3)) {
475 <             if (l3-l5+1<min_read_len) {
476 <                 if (trashReport) {
477 <                     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
478 <                     }
479 <                 continue;
381 >               tcode=1;
382 >               }
383 >             else if (tcode<=1 && tcode2>1) {
384 >               //"untrash" rseq2
385 >               if (b3-b5+1<min_read_len) {
386 >                   b5=1;
387 >                   if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
388 >                   }
389 >               tcode2=1;
390 >               }
391 >            if (tcode<=1) { //trimmed or left intact -- write it!
392 >               if (tcode>0) {
393 >                 rseq2=rseq2.substr(b5,b3-b5+1);
394 >                 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
395                   }
396 <              //-- keep only the l5..l3 range
397 <              rseq=rseq.substr(l5, l3-l5+1);
398 <              if (!rqv.is_empty())
399 <                 rqv=rqv.substr(l5, l3-l5+1);
400 <              }//some adapter was trimmed
401 <           } //adapter trimming
402 <        if (doCollapse) {
403 <           //keep read for later
404 <           FqDupRec* dr=dhash.Find(rseq.chars());
405 <           if (dr==NULL) { //new entry
406 <                  //if (prefix.is_empty())
407 <                     dhash.Add(rseq.chars(),
408 <                          new FqDupRec(&rqv, rname.chars()));
409 <                  //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
396 >               int nocounter=0;
397 >               writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
398 >               }
399 >            } //pair read
400 >       if (tcode>1) { //trashed
401 >         #ifdef GDEBUG
402 >         GMessage(" !!!!TRASH code = %c\n",tcode);
403 >         #endif
404 >          if (tcode=='s') trash_s++;
405 >          else if (tcode=='A' || tcode=='T') trash_poly++;
406 >            else if (tcode=='Q') trash_Q++;
407 >              else if (tcode=='N') trash_N++;
408 >               else if (tcode=='D') trash_D++;
409 >                else if (tcode=='3') trash_A3++;
410 >                 else if (tcode=='5') trash_A5++;
411 >          if (trashReport) trash_report(tcode, seqid, freport);
412 >          }
413 >         else if (!doCollapse) { //write it
414 >          if (tcode>0) {
415 >            rseq=rseq.substr(a5,a3-a5+1);
416 >            if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
417 >            }
418 >         #ifdef GDEBUG
419 >            GMessage("  After trimming:\n");
420 >            GMessage("%s\n",rseq.chars());
421 >         #endif
422 >          writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
423 >          }
424 >       } //for each fastq record
425 >    delete fq2;
426 >    FRCLOSE(f_in);
427 >    FRCLOSE(f_in2);
428 >    if (doCollapse) {
429 >       outcounter=0;
430 >       int maxdup_count=1;
431 >       char* maxdup_seq=NULL;
432 >       dhash.startIterate();
433 >       FqDupRec* qd=NULL;
434 >       char* seq=NULL;
435 >       while ((qd=dhash.NextData(seq))!=NULL) {
436 >         GStr rseq(seq);
437 >         //do the dusting here
438 >         if (doDust) {
439 >            int dustbases=dust(rseq);
440 >            if (dustbases>(rseq.length()>>1)) {
441 >               if (trashReport && qd->firstname!=NULL) {
442 >                 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
443                   }
444 <              else    
445 <                 dr->add(rqv);
446 <           } //collapsing duplicates
447 <         else { //not collapsing duplicates
448 <           //do the dust filter now
449 <           if (doDust) {
450 <             int dustbases=dust(rseq);
451 <             if (dustbases>(rseq.length()>>1)) {
452 <                if (trashReport) {
453 <                  fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
454 <                  }
455 <                continue;
456 <                }
509 <             }
510 <           //print this record here  
511 <           outcounter++;
512 <           if (isfasta) {
513 <            if (prefix.is_empty())
514 <               fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
515 <              else
516 <               fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
517 <                                  rseq.chars());
444 >               trash_D+=qd->count;
445 >               continue;
446 >               }
447 >            }
448 >         outcounter++;
449 >         if (qd->count>maxdup_count) {
450 >            maxdup_count=qd->count;
451 >            maxdup_seq=seq;
452 >            }
453 >         if (isfasta) {
454 >           if (prefix.is_empty()) {
455 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
456 >                           rseq.chars());
457               }
458 <           else {  //fastq
459 <            if (convert_phred) convertPhred(rqv);
460 <            if (prefix.is_empty())
522 <               fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
523 <              else
524 <               fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
525 <                                  rseq.chars(),rqv.chars() );
458 >           else { //use custom read name
459 >             fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
460 >                        qd->count, rseq.chars());
461               }
462 <           } //not collapsing duplicates
463 <        } //for each fastq record
464 <   } //while each input file
465 < FRCLOSE(f_in);
466 < if (doCollapse) {
467 <    outcounter=0;
533 <    int maxdup_count=1;
534 <    char* maxdup_seq=NULL;
535 <    dhash.startIterate();
536 <    FqDupRec* qd=NULL;
537 <    char* seq=NULL;
538 <    while ((qd=dhash.NextData(seq))!=NULL) {
539 <      GStr rseq(seq);
540 <      //do the dusting here
541 <      if (doDust) {
542 <         int dustbases=dust(rseq);
543 <         if (dustbases>(rseq.length()>>1)) {
544 <            if (trashReport && qd->firstname!=NULL) {
545 <              fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
546 <              }
547 <            continue;
462 >           }
463 >         else { //fastq format
464 >          if (convert_phred) convertPhred(qd->qv, qd->len);
465 >          if (prefix.is_empty()) {
466 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
467 >                           rseq.chars(), qd->qv);
468              }
469 +          else { //use custom read name
470 +            fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
471 +                        qd->count, rseq.chars(), qd->qv);
472 +            }
473 +           }
474 +         }//for each element of dhash
475 +       if (maxdup_count>1) {
476 +         GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
477           }
478 <      outcounter++;
479 <      if (qd->count>maxdup_count) {
480 <         maxdup_count=qd->count;
481 <         maxdup_seq=seq;
482 <         }
483 <      if (isfasta) {
484 <        if (prefix.is_empty()) {
485 <          fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
486 <                        rseq.chars());
487 <          }
488 <        else { //use custom read name
489 <          fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
490 <                     qd->count, rseq.chars());
478 >       } //collapse entries
479 >    if (verbose) {
480 >       if (paired_reads) {
481 >           GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
482 >           GMessage("Number of input pairs :%9d\n", incounter);
483 >           GMessage("         Output pairs :%9d\n", outcounter);
484 >           }
485 >         else {
486 >           GMessage(">Input file : %s\n", infname.chars());
487 >           GMessage("Number of input reads :%9d\n", incounter);
488 >           GMessage("         Output reads :%9d\n", outcounter);
489 >           }
490 >       GMessage("------------------------------------\n");
491 >       if (num_trimmed5)
492 >          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
493 >       if (num_trimmed3)
494 >          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
495 >       GMessage("------------------------------------\n");
496 >       if (trash_s>0)
497 >         GMessage("     Trashed by length:%9d\n", trash_s);
498 >       if (trash_N>0)
499 >         GMessage("         Trashed by N%%:%9d\n", trash_N);
500 >       if (trash_Q>0)
501 >         GMessage("Trashed by low quality:%9d\n", trash_Q);
502 >       if (trash_poly>0)
503 >         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
504 >       if (trash_A5>0)
505 >         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
506 >       if (trash_A3>0)
507 >         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
508 >       }
509 >    if (trashReport) {
510 >          FWCLOSE(freport);
511            }
512 <        }
513 <      else { //fastq format
514 <       if (convert_phred) convertPhred(qd->qv, qd->len);
515 <       if (prefix.is_empty()) {
516 <         fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
569 <                        rseq.chars(), qd->qv);
570 <         }
571 <       else { //use custom read name
572 <         fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
573 <                     qd->count, rseq.chars(), qd->qv);
574 <         }
575 <        }
576 <      }//for each element of dhash
577 <    if (maxdup_count>1) {
578 <      GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
579 <      }
580 <   } //report collapsed dhash entries
581 < GMessage("Number of input reads: %9d\n", icounter);
582 < GMessage("       Output records: %9d\n", outcounter);
583 < if (trashReport) {
584 <    FWCLOSE(freport);
585 <    }
586 <
587 < FWCLOSE(f_out);
512 >    FWCLOSE(f_out);
513 >    FWCLOSE(f_out2);
514 >   } //while each input file
515 > delete gxmem_l;
516 > delete gxmem_r;
517   //getc(stdin);
518   }
519  
# Line 599 | Line 528
528     const char* seq;
529     bool valid;
530     NData() {
531 +    seqlen=0;
532      NCount=0;
533      end5=0;
534      end3=0;
# Line 629 | Line 559
559       perc_N=(n*100.0)/(end5-end3+1);
560       }
561   };
562 <
562 >
563   static NData feat;
564   int perc_lenN=12; // incremental distance from ends, in percentage of
565            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
566 <          
566 >
567   void N_analyze(int l5, int l3, int p5, int p3) {
568   /* assumes feat was filled properly */
569   int old_dif, t5,t3,v;
570   if (l3<l5+2 || p5>p3 ) {
571     feat.end5=l5+1;
572     feat.end3=l3+1;
573 <   return;
573 >   return;
574     }
575  
576   t5=feat.NPos[p5]-l5;
577   t3=l3-feat.NPos[p3];
578   old_dif=p3-p5;
579   v=(int)((((double)(l3-l5))*perc_lenN)/100);
580 < if (v>20) v=20; /* enforce N-search limit for very long reads */
580 > if (v>20) v=20; /* enforce N-search limit for very long reads */
581   if (t5 < v ) {
582     l5=feat.NPos[p5]+1;
583     p5++;
# Line 664 | Line 594
594             feat.end3=l3+1;
595             return;
596             }
597 <    else
597 >    else
598        N_analyze(l5,l3, p5,p3);
599   }
600  
601  
602   bool qtrim(GStr& qvs, int &l5, int &l3) {
603 < if (qvtrim_min==0 || qvs.is_empty()) return false;
603 > if (qvtrim_qmin==0 || qvs.is_empty()) return false;
604   if (qv_phredtype==0) {
605    //try to guess the Phred type
606    int vmin=256, vmax=0;
# Line 683 | Line 613
613    if (qv_phredtype==0) {
614      GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
615      }
616 <  } //guessing the Phred type
616 >  if (verbose)
617 >    GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
618 >  } //guessing Phred type
619   for (l3=qvs.length()-1;l3>2;l3--) {
620 <  if (qvs[l3]-qv_phredtype>=qvtrim_min && qvs[l3-1]-qv_phredtype>=qvtrim_min) break;
620 >  if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
621    }
622   //just in case, check also the 5' the end (?)
623   for (l5=0;l5<qvs.length()-3;l5++) {
624 <  if (qvs[l5]-qv_phredtype>=qvtrim_min && qvs[l5+1]-qv_phredtype>=qvtrim_min) break;
624 >  if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
625 >  }
626 > if (qvtrim_max>0) {
627 >  if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
628 >  if (l5>qvtrim_max) l5=qvtrim_max;
629    }
630   return (l5>0 || l3<qvs.length()-1);
631   }
# Line 699 | Line 635
635   feat.init(rseq);
636   l5=feat.end5-1;
637   l3=feat.end3-1;
638 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
638 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
639   if (l5==feat.end5-1 && l3==feat.end3-1) {
640      if (feat.perc_N>max_perc_N) {
641             feat.valid=false;
# Line 717 | Line 653
653     return true;
654     }
655   feat.N_calc();
656 <
656 >
657   if (feat.perc_N>max_perc_N) {
658        feat.valid=false;
659        l3=l5+1;
# Line 729 | Line 665
665   //--------------- dust functions ----------------
666   class DNADuster {
667   public:
668 <  int dustword;
669 <  int dustwindow;
670 <  int dustwindow2;
668 >  int dustword;
669 >  int dustwindow;
670 >  int dustwindow2;
671    int dustcutoff;
672    int mv, iv, jv;
673    int counts[32*32*32];
# Line 826 | Line 762
762                      }
763             }
764           }
765 < //return first;
765 > //return first;
766   }
767   };
768  
# Line 844 | Line 780
780   return ncount;
781   }
782  
783 + struct SLocScore {
784 +  int pos;
785 +  int score;
786 +  SLocScore(int p=0,int s=0) {
787 +    pos=p;
788 +    score=s;
789 +    }
790 +  void set(int p, int s) {
791 +    pos=p;
792 +    score=s;
793 +    }
794 +  void add(int p, int add) {
795 +    pos=p;
796 +    score+=add;
797 +    }
798 + };
799  
800 < // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
801 < //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
802 < //check minimum score and
803 < //for 3' adapter trimming:
804 < //     require that the right end of the alignment for either the adaptor OR the read must be
805 < //     < 3 distance from its right end
806 < // for 5' adapter trimming:
807 < //     require that the left end of the alignment for either the adaptor OR the read must
808 < //     be at coordinate < 3 from start
809 <
810 < bool extendMatch(const char* a, int alen, int ai,
811 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
812 < //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
813 < #ifdef DEBUG
814 < GStr dbg(b);
863 < #endif
864 < //if (debug) {
865 < //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
866 < //  }
867 < int a_l=ai; //alignment coordinates on a
868 < int a_r=ai+mlen-1;
869 < int b_l=bi; //alignment coordinates on b
870 < int b_r=bi+mlen-1;
871 < int ai_maxscore=ai;
872 < int bi_maxscore=bi;
873 < int score=mlen*a_m_score;
874 < int maxscore=score;
875 < int mism5score=a_mis_score;
876 < if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
877 < //try to extend to the left first, if possible
878 < while (ai>0 && bi>0) {
879 <   ai--;
880 <   bi--;
881 <   score+= (a[ai]==b[bi])? a_m_score : mism5score;
882 <   if (score>maxscore) {
883 <       ai_maxscore=ai;
884 <       bi_maxscore=bi;
885 <       maxscore=score;
886 <       }
887 <     else if (maxscore-score>a_dropoff_score) break;
888 <   }
889 < a_l=ai_maxscore;
890 < b_l=bi_maxscore;
891 < //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
892 < //now extend to the right
893 < ai_maxscore=a_r;
894 < bi_maxscore=b_r;
895 < ai=a_r;
896 < bi=b_r;
897 < score=maxscore;
898 < //sometimes there are extra AAAAs at the end of the read, ignore those
899 < if (strcmp(&a[alen-4],"AAAA")==0) {
900 <    alen-=3;
901 <    while (a[alen-1]=='A' && alen>ai) alen--;
902 <    }
903 < while (ai<alen-1 && bi<blen-1) {
904 <   ai++;
905 <   bi++;
906 <   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
907 <   if (a[ai]==b[bi]) { //match
908 <      score+=a_m_score;
909 <      if (ai>=alen-2) {
910 <           score+=a_m_score-(alen-ai-1);
911 <           }
800 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
801 > if (!doPolyTrim) return false;
802 > int rlen=seq.length();
803 > l5=0;
804 > l3=rlen-1;
805 > int32 seedVal=*(int32*)poly_seed;
806 > char polyChar=poly_seed[0];
807 > //assumes N trimming was already done
808 > //so a poly match should be very close to the end of the read
809 > // -- find the initial match (seed)
810 > int lmin=GMAX((rlen-16), 0);
811 > int li;
812 > for (li=rlen-4;li>lmin;li--) {
813 >   if (seedVal==*(int*)&(seq[li])) {
814 >      break;
815        }
913    else { //mismatch
914      score+=a_mis_score;
915      }  
916   if (score>maxscore) {
917       ai_maxscore=ai;
918       bi_maxscore=bi;
919       maxscore=score;
920       }
921     else if (maxscore-score>a_dropoff_score) break;
816     }
817 <  a_r=ai_maxscore;
818 <  b_r=bi_maxscore;
819 <  int a_ovh3=alen-a_r-1;
820 <  int b_ovh3=blen-b_r-1;
821 <  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
822 <  int mmovh5=(a_l<b_l)? a_l : b_l;
823 <  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
824 < #ifdef DEBUG
825 <  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
826 < #endif
827 <  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
828 <     if (a_l<a_ovh3) {
829 <        //adapter closer to the left end (typical for 5' adapter)
830 <        l5=a_r+1;
831 <        l3=alen-1;
832 <        }
833 <      else {
834 <        //adapter matching at the right end (typical for 3' adapter)
941 <        l5=0;
942 <        l3=a_l-1;
817 > if (li<=lmin) return false;
818 > //seed found, try to extend it both ways
819 > //extend right
820 > int ri=li+3;
821 > SLocScore loc(ri, poly_m_score<<2);
822 > SLocScore maxloc(loc);
823 > //extend right
824 > while (ri<rlen-1) {
825 >   ri++;
826 >   if (seq[ri]==polyChar) {
827 >                loc.add(ri,poly_m_score);
828 >                }
829 >   else if (seq[ri]=='N') {
830 >                loc.add(ri,0);
831 >                }
832 >   else { //mismatch
833 >        loc.add(ri,poly_mis_score);
834 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
835          }
836 <     return true;
837 <     }
838 < else { //keep this segment pair for later (gapped alignment)
947 <   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
948 <   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
949 <   }
950 <  //do not trim:
951 <  l5=0;
952 <  l3=alen-1;
953 <  return false;
954 < }
955 <
956 < /*
957 < int getWordValue(const char* s, int wlen) {
958 < int r=0;
959 < while (wlen--) { r+=(((int)s[wlen])<<wlen) }
960 < return r;
961 < }
962 < */
963 < int get3mer_value(const char* s) {
964 < return (s[0]<<16)+(s[1]<<8)+s[2];
965 < }
966 <
967 < int w3_match(int qv, const char* str, int slen, int start_index=0) {
968 < if (start_index>=slen || start_index<0) return -1;
969 < for (int i=start_index;i<slen-3;i++) {
970 <   int rv=get3mer_value(str+i);
971 <   if (rv==qv) return i;
972 <   }
973 < return -1;
974 < }
975 <
976 < int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
977 < if (end_index>=slen) return -1;
978 < if (end_index<0) end_index=slen-1;
979 < for (int i=end_index-2;i>=0;i--) {
980 <   int rv=get3mer_value(str+i);
981 <   if (rv==qv) return i;
982 <   }
983 < return -1;
984 < }
985 <
986 < int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
987 < if (start_index>=slen || start_index<0) return -1;
988 < for (int i=start_index;i<slen-4;i++) {
989 <   int32* rv=(int32*)(str+i);
990 <   if (*rv==qv) return i;
991 <   }
992 < return -1;
993 < }
994 <
995 < int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
996 < if (end_index>=slen) return -1;
997 < if (end_index<0) end_index=slen-1;
998 < for (int i=end_index-3;i>=0;i--) {
999 <   int32* rv=(int32*)(str+i);
1000 <   if (*rv==qv) return i;
1001 <   }
1002 < return -1;
1003 < }
1004 <
1005 < #ifdef DEBUG
1006 < void dbgPrintChain(CSegChain& chain, const char* aseq) {
1007 <  GStr s(aseq);
1008 <  for (int i=0;i<chain.Count();i++) {
1009 <   CSegPair& seg=*chain[i];
1010 <   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1011 <            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
836 >   if (maxloc.score<=loc.score) {
837 >      maxloc=loc;
838 >      }
839     }
840 + ri=maxloc.pos;
841 + if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
842 + //ri = right boundary for the poly match
843 + //extend left
844 + loc.set(li, maxloc.score);
845 + maxloc.pos=li;
846 + while (li>0) {
847 +    li--;
848 +    if (seq[li]==polyChar) {
849 +                 loc.add(li,poly_m_score);
850 +                 }
851 +    else if (seq[li]=='N') {
852 +                 loc.add(li,0);
853 +                 }
854 +    else { //mismatch
855 +         loc.add(li,poly_mis_score);
856 +         if (maxloc.score-loc.score>poly_dropoff_score) break;
857 +         }
858 +    if (maxloc.score<=loc.score) {
859 +       maxloc=loc;
860 +       }
861 +    }
862 + li=maxloc.pos;
863 + if ((maxloc.score==poly_minScore && ri==rlen-1) ||
864 +    (maxloc.score>poly_minScore && ri>=rlen-3) ||
865 +    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
866 +  //trimming this li-ri match at 3' end
867 +    l3=li-1;
868 +    if (l3<0) l3=0;
869 +    return true;
870 +    }
871 + return false;
872   }
1014 #endif
873  
874 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
874 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
875 > if (!doPolyTrim) return false;
876   int rlen=seq.length();
877   l5=0;
878   l3=rlen-1;
879 < //first try a full match, we might get lucky
880 < int fi=-1;
881 < if ((fi=seq.index(adapter3))>=0) {
882 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
883 <      l5=fi+a3len;
884 <      l3=rlen-1;
885 <      }
886 <    else {
887 <      l5=0;
888 <      l3=fi-1;
879 > int32 seedVal=*(int32*)poly_seed;
880 > char polyChar=poly_seed[0];
881 > //assumes N trimming was already done
882 > //so a poly match should be very close to the end of the read
883 > // -- find the initial match (seed)
884 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
885 > int li;
886 > for (li=0;li<=lmax;li++) {
887 >   if (seedVal==*(int*)&(seq[li])) {
888 >      break;
889        }
1031   return true;
890     }
891 < #ifdef DEBUG
892 < if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
893 < #endif
894 <
895 < //also, for fast detection of other adapter-only reads that start past
896 < // the beginning of the adapter sequence, try to see if the first a3len-4
897 < // bases of the read are a substring of the adapter
898 < if (rlen>a3len-3) {
899 <   GStr rstart=seq.substr(1,a3len-4);
900 <   if ((fi=adapter3.index(rstart))>=0) {
901 <     l3=rlen-1;
902 <     l5=a3len-4;
903 <     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
904 <     return true;
905 <     }
906 <  }
907 < CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
908 <  //check the easy cases - 11 bases exact match at the end
909 < int fdlen=11;
910 <  if (a3len<16) {
1053 <   fdlen=a3len>>1;
1054 <   }
1055 < if (fdlen>4) {
1056 <     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1057 <     GStr rstart=seq.substr(-fdlen-3,fdlen);
1058 <     if ((fi=adapter3.index(rstart))>=0) {
1059 < #ifdef DEBUG
1060 <       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1061 < #endif
1062 <       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1063 <                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
1064 <            return true;
1065 <       }
1066 <     //another easy case: first 11 characters of the adaptor found as a substring of the read
1067 <     GStr bstr=adapter3.substr(0, fdlen);
1068 <     if ((fi=seq.rindex(bstr))>=0) {
1069 < #ifdef DEBUG
1070 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1071 < #endif
1072 <       if (extendMatch(seq.chars(), rlen, fi,
1073 <                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
1074 <            return true;
891 > if (li>lmax) return false;
892 > //seed found, try to extend it both ways
893 > //extend left
894 > int ri=li+3; //save rightmost base of the seed
895 > SLocScore loc(li, poly_m_score<<2);
896 > SLocScore maxloc(loc);
897 > while (li>0) {
898 >    li--;
899 >    if (seq[li]==polyChar) {
900 >                 loc.add(li,poly_m_score);
901 >                 }
902 >    else if (seq[li]=='N') {
903 >                 loc.add(li,0);
904 >                 }
905 >    else { //mismatch
906 >         loc.add(li,poly_mis_score);
907 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
908 >         }
909 >    if (maxloc.score<=loc.score) {
910 >       maxloc=loc;
911         }
912 <     } //tried to match 11 bases first
913 <    
914 < //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
915 < //-- only extend if the match is in the 3' (ending) region of the read
916 < int wordSize=3;
917 < int hlen=12;
918 < if (hlen>a3len-wordSize) hlen=a3len-wordSize;
919 < int imin=rlen>>1; //last half of the read, left boundary for the wmer match
920 < if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
921 < imin=rlen-imin;
922 < for (int iw=0;iw<hlen;iw++) {
923 <   //int32* qv=(int32*)(adapter3.chars()+iw);
924 <   int qv=get3mer_value(adapter3.chars()+iw);
925 <   fi=-1;
926 <   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
927 <   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
928 <     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
929 <
930 < #ifdef DEBUG
1095 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1096 < #endif
1097 <     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1098 <                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
1099 <     fi--;
1100 <     if (fi<imin) break;
1101 <     }
1102 <   } //for each wmer in the first hlen bases of the adaptor
1103 < /*
1104 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1105 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1106 < if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1107 < int hlen2=a3len-wordSize;
1108 < //if (hlen2>a3len-4) hlen2=a3len-4;
1109 < if (hlen2>hlen) {
1110 < #ifdef DEBUG
1111 <     if (debug && a3segs.Count()>0) {
1112 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
912 >    }
913 > li=maxloc.pos;
914 > if (li>5) return false; //no trimming wanted, too far from 5' end
915 > //li = right boundary for the poly match
916 >
917 > //extend right
918 > loc.set(ri, maxloc.score);
919 > maxloc.pos=ri;
920 > while (ri<rlen-1) {
921 >   ri++;
922 >   if (seq[ri]==polyChar) {
923 >                loc.add(ri,poly_m_score);
924 >                }
925 >   else if (seq[ri]=='N') {
926 >                loc.add(ri,0);
927 >                }
928 >   else { //mismatch
929 >        loc.add(ri,poly_mis_score);
930 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
931          }
932 < #endif
933 <     for (int iw=hlen;iw<hlen2;iw++) {
1116 <         //int* qv=(int32 *)(adapter3.chars()+iw);
1117 <         int qv=get3mer_value(adapter3.chars()+iw);
1118 <         fi=-1;
1119 <         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1120 <         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1121 <           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1122 <                         a3len, iw, wordSize, l5,l3, a3segs);
1123 <           fi--;
1124 <           if (fi<imin) break;
1125 <           }
1126 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1127 <     }
1128 < //lastly, analyze collected a3segs for a possible gapped alignment:
1129 < GList<CSegChain> segchains(false,true,false);
1130 < #ifdef DEBUG
1131 < if (debug && a3segs.Count()>0) {
1132 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1133 <   }
1134 < #endif
1135 < for (int i=0;i<a3segs.Count();i++) {
1136 <   if (a3segs[i]->chain==NULL) {
1137 <       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1138 <       CSegChain* newchain=new CSegChain();
1139 <       newchain->setFreeItem(false);
1140 <       newchain->addSegPair(a3segs[i]);
1141 <       a3segs[i]->chain=newchain;
1142 <       segchains.Add(newchain); //just to free them when done
1143 <       }
1144 <   for (int j=i+1;j<a3segs.Count();j++) {
1145 <      CSegChain* chain=a3segs[i]->chain;
1146 <      if (chain->extendChain(a3segs[j])) {
1147 <          a3segs[j]->chain=chain;
1148 < #ifdef DEBUG
1149 <          if (debug) dbgPrintChain(*chain, adapter3.chars());
1150 < #endif      
1151 <          //save time by checking here if the extended chain is already acceptable for trimming
1152 <          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1153 <            l5=0;
1154 <            l3=chain->astart-2;
1155 < #ifdef DEBUG
1156 <          if (debug && a3segs.Count()>0) {
1157 <            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1158 <            }
1159 < #endif
1160 <            return true;
1161 <            }
1162 <          } //chain can be extended
932 >   if (maxloc.score<=loc.score) {
933 >      maxloc=loc;
934        }
935 <   } //collect segment alignments into chains
936 < */  
937 < return false; //no adapter parts found
935 >   }
936 > ri=maxloc.pos;
937 > if ((maxloc.score==poly_minScore && li==0) ||
938 >     (maxloc.score>poly_minScore && li<2)
939 >     || (maxloc.score>(poly_minScore*3) && li<8)) {
940 >    //adjust l5 to reflect this trimming of 5' end
941 >    l5=ri+1;
942 >    if (l5>rlen-1) l5=rlen-1;
943 >    return true;
944 >    }
945 > return false;
946 > }
947 >
948 > bool trim_adapter3(GStr& seq, int&l5, int &l3) {
949 > if (adapters3.Count()==0) return false;
950 > int rlen=seq.length();
951 > l5=0;
952 > l3=rlen-1;
953 > bool trimmed=false;
954 > GStr wseq(seq);
955 > int wlen=rlen;
956 > GXSeqData seqdata;
957 > for (int ai=0;ai<adapters3.Count();ai++) {
958 >  seqdata.update(adapters3[ai].seq.chars(), adapters3[ai].seq.length(),
959 >       adapters3[ai].pz, wseq.chars(), wlen, adapters3[ai].amlen);
960 >  GXAlnInfo* bestaln=match_RightEnd(seqdata, gxmem_r, 86);
961 >  if (bestaln) {
962 >     trimmed=true;
963 >     //keep unmatched region on the left OR right (the longer one)
964 >     if (bestaln->sl > wlen-bestaln->sr) {
965 >         //keep left side
966 >         l3-=(wlen-bestaln->sl+1);
967 >         if (l3<0) l3=0;
968 >         }
969 >     else { //keep right side
970 >         l5+=bestaln->sr;
971 >         if (l5>=rlen) l5=rlen-1;
972 >         }
973 >     delete bestaln;
974 >     if (l3-l5+1<min_read_len) return true;
975 >     wseq=seq.substr(l5,l3-l5+1);
976 >     wlen=wseq.length();
977 >     }
978 >  }//for each 5' adapter
979 >  return trimmed;
980   }
981  
982   bool trim_adapter5(GStr& seq, int&l5, int &l3) {
983 < //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
983 > if (adapters5.Count()==0) return false;
984   int rlen=seq.length();
985   l5=0;
986   l3=rlen-1;
987 < //try to see if adapter is fully included in the read
988 < int fi=-1;
989 < if ((fi=seq.index(adapter5))>=0) {
990 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
991 <      l5=fi+a5len;
992 <      l3=rlen-1;
993 <      }
994 <    else {
995 <      l5=0;
996 <      l3=fi-1;
987 > bool trimmed=false;
988 > GStr wseq(seq);
989 > int wlen=rlen;
990 > GXSeqData seqdata;
991 > for (int ai=0;ai<adapters5.Count();ai++) {
992 >   seqdata.update(adapters5[ai].seq.chars(), adapters5[ai].seq.length(),
993 >       adapters5[ai].pz, wseq.chars(), wlen, adapters5[ai].amlen);
994 >  GXAlnInfo* bestaln=match_LeftEnd(seqdata, gxmem_l, 90);
995 >  if (bestaln) {
996 >     trimmed=true;
997 >     if (bestaln->sl-1 > wlen-bestaln->sr) {
998 >         //keep left side
999 >         l3-=(wlen-bestaln->sl+1);
1000 >         if (l3<0) l3=0;
1001 >         }
1002 >     else { //keep right side
1003 >         l5+=bestaln->sr;
1004 >         if (l5>=rlen) l5=rlen-1;
1005 >         }
1006 >     delete bestaln;
1007 >     if (l3-l5+1<min_read_len) return true;
1008 >     wseq=seq.substr(l5,l3-l5+1);
1009 >     wlen=wseq.length();
1010 >     }
1011 >  }//for each 5' adapter
1012 >  return trimmed;
1013 > }
1014 >
1015 > //convert qvs to/from phred64 from/to phread33
1016 > void convertPhred(GStr& q) {
1017 > for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1018 > }
1019 >
1020 > void convertPhred(char* q, int len) {
1021 > for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1022 > }
1023 >
1024 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1025 >          GStr& rname, GStr& rinfo, GStr& infname) {
1026 > rseq="";
1027 > rqv="";
1028 > rname="";
1029 > rinfo="";
1030 > if (fq.eof()) return false;
1031 > char* l=fq.getLine();
1032 > while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1033 > if (l==NULL) return false;
1034 > /* if (rawFormat) {
1035 >      //TODO: implement raw qseq parsing here
1036 >      //if (raw type=N) then continue; //skip invalid/bad records
1037 >      } //raw qseq format
1038 > else { // FASTQ or FASTA */
1039 > isfasta=(l[0]=='>');
1040 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1041 >      infname.chars(), l);
1042 > GStr s(l);
1043 > rname=&(l[1]);
1044 > for (int i=0;i<rname.length();i++)
1045 >    if (rname[i]<=' ') {
1046 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1047 >       rname.cut(i);
1048 >       break;
1049 >       }
1050 >  //now get the sequence
1051 > if ((l=fq.getLine())==NULL)
1052 >      GError("Error: unexpected EOF after header for read %s (%s)\n",
1053 >                   rname.chars(), infname.chars());
1054 > rseq=l; //this must be the DNA line
1055 > while ((l=fq.getLine())!=NULL) {
1056 >      //seq can span multiple lines
1057 >      if (l[0]=='>' || l[0]=='+') {
1058 >           fq.pushBack();
1059 >           break; //
1060 >           }
1061 >      rseq+=l;
1062 >      } //check for multi-line seq
1063 > if (!isfasta) { //reading fastq quality values, which can also be multi-line
1064 >    if ((l=fq.getLine())==NULL)
1065 >        GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1066 >    if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1067 >    if ((l=fq.getLine())==NULL)
1068 >        GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1069 >    rqv=l;
1070 >    //if (rqv.length()!=rseq.length())
1071 >    //  GError("Error: qv len != seq len for %s\n", rname.chars());
1072 >    while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1073 >      rqv+=l; //append to qv string
1074        }
1075 <   return true;
1076 <   }
1077 < #ifdef DEBUG
1078 < if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1075 >    }// fastq
1076 > // } //<-- FASTA or FASTQ
1077 > rseq.upper();
1078 > return true;
1079 > }
1080 >
1081 > #ifdef GDEBUG
1082 > void showTrim(GStr& s, int l5, int l3) {
1083 >  if (l5>0 || l3==0) {
1084 >    color_bg(c_red);
1085 >    }
1086 >  for (int i=0;i<s.length()-1;i++) {
1087 >    if (i && i==l5) color_resetbg();
1088 >    fprintf(stderr, "%c", s[i]);
1089 >    if (i && i==l3) color_bg(c_red);
1090 >   }
1091 >  fprintf(stderr, "%c", s[s.length()-1]);
1092 >  color_reset();
1093 >  fprintf(stderr, "\n");
1094 > }
1095   #endif
1096  
1097 < CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1097 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1098 > //returns 0 if the read was untouched, 1 if it was just trimmed
1099 > // and a trash code if it was trashed
1100 > l5=0;
1101 > l3=rseq.length()-1;
1102 > #ifdef GDEBUG
1103 >   //rseq.reverse();
1104 >   GMessage(">%s\n", rname.chars());
1105 >   GMessage("%s\n",rseq.chars());
1106 > #endif
1107 > if (l3-l5+1<min_read_len) {
1108 >   return 's';
1109 >   }
1110 > GStr wseq(rseq.chars());
1111 > GStr wqv(rqv.chars());
1112 > int w5=l5;
1113 > int w3=l3;
1114 > //first do the q-based trimming
1115 > if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1116 >   if (w3-w5+1<min_read_len) {
1117 >     return 'Q'; //invalid read
1118 >     }
1119 >    //-- keep only the w5..w3 range
1120 >   l5=w5;
1121 >   l3=w3;
1122 >   wseq=wseq.substr(w5, w3-w5+1);
1123 >   if (!wqv.is_empty())
1124 >      wqv=wqv.substr(w5, w3-w5+1);
1125 >   } //qv trimming
1126 > // N-trimming on the remaining read seq
1127 > if (ntrim(wseq, w5, w3)) {
1128 >   //GMessage("before: %s\n",wseq.chars());
1129 >   //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1130 >   l5+=w5;
1131 >   l3-=(wseq.length()-1-w3);
1132 >   if (w3-w5+1<min_read_len) {
1133 >     return 'N'; //to be trashed
1134 >     }
1135 >    //-- keep only the w5..w3 range
1136 >   wseq=wseq.substr(w5, w3-w5+1);
1137 >   if (!wqv.is_empty())
1138 >      wqv=wqv.substr(w5, w3-w5+1);
1139 >   w5=0;
1140 >   w3=wseq.length()-1;
1141 >   }
1142 > char trim_code;
1143 > do {
1144 >  trim_code=0;
1145 >  if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1146 >      trim_code='A';
1147 >      }
1148 >  else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1149 >      trim_code='T';
1150 >      }
1151 >  else if (trim_adapter5(wseq, w5, w3)) {
1152 >      trim_code='5';
1153 >      }
1154 >  if (trim_code) {
1155 >     #ifdef GDEBUG
1156 >      GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1157 >      showTrim(wseq, w5, w3);
1158 >     #endif
1159 >     int trimlen=wseq.length()-(w3-w5+1);
1160 >     num_trimmed5++;
1161 >     if (trimlen<min_trimmed5)
1162 >         min_trimmed5=trimlen;
1163 >     l5+=w5;
1164 >     l3-=(wseq.length()-1-w3);
1165 >     if (w3-w5+1<min_read_len) {
1166 >         return trim_code;
1167 >         }
1168 >      //-- keep only the w5..w3 range
1169 >      wseq=wseq.substr(w5, w3-w5+1);
1170 >      if (!wqv.is_empty())
1171 >         wqv=wqv.substr(w5, w3-w5+1);
1172 >      }// trimmed at 5' end
1173 > } while (trim_code);
1174 >
1175 > do {
1176 >  trim_code=0;
1177 >  if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1178 >      trim_code='A';
1179 >      }
1180 >  else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1181 >      trim_code='T';
1182 >      }
1183 >  else if (trim_adapter3(wseq, w5, w3)) {
1184 >      trim_code='3';
1185 >      }
1186 >  if (trim_code) {
1187 >     #ifdef GDEBUG
1188 >     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1189 >     showTrim(wseq, w5, w3);
1190 >     #endif
1191 >     int trimlen=wseq.length()-(w3-w5+1);
1192 >     num_trimmed3++;
1193 >     if (trimlen<min_trimmed3)
1194 >         min_trimmed3=trimlen;
1195 >     l5+=w5;
1196 >     l3-=(wseq.length()-1-w3);
1197 >     if (w3-w5+1<min_read_len) {
1198 >         return trim_code;
1199 >         }
1200 >      //-- keep only the w5..w3 range
1201 >      wseq=wseq.substr(w5, w3-w5+1);
1202 >      if (!wqv.is_empty())
1203 >         wqv=wqv.substr(w5, w3-w5+1);
1204 >      }//trimming at 3' end
1205 > } while (trim_code);
1206 >
1207 >
1208 > if (doCollapse) {
1209 >   //keep read for later
1210 >   FqDupRec* dr=dhash.Find(wseq.chars());
1211 >   if (dr==NULL) { //new entry
1212 >          //if (prefix.is_empty())
1213 >             dhash.Add(wseq.chars(),
1214 >                  new FqDupRec(&wqv, rname.chars()));
1215 >          //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1216 >         }
1217 >      else
1218 >         dr->add(wqv);
1219 >   } //collapsing duplicates
1220 > else { //not collapsing duplicates
1221 >   //apply the dust filter now
1222 >   if (doDust) {
1223 >     int dustbases=dust(wseq);
1224 >     if (dustbases>(wseq.length()>>1)) {
1225 >        return 'D';
1226 >        }
1227 >     }
1228 >   } //not collapsing duplicates
1229 > return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1230 > }
1231 >
1232 > void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1233 > //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1234 > if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1235 >  else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1236 > }
1237  
1238 < //try the easy way out first - look for an exact match of 11 bases
1239 < int fdlen=11;
1240 <  if (a5len<16) {
1241 <   fdlen=a5len>>1;
1242 <   }
1243 < if (fdlen>4) {
1244 <     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1200 <     if ((fi=adapter5.index(rstart))>=0) {
1201 < #ifdef DEBUG
1202 <       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1203 < #endif
1204 <       if (extendMatch(seq.chars(), rlen, 1,
1205 <                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1206 <           return true;
1238 > void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1239 >   outcounter++;
1240 >   bool asFasta=(rqv.is_empty() || fastaOutput);
1241 >   if (asFasta) {
1242 >    if (prefix.is_empty()) {
1243 >       printHeader(f_out, '>',rname,rinfo);
1244 >       fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1245         }
1246 <     //another easy case: last 11 characters of the adaptor found as a substring of the read
1247 <     GStr bstr=adapter5.substr(-fdlen);
1248 <     if ((fi=seq.index(bstr))>=0) {
1211 < #ifdef DEBUG
1212 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1213 < #endif
1214 <       if (extendMatch(seq.chars(), rlen, fi,
1215 <                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1216 <          return true;
1246 >      else {
1247 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1248 >                          rseq.chars());
1249         }
1218     } //tried to matching at most 11 bases first
1219    
1220 //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1221 //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1222 int wordSize=3;
1223 int hlen=12;
1224 if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1225 int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1226 if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1227 for (int iw=0;iw<=hlen;iw++) {
1228   int apstart=a5len-iw-wordSize;
1229   fi=0;
1230   //int* qv=(int32 *)(adapter5.chars()+apstart);
1231   int qv=get3mer_value(adapter5.chars()+apstart);
1232   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1233   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1234 #ifdef DEBUG
1235     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1236 #endif
1237     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1238                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1239     fi++;
1240     if (fi>imax) break;
1241     }
1242   } //for each wmer in the last hlen bases of the adaptor
1243 /*
1244
1245 //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1246 //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1247 if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1248 int hlen2=a5len-wordSize;
1249 //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1250 #ifdef DEBUG
1251      if (debug && a5segs.Count()>0) {
1252        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1253        }
1254 #endif
1255 if (hlen2>hlen) {
1256     for (int iw=hlen+1;iw<=hlen2;iw++) {
1257         int apstart=a5len-iw-wordSize;
1258         fi=0;
1259         //int* qv=(int32 *)(adapter5.chars()+apstart);
1260         int qv=get3mer_value(adapter5.chars()+apstart);
1261         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1262         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1263           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1264                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1265           fi++;
1266           if (fi>imax) break;
1267           }
1268         } //for each wmer between hlen2 and hlen bases of the adaptor
1250       }
1251 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1252 < // lastly, analyze collected a5segs for a possible gapped alignment:
1253 < GList<CSegChain> segchains(false,true,false);
1254 < #ifdef DEBUG
1255 < if (debug && a5segs.Count()>0) {
1275 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1276 <   }
1277 < #endif
1278 < for (int i=0;i<a5segs.Count();i++) {
1279 <   if (a5segs[i]->chain==NULL) {
1280 <       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1281 <       CSegChain* newchain=new CSegChain(true);
1282 <       newchain->setFreeItem(false);
1283 <       newchain->addSegPair(a5segs[i]);
1284 <       a5segs[i]->chain=newchain;
1285 <       segchains.Add(newchain); //just to free them when done
1251 >   else {  //fastq
1252 >    if (convert_phred) convertPhred(rqv);
1253 >    if (prefix.is_empty()) {
1254 >       printHeader(f_out, '@', rname, rinfo);
1255 >       fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1256         }
1257 <   for (int j=i+1;j<a5segs.Count();j++) {
1258 <      CSegChain* chain=a5segs[i]->chain;
1259 <      if (chain->extendChain(a5segs[j])) {
1260 <         a5segs[j]->chain=chain;
1261 < #ifdef DEBUG
1262 <         if (debug) dbgPrintChain(*chain, adapter5.chars());
1263 < #endif      
1264 <      //save time by checking here if the extended chain is already acceptable for trimming
1265 <         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1266 <            l5=chain->aend;
1267 <            l3=rlen-1;
1268 <            return true;
1269 <            }
1270 <         } //chain can be extended
1271 <      }
1272 <   } //collect segment alignments into chains
1273 < */
1274 < return false; //no adapter parts found
1257 >      else
1258 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1259 >                          rseq.chars(),rqv.chars() );
1260 >     }
1261 > }
1262 >
1263 > void trash_report(char trashcode, GStr& rname, FILE* freport) {
1264 > if (freport==NULL || trashcode<=' ') return;
1265 > if (trashcode=='3' || trashcode=='5') {
1266 >   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1267 >   }
1268 > else {
1269 >   fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1270 >   }
1271 > //tcounter++;
1272 > }
1273 >
1274 > GStr getFext(GStr& s, int* xpos=NULL) {
1275 > //s must be a filename without a path
1276 > GStr r("");
1277 > if (xpos!=NULL) *xpos=0;
1278 > if (s.is_empty() || s=="-") return r;
1279 > int p=s.rindex('.');
1280 > int d=s.rindex('/');
1281 > if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1282 > r=s.substr(p+1);
1283 > if (xpos!=NULL) *xpos=p+1;
1284 > r.lower();
1285 > return r;
1286   }
1287  
1288 < //convert qvs to/from phred64 from/to phread33
1289 < void convertPhred(GStr& q) {
1290 < for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1288 > void baseFileName(GStr& fname) {
1289 > //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1290 > int xpos=0;
1291 > GStr fext=getFext(fname, &xpos);
1292 > if (xpos<=1) return;
1293 > bool extdel=false;
1294 > GStr f2;
1295 > if (fext=="z" || fext=="zip") {
1296 >   extdel=true;
1297 >   }
1298 >  else if (fext.length()>=2) {
1299 >   f2=fext.substr(0,2);
1300 >   extdel=(f2=="gz" || f2=="bz");
1301 >   }
1302 > if (extdel) {
1303 >   fname.cut(xpos-1);
1304 >   fext=getFext(fname, &xpos);
1305 >   if (xpos<=1) return;
1306 >   }
1307 > extdel=false;
1308 > if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1309 >   extdel=true;
1310 >   }
1311 >  else if (fext.length()>=2) {
1312 >   extdel=(fext.substr(0,2)=="fa");
1313 >   }
1314 > if (extdel) fname.cut(xpos-1);
1315 > GStr fncp(fname);
1316 > fncp.lower();
1317 > fncp.chomp("seq");
1318 > fncp.chomp("sequence");
1319 > fncp.trimR("_.");
1320 > if (fncp.length()<fname.length()) fname.cut(fncp.length());
1321   }
1322  
1323 < void convertPhred(char* q, int len) {
1324 < for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1323 > FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1324 >  FILE* f_out=NULL;
1325 >  GStr fname(getFileName(infname.chars()));
1326 >  //eliminate known extensions
1327 >  baseFileName(fname);
1328 >  if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1329 >    else if (pocmd.is_empty()) {
1330 >               GStr oname(fname);
1331 >               oname.append('.');
1332 >               oname.append(outsuffix);
1333 >               f_out=fopen(oname.chars(),"w");
1334 >               if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1335 >               }
1336 >            else {
1337 >              GStr oname(">");
1338 >              oname.append(fname);
1339 >              oname.append('.');
1340 >              oname.append(outsuffix);
1341 >              pocmd.append(oname);
1342 >              f_out=popen(pocmd.chars(), "w");
1343 >              if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1344 >              }
1345 > return f_out;
1346 > }
1347 >
1348 > void guess_unzip(GStr& fname, GStr& picmd) {
1349 > GStr fext=getFext(fname);
1350 > if (fext=="gz" || fext=="gzip" || fext=="z") {
1351 >    picmd="gzip -cd ";
1352 >    }
1353 >   else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1354 >    picmd="bzip2 -cd ";
1355 >    }
1356 > }
1357 >
1358 > void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1359 > //TODO: prepare CASeqData here, and collect hexamers as well
1360 > CASeqData adata(revCompl);
1361 > int idx=adapters.Add(adata);
1362 > if (idx<0) GError("Error: failed to add adaptor!\n");
1363 > adapters[idx].update(seq.chars());
1364   }
1365  
1366 +
1367 + int loadAdapters(const char* fname) {
1368 +  GLineReader lr(fname);
1369 +  char* l;
1370 +  while ((l=lr.nextLine())!=NULL) {
1371 +   if (lr.length()<=3 || l[0]=='#') continue;
1372 +   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1373 +        l[0]==';'|| l[0]==':' ) {
1374 +      int i=1;
1375 +      while (l[i]!=0 && isspace(l[i])) {
1376 +        i++;
1377 +        }
1378 +      if (l[i]!=0) {
1379 +        GStr s(&(l[i]));
1380 +      #ifdef GDEBUG
1381 +          //s.reverse();
1382 +      #endif
1383 +        addAdapter(adapters3, s);
1384 +        continue;
1385 +        }
1386 +      }
1387 +    else {
1388 +      GStr s(l);
1389 +      s.startTokenize("\t ;,:");
1390 +      GStr a5,a3;
1391 +      if (s.nextToken(a5))
1392 +         s.nextToken(a3);
1393 +      a5.upper();
1394 +      a3.upper();
1395 +     #ifdef GDEBUG
1396 +     //   a5.reverse();
1397 +     //   a3.reverse();
1398 +     #endif
1399 +      addAdapter(adapters5, a5);
1400 +      addAdapter(adapters3, a3);
1401 +      }
1402 +   }
1403 +   return adapters5.Count()+adapters3.Count();
1404 + }
1405 +
1406 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1407 +                       GStr& s, GStr& infname, GStr& infname2) {
1408 + // uses outsuffix to generate output file names and open file handles as needed
1409 + infname="";
1410 + infname2="";
1411 + f_in=NULL;
1412 + f_in2=NULL;
1413 + f_out=NULL;
1414 + f_out2=NULL;
1415 + //analyze outsuffix intent
1416 + GStr pocmd;
1417 + if (outsuffix=="-") {
1418 +    f_out=stdout;
1419 +    }
1420 +   else {
1421 +    GStr ox=getFext(outsuffix);
1422 +    if (ox.length()>2) ox=ox.substr(0,2);
1423 +    if (ox=="gz") pocmd="gzip -9 -c ";
1424 +        else if (ox=="bz") pocmd="bzip2 -9 -c ";
1425 +    }
1426 + if (s=="-") {
1427 +    f_in=stdin;
1428 +    infname="stdin";
1429 +    f_out=prepOutFile(infname, pocmd);
1430 +    return;
1431 +    } // streaming from stdin
1432 + s.startTokenize(",:");
1433 + s.nextToken(infname);
1434 + bool paired=s.nextToken(infname2);
1435 + if (fileExists(infname.chars())==0)
1436 +    GError("Error: cannot find file %s!\n",infname.chars());
1437 + GStr fname(getFileName(infname.chars()));
1438 + GStr picmd;
1439 + guess_unzip(fname, picmd);
1440 + if (picmd.is_empty()) {
1441 +   f_in=fopen(infname.chars(), "r");
1442 +   if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1443 +   }
1444 +  else {
1445 +   picmd.append(infname);
1446 +   f_in=popen(picmd.chars(), "r");
1447 +   if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1448 +   }
1449 + if (f_out==stdout) {
1450 +   if (paired) GError("Error: output suffix required for paired reads\n");
1451 +   return;
1452 +   }
1453 + f_out=prepOutFile(infname, pocmd);
1454 + if (!paired) return;
1455 + if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1456 + // ---- paired reads:-------------
1457 + if (fileExists(infname2.chars())==0)
1458 +     GError("Error: cannot find file %s!\n",infname2.chars());
1459 + picmd="";
1460 + GStr fname2(getFileName(infname2.chars()));
1461 + guess_unzip(fname2, picmd);
1462 + if (picmd.is_empty()) {
1463 +   f_in2=fopen(infname2.chars(), "r");
1464 +   if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1465 +   }
1466 +  else {
1467 +   picmd.append(infname2);
1468 +   f_in2=popen(picmd.chars(), "r");
1469 +   if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1470 +   }
1471 + f_out2=prepOutFile(infname2, pocmd);
1472 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines