ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 2 | Line 2
2   #include "GStr.h"
3   #include "GHash.hh"
4   #include "GList.hh"
5 + #include <ctype.h>
6  
7   #define USAGE "Usage:\n\
8 < fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\
9 <    [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\
8 > fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
9 >   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10 >   [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
11 >    <input.fq>[,<input_mates.fq>\n\
12   \n\
13 < Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\
14 < optionally collapse duplicates in the input fastq data.\n\
13 > Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
14 > for low complexity and collapse duplicate reads.\n\
15 > If read pairs should be trimmed and kept together (i.e. without discarding\n\
16 > one read in a pair), the two file names should be given delimited by a comma\n\
17 > or a colon character\n\
18   \n\
19   Options:\n\
20   -n  rename all the reads using the <prefix> followed by a read counter;\n\
21      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
22      the read duplication count\n\
23 < -o  write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
23 > -o  unless this parameter is '-', write the trimmed/filtered reads to \n\
24 >    file(s) named <input>.<outsuffix> which will be created in the \n\
25 >    current (working) directory; (writes to stdout if -o- is given);\n\
26 >    a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
27 > -f  file with adapter sequences to trim, each line having this format:\n\
28 >    <5'-adapter-sequence> <3'-adapter-sequence>\n\
29   -5  trim the given adapter or primer sequence at the 5' end of each read\n\
30      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
31   -3  trim the given adapter sequence at the 3' end of each read\n\
32      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
33 < -l  minimum \"clean\" length at the high quality 5'end that a read must\n\
34 <    have in order to pass the filter (default: 16)\n\
33 > -a  minimum length of exact match to adaptor sequence at the proper end (6)\n\
34 > -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
35 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
36 > -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
37 > -l  minimum \"clean\" length after trimming that a read must have\n\
38 >    in order to pass the filter (default: 16)\n\
39   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
40      one letter code for the reason why they were trashed\n\
41 + -D  apply a low-complexity (dust) filter and discard any read that has over\n\
42 +    50% of its length detected as low complexity\n\
43   -C  collapse duplicate reads and add a prefix with their count to the read\n\
44 <    name\n\
45 < -D  apply a low-complexity (dust) filter and discard any read that has at\n\
46 <    least half of it masked by this\n\
47 < -Q  quality values in the input data are interpreted as phred64, convert\n\
31 <    them to phred33\n\
44 >    name (e.g. for microRNAs)\n\
45 > -p  input is phred64/phred33 (use -p64 or -p33)\n\
46 > -Q  convert quality values to the other Phred qv type\n\
47 > -V  verbose processing\n\
48   "
49 +
50 + //-z  for -o option, the output stream(s) will be first piped into the given\n
51 + //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
52 +
53 +
54   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
55 < FILE* f_out=NULL; //stdout if not provided
56 < FILE* f_in=NULL; //input fastq (stdin if not provided)
55 >
56 > //For paired reads sequencing:
57 > //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
58 > //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
59 > //FILE* f_out=NULL; //stdout if not provided
60 > //FILE* f_out2=NULL; //for paired reads
61 > //FILE* f_in=NULL; //input fastq (stdin if not provided)
62 > //FILE* f_in2=NULL; //for paired reads
63   FILE* freport=NULL;
64 +
65   bool debug=false;
66 + bool verbose=false;
67   bool doCollapse=false;
68   bool doDust=false;
69 + bool fastaOutput=false;
70   bool trashReport=false;
71 < bool rawFormat=false;
71 > //bool rawFormat=false;
72   int min_read_len=16;
73 + double max_perc_N=7.0;
74   int dust_cutoff=16;
75   bool isfasta=false;
76 < bool phred64=false;
76 > bool convert_phred=false;
77 > GStr outsuffix; // -o
78 > //GStr adapter3;
79 > //GStr adapter5;
80   GStr prefix;
81 < GStr adapter3;
82 < GStr adapter5;
81 > GStr zcmd;
82 > int num_trimmed5=0;
83 > int num_trimmed3=0;
84 > int num_trimmedN=0;
85 > int num_trimmedQ=0;
86 > int min_trimmed5=INT_MAX;
87 > int min_trimmed3=INT_MAX;
88 >
89 > int qvtrim_qmin=0;
90 > int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
91 > int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
92 > int qv_cvtadd=0; //could be -31 or +31
93 >
94   int a3len=0;
95   int a5len=0;
96   // adaptor matching metrics -- for extendMatch() function
97 < static const int a_m_score=2; //match score
98 < static const int a_mis_score=-3; //mismatch
99 < static const int a_dropoff_score=7;
100 < static const int a_min_score=7;
97 > const int match_score=2; //match score
98 > const int mismatch_score=-3; //mismatch
99 >
100 > const int a_m_score=2; //match score
101 > const int a_mis_score=-3; //mismatch
102 > const int a_dropoff_score=7;
103 > const int poly_dropoff_score=7;
104 > int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
105 > const int a_min_chain_score=15; //for gapped alignments
106 >
107 > class CSegChain;
108 >
109 > class CSegPair {
110 >  public:
111 >   GSeg a;
112 >   GSeg b; //the adapter segment
113 >   int score;
114 >   int flags;
115 >   CSegChain* chain;
116 >   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
117 >      score=mscore;
118 >      if (score==0) score=a.len()*a_m_score;
119 >      flags=0;
120 >      chain=NULL;
121 >      }
122 >   int len() { return  a.len(); }
123 >   bool operator==(CSegPair& d){
124 >      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
125 >      //make equal even segments that are included into one another:
126 >      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
127 >      }
128 >   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
129 >     if (b.start==d.b.start) {
130 >        if (score==d.score) {
131 >           //just try to be consistent:
132 >           if (b.end==d.b.end) {
133 >             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
134 >             }
135 >           return (b.end>d.b.end);
136 >           }
137 >         else return (score<d.score);
138 >        }
139 >     return (b.start>d.b.start);
140 >     }
141 >   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
142 >     /*if (b.start==d.b.start && b.end==d.b.end) {
143 >          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
144 >          }
145 >     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
146 >     if (b.start==d.b.start) {
147 >        if (score==d.score) {
148 >           //just try to be consistent:
149 >           if (b.end==d.b.end) {
150 >             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
151 >             }
152 >           return (b.end<d.b.end);
153 >           }
154 >         else return (score>d.score);
155 >        }
156 >     return (b.start<d.b.start);
157 >     }
158 > };
159 >
160 > int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
161 > CSegPair& x = *(CSegPair *)sa;
162 > CSegPair& y = *(CSegPair *)sb;
163 > /*
164 > if (x.b.end==y.b.end) {
165 >     if (x.b.start==y.b.start) {
166 >         if (x.a.end==y.a.end) {
167 >            if (x.a.start==y.a.start) return 0;
168 >            return ((x.a.start>y.a.start) ? -1 : 1);
169 >            }
170 >          else {
171 >            return ((x.a.end>y.a.end) ? -1 : 1);
172 >            }
173 >          }
174 >      else {
175 >       return ((x.b.start>y.b.start) ? -1 : 1);
176 >       }
177 >     }
178 >    else {
179 >     return ((x.b.end>y.b.end) ? -1 : 1);
180 >     }
181 > */
182 >  if (x.b.end==y.b.end) {
183 >     if (x.score==y.score) {
184 >     if (x.b.start==y.b.start) {
185 >         if (x.a.end==y.a.end) {
186 >            if (x.a.start==y.a.start) return 0;
187 >            return ((x.a.start<y.a.start) ? -1 : 1);
188 >            }
189 >          else {
190 >            return ((x.a.end<y.a.end) ? -1 : 1);
191 >            }
192 >          }
193 >      else {
194 >       return ((x.b.start<y.b.start) ? -1 : 1);
195 >       }
196 >      } else return ((x.score>y.score) ? -1 : 1);
197 >     }
198 >    else {
199 >     return ((x.b.end>y.b.end) ? -1 : 1);
200 >     }
201 >
202 > }
203 >
204 > class CSegChain:public GList<CSegPair> {
205 > public:
206 >   uint astart;
207 >   uint aend;
208 >   uint bstart;
209 >   uint bend;
210 >   int score;
211 >   bool endSort;
212 >  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
213 >   //as SegPairs are inserted, they will be sorted by a.start coordinate
214 >   score=0;
215 >   astart=MAX_UINT;
216 >   aend=0;
217 >   bstart=MAX_UINT;
218 >   bend=0;
219 >   endSort=aln5;
220 >   if (aln5) { setSorted(cmpSegEnds); }
221 >   }
222 > bool operator==(CSegChain& d) {
223 >   //return (score==d.score);
224 >    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
225 >   }
226 > bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
227 >   //return (score<d.score);
228 >   if (bstart==d.bstart && bend==d.bend) {
229 >          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
230 >          }
231 >     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
232 >   }
233 > bool operator<(CSegChain& d) {
234 >   //return (score>d.score);
235 >   if (bstart==d.bstart && bend==d.bend) {
236 >          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
237 >          }
238 >     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
239 >   }
240 > void addSegPair(CSegPair* segp) {
241 >   if (AddIfNew(segp)!=segp) return;
242 >   score+=segp->score;
243 >   if (astart>segp->a.start) astart=segp->a.start;
244 >   if (aend<segp->a.end) aend=segp->a.end;
245 >   if (bstart>segp->b.start) bstart=segp->b.start;
246 >   if (bend<segp->b.end) bend=segp->b.end;
247 >   }
248 > //for building actual chains:
249 > bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
250 >   int bgap=0;
251 >   int agap=0;
252 >   //if (endSort) {
253 >   if (bstart>segp->b.start) {
254 >      bgap = (int)(bstart-segp->b.end);
255 >      if (abs(bgap)>2) return false;
256 >      agap = (int)(astart-segp->a.end);
257 >      if (abs(agap)>2) return false;
258 >      }
259 >     else {
260 >      bgap = (int) (segp->b.start-bend);
261 >      if (abs(bgap)>2) return false;
262 >      agap = (int)(segp->a.start-aend);
263 >      if (abs(agap)>2) return false;
264 >      }
265 >   if (agap*bgap<0) return false;
266 >   addSegPair(segp);
267 >   score-=abs(agap)+abs(bgap);
268 >   return true;
269 >   }
270 > };
271  
272   // element in dhash:
273   class FqDupRec {
# Line 91 | Line 306
306    if (!s.is_empty()) {
307        if (s=='-') f=stdout;
308        else {
309 <       f=fopen(s,"w");
309 >       f=fopen(s.chars(),"w");
310         if (f==NULL) GError("Error creating file: %s\n", s.chars());
311         }
312       }
# Line 102 | Line 317
317  
318   GHash<FqDupRec> dhash; //hash to keep track of duplicates
319  
320 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured
320 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
321 >                       GStr& s, GStr& infname, GStr& infname2);
322 > // uses outsuffix to generate output file names and open file handles as needed
323  
324 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
325 + void trash_report(char trashcode, GStr& rname, FILE* freport);
326 +
327 + bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
328 +          GStr& rname, GStr& rinfo, GStr& infname);
329 +
330 + char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
331 + //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
332 +
333 + bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
334 + bool polytrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
335 + bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
336   int dust(GStr& seq);
337   bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
338   bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
339  
340 < void convertQ64(char* q, int len);
341 < void convertQ64(GStr& q);
340 > void convertPhred(char* q, int len);
341 > void convertPhred(GStr& q);
342  
343   int main(int argc, char * const argv[]) {
344 <  GArgs args(argc, argv, "YQRDCl:d:3:5:n:r:o:");
344 >  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
345    int e;
117  int icounter=0; //counter for input reads
118  int outcounter=0; //counter for output reads
346    if ((e=args.isError())>0) {
347        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
348        exit(224);
349        }
350    debug=(args.getOpt('Y')!=NULL);
351 <  phred64=(args.getOpt('Q')!=NULL);
351 >  verbose=(args.getOpt('V')!=NULL);
352 >  convert_phred=(args.getOpt('Q')!=NULL);
353    doCollapse=(args.getOpt('C')!=NULL);
354    doDust=(args.getOpt('D')!=NULL);
355 +  /*
356    rawFormat=(args.getOpt('R')!=NULL);
357    if (rawFormat) {
358      GError("Sorry, raw qseq format parsing is not implemented yet!\n");
359      }
360 +  */
361    prefix=args.getOpt('n');
362    GStr s=args.getOpt('l');
363    if (!s.is_empty())
364       min_read_len=s.asInt();
365 +  s=args.getOpt('m');
366 +  if (!s.is_empty())
367 +     max_perc_N=s.asDouble();
368    s=args.getOpt('d');
369    if (!s.is_empty()) {
370       dust_cutoff=s.asInt();
371       doDust=true;
372       }
373 <    
373 >  s=args.getOpt('q');
374 >  if (!s.is_empty()) {
375 >     qvtrim_qmin=s.asInt();
376 >     }
377 >  s=args.getOpt('t');
378 >  if (!s.is_empty()) {
379 >     qvtrim_max=s.asInt();
380 >     }
381 >  s=args.getOpt('p');
382 >  if (!s.is_empty()) {
383 >     int v=s.asInt();
384 >     if (v==33) {
385 >        qv_phredtype=33;
386 >        qv_cvtadd=31;
387 >        }
388 >      else if (v==64) {
389 >        qv_phredtype=64;
390 >        qv_cvtadd=-31;
391 >        }
392 >       else
393 >         GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
394 >     }
395    if (args.getOpt('3')!=NULL) {
396      adapter3=args.getOpt('3');
397      adapter3.upper();
# Line 148 | Line 402
402      adapter5.upper();
403      a5len=adapter5.length();
404      }
405 +  s=args.getOpt('a');
406 +  if (!s.is_empty()) {
407 +     int a_minmatch=s.asInt();
408 +     a_min_score=a_minmatch<<1;
409 +     }
410 +  
411 +  if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
412 +                         else outsuffix="-";
413    trashReport=  (args.getOpt('r')!=NULL);
414 <  if (args.startNonOpt()==0) {
414 >  int fcount=args.startNonOpt();
415 >  if (fcount==0) {
416      GMessage(USAGE);
417      exit(224);
418      }
419 <
420 <  openfw(f_out, args, 'o');
421 <  if (f_out==NULL) f_out=stdout;
419 >   if (fcount>1 && doCollapse) {
420 >    GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
421 >    }
422 >  //openfw(f_out, args, 'o');
423 >  //if (f_out==NULL) f_out=stdout;
424    if (trashReport)
425      openfw(freport, args, 'r');
426    char* infile=NULL;
427    while ((infile=args.nextNonOpt())!=NULL) {
428 <    GStr infname(infile);
429 <    if (strcmp(infile,"-")==0) {
430 <       f_in=stdin; infname="stdin"; }
431 <     else {
432 <        f_in=fopen(infile,"r");
433 <        if (f_in==NULL)
434 <            GError("Cannot open input file %s!\n",infile);
435 <        }
436 <     GLineReader fq(f_in);
437 <     char* l=NULL;
438 <     while ((l=fq.getLine())!=NULL) {
439 <        GStr rname; //current read name
440 <        GStr rseq;  //current read sequence
441 <        GStr rqv;   //current read quality values
442 <        GStr s;
443 <        if (rawFormat) {
444 <          //TODO: implement qseq parsing here
445 <          //if (raw type=N) then continue; //skip invalid/bad records
446 <          
447 <          } //raw qseq format
448 <         else { // FASTQ or FASTA
449 <          isfasta=(l[0]=='>');
450 <          if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
451 <          s=l;
452 <          rname=&(l[1]);
453 <          icounter++;
454 <          //GMessage("readname=%s\n",rname.chars());
455 <          for (int i=0;i<rname.length();i++)
456 <            if (rname[i]<=' ') { rname.cut(i); break; }
457 <          //now get the sequence
458 <          if ((l=fq.getLine())==NULL)
459 <              GError("Error: unexpected EOF after header for %s\n",rname.chars());
460 <          rseq=l; //this must be the DNA line
461 <          if (isfasta) {
462 <            while ((l=fq.getLine())!=NULL) {
463 <              //fasta can have multiple sequence lines
464 <              if (l[0]=='>') {
465 <                   fq.pushBack();
466 <                   break; //
428 >    int incounter=0; //counter for input reads
429 >    int outcounter=0; //counter for output reads
430 >    int trash_s=0; //too short from the get go
431 >    int trash_Q=0;
432 >    int trash_N=0;
433 >    int trash_D=0;
434 >    int trash_A3=0;
435 >    int trash_A5=0;
436 >    s=infile;
437 >    GStr infname;
438 >    GStr infname2;
439 >    FILE* f_in=NULL;
440 >    FILE* f_in2=NULL;
441 >    FILE* f_out=NULL;
442 >    FILE* f_out2=NULL;
443 >    bool paired_reads=false;
444 >    setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
445 >    GLineReader fq(f_in);
446 >    GLineReader* fq2=NULL;
447 >    if (f_in2!=NULL) {
448 >       fq2=new GLineReader(f_in2);
449 >       paired_reads=true;
450 >       }
451 >    GStr rseq, rqv, seqid, seqinfo;
452 >    GStr rseq2, rqv2, seqid2, seqinfo2;
453 >    while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
454 >       incounter++;
455 >       int a5=0, a3=0, b5=0, b3=0;
456 >       char tcode=0, tcode2=0;
457 >       tcode=process_read(seqid, rseq, rqv, a5, a3);
458 >       //if (!doCollapse) {
459 >         if (fq2!=NULL) {
460 >            getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
461 >            if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
462 >               GError("Error: no paired match for read %s vs %s (%s,%s)\n",
463 >                  seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
464 >               }
465 >            tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
466 >            //decide what to do with this pair and print rseq2 if the pair makes it
467 >            if (tcode>1 && tcode2<=1) {
468 >               //"untrash" rseq
469 >               if (a3-a5+1<min_read_len) {
470 >                   a5=1;
471 >                   if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
472                     }
473 <              rseq+=l;
474 <              }
475 <            } //multi-line fasta file
476 <          if (!isfasta) {
477 <            if ((l=fq.getLine())==NULL)
478 <                GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
479 <            if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
480 <            if ((l=fq.getLine())==NULL)
481 <                GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
482 <            rqv=l;
483 <            if (rqv.length()!=rseq.length())
484 <                GError("Error: qv len != seq len for %s\n", rname.chars());
485 <            }
486 <        } //<-- FASTA or FASTQ
217 <        rseq.upper();
218 <        int l5=0;
219 <        int l3=rseq.length()-1;
220 <        if (l3-l5+1<min_read_len) {
221 <           if (trashReport) {
222 <                  fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
223 <                  }
224 <           continue;
225 <           }
226 <        if (ntrim(rseq, rqv, l5, l3)) {
227 <           if (l3-l5+1<min_read_len) {
228 <             if (trashReport) {
229 <                    fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
230 <                    }
231 <             continue; //invalid read
232 <             }
233 <            //-- keep only the l5..l3 range
234 <           rseq=rseq.substr(l5, l3-l5+1);
235 <           if (!rqv.is_empty())
236 <              rqv=rqv.substr(l5, l3-l5+1);
237 <           }
238 <          
239 <        if (a3len>0) {
240 <          if (trim_adapter3(rseq, l5, l3)) {
241 <             if (l3-l5+1<min_read_len) {
242 <                 if (trashReport) {
243 <                     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
244 <                     }
245 <                 continue;
246 <                 }
247 <              //-- keep only the l5..l3 range
248 <              rseq=rseq.substr(l5, l3-l5+1);
249 <              if (!rqv.is_empty())
250 <                 rqv=rqv.substr(l5, l3-l5+1);
251 <              }//some adapter was trimmed
252 <           } //adapter trimming
253 <        if (a5len>0) {
254 <          if (trim_adapter5(rseq, l5, l3)) {
255 <             if (l3-l5+1<min_read_len) {
256 <                 if (trashReport) {
257 <                     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
258 <                     }
259 <                 continue;
473 >               tcode=1;
474 >               }
475 >             else if (tcode<=1 && tcode2>1) {
476 >               //"untrash" rseq2
477 >               if (b3-b5+1<min_read_len) {
478 >                   b5=1;
479 >                   if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
480 >                   }
481 >               tcode2=1;
482 >               }
483 >            if (tcode<=1) { //trimmed or left intact -- write it!
484 >               if (tcode>0) {
485 >                 rseq2=rseq2.substr(b5,b3-b5+1);
486 >                 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
487                   }
488 <              //-- keep only the l5..l3 range
489 <              rseq=rseq.substr(l5, l3-l5+1);
490 <              if (!rqv.is_empty())
491 <                 rqv=rqv.substr(l5, l3-l5+1);
492 <              }//some adapter was trimmed
493 <           } //adapter trimming
494 <        if (doCollapse) {
495 <           //keep read for later
496 <           FqDupRec* dr=dhash.Find(rseq.chars());
497 <           if (dr==NULL) { //new entry
498 <                  //if (prefix.is_empty())
499 <                     dhash.Add(rseq.chars(),
500 <                          new FqDupRec(&rqv, rname.chars()));
501 <                  //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
488 >               int nocounter=0;
489 >               writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
490 >               }
491 >            } //paired read
492 >       // }
493 >       if (tcode>1) { //trashed
494 >          if (tcode=='s') trash_s++;
495 >            else if (tcode=='Q') trash_Q++;
496 >              else if (tcode=='N') trash_N++;
497 >               else if (tcode=='D') trash_D++;
498 >                else if (tcode=='3') trash_A3++;
499 >                 else if (tcode=='5') trash_A5++;
500 >          if (trashReport) trash_report(tcode, seqid, freport);
501 >          }
502 >         else if (!doCollapse) { //write it
503 >          if (tcode>0) {
504 >            rseq=rseq.substr(a5,a3-a5+1);
505 >            if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
506 >            }
507 >          writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
508 >          }
509 >       } //for each fastq record
510 >    delete fq2;
511 >    FRCLOSE(f_in);
512 >    FRCLOSE(f_in2);
513 >    if (doCollapse) {
514 >       outcounter=0;
515 >       int maxdup_count=1;
516 >       char* maxdup_seq=NULL;
517 >       dhash.startIterate();
518 >       FqDupRec* qd=NULL;
519 >       char* seq=NULL;
520 >       while ((qd=dhash.NextData(seq))!=NULL) {
521 >         GStr rseq(seq);
522 >         //do the dusting here
523 >         if (doDust) {
524 >            int dustbases=dust(rseq);
525 >            if (dustbases>(rseq.length()>>1)) {
526 >               if (trashReport && qd->firstname!=NULL) {
527 >                 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
528                   }
529 <              else    
530 <                 dr->add(rqv);
531 <           } //collapsing duplicates
532 <         else { //not collapsing duplicates
533 <           //do the dust filter now
534 <           if (doDust) {
535 <             int dustbases=dust(rseq);
536 <             if (dustbases>(rseq.length()>>1)) {
537 <                if (trashReport) {
538 <                  fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
539 <                  }
540 <                continue;
541 <                }
289 <             }
290 <           //print this record here  
291 <           outcounter++;
292 <           if (isfasta) {
293 <            if (prefix.is_empty())
294 <               fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
295 <              else
296 <               fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
297 <                                  rseq.chars());
529 >               trash_D+=qd->count;
530 >               continue;
531 >               }
532 >            }
533 >         outcounter++;
534 >         if (qd->count>maxdup_count) {
535 >            maxdup_count=qd->count;
536 >            maxdup_seq=seq;
537 >            }
538 >         if (isfasta) {
539 >           if (prefix.is_empty()) {
540 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
541 >                           rseq.chars());
542               }
543 <           else {  //fastq
544 <            if (phred64) convertQ64(rqv);
545 <            if (prefix.is_empty())
302 <               fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
303 <              else
304 <               fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
305 <                                  rseq.chars(),rqv.chars() );
543 >           else { //use custom read name
544 >             fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
545 >                        qd->count, rseq.chars());
546               }
547 <           } //not collapsing duplicates
548 <        } //for each fastq record
549 <   } //while each input file
550 < FRCLOSE(f_in);
551 < if (doCollapse) {
552 <    outcounter=0;
313 <    int maxdup_count=1;
314 <    char* maxdup_seq=NULL;
315 <    dhash.startIterate();
316 <    FqDupRec* qd=NULL;
317 <    char* seq=NULL;
318 <    while ((qd=dhash.NextData(seq))!=NULL) {
319 <      GStr rseq(seq);
320 <      //do the dusting here
321 <      if (doDust) {
322 <         int dustbases=dust(rseq);
323 <         if (dustbases>(rseq.length()>>1)) {
324 <            if (trashReport && qd->firstname!=NULL) {
325 <              fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
326 <              }
327 <            continue;
547 >           }
548 >         else { //fastq format
549 >          if (convert_phred) convertPhred(qd->qv, qd->len);
550 >          if (prefix.is_empty()) {
551 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
552 >                           rseq.chars(), qd->qv);
553              }
554 +          else { //use custom read name
555 +            fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
556 +                        qd->count, rseq.chars(), qd->qv);
557 +            }
558 +           }
559 +         }//for each element of dhash
560 +       if (maxdup_count>1) {
561 +         GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
562           }
563 <      outcounter++;
564 <      if (qd->count>maxdup_count) {
565 <         maxdup_count=qd->count;
566 <         maxdup_seq=seq;
567 <         }
568 <      if (isfasta) {
569 <        if (prefix.is_empty()) {
570 <          fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
571 <                        rseq.chars());
572 <          }
573 <        else { //use custom read name
574 <          fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
575 <                     qd->count, rseq.chars());
563 >       } //collapse entries
564 >    if (verbose) {
565 >       if (paired_reads) {
566 >           GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
567 >           GMessage("Number of input pairs :%9d\n", incounter);
568 >           GMessage("         Output pairs :%9d\n", outcounter);
569 >           }
570 >         else {
571 >           GMessage(">Input file : %s\n", infname.chars());
572 >           GMessage("Number of input reads :%9d\n", incounter);
573 >           GMessage("         Output reads :%9d\n", outcounter);
574 >           }
575 >       GMessage("------------------------------------\n");
576 >       if (num_trimmed5)
577 >          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
578 >       if (num_trimmed3)
579 >          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
580 >       GMessage("------------------------------------\n");
581 >       if (trash_s>0)
582 >         GMessage("     Trashed by length:%9d\n", trash_s);
583 >       if (trash_N>0)
584 >         GMessage("         Trashed by N%%:%9d\n", trash_N);
585 >       if (trash_Q>0)
586 >         GMessage("Trashed by low quality:%9d\n", trash_Q);
587 >       if (trash_A5>0)
588 >         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
589 >       if (trash_A3>0)
590 >         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
591 >       }
592 >    if (trashReport) {
593 >          FWCLOSE(freport);
594            }
595 <        }
596 <      else { //fastq format
597 <       if (phred64) convertQ64(qd->qv, qd->len);
347 <       if (prefix.is_empty()) {
348 <         fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
349 <                        rseq.chars(), qd->qv);
350 <         }
351 <       else { //use custom read name
352 <         fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
353 <                     qd->count, rseq.chars(), qd->qv);
354 <         }
355 <        }
356 <      }//for each element of dhash
357 <    if (maxdup_count>1) {
358 <      GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
359 <      }
360 <   } //report collapsed dhash entries
361 < GMessage("Number of input reads: %9d\n", icounter);
362 < GMessage("       Output records: %9d\n", outcounter);
363 < if (trashReport) {
364 <    FWCLOSE(freport);
365 <    }
595 >    FWCLOSE(f_out);
596 >    FWCLOSE(f_out2);
597 >   } //while each input file
598  
599 < FWCLOSE(f_out);
599 > //getc(stdin);
600   }
601  
602   class NData {
# Line 393 | Line 625
625       end5=1;
626       end3=seqlen;
627       for (int i=0;i<seqlen;i++)
628 <        if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
628 >        if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
629             NPos[NCount]=i;
630             NCount++;
631             }
# Line 416 | Line 648
648   void N_analyze(int l5, int l3, int p5, int p3) {
649   /* assumes feat was filled properly */
650   int old_dif, t5,t3,v;
651 < if (l3-l5<min_read_len || p5>p3 ) {
651 > if (l3<l5+2 || p5>p3 ) {
652     feat.end5=l5+1;
653     feat.end3=l3+1;
654     return;
# Line 448 | Line 680
680   }
681  
682  
683 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) {
684 < //count Ns in the sequence
683 > bool qtrim(GStr& qvs, int &l5, int &l3) {
684 > if (qvtrim_qmin==0 || qvs.is_empty()) return false;
685 > if (qv_phredtype==0) {
686 >  //try to guess the Phred type
687 >  int vmin=256, vmax=0;
688 >  for (int i=0;i<qvs.length();i++) {
689 >     if (vmin>qvs[i]) vmin=qvs[i];
690 >     if (vmax<qvs[i]) vmax=qvs[i];
691 >     }
692 >  if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
693 >  if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
694 >  if (qv_phredtype==0) {
695 >    GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
696 >    }
697 >  if (verbose)
698 >    GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
699 >  } //guessing Phred type
700 > for (l3=qvs.length()-1;l3>2;l3--) {
701 >  if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
702 >  }
703 > //just in case, check also the 5' the end (?)
704 > for (l5=0;l5<qvs.length()-3;l5++) {
705 >  if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
706 >  }
707 > if (qvtrim_max>0) {
708 >  if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
709 >  if (l5>qvtrim_max) l5=qvtrim_max;
710 >  }
711 > return (l5>0 || l3<qvs.length()-1);
712 > }
713 >
714 > bool ntrim(GStr& rseq, int &l5, int &l3) {
715 > //count Ns in the sequence, trim N-rich ends
716   feat.init(rseq);
717   l5=feat.end5-1;
718   l3=feat.end3-1;
719   N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
720 < if (l5==feat.end5-1 && l3==feat.end3-1)
721 <    return false; //nothing changed
720 > if (l5==feat.end5-1 && l3==feat.end3-1) {
721 >    if (feat.perc_N>max_perc_N) {
722 >           feat.valid=false;
723 >           l3=l5+1;
724 >           return true;
725 >           }
726 >      else {
727 >       return false; //nothing changed
728 >       }
729 >    }
730   l5=feat.end5-1;
731   l3=feat.end3-1;
732   if (l3-l5+1<min_read_len) {
# Line 463 | Line 734
734     return true;
735     }
736   feat.N_calc();
737 < if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases
737 >
738 > if (feat.perc_N>max_perc_N) {
739        feat.valid=false;
740        l3=l5+1;
741        return true;
# Line 589 | Line 861
861   return ncount;
862   }
863  
864 < // ------------------ adapter matching - triplet matching
865 < //look for matching triplets amongst the first 9 nucleotides of the 3' adaptor
866 < // or the last 9 nucleotides for the 5' adapter
867 < //when a triplet match is found, simply try to extend the alignment using a drop-off scheme
868 < // check minimum score and
597 < // for 3' adapter trimming:
864 >
865 > // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
866 > //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
867 > //check minimum score and
868 > //for 3' adapter trimming:
869   //     require that the right end of the alignment for either the adaptor OR the read must be
870   //     < 3 distance from its right end
871   // for 5' adapter trimming:
872   //     require that the left end of the alignment for either the adaptor OR the read must
873 < //     be at coordinate 0
873 > //     be at coordinate < 3 from start
874  
875   bool extendMatch(const char* a, int alen, int ai,
876 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, bool end5=false) {
876 >                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
877   //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
878 < //if (debug)
879 < //  GStr dbg(b);
880 < //GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai);
878 > #ifdef DEBUG
879 > GStr dbg(b);
880 > #endif
881 > //if (debug) {
882 > //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
883 > //  }
884   int a_l=ai; //alignment coordinates on a
885   int a_r=ai+mlen-1;
886   int b_l=bi; //alignment coordinates on b
# Line 670 | Line 944
944    int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
945    int mmovh5=(a_l<b_l)? a_l : b_l;
946    //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
947 + #ifdef DEBUG
948 +  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
949 + #endif
950    if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
951       if (a_l<a_ovh3) {
952          //adapter closer to the left end (typical for 5' adapter)
# Line 683 | Line 960
960          }
961       return true;
962       }
963 + else { //keep this segment pair for later (gapped alignment)
964 +   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
965 +   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
966 +   }
967 +  //do not trim:
968 +  l5=0;
969 +  l3=alen-1;
970    return false;
971   }
972  
973 + /*
974 + int getWordValue(const char* s, int wlen) {
975 + int r=0;
976 + while (wlen--) { r+=(((int)s[wlen])<<wlen) }
977 + return r;
978 + }
979 + */
980 + int get3mer_value(const char* s) {
981 + return (s[0]<<16)+(s[1]<<8)+s[2];
982 + }
983 +
984 + int w3_match(int qv, const char* str, int slen, int start_index=0) {
985 + if (start_index>=slen || start_index<0) return -1;
986 + for (int i=start_index;i<slen-3;i++) {
987 +   int rv=get3mer_value(str+i);
988 +   if (rv==qv) return i;
989 +   }
990 + return -1;
991 + }
992 +
993 + int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
994 + if (end_index>=slen) return -1;
995 + if (end_index<0) end_index=slen-1;
996 + for (int i=end_index-2;i>=0;i--) {
997 +   int rv=get3mer_value(str+i);
998 +   if (rv==qv) return i;
999 +   }
1000 + return -1;
1001 + }
1002 +
1003 + int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
1004 + if (start_index>=slen || start_index<0) return -1;
1005 + for (int i=start_index;i<slen-4;i++) {
1006 +   int32* rv=(int32*)(str+i);
1007 +   if (*rv==qv) return i;
1008 +   }
1009 + return -1;
1010 + }
1011 +
1012 + int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1013 + if (end_index>=slen) return -1;
1014 + if (end_index<0) end_index=slen-1;
1015 + for (int i=end_index-3;i>=0;i--) {
1016 +   int32* rv=(int32*)(str+i);
1017 +   if (*rv==qv) return i;
1018 +   }
1019 + return -1;
1020 + }
1021 +
1022 + #ifdef DEBUG
1023 + void dbgPrintChain(CSegChain& chain, const char* aseq) {
1024 +  GStr s(aseq);
1025 +  for (int i=0;i<chain.Count();i++) {
1026 +   CSegPair& seg=*chain[i];
1027 +   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1028 +            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
1029 +   }
1030 + }
1031 + #endif
1032 + const char *pA_seed="AAAA";
1033 + const char *pT_seed="TTTT";
1034 + const int polyA_seed=*(int*)pA_seed;
1035 + const int polyT_seed=*(int*)pT_seed;
1036 +
1037 + bool trim_poly3(GStr &seq, int &l5, int &l3) {
1038 + int rlen=seq.length();
1039 + l5=0;
1040 + l3=rlen-1;
1041 + //assumes N trimming was already done
1042 + //so a poly match should be very close to the end of the read
1043 + //TODO: should we attempt polyA and polyT at the same time, same end?
1044 + // -- find the initial match (seed)
1045 + int rlo=GMAX((rlen-10), 0);
1046 + int si;
1047 + for (si=rlen-5;si>rlo;si--) {
1048 +   if (polyA_seed==*(int*)&(seq.chars()+si)) {
1049 +      break;
1050 +      }
1051 +   }
1052 + if (si<=rlo) return false;
1053 + //seed found, try to extend it to left and right
1054 + int score=4*match_score;
1055 + max_rlo=rlo;
1056 + rlo--;
1057 + while (rlo>=0) {
1058 +    rlo--;
1059 +    score+=
1060 +    
1061 +    }
1062 + }
1063 +
1064   bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1065   int rlen=seq.length();
1066   l5=0;
# Line 699 | Line 1074
1074        }
1075      else {
1076        l5=0;
1077 <      l3=fi-1;
1077 >      l3=fi-1;
1078        }
1079     return true;
1080     }
1081 + #ifdef DEBUG
1082 + if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
1083 + #endif
1084 +
1085   //also, for fast detection of other adapter-only reads that start past
1086   // the beginning of the adapter sequence, try to see if the first a3len-4
1087 < // characters of the read are a substring of the adapter
1088 < GStr rstart=seq.substr(0,a3len-4);
1089 < if ((fi=adapter3.index(rstart))>=0) {
1090 <   l3=rlen-1;
1091 <   l5=a3len-4;
1092 <   while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1093 <   return true;
1087 > // bases of the read are a substring of the adapter
1088 > if (rlen>a3len-3) {
1089 >   GStr rstart=seq.substr(1,a3len-4);
1090 >   if ((fi=adapter3.index(rstart))>=0) {
1091 >     l3=rlen-1;
1092 >     l5=a3len-4;
1093 >     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1094 >     return true;
1095 >     }
1096 >  }
1097 > CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1098 >  //check the easy cases - 11 bases exact match at the end
1099 > int fdlen=11;
1100 >  if (a3len<16) {
1101 >   fdlen=a3len>>1;
1102     }
1103 <  
1104 < //another easy case: first half of the adaptor matches
1105 < int a3hlen=a3len>>1;
1106 < GStr ahalf=adapter3.substr(0, a3hlen);
1107 < if ((fi=seq.index(ahalf))>=0) {
1108 <   extendMatch(seq.chars(), rlen, fi,
1109 <                 adapter3.chars(), a3len, 0,  a3hlen, l5,l3);
1110 <   return true;
1103 > if (fdlen>4) {
1104 >     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1105 >     GStr rstart=seq.substr(-fdlen-3,fdlen);
1106 >     if ((fi=adapter3.index(rstart))>=0) {
1107 > #ifdef DEBUG
1108 >       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1109 > #endif
1110 >       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1111 >                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
1112 >            return true;
1113 >       }
1114 >     //another easy case: first 11 characters of the adaptor found as a substring of the read
1115 >     GStr bstr=adapter3.substr(0, fdlen);
1116 >     if ((fi=seq.rindex(bstr))>=0) {
1117 > #ifdef DEBUG
1118 >       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1119 > #endif
1120 >       if (extendMatch(seq.chars(), rlen, fi,
1121 >                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
1122 >            return true;
1123 >       }
1124 >     } //tried to match 11 bases first
1125 >    
1126 > //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1127 > //-- only extend if the match is in the 3' (ending) region of the read
1128 > int wordSize=3;
1129 > int hlen=12;
1130 > if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1131 > int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1132 > if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1133 > imin=rlen-imin;
1134 > for (int iw=0;iw<hlen;iw++) {
1135 >   //int32* qv=(int32*)(adapter3.chars()+iw);
1136 >   int qv=get3mer_value(adapter3.chars()+iw);
1137 >   fi=-1;
1138 >   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1139 >   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1140 >     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1141 >
1142 > #ifdef DEBUG
1143 >     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1144 > #endif
1145 >     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1146 >                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
1147 >     fi--;
1148 >     if (fi<imin) break;
1149 >     }
1150 >   } //for each wmer in the first hlen bases of the adaptor
1151 > /*
1152 > //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1153 > //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1154 > if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1155 > int hlen2=a3len-wordSize;
1156 > //if (hlen2>a3len-4) hlen2=a3len-4;
1157 > if (hlen2>hlen) {
1158 > #ifdef DEBUG
1159 >     if (debug && a3segs.Count()>0) {
1160 >        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1161 >        }
1162 > #endif
1163 >     for (int iw=hlen;iw<hlen2;iw++) {
1164 >         //int* qv=(int32 *)(adapter3.chars()+iw);
1165 >         int qv=get3mer_value(adapter3.chars()+iw);
1166 >         fi=-1;
1167 >         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1168 >         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1169 >           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1170 >                         a3len, iw, wordSize, l5,l3, a3segs);
1171 >           fi--;
1172 >           if (fi<imin) break;
1173 >           }
1174 >         } //for each wmer between hlen2 and hlen bases of the adaptor
1175 >     }
1176 > //lastly, analyze collected a3segs for a possible gapped alignment:
1177 > GList<CSegChain> segchains(false,true,false);
1178 > #ifdef DEBUG
1179 > if (debug && a3segs.Count()>0) {
1180 >   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1181     }
1182 < //no easy cases, so let's do the word hashing
1183 < for (int iw=0;iw<6;iw++) {
1184 <   GStr aword=adapter3.substr(iw,3);
1185 <   if ((fi=seq.index(aword))>=0  && rlen-fi>3) {
1186 <      if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1187 <                   a3len, iw, 3, l5,l3)) return true;
1182 > #endif
1183 > for (int i=0;i<a3segs.Count();i++) {
1184 >   if (a3segs[i]->chain==NULL) {
1185 >       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1186 >       CSegChain* newchain=new CSegChain();
1187 >       newchain->setFreeItem(false);
1188 >       newchain->addSegPair(a3segs[i]);
1189 >       a3segs[i]->chain=newchain;
1190 >       segchains.Add(newchain); //just to free them when done
1191 >       }
1192 >   for (int j=i+1;j<a3segs.Count();j++) {
1193 >      CSegChain* chain=a3segs[i]->chain;
1194 >      if (chain->extendChain(a3segs[j])) {
1195 >          a3segs[j]->chain=chain;
1196 > #ifdef DEBUG
1197 >          if (debug) dbgPrintChain(*chain, adapter3.chars());
1198 > #endif      
1199 >          //save time by checking here if the extended chain is already acceptable for trimming
1200 >          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1201 >            l5=0;
1202 >            l3=chain->astart-2;
1203 > #ifdef DEBUG
1204 >          if (debug && a3segs.Count()>0) {
1205 >            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1206 >            }
1207 > #endif
1208 >            return true;
1209 >            }
1210 >          } //chain can be extended
1211        }
1212 <   }
1212 >   } //collect segment alignments into chains
1213 > */  
1214   return false; //no adapter parts found
1215   }
1216  
# Line 751 | Line 1232
1232        }
1233     return true;
1234     }
1235 < //for fast detection of adapter-rich reads, check if the first 12
1236 < //characters of the read are a substring of the adapter
1237 < GStr rstart=seq.substr(1,12);
1238 < if ((fi=adapter5.index(rstart))>=0) {
1239 <   //l3=rlen-1;
1240 <   //l5=a5len-4;
1241 <   //while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++;
1242 <   //return true;
1243 <   //if (debug) GMessage(" first 12char of the read match adaptor!\n");
1244 <   extendMatch(seq.chars(), rlen, 1,
764 <                 adapter5.chars(), a5len, fi,  12, l5,l3, true);
765 <   return true;
1235 > #ifdef DEBUG
1236 > if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1237 > #endif
1238 >
1239 > CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1240 >
1241 > //try the easy way out first - look for an exact match of 11 bases
1242 > int fdlen=11;
1243 >  if (a5len<16) {
1244 >   fdlen=a5len>>1;
1245     }
1246 <  
1247 < //another easy case: last 12 characters of the adaptor found as a substring of the read
1248 < int aplen=12;
1249 < int apstart=a5len-aplen-2;
1250 < if (apstart<0) { apstart=0; aplen=a5len-1; }
1251 < GStr bstr=adapter5.substr(apstart, aplen);
1252 < if ((fi=seq.index(bstr))>=0) {
1253 <   //if (debug) GMessage(" last 12char of adaptor match the read!\n");
1254 <   extendMatch(seq.chars(), rlen, fi,
1255 <                 adapter5.chars(), a5len, apstart,  aplen, l5,l3,true);
1256 <   return true;
1246 > if (fdlen>4) {
1247 >     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1248 >     if ((fi=adapter5.index(rstart))>=0) {
1249 > #ifdef DEBUG
1250 >       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1251 > #endif
1252 >       if (extendMatch(seq.chars(), rlen, 1,
1253 >                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1254 >           return true;
1255 >       }
1256 >     //another easy case: last 11 characters of the adaptor found as a substring of the read
1257 >     GStr bstr=adapter5.substr(-fdlen);
1258 >     if ((fi=seq.index(bstr))>=0) {
1259 > #ifdef DEBUG
1260 >       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1261 > #endif
1262 >       if (extendMatch(seq.chars(), rlen, fi,
1263 >                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1264 >          return true;
1265 >       }
1266 >     } //tried to matching at most 11 bases first
1267 >    
1268 > //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1269 > //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1270 > int wordSize=3;
1271 > int hlen=12;
1272 > if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1273 > int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1274 > if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1275 > for (int iw=0;iw<=hlen;iw++) {
1276 >   int apstart=a5len-iw-wordSize;
1277 >   fi=0;
1278 >   //int* qv=(int32 *)(adapter5.chars()+apstart);
1279 >   int qv=get3mer_value(adapter5.chars()+apstart);
1280 >   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1281 >   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1282 > #ifdef DEBUG
1283 >     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1284 > #endif
1285 >     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1286 >                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1287 >     fi++;
1288 >     if (fi>imax) break;
1289 >     }
1290 >   } //for each wmer in the last hlen bases of the adaptor
1291 > /*
1292 >
1293 > //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1294 > //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1295 > if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1296 > int hlen2=a5len-wordSize;
1297 > //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1298 > #ifdef DEBUG
1299 >      if (debug && a5segs.Count()>0) {
1300 >        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1301 >        }
1302 > #endif
1303 > if (hlen2>hlen) {
1304 >     for (int iw=hlen+1;iw<=hlen2;iw++) {
1305 >         int apstart=a5len-iw-wordSize;
1306 >         fi=0;
1307 >         //int* qv=(int32 *)(adapter5.chars()+apstart);
1308 >         int qv=get3mer_value(adapter5.chars()+apstart);
1309 >         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1310 >         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1311 >           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1312 >                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1313 >           fi++;
1314 >           if (fi>imax) break;
1315 >           }
1316 >         } //for each wmer between hlen2 and hlen bases of the adaptor
1317 >     }
1318 > if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1319 > // lastly, analyze collected a5segs for a possible gapped alignment:
1320 > GList<CSegChain> segchains(false,true,false);
1321 > #ifdef DEBUG
1322 > if (debug && a5segs.Count()>0) {
1323 >   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1324     }
1325 < //no easy cases, find a triplet match as a seed for alignment extension
1326 < //find triplets at the right end of the adapter
1327 < for (int iw=0;iw<6;iw++) {
1328 <   apstart=a5len-iw-4;
1329 <   GStr aword=adapter5.substr(apstart,3);
1330 <   if ((fi=seq.index(aword))>=0) {
1331 <      //if (debug) GMessage("extending wmatch :%*s\n", fi+3, aword.chars());
1332 <      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1333 <                   a5len, apstart, 3, l5,l3,true)) {
1334 <                     return true;
1335 <                     }
1325 > #endif
1326 > for (int i=0;i<a5segs.Count();i++) {
1327 >   if (a5segs[i]->chain==NULL) {
1328 >       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1329 >       CSegChain* newchain=new CSegChain(true);
1330 >       newchain->setFreeItem(false);
1331 >       newchain->addSegPair(a5segs[i]);
1332 >       a5segs[i]->chain=newchain;
1333 >       segchains.Add(newchain); //just to free them when done
1334 >       }
1335 >   for (int j=i+1;j<a5segs.Count();j++) {
1336 >      CSegChain* chain=a5segs[i]->chain;
1337 >      if (chain->extendChain(a5segs[j])) {
1338 >         a5segs[j]->chain=chain;
1339 > #ifdef DEBUG
1340 >         if (debug) dbgPrintChain(*chain, adapter5.chars());
1341 > #endif      
1342 >      //save time by checking here if the extended chain is already acceptable for trimming
1343 >         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1344 >            l5=chain->aend;
1345 >            l3=rlen-1;
1346 >            return true;
1347 >            }
1348 >         } //chain can be extended
1349        }
1350 <   }
1350 >   } //collect segment alignments into chains
1351 > */
1352   return false; //no adapter parts found
1353   }
1354  
1355 < //conversion of phred64 to phread33
1356 < void convertQ64(GStr& q) {
1357 < for (int i=0;i<q.length();i++) q[i]-=31;
1355 > //convert qvs to/from phred64 from/to phread33
1356 > void convertPhred(GStr& q) {
1357 > for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1358 > }
1359 >
1360 > void convertPhred(char* q, int len) {
1361 > for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1362 > }
1363 >
1364 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1365 >          GStr& rname, GStr& rinfo, GStr& infname) {
1366 > rseq="";
1367 > rqv="";
1368 > rname="";
1369 > rinfo="";
1370 > if (fq.eof()) return false;
1371 > char* l=fq.getLine();
1372 > while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1373 > if (l==NULL) return false;
1374 > /* if (rawFormat) {
1375 >      //TODO: implement raw qseq parsing here
1376 >      //if (raw type=N) then continue; //skip invalid/bad records
1377 >      } //raw qseq format
1378 > else { // FASTQ or FASTA */
1379 > isfasta=(l[0]=='>');
1380 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1381 >      infname.chars(), l);
1382 > GStr s(l);
1383 > rname=&(l[1]);
1384 > for (int i=0;i<rname.length();i++)
1385 >    if (rname[i]<=' ') {
1386 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1387 >       rname.cut(i);
1388 >       break;
1389 >       }
1390 >  //now get the sequence
1391 > if ((l=fq.getLine())==NULL)
1392 >      GError("Error: unexpected EOF after header for read %s (%s)\n",
1393 >                   rname.chars(), infname.chars());
1394 > rseq=l; //this must be the DNA line
1395 > while ((l=fq.getLine())!=NULL) {
1396 >      //seq can span multiple lines
1397 >      if (l[0]=='>' || l[0]=='+') {
1398 >           fq.pushBack();
1399 >           break; //
1400 >           }
1401 >      rseq+=l;
1402 >      } //check for multi-line seq
1403 > if (!isfasta) { //reading fastq quality values, which can also be multi-line
1404 >    if ((l=fq.getLine())==NULL)
1405 >        GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1406 >    if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1407 >    if ((l=fq.getLine())==NULL)
1408 >        GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1409 >    rqv=l;
1410 >    //if (rqv.length()!=rseq.length())
1411 >    //  GError("Error: qv len != seq len for %s\n", rname.chars());
1412 >    while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1413 >      rqv+=l; //append to qv string
1414 >      }
1415 >    }// fastq
1416 > // } //<-- FASTA or FASTQ
1417 > rseq.upper(); //TODO: what if we care about masking?
1418 > return true;
1419 > }
1420 >
1421 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1422 > //returns 0 if the read was untouched, 1 if it was just trimmed
1423 > // and a trash code if it was trashed
1424 > l5=0;
1425 > l3=rseq.length()-1;
1426 > if (l3-l5+1<min_read_len) {
1427 >   return 's';
1428 >   }
1429 > GStr wseq(rseq.chars());
1430 > GStr wqv(rqv.chars());
1431 > int w5=l5;
1432 > int w3=l3;
1433 > //first do the q-based trimming
1434 > if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1435 >   if (w3-w5+1<min_read_len) {
1436 >     return 'Q'; //invalid read
1437 >     }
1438 >    //-- keep only the w5..w3 range
1439 >   l5=w5;
1440 >   l3=w3;
1441 >   wseq=wseq.substr(w5, w3-w5+1);
1442 >   if (!wqv.is_empty())
1443 >      wqv=wqv.substr(w5, w3-w5+1);
1444 >   } //qv trimming
1445 > // N-trimming on the remaining read seq
1446 > if (ntrim(wseq, w5, w3)) {
1447 >   //GMessage("before: %s\n",wseq.chars());
1448 >   //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1449 >   l5+=w5;
1450 >   l3-=(wseq.length()-1-w3);
1451 >   if (w3-w5+1<min_read_len) {
1452 >     return 'N'; //to be trashed
1453 >     }
1454 >    //-- keep only the w5..w3 range
1455 >   wseq=wseq.substr(w5, w3-w5+1);
1456 >   if (!wqv.is_empty())
1457 >      wqv=wqv.substr(w5, w3-w5+1);
1458 >   w5=0;
1459 >   w3=wseq.length()-1;
1460 >   }
1461 > if (a3len>0) {
1462 >  if (trim_adapter3(wseq, w5, w3)) {
1463 >     int trimlen=wseq.length()-(w3-w5+1);
1464 >     num_trimmed3++;
1465 >     if (trimlen<min_trimmed3)
1466 >         min_trimmed3=trimlen;
1467 >     l5+=w5;
1468 >     l3-=(wseq.length()-1-w3);
1469 >     if (w3-w5+1<min_read_len) {
1470 >         return '3';
1471 >         }
1472 >      //-- keep only the w5..w3 range
1473 >      wseq=wseq.substr(w5, w3-w5+1);
1474 >      if (!wqv.is_empty())
1475 >         wqv=wqv.substr(w5, w3-w5+1);
1476 >      }//some adapter was trimmed
1477 >   } //adapter trimming
1478 > if (a5len>0) {
1479 >  if (trim_adapter5(wseq, w5, w3)) {
1480 >     int trimlen=wseq.length()-(w3-w5+1);
1481 >     num_trimmed5++;
1482 >     if (trimlen<min_trimmed5)
1483 >         min_trimmed5=trimlen;
1484 >     l5+=w5;
1485 >     l3-=(wseq.length()-1-w3);
1486 >     if (w3-w5+1<min_read_len) {
1487 >         return '5';
1488 >         }
1489 >      //-- keep only the w5..w3 range
1490 >      wseq=wseq.substr(w5, w3-w5+1);
1491 >      if (!wqv.is_empty())
1492 >         wqv=wqv.substr(w5, w3-w5+1);
1493 >      }//some adapter was trimmed
1494 >   } //adapter trimming
1495 > if (doCollapse) {
1496 >   //keep read for later
1497 >   FqDupRec* dr=dhash.Find(wseq.chars());
1498 >   if (dr==NULL) { //new entry
1499 >          //if (prefix.is_empty())
1500 >             dhash.Add(wseq.chars(),
1501 >                  new FqDupRec(&wqv, rname.chars()));
1502 >          //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1503 >         }
1504 >      else
1505 >         dr->add(wqv);
1506 >   } //collapsing duplicates
1507 > else { //not collapsing duplicates
1508 >   //apply the dust filter now
1509 >   if (doDust) {
1510 >     int dustbases=dust(wseq);
1511 >     if (dustbases>(wseq.length()>>1)) {
1512 >        return 'D';
1513 >        }
1514 >     }
1515 >   } //not collapsing duplicates
1516 > return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1517 > }
1518 >
1519 > void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1520 > //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1521 > if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1522 >  else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1523 > }
1524 >
1525 > void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1526 >   outcounter++;
1527 >   bool asFasta=(rqv.is_empty() || fastaOutput);
1528 >   if (asFasta) {
1529 >    if (prefix.is_empty()) {
1530 >       printHeader(f_out, '>',rname,rinfo);
1531 >       fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1532 >       }
1533 >      else {
1534 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1535 >                          rseq.chars());
1536 >       }
1537 >     }
1538 >   else {  //fastq
1539 >    if (convert_phred) convertPhred(rqv);
1540 >    if (prefix.is_empty()) {
1541 >       printHeader(f_out, '@', rname, rinfo);
1542 >       fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1543 >       }
1544 >      else
1545 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1546 >                          rseq.chars(),rqv.chars() );
1547 >     }
1548 > }
1549 >
1550 > void trash_report(char trashcode, GStr& rname, FILE* freport) {
1551 > if (freport==NULL || trashcode<=' ') return;
1552 > if (trashcode=='3' || trashcode=='5') {
1553 >   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1554 >   }
1555 > else {
1556 >   fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1557 >   }
1558 > //tcounter++;
1559 > }
1560 >
1561 > GStr getFext(GStr& s, int* xpos=NULL) {
1562 > //s must be a filename without a path
1563 > GStr r("");
1564 > if (xpos!=NULL) *xpos=0;
1565 > if (s.is_empty() || s=="-") return r;
1566 > int p=s.rindex('.');
1567 > int d=s.rindex('/');
1568 > if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1569 > r=s.substr(p+1);
1570 > if (xpos!=NULL) *xpos=p+1;
1571 > r.lower();
1572 > return r;
1573 > }
1574 >
1575 > void baseFileName(GStr& fname) {
1576 > //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1577 > int xpos=0;
1578 > GStr fext=getFext(fname, &xpos);
1579 > if (xpos<=1) return;
1580 > bool extdel=false;
1581 > GStr f2;
1582 > if (fext=="z" || fext=="zip") {
1583 >   extdel=true;
1584 >   }
1585 >  else if (fext.length()>=2) {
1586 >   f2=fext.substr(0,2);
1587 >   extdel=(f2=="gz" || f2=="bz");
1588 >   }
1589 > if (extdel) {
1590 >   fname.cut(xpos-1);
1591 >   fext=getFext(fname, &xpos);
1592 >   if (xpos<=1) return;
1593 >   }
1594 > extdel=false;
1595 > if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1596 >   extdel=true;
1597 >   }
1598 >  else if (fext.length()>=2) {
1599 >   extdel=(fext.substr(0,2)=="fa");
1600 >   }
1601 > if (extdel) fname.cut(xpos-1);
1602 > GStr fncp(fname);
1603 > fncp.lower();
1604 > fncp.chomp("seq");
1605 > fncp.chomp("sequence");
1606 > fncp.trimR("_.");
1607 > if (fncp.length()<fname.length()) fname.cut(fncp.length());
1608   }
1609  
1610 < void convertQ64(char* q, int len) {
1611 < for (int i=0;i<len;i++) q[i]-=31;
1610 > FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1611 >  FILE* f_out=NULL;
1612 >  GStr fname(getFileName(infname.chars()));
1613 >  //eliminate known extensions
1614 >  baseFileName(fname);
1615 >  if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1616 >    else if (pocmd.is_empty()) {
1617 >               GStr oname(fname);
1618 >               oname.append('.');
1619 >               oname.append(outsuffix);
1620 >               f_out=fopen(oname.chars(),"w");
1621 >               if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1622 >               }
1623 >            else {
1624 >              GStr oname(">");
1625 >              oname.append(fname);
1626 >              oname.append('.');
1627 >              oname.append(outsuffix);
1628 >              pocmd.append(oname);
1629 >              f_out=popen(pocmd.chars(), "w");
1630 >              if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1631 >              }
1632 > return f_out;
1633   }
1634  
1635 + void guess_unzip(GStr& fname, GStr& picmd) {
1636 + GStr fext=getFext(fname);
1637 + if (fext=="gz" || fext=="gzip" || fext=="z") {
1638 +    picmd="gzip -cd ";
1639 +    }
1640 +   else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1641 +    picmd="bzip2 -cd ";
1642 +    }
1643 + }
1644 +
1645 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1646 +                       GStr& s, GStr& infname, GStr& infname2) {
1647 + // uses outsuffix to generate output file names and open file handles as needed
1648 + infname="";
1649 + infname2="";
1650 + f_in=NULL;
1651 + f_in2=NULL;
1652 + f_out=NULL;
1653 + f_out2=NULL;
1654 + //analyze outsuffix intent
1655 + GStr pocmd;
1656 + if (outsuffix=="-") {
1657 +    f_out=stdout;
1658 +    }
1659 +   else {
1660 +    GStr ox=getFext(outsuffix);
1661 +    if (ox.length()>2) ox=ox.substr(0,2);
1662 +    if (ox=="gz") pocmd="gzip -9 -c ";
1663 +        else if (ox=="bz") pocmd="bzip2 -9 -c ";
1664 +    }
1665 + if (s=="-") {
1666 +    f_in=stdin;
1667 +    infname="stdin";
1668 +    f_out=prepOutFile(infname, pocmd);
1669 +    return;
1670 +    } // streaming from stdin
1671 + s.startTokenize(",:");
1672 + s.nextToken(infname);
1673 + bool paired=s.nextToken(infname2);
1674 + if (fileExists(infname.chars())==0)
1675 +    GError("Error: cannot find file %s!\n",infname.chars());
1676 + GStr fname(getFileName(infname.chars()));
1677 + GStr picmd;
1678 + guess_unzip(fname, picmd);
1679 + if (picmd.is_empty()) {
1680 +   f_in=fopen(infname.chars(), "r");
1681 +   if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1682 +   }
1683 +  else {
1684 +   picmd.append(infname);
1685 +   f_in=popen(picmd.chars(), "r");
1686 +   if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1687 +   }
1688 + if (f_out==stdout) {
1689 +   if (paired) GError("Error: output suffix required for paired reads\n");
1690 +   return;
1691 +   }
1692 + f_out=prepOutFile(infname, pocmd);
1693 + if (!paired) return;
1694 + if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1695 + // ---- paired reads:-------------
1696 + if (fileExists(infname2.chars())==0)
1697 +     GError("Error: cannot find file %s!\n",infname2.chars());
1698 + picmd="";
1699 + GStr fname2(getFileName(infname2.chars()));
1700 + guess_unzip(fname2, picmd);
1701 + if (picmd.is_empty()) {
1702 +   f_in2=fopen(infname2.chars(), "r");
1703 +   if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1704 +   }
1705 +  else {
1706 +   picmd.append(infname2);
1707 +   f_in2=popen(picmd.chars(), "r");
1708 +   if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1709 +   }
1710 + f_out2=prepOutFile(infname2, pocmd);
1711 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines