ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 2 | Line 2
2   #include "GStr.h"
3   #include "GHash.hh"
4   #include "GList.hh"
5 + #include <ctype.h>
6  
7   #define USAGE "Usage:\n\
8 < fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\
9 <    [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\
8 > fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
9 >   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10 >   [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
11 >    <input.fq>[,<input_mates.fq>\n\
12   \n\
13 < Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\
14 < optionally collapse duplicates in the input fastq data.\n\
13 > Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
14 > for low complexity and collapse duplicate reads.\n\
15 > If read pairs should be trimmed and kept together (i.e. without discarding\n\
16 > one read in a pair), the two file names should be given delimited by a comma\n\
17 > or a colon character\n\
18   \n\
19   Options:\n\
20   -n  rename all the reads using the <prefix> followed by a read counter;\n\
21      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
22      the read duplication count\n\
23 < -o  write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
23 > -o  unless this parameter is '-', write the trimmed/filtered reads to \n\
24 >    file(s) named <input>.<outsuffix> which will be created in the \n\
25 >    current (working) directory; (writes to stdout if -o- is given);\n\
26 >    a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
27 > -f  file with adapter sequences to trim, each line having this format:\n\
28 >    <5'-adapter-sequence> <3'-adapter-sequence>\n\
29   -5  trim the given adapter or primer sequence at the 5' end of each read\n\
30      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
31   -3  trim the given adapter sequence at the 3' end of each read\n\
32      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
33 < -l  minimum \"clean\" length at the high quality 5'end that a read must\n\
34 <    have in order to pass the filter (default: 16)\n\
33 > -a  minimum length of exact match to adaptor sequence at the proper end (6)\n\
34 > -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
35 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
36 > -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
37 > -l  minimum \"clean\" length after trimming that a read must have\n\
38 >    in order to pass the filter (default: 16)\n\
39   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
40      one letter code for the reason why they were trashed\n\
41 + -D  apply a low-complexity (dust) filter and discard any read that has over\n\
42 +    50% of its length detected as low complexity\n\
43   -C  collapse duplicate reads and add a prefix with their count to the read\n\
44 <    name\n\
45 < -D  apply a low-complexity (dust) filter and discard any read that has at\n\
46 <    least half of it masked by this\n\
47 < -Q  quality values in the input data are interpreted as phred64, convert\n\
31 <    them to phred33\n\
44 >    name (e.g. for microRNAs)\n\
45 > -p  input is phred64/phred33 (use -p64 or -p33)\n\
46 > -Q  convert quality values to the other Phred qv type\n\
47 > -V  verbose processing\n\
48   "
49 +
50 + //-z  for -o option, the output stream(s) will be first piped into the given\n
51 + //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
52 +
53 +
54   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
55 < FILE* f_out=NULL; //stdout if not provided
56 < FILE* f_in=NULL; //input fastq (stdin if not provided)
55 >
56 > //For paired reads sequencing:
57 > //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
58 > //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
59 > //FILE* f_out=NULL; //stdout if not provided
60 > //FILE* f_out2=NULL; //for paired reads
61 > //FILE* f_in=NULL; //input fastq (stdin if not provided)
62 > //FILE* f_in2=NULL; //for paired reads
63   FILE* freport=NULL;
64 +
65   bool debug=false;
66 + bool verbose=false;
67   bool doCollapse=false;
68   bool doDust=false;
69 + bool fastaOutput=false;
70   bool trashReport=false;
71 < bool rawFormat=false;
71 > //bool rawFormat=false;
72   int min_read_len=16;
73 + double max_perc_N=7.0;
74   int dust_cutoff=16;
75   bool isfasta=false;
76 < bool phred64=false;
76 > bool convert_phred=false;
77 > GStr outsuffix; // -o
78 > //GStr adapter3;
79 > //GStr adapter5;
80   GStr prefix;
81 < GStr adapter3;
82 < GStr adapter5;
81 > GStr zcmd;
82 > int num_trimmed5=0;
83 > int num_trimmed3=0;
84 > int num_trimmedN=0;
85 > int num_trimmedQ=0;
86 > int min_trimmed5=INT_MAX;
87 > int min_trimmed3=INT_MAX;
88 >
89 > int qvtrim_qmin=0;
90 > int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
91 > int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
92 > int qv_cvtadd=0; //could be -31 or +31
93 >
94   int a3len=0;
95   int a5len=0;
96   // adaptor matching metrics -- for extendMatch() function
97 < static const int a_m_score=2; //match score
98 < static const int a_mis_score=-3; //mismatch
99 < static const int a_dropoff_score=7;
100 < static const int a_min_score=7;
97 > const int a_m_score=2; //match score
98 > const int a_mis_score=-3; //mismatch
99 > const int a_dropoff_score=7;
100 > int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
101 > const int a_min_chain_score=15; //for gapped alignments
102 >
103 > class CSegChain;
104 >
105 > class CSegPair {
106 >  public:
107 >   GSeg a;
108 >   GSeg b; //the adapter segment
109 >   int score;
110 >   int flags;
111 >   CSegChain* chain;
112 >   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
113 >      score=mscore;
114 >      if (score==0) score=a.len()*a_m_score;
115 >      flags=0;
116 >      chain=NULL;
117 >      }
118 >   int len() { return  a.len(); }
119 >   bool operator==(CSegPair& d){
120 >      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
121 >      //make equal even segments that are included into one another:
122 >      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
123 >      }
124 >   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
125 >     if (b.start==d.b.start) {
126 >        if (score==d.score) {
127 >           //just try to be consistent:
128 >           if (b.end==d.b.end) {
129 >             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
130 >             }
131 >           return (b.end>d.b.end);
132 >           }
133 >         else return (score<d.score);
134 >        }
135 >     return (b.start>d.b.start);
136 >     }
137 >   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
138 >     /*if (b.start==d.b.start && b.end==d.b.end) {
139 >          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
140 >          }
141 >     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
142 >     if (b.start==d.b.start) {
143 >        if (score==d.score) {
144 >           //just try to be consistent:
145 >           if (b.end==d.b.end) {
146 >             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
147 >             }
148 >           return (b.end<d.b.end);
149 >           }
150 >         else return (score>d.score);
151 >        }
152 >     return (b.start<d.b.start);
153 >     }
154 > };
155 >
156 > int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
157 > CSegPair& x = *(CSegPair *)sa;
158 > CSegPair& y = *(CSegPair *)sb;
159 > /*
160 > if (x.b.end==y.b.end) {
161 >     if (x.b.start==y.b.start) {
162 >         if (x.a.end==y.a.end) {
163 >            if (x.a.start==y.a.start) return 0;
164 >            return ((x.a.start>y.a.start) ? -1 : 1);
165 >            }
166 >          else {
167 >            return ((x.a.end>y.a.end) ? -1 : 1);
168 >            }
169 >          }
170 >      else {
171 >       return ((x.b.start>y.b.start) ? -1 : 1);
172 >       }
173 >     }
174 >    else {
175 >     return ((x.b.end>y.b.end) ? -1 : 1);
176 >     }
177 > */
178 >  if (x.b.end==y.b.end) {
179 >     if (x.score==y.score) {
180 >     if (x.b.start==y.b.start) {
181 >         if (x.a.end==y.a.end) {
182 >            if (x.a.start==y.a.start) return 0;
183 >            return ((x.a.start<y.a.start) ? -1 : 1);
184 >            }
185 >          else {
186 >            return ((x.a.end<y.a.end) ? -1 : 1);
187 >            }
188 >          }
189 >      else {
190 >       return ((x.b.start<y.b.start) ? -1 : 1);
191 >       }
192 >      } else return ((x.score>y.score) ? -1 : 1);
193 >     }
194 >    else {
195 >     return ((x.b.end>y.b.end) ? -1 : 1);
196 >     }
197 >
198 > }
199 >
200 > class CSegChain:public GList<CSegPair> {
201 > public:
202 >   uint astart;
203 >   uint aend;
204 >   uint bstart;
205 >   uint bend;
206 >   int score;
207 >   bool endSort;
208 >  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
209 >   //as SegPairs are inserted, they will be sorted by a.start coordinate
210 >   score=0;
211 >   astart=MAX_UINT;
212 >   aend=0;
213 >   bstart=MAX_UINT;
214 >   bend=0;
215 >   endSort=aln5;
216 >   if (aln5) { setSorted(cmpSegEnds); }
217 >   }
218 > bool operator==(CSegChain& d) {
219 >   //return (score==d.score);
220 >    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
221 >   }
222 > bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
223 >   //return (score<d.score);
224 >   if (bstart==d.bstart && bend==d.bend) {
225 >          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
226 >          }
227 >     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
228 >   }
229 > bool operator<(CSegChain& d) {
230 >   //return (score>d.score);
231 >   if (bstart==d.bstart && bend==d.bend) {
232 >          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
233 >          }
234 >     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
235 >   }
236 > void addSegPair(CSegPair* segp) {
237 >   if (AddIfNew(segp)!=segp) return;
238 >   score+=segp->score;
239 >   if (astart>segp->a.start) astart=segp->a.start;
240 >   if (aend<segp->a.end) aend=segp->a.end;
241 >   if (bstart>segp->b.start) bstart=segp->b.start;
242 >   if (bend<segp->b.end) bend=segp->b.end;
243 >   }
244 > //for building actual chains:
245 > bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
246 >   int bgap=0;
247 >   int agap=0;
248 >   //if (endSort) {
249 >   if (bstart>segp->b.start) {
250 >      bgap = (int)(bstart-segp->b.end);
251 >      if (abs(bgap)>2) return false;
252 >      agap = (int)(astart-segp->a.end);
253 >      if (abs(agap)>2) return false;
254 >      }
255 >     else {
256 >      bgap = (int) (segp->b.start-bend);
257 >      if (abs(bgap)>2) return false;
258 >      agap = (int)(segp->a.start-aend);
259 >      if (abs(agap)>2) return false;
260 >      }
261 >   if (agap*bgap<0) return false;
262 >   addSegPair(segp);
263 >   score-=abs(agap)+abs(bgap);
264 >   return true;
265 >   }
266 > };
267  
268   // element in dhash:
269   class FqDupRec {
# Line 91 | Line 302
302    if (!s.is_empty()) {
303        if (s=='-') f=stdout;
304        else {
305 <       f=fopen(s,"w");
305 >       f=fopen(s.chars(),"w");
306         if (f==NULL) GError("Error creating file: %s\n", s.chars());
307         }
308       }
# Line 102 | Line 313
313  
314   GHash<FqDupRec> dhash; //hash to keep track of duplicates
315  
316 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured
316 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
317 >                       GStr& s, GStr& infname, GStr& infname2);
318 > // uses outsuffix to generate output file names and open file handles as needed
319  
320 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
321 + void trash_report(char trashcode, GStr& rname, FILE* freport);
322 +
323 + bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
324 +          GStr& rname, GStr& rinfo, GStr& infname);
325 +
326 + char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
327 + //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
328 +
329 + bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
330 + bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
331   int dust(GStr& seq);
332   bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
333   bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
334  
335 < void convertQ64(char* q, int len);
336 < void convertQ64(GStr& q);
335 > void convertPhred(char* q, int len);
336 > void convertPhred(GStr& q);
337  
338   int main(int argc, char * const argv[]) {
339 <  GArgs args(argc, argv, "YQRDCl:d:3:5:n:r:o:");
339 >  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
340    int e;
117  int icounter=0; //counter for input reads
118  int outcounter=0; //counter for output reads
341    if ((e=args.isError())>0) {
342        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
343        exit(224);
344        }
345    debug=(args.getOpt('Y')!=NULL);
346 <  phred64=(args.getOpt('Q')!=NULL);
346 >  verbose=(args.getOpt('V')!=NULL);
347 >  convert_phred=(args.getOpt('Q')!=NULL);
348    doCollapse=(args.getOpt('C')!=NULL);
349    doDust=(args.getOpt('D')!=NULL);
350 +  /*
351    rawFormat=(args.getOpt('R')!=NULL);
352    if (rawFormat) {
353      GError("Sorry, raw qseq format parsing is not implemented yet!\n");
354      }
355 +  */
356    prefix=args.getOpt('n');
357    GStr s=args.getOpt('l');
358    if (!s.is_empty())
359       min_read_len=s.asInt();
360 +  s=args.getOpt('m');
361 +  if (!s.is_empty())
362 +     max_perc_N=s.asDouble();
363    s=args.getOpt('d');
364    if (!s.is_empty()) {
365       dust_cutoff=s.asInt();
366       doDust=true;
367       }
368 <    
368 >  s=args.getOpt('q');
369 >  if (!s.is_empty()) {
370 >     qvtrim_qmin=s.asInt();
371 >     }
372 >  s=args.getOpt('t');
373 >  if (!s.is_empty()) {
374 >     qvtrim_max=s.asInt();
375 >     }
376 >  s=args.getOpt('p');
377 >  if (!s.is_empty()) {
378 >     int v=s.asInt();
379 >     if (v==33) {
380 >        qv_phredtype=33;
381 >        qv_cvtadd=31;
382 >        }
383 >      else if (v==64) {
384 >        qv_phredtype=64;
385 >        qv_cvtadd=-31;
386 >        }
387 >       else
388 >         GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
389 >     }
390    if (args.getOpt('3')!=NULL) {
391      adapter3=args.getOpt('3');
392      adapter3.upper();
# Line 148 | Line 397
397      adapter5.upper();
398      a5len=adapter5.length();
399      }
400 +  s=args.getOpt('a');
401 +  if (!s.is_empty()) {
402 +     int a_minmatch=s.asInt();
403 +     a_min_score=a_minmatch<<1;
404 +     }
405 +  
406 +  if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
407 +                         else outsuffix="-";
408    trashReport=  (args.getOpt('r')!=NULL);
409 <  if (args.startNonOpt()==0) {
409 >  int fcount=args.startNonOpt();
410 >  if (fcount==0) {
411      GMessage(USAGE);
412      exit(224);
413      }
414 <
415 <  openfw(f_out, args, 'o');
416 <  if (f_out==NULL) f_out=stdout;
414 >   if (fcount>1 && doCollapse) {
415 >    GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
416 >    }
417 >  //openfw(f_out, args, 'o');
418 >  //if (f_out==NULL) f_out=stdout;
419    if (trashReport)
420      openfw(freport, args, 'r');
421    char* infile=NULL;
422    while ((infile=args.nextNonOpt())!=NULL) {
423 <    GStr infname(infile);
424 <    if (strcmp(infile,"-")==0) {
425 <       f_in=stdin; infname="stdin"; }
426 <     else {
427 <        f_in=fopen(infile,"r");
428 <        if (f_in==NULL)
429 <            GError("Cannot open input file %s!\n",infile);
430 <        }
431 <     GLineReader fq(f_in);
432 <     char* l=NULL;
433 <     while ((l=fq.getLine())!=NULL) {
434 <        GStr rname; //current read name
435 <        GStr rseq;  //current read sequence
436 <        GStr rqv;   //current read quality values
437 <        GStr s;
438 <        if (rawFormat) {
439 <          //TODO: implement qseq parsing here
440 <          //if (raw type=N) then continue; //skip invalid/bad records
441 <          
442 <          } //raw qseq format
443 <         else { // FASTQ or FASTA
444 <          isfasta=(l[0]=='>');
445 <          if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
446 <          s=l;
447 <          rname=&(l[1]);
448 <          icounter++;
449 <          //GMessage("readname=%s\n",rname.chars());
450 <          for (int i=0;i<rname.length();i++)
451 <            if (rname[i]<=' ') { rname.cut(i); break; }
452 <          //now get the sequence
453 <          if ((l=fq.getLine())==NULL)
454 <              GError("Error: unexpected EOF after header for %s\n",rname.chars());
455 <          rseq=l; //this must be the DNA line
456 <          if (isfasta) {
457 <            while ((l=fq.getLine())!=NULL) {
458 <              //fasta can have multiple sequence lines
459 <              if (l[0]=='>') {
460 <                   fq.pushBack();
461 <                   break; //
423 >    int incounter=0; //counter for input reads
424 >    int outcounter=0; //counter for output reads
425 >    int trash_s=0; //too short from the get go
426 >    int trash_Q=0;
427 >    int trash_N=0;
428 >    int trash_D=0;
429 >    int trash_A3=0;
430 >    int trash_A5=0;
431 >    s=infile;
432 >    GStr infname;
433 >    GStr infname2;
434 >    FILE* f_in=NULL;
435 >    FILE* f_in2=NULL;
436 >    FILE* f_out=NULL;
437 >    FILE* f_out2=NULL;
438 >    bool paired_reads=false;
439 >    setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
440 >    GLineReader fq(f_in);
441 >    GLineReader* fq2=NULL;
442 >    if (f_in2!=NULL) {
443 >       fq2=new GLineReader(f_in2);
444 >       paired_reads=true;
445 >       }
446 >    GStr rseq, rqv, seqid, seqinfo;
447 >    GStr rseq2, rqv2, seqid2, seqinfo2;
448 >    while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
449 >       incounter++;
450 >       int a5=0, a3=0, b5=0, b3=0;
451 >       char tcode=0, tcode2=0;
452 >       tcode=process_read(seqid, rseq, rqv, a5, a3);
453 >       //if (!doCollapse) {
454 >         if (fq2!=NULL) {
455 >            getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
456 >            if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
457 >               GError("Error: no paired match for read %s vs %s (%s,%s)\n",
458 >                  seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
459 >               }
460 >            tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
461 >            //decide what to do with this pair and print rseq2 if the pair makes it
462 >            if (tcode>1 && tcode2<=1) {
463 >               //"untrash" rseq
464 >               if (a3-a5+1<min_read_len) {
465 >                   a5=1;
466 >                   if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
467                     }
468 <              rseq+=l;
469 <              }
470 <            } //multi-line fasta file
471 <          if (!isfasta) {
472 <            if ((l=fq.getLine())==NULL)
473 <                GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
474 <            if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
475 <            if ((l=fq.getLine())==NULL)
476 <                GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
477 <            rqv=l;
478 <            if (rqv.length()!=rseq.length())
479 <                GError("Error: qv len != seq len for %s\n", rname.chars());
480 <            }
481 <        } //<-- FASTA or FASTQ
217 <        rseq.upper();
218 <        int l5=0;
219 <        int l3=rseq.length()-1;
220 <        if (l3-l5+1<min_read_len) {
221 <           if (trashReport) {
222 <                  fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
223 <                  }
224 <           continue;
225 <           }
226 <        if (ntrim(rseq, rqv, l5, l3)) {
227 <           if (l3-l5+1<min_read_len) {
228 <             if (trashReport) {
229 <                    fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
230 <                    }
231 <             continue; //invalid read
232 <             }
233 <            //-- keep only the l5..l3 range
234 <           rseq=rseq.substr(l5, l3-l5+1);
235 <           if (!rqv.is_empty())
236 <              rqv=rqv.substr(l5, l3-l5+1);
237 <           }
238 <          
239 <        if (a3len>0) {
240 <          if (trim_adapter3(rseq, l5, l3)) {
241 <             if (l3-l5+1<min_read_len) {
242 <                 if (trashReport) {
243 <                     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
244 <                     }
245 <                 continue;
246 <                 }
247 <              //-- keep only the l5..l3 range
248 <              rseq=rseq.substr(l5, l3-l5+1);
249 <              if (!rqv.is_empty())
250 <                 rqv=rqv.substr(l5, l3-l5+1);
251 <              }//some adapter was trimmed
252 <           } //adapter trimming
253 <        if (a5len>0) {
254 <          if (trim_adapter5(rseq, l5, l3)) {
255 <             if (l3-l5+1<min_read_len) {
256 <                 if (trashReport) {
257 <                     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
258 <                     }
259 <                 continue;
468 >               tcode=1;
469 >               }
470 >             else if (tcode<=1 && tcode2>1) {
471 >               //"untrash" rseq2
472 >               if (b3-b5+1<min_read_len) {
473 >                   b5=1;
474 >                   if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
475 >                   }
476 >               tcode2=1;
477 >               }
478 >            if (tcode<=1) { //trimmed or left intact -- write it!
479 >               if (tcode>0) {
480 >                 rseq2=rseq2.substr(b5,b3-b5+1);
481 >                 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
482                   }
483 <              //-- keep only the l5..l3 range
484 <              rseq=rseq.substr(l5, l3-l5+1);
485 <              if (!rqv.is_empty())
486 <                 rqv=rqv.substr(l5, l3-l5+1);
487 <              }//some adapter was trimmed
488 <           } //adapter trimming
489 <        if (doCollapse) {
490 <           //keep read for later
491 <           FqDupRec* dr=dhash.Find(rseq.chars());
492 <           if (dr==NULL) { //new entry
493 <                  //if (prefix.is_empty())
494 <                     dhash.Add(rseq.chars(),
495 <                          new FqDupRec(&rqv, rname.chars()));
496 <                  //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
483 >               int nocounter=0;
484 >               writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
485 >               }
486 >            } //paired read
487 >       // }
488 >       if (tcode>1) { //trashed
489 >          if (tcode=='s') trash_s++;
490 >            else if (tcode=='Q') trash_Q++;
491 >              else if (tcode=='N') trash_N++;
492 >               else if (tcode=='D') trash_D++;
493 >                else if (tcode=='3') trash_A3++;
494 >                 else if (tcode=='5') trash_A5++;
495 >          if (trashReport) trash_report(tcode, seqid, freport);
496 >          }
497 >         else if (!doCollapse) { //write it
498 >          if (tcode>0) {
499 >            rseq=rseq.substr(a5,a3-a5+1);
500 >            if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
501 >            }
502 >          writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
503 >          }
504 >       } //for each fastq record
505 >    delete fq2;
506 >    FRCLOSE(f_in);
507 >    FRCLOSE(f_in2);
508 >    if (doCollapse) {
509 >       outcounter=0;
510 >       int maxdup_count=1;
511 >       char* maxdup_seq=NULL;
512 >       dhash.startIterate();
513 >       FqDupRec* qd=NULL;
514 >       char* seq=NULL;
515 >       while ((qd=dhash.NextData(seq))!=NULL) {
516 >         GStr rseq(seq);
517 >         //do the dusting here
518 >         if (doDust) {
519 >            int dustbases=dust(rseq);
520 >            if (dustbases>(rseq.length()>>1)) {
521 >               if (trashReport && qd->firstname!=NULL) {
522 >                 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
523                   }
524 <              else    
525 <                 dr->add(rqv);
526 <           } //collapsing duplicates
527 <         else { //not collapsing duplicates
528 <           //do the dust filter now
529 <           if (doDust) {
530 <             int dustbases=dust(rseq);
531 <             if (dustbases>(rseq.length()>>1)) {
532 <                if (trashReport) {
533 <                  fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
534 <                  }
535 <                continue;
536 <                }
289 <             }
290 <           //print this record here  
291 <           outcounter++;
292 <           if (isfasta) {
293 <            if (prefix.is_empty())
294 <               fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
295 <              else
296 <               fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
297 <                                  rseq.chars());
524 >               trash_D+=qd->count;
525 >               continue;
526 >               }
527 >            }
528 >         outcounter++;
529 >         if (qd->count>maxdup_count) {
530 >            maxdup_count=qd->count;
531 >            maxdup_seq=seq;
532 >            }
533 >         if (isfasta) {
534 >           if (prefix.is_empty()) {
535 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
536 >                           rseq.chars());
537               }
538 <           else {  //fastq
539 <            if (phred64) convertQ64(rqv);
540 <            if (prefix.is_empty())
302 <               fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
303 <              else
304 <               fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
305 <                                  rseq.chars(),rqv.chars() );
538 >           else { //use custom read name
539 >             fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
540 >                        qd->count, rseq.chars());
541               }
542 <           } //not collapsing duplicates
543 <        } //for each fastq record
544 <   } //while each input file
545 < FRCLOSE(f_in);
546 < if (doCollapse) {
547 <    outcounter=0;
313 <    int maxdup_count=1;
314 <    char* maxdup_seq=NULL;
315 <    dhash.startIterate();
316 <    FqDupRec* qd=NULL;
317 <    char* seq=NULL;
318 <    while ((qd=dhash.NextData(seq))!=NULL) {
319 <      GStr rseq(seq);
320 <      //do the dusting here
321 <      if (doDust) {
322 <         int dustbases=dust(rseq);
323 <         if (dustbases>(rseq.length()>>1)) {
324 <            if (trashReport && qd->firstname!=NULL) {
325 <              fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
326 <              }
327 <            continue;
542 >           }
543 >         else { //fastq format
544 >          if (convert_phred) convertPhred(qd->qv, qd->len);
545 >          if (prefix.is_empty()) {
546 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
547 >                           rseq.chars(), qd->qv);
548              }
549 +          else { //use custom read name
550 +            fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
551 +                        qd->count, rseq.chars(), qd->qv);
552 +            }
553 +           }
554 +         }//for each element of dhash
555 +       if (maxdup_count>1) {
556 +         GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
557           }
558 <      outcounter++;
559 <      if (qd->count>maxdup_count) {
560 <         maxdup_count=qd->count;
561 <         maxdup_seq=seq;
562 <         }
563 <      if (isfasta) {
564 <        if (prefix.is_empty()) {
565 <          fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
566 <                        rseq.chars());
567 <          }
568 <        else { //use custom read name
569 <          fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
570 <                     qd->count, rseq.chars());
558 >       } //collapse entries
559 >    if (verbose) {
560 >       if (paired_reads) {
561 >           GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
562 >           GMessage("Number of input pairs :%9d\n", incounter);
563 >           GMessage("         Output pairs :%9d\n", outcounter);
564 >           }
565 >         else {
566 >           GMessage(">Input file : %s\n", infname.chars());
567 >           GMessage("Number of input reads :%9d\n", incounter);
568 >           GMessage("         Output reads :%9d\n", outcounter);
569 >           }
570 >       GMessage("------------------------------------\n");
571 >       if (num_trimmed5)
572 >          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
573 >       if (num_trimmed3)
574 >          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
575 >       GMessage("------------------------------------\n");
576 >       if (trash_s>0)
577 >         GMessage("     Trashed by length:%9d\n", trash_s);
578 >       if (trash_N>0)
579 >         GMessage("         Trashed by N%%:%9d\n", trash_N);
580 >       if (trash_Q>0)
581 >         GMessage("Trashed by low quality:%9d\n", trash_Q);
582 >       if (trash_A5>0)
583 >         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
584 >       if (trash_A3>0)
585 >         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
586 >       }
587 >    if (trashReport) {
588 >          FWCLOSE(freport);
589            }
590 <        }
591 <      else { //fastq format
592 <       if (phred64) convertQ64(qd->qv, qd->len);
347 <       if (prefix.is_empty()) {
348 <         fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
349 <                        rseq.chars(), qd->qv);
350 <         }
351 <       else { //use custom read name
352 <         fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
353 <                     qd->count, rseq.chars(), qd->qv);
354 <         }
355 <        }
356 <      }//for each element of dhash
357 <    if (maxdup_count>1) {
358 <      GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
359 <      }
360 <   } //report collapsed dhash entries
361 < GMessage("Number of input reads: %9d\n", icounter);
362 < GMessage("       Output records: %9d\n", outcounter);
363 < if (trashReport) {
364 <    FWCLOSE(freport);
365 <    }
590 >    FWCLOSE(f_out);
591 >    FWCLOSE(f_out2);
592 >   } //while each input file
593  
594 < FWCLOSE(f_out);
594 > //getc(stdin);
595   }
596  
597   class NData {
# Line 393 | Line 620
620       end5=1;
621       end3=seqlen;
622       for (int i=0;i<seqlen;i++)
623 <        if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
623 >        if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
624             NPos[NCount]=i;
625             NCount++;
626             }
# Line 416 | Line 643
643   void N_analyze(int l5, int l3, int p5, int p3) {
644   /* assumes feat was filled properly */
645   int old_dif, t5,t3,v;
646 < if (l3-l5<min_read_len || p5>p3 ) {
646 > if (l3<l5+2 || p5>p3 ) {
647     feat.end5=l5+1;
648     feat.end3=l3+1;
649     return;
# Line 448 | Line 675
675   }
676  
677  
678 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) {
679 < //count Ns in the sequence
678 > bool qtrim(GStr& qvs, int &l5, int &l3) {
679 > if (qvtrim_qmin==0 || qvs.is_empty()) return false;
680 > if (qv_phredtype==0) {
681 >  //try to guess the Phred type
682 >  int vmin=256, vmax=0;
683 >  for (int i=0;i<qvs.length();i++) {
684 >     if (vmin>qvs[i]) vmin=qvs[i];
685 >     if (vmax<qvs[i]) vmax=qvs[i];
686 >     }
687 >  if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
688 >  if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
689 >  if (qv_phredtype==0) {
690 >    GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
691 >    }
692 >  if (verbose)
693 >    GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
694 >  } //guessing Phred type
695 > for (l3=qvs.length()-1;l3>2;l3--) {
696 >  if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
697 >  }
698 > //just in case, check also the 5' the end (?)
699 > for (l5=0;l5<qvs.length()-3;l5++) {
700 >  if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
701 >  }
702 > if (qvtrim_max>0) {
703 >  if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
704 >  if (l5>qvtrim_max) l5=qvtrim_max;
705 >  }
706 > return (l5>0 || l3<qvs.length()-1);
707 > }
708 >
709 > bool ntrim(GStr& rseq, int &l5, int &l3) {
710 > //count Ns in the sequence, trim N-rich ends
711   feat.init(rseq);
712   l5=feat.end5-1;
713   l3=feat.end3-1;
714   N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
715 < if (l5==feat.end5-1 && l3==feat.end3-1)
716 <    return false; //nothing changed
715 > if (l5==feat.end5-1 && l3==feat.end3-1) {
716 >    if (feat.perc_N>max_perc_N) {
717 >           feat.valid=false;
718 >           l3=l5+1;
719 >           return true;
720 >           }
721 >      else {
722 >       return false; //nothing changed
723 >       }
724 >    }
725   l5=feat.end5-1;
726   l3=feat.end3-1;
727   if (l3-l5+1<min_read_len) {
# Line 463 | Line 729
729     return true;
730     }
731   feat.N_calc();
732 < if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases
732 >
733 > if (feat.perc_N>max_perc_N) {
734        feat.valid=false;
735        l3=l5+1;
736        return true;
# Line 589 | Line 856
856   return ncount;
857   }
858  
859 < // ------------------ adapter matching - triplet matching
860 < //look for matching triplets amongst the first 9 nucleotides of the 3' adaptor
861 < // or the last 9 nucleotides for the 5' adapter
862 < //when a triplet match is found, simply try to extend the alignment using a drop-off scheme
863 < // check minimum score and
597 < // for 3' adapter trimming:
859 >
860 > // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
861 > //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
862 > //check minimum score and
863 > //for 3' adapter trimming:
864   //     require that the right end of the alignment for either the adaptor OR the read must be
865   //     < 3 distance from its right end
866   // for 5' adapter trimming:
867   //     require that the left end of the alignment for either the adaptor OR the read must
868 < //     be at coordinate 0
868 > //     be at coordinate < 3 from start
869  
870   bool extendMatch(const char* a, int alen, int ai,
871 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, bool end5=false) {
871 >                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
872   //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
873 < //if (debug)
874 < //  GStr dbg(b);
875 < //GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai);
873 > #ifdef DEBUG
874 > GStr dbg(b);
875 > #endif
876 > //if (debug) {
877 > //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
878 > //  }
879   int a_l=ai; //alignment coordinates on a
880   int a_r=ai+mlen-1;
881   int b_l=bi; //alignment coordinates on b
# Line 670 | Line 939
939    int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
940    int mmovh5=(a_l<b_l)? a_l : b_l;
941    //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
942 + #ifdef DEBUG
943 +  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
944 + #endif
945    if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
946       if (a_l<a_ovh3) {
947          //adapter closer to the left end (typical for 5' adapter)
# Line 683 | Line 955
955          }
956       return true;
957       }
958 + else { //keep this segment pair for later (gapped alignment)
959 +   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
960 +   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
961 +   }
962 +  //do not trim:
963 +  l5=0;
964 +  l3=alen-1;
965    return false;
966   }
967  
968 + /*
969 + int getWordValue(const char* s, int wlen) {
970 + int r=0;
971 + while (wlen--) { r+=(((int)s[wlen])<<wlen) }
972 + return r;
973 + }
974 + */
975 + int get3mer_value(const char* s) {
976 + return (s[0]<<16)+(s[1]<<8)+s[2];
977 + }
978 +
979 + int w3_match(int qv, const char* str, int slen, int start_index=0) {
980 + if (start_index>=slen || start_index<0) return -1;
981 + for (int i=start_index;i<slen-3;i++) {
982 +   int rv=get3mer_value(str+i);
983 +   if (rv==qv) return i;
984 +   }
985 + return -1;
986 + }
987 +
988 + int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
989 + if (end_index>=slen) return -1;
990 + if (end_index<0) end_index=slen-1;
991 + for (int i=end_index-2;i>=0;i--) {
992 +   int rv=get3mer_value(str+i);
993 +   if (rv==qv) return i;
994 +   }
995 + return -1;
996 + }
997 +
998 + int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
999 + if (start_index>=slen || start_index<0) return -1;
1000 + for (int i=start_index;i<slen-4;i++) {
1001 +   int32* rv=(int32*)(str+i);
1002 +   if (*rv==qv) return i;
1003 +   }
1004 + return -1;
1005 + }
1006 +
1007 + int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1008 + if (end_index>=slen) return -1;
1009 + if (end_index<0) end_index=slen-1;
1010 + for (int i=end_index-3;i>=0;i--) {
1011 +   int32* rv=(int32*)(str+i);
1012 +   if (*rv==qv) return i;
1013 +   }
1014 + return -1;
1015 + }
1016 +
1017 + #ifdef DEBUG
1018 + void dbgPrintChain(CSegChain& chain, const char* aseq) {
1019 +  GStr s(aseq);
1020 +  for (int i=0;i<chain.Count();i++) {
1021 +   CSegPair& seg=*chain[i];
1022 +   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1023 +            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
1024 +   }
1025 + }
1026 + #endif
1027 +
1028   bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1029   int rlen=seq.length();
1030   l5=0;
# Line 699 | Line 1038
1038        }
1039      else {
1040        l5=0;
1041 <      l3=fi-1;
1041 >      l3=fi-1;
1042        }
1043     return true;
1044     }
1045 + #ifdef DEBUG
1046 + if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
1047 + #endif
1048 +
1049   //also, for fast detection of other adapter-only reads that start past
1050   // the beginning of the adapter sequence, try to see if the first a3len-4
1051 < // characters of the read are a substring of the adapter
1052 < GStr rstart=seq.substr(0,a3len-4);
1053 < if ((fi=adapter3.index(rstart))>=0) {
1054 <   l3=rlen-1;
1055 <   l5=a3len-4;
1056 <   while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1057 <   return true;
1051 > // bases of the read are a substring of the adapter
1052 > if (rlen>a3len-3) {
1053 >   GStr rstart=seq.substr(1,a3len-4);
1054 >   if ((fi=adapter3.index(rstart))>=0) {
1055 >     l3=rlen-1;
1056 >     l5=a3len-4;
1057 >     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1058 >     return true;
1059 >     }
1060 >  }
1061 > CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1062 >  //check the easy cases - 11 bases exact match at the end
1063 > int fdlen=11;
1064 >  if (a3len<16) {
1065 >   fdlen=a3len>>1;
1066     }
1067 <  
1068 < //another easy case: first half of the adaptor matches
1069 < int a3hlen=a3len>>1;
1070 < GStr ahalf=adapter3.substr(0, a3hlen);
1071 < if ((fi=seq.index(ahalf))>=0) {
1072 <   extendMatch(seq.chars(), rlen, fi,
1073 <                 adapter3.chars(), a3len, 0,  a3hlen, l5,l3);
1074 <   return true;
1067 > if (fdlen>4) {
1068 >     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1069 >     GStr rstart=seq.substr(-fdlen-3,fdlen);
1070 >     if ((fi=adapter3.index(rstart))>=0) {
1071 > #ifdef DEBUG
1072 >       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1073 > #endif
1074 >       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1075 >                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
1076 >            return true;
1077 >       }
1078 >     //another easy case: first 11 characters of the adaptor found as a substring of the read
1079 >     GStr bstr=adapter3.substr(0, fdlen);
1080 >     if ((fi=seq.rindex(bstr))>=0) {
1081 > #ifdef DEBUG
1082 >       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1083 > #endif
1084 >       if (extendMatch(seq.chars(), rlen, fi,
1085 >                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
1086 >            return true;
1087 >       }
1088 >     } //tried to match 11 bases first
1089 >    
1090 > //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1091 > //-- only extend if the match is in the 3' (ending) region of the read
1092 > int wordSize=3;
1093 > int hlen=12;
1094 > if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1095 > int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1096 > if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1097 > imin=rlen-imin;
1098 > for (int iw=0;iw<hlen;iw++) {
1099 >   //int32* qv=(int32*)(adapter3.chars()+iw);
1100 >   int qv=get3mer_value(adapter3.chars()+iw);
1101 >   fi=-1;
1102 >   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1103 >   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1104 >     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1105 >
1106 > #ifdef DEBUG
1107 >     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1108 > #endif
1109 >     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1110 >                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
1111 >     fi--;
1112 >     if (fi<imin) break;
1113 >     }
1114 >   } //for each wmer in the first hlen bases of the adaptor
1115 > /*
1116 > //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1117 > //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1118 > if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1119 > int hlen2=a3len-wordSize;
1120 > //if (hlen2>a3len-4) hlen2=a3len-4;
1121 > if (hlen2>hlen) {
1122 > #ifdef DEBUG
1123 >     if (debug && a3segs.Count()>0) {
1124 >        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1125 >        }
1126 > #endif
1127 >     for (int iw=hlen;iw<hlen2;iw++) {
1128 >         //int* qv=(int32 *)(adapter3.chars()+iw);
1129 >         int qv=get3mer_value(adapter3.chars()+iw);
1130 >         fi=-1;
1131 >         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1132 >         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1133 >           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1134 >                         a3len, iw, wordSize, l5,l3, a3segs);
1135 >           fi--;
1136 >           if (fi<imin) break;
1137 >           }
1138 >         } //for each wmer between hlen2 and hlen bases of the adaptor
1139 >     }
1140 > //lastly, analyze collected a3segs for a possible gapped alignment:
1141 > GList<CSegChain> segchains(false,true,false);
1142 > #ifdef DEBUG
1143 > if (debug && a3segs.Count()>0) {
1144 >   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1145     }
1146 < //no easy cases, so let's do the word hashing
1147 < for (int iw=0;iw<6;iw++) {
1148 <   GStr aword=adapter3.substr(iw,3);
1149 <   if ((fi=seq.index(aword))>=0  && rlen-fi>3) {
1150 <      if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1151 <                   a3len, iw, 3, l5,l3)) return true;
1146 > #endif
1147 > for (int i=0;i<a3segs.Count();i++) {
1148 >   if (a3segs[i]->chain==NULL) {
1149 >       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1150 >       CSegChain* newchain=new CSegChain();
1151 >       newchain->setFreeItem(false);
1152 >       newchain->addSegPair(a3segs[i]);
1153 >       a3segs[i]->chain=newchain;
1154 >       segchains.Add(newchain); //just to free them when done
1155 >       }
1156 >   for (int j=i+1;j<a3segs.Count();j++) {
1157 >      CSegChain* chain=a3segs[i]->chain;
1158 >      if (chain->extendChain(a3segs[j])) {
1159 >          a3segs[j]->chain=chain;
1160 > #ifdef DEBUG
1161 >          if (debug) dbgPrintChain(*chain, adapter3.chars());
1162 > #endif      
1163 >          //save time by checking here if the extended chain is already acceptable for trimming
1164 >          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1165 >            l5=0;
1166 >            l3=chain->astart-2;
1167 > #ifdef DEBUG
1168 >          if (debug && a3segs.Count()>0) {
1169 >            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1170 >            }
1171 > #endif
1172 >            return true;
1173 >            }
1174 >          } //chain can be extended
1175        }
1176 <   }
1176 >   } //collect segment alignments into chains
1177 > */  
1178   return false; //no adapter parts found
1179   }
1180  
# Line 751 | Line 1196
1196        }
1197     return true;
1198     }
1199 < //for fast detection of adapter-rich reads, check if the first 12
1200 < //characters of the read are a substring of the adapter
1201 < GStr rstart=seq.substr(1,12);
1202 < if ((fi=adapter5.index(rstart))>=0) {
1203 <   //l3=rlen-1;
1204 <   //l5=a5len-4;
1205 <   //while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++;
1206 <   //return true;
1207 <   //if (debug) GMessage(" first 12char of the read match adaptor!\n");
1208 <   extendMatch(seq.chars(), rlen, 1,
764 <                 adapter5.chars(), a5len, fi,  12, l5,l3, true);
765 <   return true;
1199 > #ifdef DEBUG
1200 > if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1201 > #endif
1202 >
1203 > CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1204 >
1205 > //try the easy way out first - look for an exact match of 11 bases
1206 > int fdlen=11;
1207 >  if (a5len<16) {
1208 >   fdlen=a5len>>1;
1209     }
1210 <  
1211 < //another easy case: last 12 characters of the adaptor found as a substring of the read
1212 < int aplen=12;
1213 < int apstart=a5len-aplen-2;
1214 < if (apstart<0) { apstart=0; aplen=a5len-1; }
1215 < GStr bstr=adapter5.substr(apstart, aplen);
1216 < if ((fi=seq.index(bstr))>=0) {
1217 <   //if (debug) GMessage(" last 12char of adaptor match the read!\n");
1218 <   extendMatch(seq.chars(), rlen, fi,
1219 <                 adapter5.chars(), a5len, apstart,  aplen, l5,l3,true);
1220 <   return true;
1210 > if (fdlen>4) {
1211 >     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1212 >     if ((fi=adapter5.index(rstart))>=0) {
1213 > #ifdef DEBUG
1214 >       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1215 > #endif
1216 >       if (extendMatch(seq.chars(), rlen, 1,
1217 >                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1218 >           return true;
1219 >       }
1220 >     //another easy case: last 11 characters of the adaptor found as a substring of the read
1221 >     GStr bstr=adapter5.substr(-fdlen);
1222 >     if ((fi=seq.index(bstr))>=0) {
1223 > #ifdef DEBUG
1224 >       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1225 > #endif
1226 >       if (extendMatch(seq.chars(), rlen, fi,
1227 >                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1228 >          return true;
1229 >       }
1230 >     } //tried to matching at most 11 bases first
1231 >    
1232 > //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1233 > //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1234 > int wordSize=3;
1235 > int hlen=12;
1236 > if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1237 > int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1238 > if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1239 > for (int iw=0;iw<=hlen;iw++) {
1240 >   int apstart=a5len-iw-wordSize;
1241 >   fi=0;
1242 >   //int* qv=(int32 *)(adapter5.chars()+apstart);
1243 >   int qv=get3mer_value(adapter5.chars()+apstart);
1244 >   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1245 >   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1246 > #ifdef DEBUG
1247 >     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1248 > #endif
1249 >     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1250 >                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1251 >     fi++;
1252 >     if (fi>imax) break;
1253 >     }
1254 >   } //for each wmer in the last hlen bases of the adaptor
1255 > /*
1256 >
1257 > //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1258 > //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1259 > if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1260 > int hlen2=a5len-wordSize;
1261 > //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1262 > #ifdef DEBUG
1263 >      if (debug && a5segs.Count()>0) {
1264 >        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1265 >        }
1266 > #endif
1267 > if (hlen2>hlen) {
1268 >     for (int iw=hlen+1;iw<=hlen2;iw++) {
1269 >         int apstart=a5len-iw-wordSize;
1270 >         fi=0;
1271 >         //int* qv=(int32 *)(adapter5.chars()+apstart);
1272 >         int qv=get3mer_value(adapter5.chars()+apstart);
1273 >         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1274 >         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1275 >           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1276 >                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1277 >           fi++;
1278 >           if (fi>imax) break;
1279 >           }
1280 >         } //for each wmer between hlen2 and hlen bases of the adaptor
1281 >     }
1282 > if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1283 > // lastly, analyze collected a5segs for a possible gapped alignment:
1284 > GList<CSegChain> segchains(false,true,false);
1285 > #ifdef DEBUG
1286 > if (debug && a5segs.Count()>0) {
1287 >   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1288     }
1289 < //no easy cases, find a triplet match as a seed for alignment extension
1290 < //find triplets at the right end of the adapter
1291 < for (int iw=0;iw<6;iw++) {
1292 <   apstart=a5len-iw-4;
1293 <   GStr aword=adapter5.substr(apstart,3);
1294 <   if ((fi=seq.index(aword))>=0) {
1295 <      //if (debug) GMessage("extending wmatch :%*s\n", fi+3, aword.chars());
1296 <      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1297 <                   a5len, apstart, 3, l5,l3,true)) {
1298 <                     return true;
1299 <                     }
1289 > #endif
1290 > for (int i=0;i<a5segs.Count();i++) {
1291 >   if (a5segs[i]->chain==NULL) {
1292 >       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1293 >       CSegChain* newchain=new CSegChain(true);
1294 >       newchain->setFreeItem(false);
1295 >       newchain->addSegPair(a5segs[i]);
1296 >       a5segs[i]->chain=newchain;
1297 >       segchains.Add(newchain); //just to free them when done
1298 >       }
1299 >   for (int j=i+1;j<a5segs.Count();j++) {
1300 >      CSegChain* chain=a5segs[i]->chain;
1301 >      if (chain->extendChain(a5segs[j])) {
1302 >         a5segs[j]->chain=chain;
1303 > #ifdef DEBUG
1304 >         if (debug) dbgPrintChain(*chain, adapter5.chars());
1305 > #endif      
1306 >      //save time by checking here if the extended chain is already acceptable for trimming
1307 >         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1308 >            l5=chain->aend;
1309 >            l3=rlen-1;
1310 >            return true;
1311 >            }
1312 >         } //chain can be extended
1313        }
1314 <   }
1314 >   } //collect segment alignments into chains
1315 > */
1316   return false; //no adapter parts found
1317   }
1318  
1319 < //conversion of phred64 to phread33
1320 < void convertQ64(GStr& q) {
1321 < for (int i=0;i<q.length();i++) q[i]-=31;
1319 > //convert qvs to/from phred64 from/to phread33
1320 > void convertPhred(GStr& q) {
1321 > for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1322 > }
1323 >
1324 > void convertPhred(char* q, int len) {
1325 > for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1326 > }
1327 >
1328 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1329 >          GStr& rname, GStr& rinfo, GStr& infname) {
1330 > rseq="";
1331 > rqv="";
1332 > rname="";
1333 > rinfo="";
1334 > if (fq.eof()) return false;
1335 > char* l=fq.getLine();
1336 > while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1337 > if (l==NULL) return false;
1338 > /* if (rawFormat) {
1339 >      //TODO: implement raw qseq parsing here
1340 >      //if (raw type=N) then continue; //skip invalid/bad records
1341 >      } //raw qseq format
1342 > else { // FASTQ or FASTA */
1343 > isfasta=(l[0]=='>');
1344 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1345 >      infname.chars(), l);
1346 > GStr s(l);
1347 > rname=&(l[1]);
1348 > for (int i=0;i<rname.length();i++)
1349 >    if (rname[i]<=' ') {
1350 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1351 >       rname.cut(i);
1352 >       break;
1353 >       }
1354 >  //now get the sequence
1355 > if ((l=fq.getLine())==NULL)
1356 >      GError("Error: unexpected EOF after header for read %s (%s)\n",
1357 >                   rname.chars(), infname.chars());
1358 > rseq=l; //this must be the DNA line
1359 > while ((l=fq.getLine())!=NULL) {
1360 >      //seq can span multiple lines
1361 >      if (l[0]=='>' || l[0]=='+') {
1362 >           fq.pushBack();
1363 >           break; //
1364 >           }
1365 >      rseq+=l;
1366 >      } //check for multi-line seq
1367 > if (!isfasta) { //reading fastq quality values, which can also be multi-line
1368 >    if ((l=fq.getLine())==NULL)
1369 >        GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1370 >    if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1371 >    if ((l=fq.getLine())==NULL)
1372 >        GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1373 >    rqv=l;
1374 >    //if (rqv.length()!=rseq.length())
1375 >    //  GError("Error: qv len != seq len for %s\n", rname.chars());
1376 >    while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1377 >      rqv+=l; //append to qv string
1378 >      }
1379 >    }// fastq
1380 > // } //<-- FASTA or FASTQ
1381 > rseq.upper(); //TODO: what if we care about masking?
1382 > return true;
1383 > }
1384 >
1385 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1386 > //returns 0 if the read was untouched, 1 if it was just trimmed
1387 > // and a trash code if it was trashed
1388 > l5=0;
1389 > l3=rseq.length()-1;
1390 > if (l3-l5+1<min_read_len) {
1391 >   return 's';
1392 >   }
1393 > GStr wseq(rseq.chars());
1394 > GStr wqv(rqv.chars());
1395 > int w5=l5;
1396 > int w3=l3;
1397 > //first do the q-based trimming
1398 > if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1399 >   if (w3-w5+1<min_read_len) {
1400 >     return 'Q'; //invalid read
1401 >     }
1402 >    //-- keep only the w5..w3 range
1403 >   l5=w5;
1404 >   l3=w3;
1405 >   wseq=wseq.substr(w5, w3-w5+1);
1406 >   if (!wqv.is_empty())
1407 >      wqv=wqv.substr(w5, w3-w5+1);
1408 >   } //qv trimming
1409 > // N-trimming on the remaining read seq
1410 > if (ntrim(wseq, w5, w3)) {
1411 >   //GMessage("before: %s\n",wseq.chars());
1412 >   //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1413 >   l5+=w5;
1414 >   l3-=(wseq.length()-1-w3);
1415 >   if (w3-w5+1<min_read_len) {
1416 >     return 'N'; //to be trashed
1417 >     }
1418 >    //-- keep only the w5..w3 range
1419 >   wseq=wseq.substr(w5, w3-w5+1);
1420 >   if (!wqv.is_empty())
1421 >      wqv=wqv.substr(w5, w3-w5+1);
1422 >   w5=0;
1423 >   w3=wseq.length()-1;
1424 >   }
1425 > if (a3len>0) {
1426 >  if (trim_adapter3(wseq, w5, w3)) {
1427 >     int trimlen=wseq.length()-(w3-w5+1);
1428 >     num_trimmed3++;
1429 >     if (trimlen<min_trimmed3)
1430 >         min_trimmed3=trimlen;
1431 >     l5+=w5;
1432 >     l3-=(wseq.length()-1-w3);
1433 >     if (w3-w5+1<min_read_len) {
1434 >         return '3';
1435 >         }
1436 >      //-- keep only the w5..w3 range
1437 >      wseq=wseq.substr(w5, w3-w5+1);
1438 >      if (!wqv.is_empty())
1439 >         wqv=wqv.substr(w5, w3-w5+1);
1440 >      }//some adapter was trimmed
1441 >   } //adapter trimming
1442 > if (a5len>0) {
1443 >  if (trim_adapter5(wseq, w5, w3)) {
1444 >     int trimlen=wseq.length()-(w3-w5+1);
1445 >     num_trimmed5++;
1446 >     if (trimlen<min_trimmed5)
1447 >         min_trimmed5=trimlen;
1448 >     l5+=w5;
1449 >     l3-=(wseq.length()-1-w3);
1450 >     if (w3-w5+1<min_read_len) {
1451 >         return '5';
1452 >         }
1453 >      //-- keep only the w5..w3 range
1454 >      wseq=wseq.substr(w5, w3-w5+1);
1455 >      if (!wqv.is_empty())
1456 >         wqv=wqv.substr(w5, w3-w5+1);
1457 >      }//some adapter was trimmed
1458 >   } //adapter trimming
1459 > if (doCollapse) {
1460 >   //keep read for later
1461 >   FqDupRec* dr=dhash.Find(wseq.chars());
1462 >   if (dr==NULL) { //new entry
1463 >          //if (prefix.is_empty())
1464 >             dhash.Add(wseq.chars(),
1465 >                  new FqDupRec(&wqv, rname.chars()));
1466 >          //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1467 >         }
1468 >      else
1469 >         dr->add(wqv);
1470 >   } //collapsing duplicates
1471 > else { //not collapsing duplicates
1472 >   //apply the dust filter now
1473 >   if (doDust) {
1474 >     int dustbases=dust(wseq);
1475 >     if (dustbases>(wseq.length()>>1)) {
1476 >        return 'D';
1477 >        }
1478 >     }
1479 >   } //not collapsing duplicates
1480 > return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1481 > }
1482 >
1483 > void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1484 > //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1485 > if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1486 >  else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1487 > }
1488 >
1489 > void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1490 >   outcounter++;
1491 >   bool asFasta=(rqv.is_empty() || fastaOutput);
1492 >   if (asFasta) {
1493 >    if (prefix.is_empty()) {
1494 >       printHeader(f_out, '>',rname,rinfo);
1495 >       fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1496 >       }
1497 >      else {
1498 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1499 >                          rseq.chars());
1500 >       }
1501 >     }
1502 >   else {  //fastq
1503 >    if (convert_phred) convertPhred(rqv);
1504 >    if (prefix.is_empty()) {
1505 >       printHeader(f_out, '@', rname, rinfo);
1506 >       fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1507 >       }
1508 >      else
1509 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1510 >                          rseq.chars(),rqv.chars() );
1511 >     }
1512 > }
1513 >
1514 > void trash_report(char trashcode, GStr& rname, FILE* freport) {
1515 > if (freport==NULL || trashcode<=' ') return;
1516 > if (trashcode=='3' || trashcode=='5') {
1517 >   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1518 >   }
1519 > else {
1520 >   fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1521 >   }
1522 > //tcounter++;
1523 > }
1524 >
1525 > GStr getFext(GStr& s, int* xpos=NULL) {
1526 > //s must be a filename without a path
1527 > GStr r("");
1528 > if (xpos!=NULL) *xpos=0;
1529 > if (s.is_empty() || s=="-") return r;
1530 > int p=s.rindex('.');
1531 > int d=s.rindex('/');
1532 > if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1533 > r=s.substr(p+1);
1534 > if (xpos!=NULL) *xpos=p+1;
1535 > r.lower();
1536 > return r;
1537 > }
1538 >
1539 > void baseFileName(GStr& fname) {
1540 > //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1541 > int xpos=0;
1542 > GStr fext=getFext(fname, &xpos);
1543 > if (xpos<=1) return;
1544 > bool extdel=false;
1545 > GStr f2;
1546 > if (fext=="z" || fext=="zip") {
1547 >   extdel=true;
1548 >   }
1549 >  else if (fext.length()>=2) {
1550 >   f2=fext.substr(0,2);
1551 >   extdel=(f2=="gz" || f2=="bz");
1552 >   }
1553 > if (extdel) {
1554 >   fname.cut(xpos-1);
1555 >   fext=getFext(fname, &xpos);
1556 >   if (xpos<=1) return;
1557 >   }
1558 > extdel=false;
1559 > if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1560 >   extdel=true;
1561 >   }
1562 >  else if (fext.length()>=2) {
1563 >   extdel=(fext.substr(0,2)=="fa");
1564 >   }
1565 > if (extdel) fname.cut(xpos-1);
1566 > GStr fncp(fname);
1567 > fncp.lower();
1568 > fncp.chomp("seq");
1569 > fncp.chomp("sequence");
1570 > fncp.trimR("_.");
1571 > if (fncp.length()<fname.length()) fname.cut(fncp.length());
1572 > }
1573 >
1574 > FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1575 >  FILE* f_out=NULL;
1576 >  GStr fname(getFileName(infname.chars()));
1577 >  //eliminate known extensions
1578 >  baseFileName(fname);
1579 >  if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1580 >    else if (pocmd.is_empty()) {
1581 >               GStr oname(fname);
1582 >               oname.append('.');
1583 >               oname.append(outsuffix);
1584 >               f_out=fopen(oname.chars(),"w");
1585 >               if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1586 >               }
1587 >            else {
1588 >              GStr oname(">");
1589 >              oname.append(fname);
1590 >              oname.append('.');
1591 >              oname.append(outsuffix);
1592 >              pocmd.append(oname);
1593 >              f_out=popen(pocmd.chars(), "w");
1594 >              if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1595 >              }
1596 > return f_out;
1597   }
1598  
1599 < void convertQ64(char* q, int len) {
1600 < for (int i=0;i<len;i++) q[i]-=31;
1599 > void guess_unzip(GStr& fname, GStr& picmd) {
1600 > GStr fext=getFext(fname);
1601 > if (fext=="gz" || fext=="gzip" || fext=="z") {
1602 >    picmd="gzip -cd ";
1603 >    }
1604 >   else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1605 >    picmd="bzip2 -cd ";
1606 >    }
1607   }
1608  
1609 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1610 +                       GStr& s, GStr& infname, GStr& infname2) {
1611 + // uses outsuffix to generate output file names and open file handles as needed
1612 + infname="";
1613 + infname2="";
1614 + f_in=NULL;
1615 + f_in2=NULL;
1616 + f_out=NULL;
1617 + f_out2=NULL;
1618 + //analyze outsuffix intent
1619 + GStr pocmd;
1620 + if (outsuffix=="-") {
1621 +    f_out=stdout;
1622 +    }
1623 +   else {
1624 +    GStr ox=getFext(outsuffix);
1625 +    if (ox.length()>2) ox=ox.substr(0,2);
1626 +    if (ox=="gz") pocmd="gzip -9 -c ";
1627 +        else if (ox=="bz") pocmd="bzip2 -9 -c ";
1628 +    }
1629 + if (s=="-") {
1630 +    f_in=stdin;
1631 +    infname="stdin";
1632 +    f_out=prepOutFile(infname, pocmd);
1633 +    return;
1634 +    } // streaming from stdin
1635 + s.startTokenize(",:");
1636 + s.nextToken(infname);
1637 + bool paired=s.nextToken(infname2);
1638 + if (fileExists(infname.chars())==0)
1639 +    GError("Error: cannot find file %s!\n",infname.chars());
1640 + GStr fname(getFileName(infname.chars()));
1641 + GStr picmd;
1642 + guess_unzip(fname, picmd);
1643 + if (picmd.is_empty()) {
1644 +   f_in=fopen(infname.chars(), "r");
1645 +   if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1646 +   }
1647 +  else {
1648 +   picmd.append(infname);
1649 +   f_in=popen(picmd.chars(), "r");
1650 +   if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1651 +   }
1652 + if (f_out==stdout) {
1653 +   if (paired) GError("Error: output suffix required for paired reads\n");
1654 +   return;
1655 +   }
1656 + f_out=prepOutFile(infname, pocmd);
1657 + if (!paired) return;
1658 + if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1659 + // ---- paired reads:-------------
1660 + if (fileExists(infname2.chars())==0)
1661 +     GError("Error: cannot find file %s!\n",infname2.chars());
1662 + picmd="";
1663 + GStr fname2(getFileName(infname2.chars()));
1664 + guess_unzip(fname2, picmd);
1665 + if (picmd.is_empty()) {
1666 +   f_in2=fopen(infname2.chars(), "r");
1667 +   if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1668 +   }
1669 +  else {
1670 +   picmd.append(infname2);
1671 +   f_in2=popen(picmd.chars(), "r");
1672 +   if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1673 +   }
1674 + f_out2=prepOutFile(infname2, pocmd);
1675 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines