ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 2 | Line 2
2   #include "GStr.h"
3   #include "GHash.hh"
4   #include "GList.hh"
5 + #include <ctype.h>
6  
7   #define USAGE "Usage:\n\
8 < fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\
9 <    [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\
8 > fqtrim [-5 <5adapter>] [-3 <3adapter>] [-a <min_matchlen>] [-p {64|33}] [-q <minq> [-t <trim_max>]]\\\n\
9 >   [-n <rename_prefix>] [-o <outsuffix>] [-z <zcmd>] [-r <discarded.lst>]\\\n\
10 >   [-l <minlen>] [-C] [-D] [-Q] <input.fq>[,<input_mates.fq>\n\
11   \n\
12 < Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\
13 < optionally collapse duplicates in the input fastq data.\n\
12 > Trim low quality bases at the 3' end, optionally trim adapter sequence, filter\n\
13 > for low complexity and collapse duplicate reads\n\
14 > If read pairs should be trimmed and kept together (i.e. without discarding\n\
15 > one read in a pair), the two file names should be given delimited by a comma\n\
16 > or a colon character\n\
17   \n\
18   Options:\n\
19   -n  rename all the reads using the <prefix> followed by a read counter;\n\
20      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
21      the read duplication count\n\
22 < -o  write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
22 > -o  write the trimmed/filtered reads to file(s) named <input>.<outsuffix>\n\
23 >    which will be created in the current (working) directory\n\
24   -5  trim the given adapter or primer sequence at the 5' end of each read\n\
25      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
26   -3  trim the given adapter sequence at the 3' end of each read\n\
27      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
28 < -l  minimum \"clean\" length at the high quality 5'end that a read must\n\
29 <    have in order to pass the filter (default: 16)\n\
28 > -a  minimum bases to match to adaptor sequence (default 5)\n\
29 > -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
30 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max>\n\
31 > -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
32 > -l  minimum \"clean\" length after trimming that a read must have\n\
33 >    in order to pass the filter (default: 16)\n\
34   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
35      one letter code for the reason why they were trashed\n\
36 + -D  apply a low-complexity (dust) filter and discard any read that has over\n\
37 +    50% of its length detected as low complexity\n\
38   -C  collapse duplicate reads and add a prefix with their count to the read\n\
39 <    name\n\
40 < -D  apply a low-complexity (dust) filter and discard any read that has at\n\
41 <    least half of it masked by this\n\
42 < -Q  quality values in the input data are interpreted as phred64, convert\n\
31 <    them to phred33\n\
39 >    name (e.g. for microRNAs)\n\
40 > -p  input is phred64/phred33 (use -p64 or -p33)\n\
41 > -Q  convert quality values to the other Phred qv type\n\
42 > -V  verbose processing\n\
43   "
44 +
45 + //-z  for -o option, the output stream(s) will be first piped into the given\n
46 + //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
47 +
48 +
49   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
50 < FILE* f_out=NULL; //stdout if not provided
51 < FILE* f_in=NULL; //input fastq (stdin if not provided)
50 >
51 > //For pair ends sequencing:
52 > //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
53 > //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
54 > //FILE* f_out=NULL; //stdout if not provided
55 > //FILE* f_out2=NULL; //for paired reads
56 > //FILE* f_in=NULL; //input fastq (stdin if not provided)
57 > //FILE* f_in2=NULL; //for paired reads
58   FILE* freport=NULL;
59 +
60   bool debug=false;
61 + bool verbose=false;
62   bool doCollapse=false;
63   bool doDust=false;
64 + bool fastaOutput=false;
65   bool trashReport=false;
66 < bool rawFormat=false;
66 > //bool rawFormat=false;
67   int min_read_len=16;
68 + double max_perc_N=7.0;
69   int dust_cutoff=16;
70   bool isfasta=false;
71 < bool phred64=false;
72 < GStr prefix;
71 > bool convert_phred=false;
72 > GStr outsuffix; // -o
73   GStr adapter3;
74   GStr adapter5;
75 + GStr prefix;
76 + GStr zcmd;
77 + int num_trimmed5=0;
78 + int num_trimmed3=0;
79 + int num_trimmedN=0;
80 + int num_trimmedQ=0;
81 + int min_trimmed5=INT_MAX;
82 + int min_trimmed3=INT_MAX;
83 +
84 + int qvtrim_qmin=0;
85 + int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
86 + int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
87 + int qv_cvtadd=0; //could be -31 or +31
88 +
89   int a3len=0;
90   int a5len=0;
91   // adaptor matching metrics -- for extendMatch() function
92 < static const int a_m_score=2; //match score
93 < static const int a_mis_score=-3; //mismatch
94 < static const int a_dropoff_score=7;
95 < static const int a_min_score=7;
92 > const int a_m_score=2; //match score
93 > const int a_mis_score=-3; //mismatch
94 > const int a_dropoff_score=7;
95 > int a_min_score=10; //an exact match of 5 bases at the proper ends WILL be trimmed
96 > const int a_min_chain_score=15; //for gapped alignments
97 >
98 > class CSegChain;
99 >
100 > class CSegPair {
101 >  public:
102 >   GSeg a;
103 >   GSeg b; //the adapter segment
104 >   int score;
105 >   int flags;
106 >   CSegChain* chain;
107 >   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
108 >      score=mscore;
109 >      if (score==0) score=a.len()*a_m_score;
110 >      flags=0;
111 >      chain=NULL;
112 >      }
113 >   int len() { return  a.len(); }
114 >   bool operator==(CSegPair& d){
115 >      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
116 >      //make equal even segments that are included into one another:
117 >      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
118 >      }
119 >   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
120 >     if (b.start==d.b.start) {
121 >        if (score==d.score) {
122 >           //just try to be consistent:
123 >           if (b.end==d.b.end) {
124 >             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
125 >             }
126 >           return (b.end>d.b.end);
127 >           }
128 >         else return (score<d.score);
129 >        }
130 >     return (b.start>d.b.start);
131 >     }
132 >   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
133 >     /*if (b.start==d.b.start && b.end==d.b.end) {
134 >          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
135 >          }
136 >     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
137 >     if (b.start==d.b.start) {
138 >        if (score==d.score) {
139 >           //just try to be consistent:
140 >           if (b.end==d.b.end) {
141 >             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
142 >             }
143 >           return (b.end<d.b.end);
144 >           }
145 >         else return (score>d.score);
146 >        }
147 >     return (b.start<d.b.start);
148 >     }
149 > };
150 >
151 > int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
152 > CSegPair& x = *(CSegPair *)sa;
153 > CSegPair& y = *(CSegPair *)sb;
154 > /*
155 > if (x.b.end==y.b.end) {
156 >     if (x.b.start==y.b.start) {
157 >         if (x.a.end==y.a.end) {
158 >            if (x.a.start==y.a.start) return 0;
159 >            return ((x.a.start>y.a.start) ? -1 : 1);
160 >            }
161 >          else {
162 >            return ((x.a.end>y.a.end) ? -1 : 1);
163 >            }
164 >          }
165 >      else {
166 >       return ((x.b.start>y.b.start) ? -1 : 1);
167 >       }
168 >     }
169 >    else {
170 >     return ((x.b.end>y.b.end) ? -1 : 1);
171 >     }
172 > */
173 >  if (x.b.end==y.b.end) {
174 >     if (x.score==y.score) {
175 >     if (x.b.start==y.b.start) {
176 >         if (x.a.end==y.a.end) {
177 >            if (x.a.start==y.a.start) return 0;
178 >            return ((x.a.start<y.a.start) ? -1 : 1);
179 >            }
180 >          else {
181 >            return ((x.a.end<y.a.end) ? -1 : 1);
182 >            }
183 >          }
184 >      else {
185 >       return ((x.b.start<y.b.start) ? -1 : 1);
186 >       }
187 >      } else return ((x.score>y.score) ? -1 : 1);
188 >     }
189 >    else {
190 >     return ((x.b.end>y.b.end) ? -1 : 1);
191 >     }
192 >
193 > }
194 >
195 > class CSegChain:public GList<CSegPair> {
196 > public:
197 >   uint astart;
198 >   uint aend;
199 >   uint bstart;
200 >   uint bend;
201 >   int score;
202 >   bool endSort;
203 >  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
204 >   //as SegPairs are inserted, they will be sorted by a.start coordinate
205 >   score=0;
206 >   astart=MAX_UINT;
207 >   aend=0;
208 >   bstart=MAX_UINT;
209 >   bend=0;
210 >   endSort=aln5;
211 >   if (aln5) { setSorted(cmpSegEnds); }
212 >   }
213 > bool operator==(CSegChain& d) {
214 >   //return (score==d.score);
215 >    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
216 >   }
217 > bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
218 >   //return (score<d.score);
219 >   if (bstart==d.bstart && bend==d.bend) {
220 >          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
221 >          }
222 >     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
223 >   }
224 > bool operator<(CSegChain& d) {
225 >   //return (score>d.score);
226 >   if (bstart==d.bstart && bend==d.bend) {
227 >          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
228 >          }
229 >     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
230 >   }
231 > void addSegPair(CSegPair* segp) {
232 >   if (AddIfNew(segp)!=segp) return;
233 >   score+=segp->score;
234 >   if (astart>segp->a.start) astart=segp->a.start;
235 >   if (aend<segp->a.end) aend=segp->a.end;
236 >   if (bstart>segp->b.start) bstart=segp->b.start;
237 >   if (bend<segp->b.end) bend=segp->b.end;
238 >   }
239 > //for building actual chains:
240 > bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
241 >   int bgap=0;
242 >   int agap=0;
243 >   //if (endSort) {
244 >   if (bstart>segp->b.start) {
245 >      bgap = (int)(bstart-segp->b.end);
246 >      if (abs(bgap)>2) return false;
247 >      agap = (int)(astart-segp->a.end);
248 >      if (abs(agap)>2) return false;
249 >      }
250 >     else {
251 >      bgap = (int) (segp->b.start-bend);
252 >      if (abs(bgap)>2) return false;
253 >      agap = (int)(segp->a.start-aend);
254 >      if (abs(agap)>2) return false;
255 >      }
256 >   if (agap*bgap<0) return false;
257 >   addSegPair(segp);
258 >   score-=abs(agap)+abs(bgap);
259 >   return true;
260 >   }
261 > };
262  
263   // element in dhash:
264   class FqDupRec {
# Line 91 | Line 297
297    if (!s.is_empty()) {
298        if (s=='-') f=stdout;
299        else {
300 <       f=fopen(s,"w");
300 >       f=fopen(s.chars(),"w");
301         if (f==NULL) GError("Error creating file: %s\n", s.chars());
302         }
303       }
# Line 102 | Line 308
308  
309   GHash<FqDupRec> dhash; //hash to keep track of duplicates
310  
311 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured
311 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
312 >                       GStr& s, GStr& infname, GStr& infname2);
313 > // uses outsuffix to generate output file names and open file handles as needed
314  
315 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
316 + void trash_report(char trashcode, GStr& rname, FILE* freport);
317 +
318 + bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
319 +          GStr& rname, GStr& rinfo, GStr& infname);
320 +
321 + char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
322 + //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
323 +
324 + bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
325 + bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
326   int dust(GStr& seq);
327   bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
328   bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
329  
330 < void convertQ64(char* q, int len);
331 < void convertQ64(GStr& q);
330 > void convertPhred(char* q, int len);
331 > void convertPhred(GStr& q);
332  
333   int main(int argc, char * const argv[]) {
334 <  GArgs args(argc, argv, "QRDCl:d:3:5:n:r:o:");
334 >  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:t:o:z:a:");
335    int e;
117  int icounter=0; //counter for input reads
118  int outcounter=0; //counter for output reads
336    if ((e=args.isError())>0) {
337        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
338        exit(224);
339        }
340 <  debug=(args.getOpt('D')!=NULL);
341 <  phred64=(args.getOpt('Q')!=NULL);
340 >  debug=(args.getOpt('Y')!=NULL);
341 >  verbose=(args.getOpt('V')!=NULL);
342 >  convert_phred=(args.getOpt('Q')!=NULL);
343    doCollapse=(args.getOpt('C')!=NULL);
344    doDust=(args.getOpt('D')!=NULL);
345 +  /*
346    rawFormat=(args.getOpt('R')!=NULL);
347    if (rawFormat) {
348      GError("Sorry, raw qseq format parsing is not implemented yet!\n");
349      }
350 +  */
351    prefix=args.getOpt('n');
352    GStr s=args.getOpt('l');
353    if (!s.is_empty())
354       min_read_len=s.asInt();
355 +  s=args.getOpt('m');
356 +  if (!s.is_empty())
357 +     max_perc_N=s.asDouble();
358    s=args.getOpt('d');
359    if (!s.is_empty()) {
360       dust_cutoff=s.asInt();
361       doDust=true;
362       }
363 <    
363 >  s=args.getOpt('q');
364 >  if (!s.is_empty()) {
365 >     qvtrim_qmin=s.asInt();
366 >     }
367 >  s=args.getOpt('t');
368 >  if (!s.is_empty()) {
369 >     qvtrim_max=s.asInt();
370 >     }
371 >  s=args.getOpt('p');
372 >  if (!s.is_empty()) {
373 >     int v=s.asInt();
374 >     if (v==33) {
375 >        qv_phredtype=33;
376 >        qv_cvtadd=31;
377 >        }
378 >      else if (v==64) {
379 >        qv_phredtype=64;
380 >        qv_cvtadd=-31;
381 >        }
382 >       else
383 >         GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
384 >     }
385    if (args.getOpt('3')!=NULL) {
386      adapter3=args.getOpt('3');
387      adapter3.upper();
# Line 148 | Line 392
392      adapter5.upper();
393      a5len=adapter5.length();
394      }
395 +  s=args.getOpt('a');
396 +  if (!s.is_empty()) {
397 +     int a_minmatch=s.asInt();
398 +     a_min_score=a_minmatch<<1;
399 +     }
400 +
401 +  if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
402    trashReport=  (args.getOpt('r')!=NULL);
403 <  if (args.startNonOpt()==0) {
403 >  int fcount=args.startNonOpt();
404 >  if (fcount==0) {
405      GMessage(USAGE);
406      exit(224);
407      }
408 <
409 <  openfw(f_out, args, 'o');
408 >   if (fcount>1 && doCollapse) {
409 >    GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
410 >    }
411 >  //openfw(f_out, args, 'o');
412 >  //if (f_out==NULL) f_out=stdout;
413    if (trashReport)
414      openfw(freport, args, 'r');
415    char* infile=NULL;
416    while ((infile=args.nextNonOpt())!=NULL) {
417 <    GStr infname(infile);
418 <    if (strcmp(infile,"-")==0) {
419 <       f_in=stdin; infname="stdin"; }
420 <     else {
421 <        f_in=fopen(infile,"r");
422 <        if (f_in==NULL)
423 <            GError("Cannot open input file %s!\n",infile);
424 <        }
425 <     GLineReader fq(f_in);
426 <     char* l=NULL;
427 <     while ((l=fq.getLine())!=NULL) {
428 <        GStr rname; //current read name
429 <        GStr rseq;  //current read sequence
430 <        GStr rqv;   //current read quality values
431 <        GStr s;
432 <        if (rawFormat) {
433 <          //TODO: implement qseq parsing here
434 <          //if (raw type=N) then continue; //skip invalid/bad records
435 <          
436 <          } //raw qseq format
437 <         else { // FASTQ or FASTA
438 <          isfasta=(l[0]=='>');
439 <          if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
440 <          s=l;
441 <          rname=&(l[1]);
442 <          icounter++;
443 <          //GMessage("readname=%s\n",rname.chars());
444 <          for (int i=0;i<rname.length();i++)
445 <            if (rname[i]<=' ') { rname.cut(i); break; }
446 <          //now get the sequence
447 <          if ((l=fq.getLine())==NULL)
448 <              GError("Error: unexpected EOF after header for %s\n",rname.chars());
449 <          rseq=l; //this must be the DNA line
450 <          if (isfasta) {
451 <            while ((l=fq.getLine())!=NULL) {
452 <              //fasta can have multiple sequence lines
453 <              if (l[0]=='>') {
454 <                   fq.pushBack();
455 <                   break; //
417 >    int incounter=0; //counter for input reads
418 >    int outcounter=0; //counter for output reads
419 >    int trash_s=0; //too short from the get go
420 >    int trash_Q=0;
421 >    int trash_N=0;
422 >    int trash_D=0;
423 >    int trash_A3=0;
424 >    int trash_A5=0;
425 >    s=infile;
426 >    GStr infname;
427 >    GStr infname2;
428 >    FILE* f_in=NULL;
429 >    FILE* f_in2=NULL;
430 >    FILE* f_out=NULL;
431 >    FILE* f_out2=NULL;
432 >    bool paired_reads=false;
433 >    setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
434 >    GLineReader fq(f_in);
435 >    GLineReader* fq2=NULL;
436 >    if (f_in2!=NULL) {
437 >       fq2=new GLineReader(f_in2);
438 >       paired_reads=true;
439 >       }
440 >    GStr rseq, rqv, seqid, seqinfo;
441 >    GStr rseq2, rqv2, seqid2, seqinfo2;
442 >    while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
443 >       incounter++;
444 >       int a5=0, a3=0, b5=0, b3=0;
445 >       char tcode=0, tcode2=0;
446 >       tcode=process_read(seqid, rseq, rqv, a5, a3);
447 >       //if (!doCollapse) {
448 >         if (fq2!=NULL) {
449 >            getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
450 >            if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
451 >               GError("Error: no paired match for read %s vs %s (%s,%s)\n",
452 >                  seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
453 >               }
454 >            tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
455 >            //decide what to do with this pair and print rseq2 if the pair makes it
456 >            if (tcode>1 && tcode2<=1) {
457 >               //"untrash" rseq
458 >               if (a3-a5+1<min_read_len) {
459 >                   a5=1;
460 >                   if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
461                     }
462 <              rseq+=l;
463 <              }
464 <            } //multi-line fasta file
465 <          if (!isfasta) {
466 <            if ((l=fq.getLine())==NULL)
467 <                GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
468 <            if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
469 <            if ((l=fq.getLine())==NULL)
470 <                GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
471 <            rqv=l;
472 <            if (rqv.length()!=rseq.length())
473 <                GError("Error: qv len != seq len for %s\n", rname.chars());
474 <            }
475 <        } //<-- FASTA or FASTQ
216 <        rseq.upper();
217 <        int l5=0;
218 <        int l3=rseq.length()-1;
219 <        if (l3-l5+1<min_read_len) {
220 <           if (trashReport) {
221 <                  fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
222 <                  }
223 <           continue;
224 <           }
225 <        if (ntrim(rseq, rqv, l5, l3)) {
226 <           if (l3-l5+1<min_read_len) {
227 <             if (trashReport) {
228 <                    fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
229 <                    }
230 <             continue; //invalid read
231 <             }
232 <            //-- keep only the l5..l3 range
233 <           rseq=rseq.substr(l5, l3-l5+1);
234 <           if (!rqv.is_empty())
235 <              rqv=rqv.substr(l5, l3-l5+1);
236 <           }
237 <          
238 <        if (a3len>0) {
239 <          if (trim_adapter3(rseq, l5, l3)) {
240 <             if (l3-l5+1<min_read_len) {
241 <                 if (trashReport) {
242 <                     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
243 <                     }
244 <                 continue;
245 <                 }
246 <              //-- keep only the l5..l3 range
247 <              rseq=rseq.substr(l5, l3-l5+1);
248 <              if (!rqv.is_empty())
249 <                 rqv=rqv.substr(l5, l3-l5+1);
250 <              }//some adapter was trimmed
251 <           } //adapter trimming
252 <        if (a5len>0) {
253 <          if (trim_adapter5(rseq, l5, l3)) {
254 <             if (l3-l5+1<min_read_len) {
255 <                 if (trashReport) {
256 <                     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
257 <                     }
258 <                 continue;
462 >               tcode=1;
463 >               }
464 >             else if (tcode<=1 && tcode2>1) {
465 >               //"untrash" rseq2
466 >               if (b3-b5+1<min_read_len) {
467 >                   b5=1;
468 >                   if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
469 >                   }
470 >               tcode2=1;
471 >               }
472 >            if (tcode<=1) { //trimmed or left intact -- write it!
473 >               if (tcode>0) {
474 >                 rseq2=rseq2.substr(b5,b3-b5+1);
475 >                 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
476                   }
477 <              //-- keep only the l5..l3 range
478 <              rseq=rseq.substr(l5, l3-l5+1);
479 <              if (!rqv.is_empty())
480 <                 rqv=rqv.substr(l5, l3-l5+1);
481 <              }//some adapter was trimmed
482 <           } //adapter trimming
483 <        if (doCollapse) {
484 <           //keep read for later
485 <           FqDupRec* dr=dhash.Find(rseq.chars());
486 <           if (dr==NULL) { //new entry
487 <                  //if (prefix.is_empty())
488 <                     dhash.Add(rseq.chars(),
489 <                          new FqDupRec(&rqv, rname.chars()));
490 <                  //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
477 >               int nocounter=0;
478 >               writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
479 >               }
480 >            } //paired read
481 >       // }
482 >       if (tcode>1) { //trashed
483 >          if (tcode=='s') trash_s++;
484 >            else if (tcode=='Q') trash_Q++;
485 >              else if (tcode=='N') trash_N++;
486 >               else if (tcode=='D') trash_D++;
487 >                else if (tcode=='3') trash_A3++;
488 >                 else if (tcode=='5') trash_A5++;
489 >          if (trashReport) trash_report(tcode, seqid, freport);
490 >          }
491 >         else if (!doCollapse) { //write it
492 >          if (tcode>0) {
493 >            rseq=rseq.substr(a5,a3-a5+1);
494 >            if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
495 >            }
496 >          writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
497 >          }
498 >       } //for each fastq record
499 >    delete fq2;
500 >    FRCLOSE(f_in);
501 >    FRCLOSE(f_in2);
502 >    if (doCollapse) {
503 >       outcounter=0;
504 >       int maxdup_count=1;
505 >       char* maxdup_seq=NULL;
506 >       dhash.startIterate();
507 >       FqDupRec* qd=NULL;
508 >       char* seq=NULL;
509 >       while ((qd=dhash.NextData(seq))!=NULL) {
510 >         GStr rseq(seq);
511 >         //do the dusting here
512 >         if (doDust) {
513 >            int dustbases=dust(rseq);
514 >            if (dustbases>(rseq.length()>>1)) {
515 >               if (trashReport && qd->firstname!=NULL) {
516 >                 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
517                   }
518 <              else    
519 <                 dr->add(rqv);
520 <           } //collapsing duplicates
521 <         else { //not collapsing duplicates
522 <           //do the dust filter now
523 <           if (doDust) {
524 <             int dustbases=dust(rseq);
525 <             if (dustbases>(rseq.length()>>1)) {
526 <                if (trashReport) {
527 <                  fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
528 <                  }
529 <                continue;
530 <                }
288 <             }
289 <           //print this record here  
290 <           outcounter++;
291 <           if (isfasta) {
292 <            if (prefix.is_empty())
293 <               fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
294 <              else
295 <               fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
296 <                                  rseq.chars());
518 >               trash_D+=qd->count;
519 >               continue;
520 >               }
521 >            }
522 >         outcounter++;
523 >         if (qd->count>maxdup_count) {
524 >            maxdup_count=qd->count;
525 >            maxdup_seq=seq;
526 >            }
527 >         if (isfasta) {
528 >           if (prefix.is_empty()) {
529 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
530 >                           rseq.chars());
531               }
532 <           else {  //fastq
533 <            if (phred64) convertQ64(rqv);
534 <            if (prefix.is_empty())
301 <               fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
302 <              else
303 <               fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
304 <                                  rseq.chars(),rqv.chars() );
532 >           else { //use custom read name
533 >             fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
534 >                        qd->count, rseq.chars());
535               }
536 <           } //not collapsing duplicates
537 <        } //for each fastq record
538 <   } //while each input file
539 < FRCLOSE(f_in);
540 < if (doCollapse) {
541 <    outcounter=0;
312 <    int maxdup_count=1;
313 <    char* maxdup_seq=NULL;
314 <    dhash.startIterate();
315 <    FqDupRec* qd=NULL;
316 <    char* seq=NULL;
317 <    while ((qd=dhash.NextData(seq))!=NULL) {
318 <      GStr rseq(seq);
319 <      //do the dusting here
320 <      if (doDust) {
321 <         int dustbases=dust(rseq);
322 <         if (dustbases>(rseq.length()>>1)) {
323 <            if (trashReport && qd->firstname!=NULL) {
324 <              fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
325 <              }
326 <            continue;
536 >           }
537 >         else { //fastq format
538 >          if (convert_phred) convertPhred(qd->qv, qd->len);
539 >          if (prefix.is_empty()) {
540 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
541 >                           rseq.chars(), qd->qv);
542              }
543 +          else { //use custom read name
544 +            fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
545 +                        qd->count, rseq.chars(), qd->qv);
546 +            }
547 +           }
548 +         }//for each element of dhash
549 +       if (maxdup_count>1) {
550 +         GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
551           }
552 <      outcounter++;
553 <      if (qd->count>maxdup_count) {
554 <         maxdup_count=qd->count;
555 <         maxdup_seq=seq;
556 <         }
557 <      if (isfasta) {
558 <        if (prefix.is_empty()) {
559 <          fprintf(f_out, "@%s_x%d\n%s\n", qd->firstname, qd->count,
560 <                        rseq.chars());
561 <          }
562 <        else { //use custom read name
563 <          fprintf(f_out, "@%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
564 <                     qd->count, rseq.chars());
552 >       } //collapse entries
553 >    if (verbose) {
554 >       if (paired_reads) {
555 >           GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
556 >           GMessage("Number of input pairs :%9d\n", incounter);
557 >           GMessage("         Output pairs :%9d\n", outcounter);
558 >           }
559 >         else {
560 >           GMessage(">Input file : %s\n", infname.chars());
561 >           GMessage("Number of input reads :%9d\n", incounter);
562 >           GMessage("         Output reads :%9d\n", outcounter);
563 >           }
564 >       GMessage("------------------------------------\n");
565 >       if (num_trimmed5)
566 >          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
567 >       if (num_trimmed3)
568 >          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
569 >       GMessage("------------------------------------\n");
570 >       if (trash_s>0)
571 >         GMessage("     Trashed by length:%9d\n", trash_s);
572 >       if (trash_N>0)
573 >         GMessage("         Trashed by N%%:%9d\n", trash_N);
574 >       if (trash_Q>0)
575 >         GMessage("Trashed by low quality:%9d\n", trash_Q);
576 >       if (trash_A5>0)
577 >         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
578 >       if (trash_A3>0)
579 >         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
580 >       }
581 >    if (trashReport) {
582 >          FWCLOSE(freport);
583            }
584 <        }
585 <      else { //fastq format
586 <       if (phred64) convertQ64(qd->qv, qd->len);
346 <       if (prefix.is_empty()) {
347 <         fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
348 <                        rseq.chars(), qd->qv);
349 <         }
350 <       else { //use custom read name
351 <         fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
352 <                     qd->count, rseq.chars(), qd->qv);
353 <         }
354 <        }
355 <      }//for each element of dhash
356 <    if (maxdup_count>1) {
357 <      GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
358 <      }
359 <   } //report collapsed dhash entries
360 < GMessage("Number of input reads: %9d\n", icounter);
361 < GMessage("       Output records: %9d\n", outcounter);
362 < if (trashReport) {
363 <    FWCLOSE(freport);
364 <    }
584 >    FWCLOSE(f_out);
585 >    FWCLOSE(f_out2);
586 >   } //while each input file
587  
588 < FWCLOSE(f_out);
588 > //getc(stdin);
589   }
590  
591   class NData {
# Line 392 | Line 614
614       end5=1;
615       end3=seqlen;
616       for (int i=0;i<seqlen;i++)
617 <        if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
617 >        if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
618             NPos[NCount]=i;
619             NCount++;
620             }
# Line 415 | Line 637
637   void N_analyze(int l5, int l3, int p5, int p3) {
638   /* assumes feat was filled properly */
639   int old_dif, t5,t3,v;
640 < if (l3-l5<min_read_len || p5>p3 ) {
640 > if (l3<l5+2 || p5>p3 ) {
641     feat.end5=l5+1;
642     feat.end3=l3+1;
643     return;
# Line 447 | Line 669
669   }
670  
671  
672 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) {
673 < //count Ns in the sequence
672 > bool qtrim(GStr& qvs, int &l5, int &l3) {
673 > if (qvtrim_qmin==0 || qvs.is_empty()) return false;
674 > if (qv_phredtype==0) {
675 >  //try to guess the Phred type
676 >  int vmin=256, vmax=0;
677 >  for (int i=0;i<qvs.length();i++) {
678 >     if (vmin>qvs[i]) vmin=qvs[i];
679 >     if (vmax<qvs[i]) vmax=qvs[i];
680 >     }
681 >  if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
682 >  if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
683 >  if (qv_phredtype==0) {
684 >    GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
685 >    }
686 >  if (verbose)
687 >    GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
688 >  } //guessing Phred type
689 > for (l3=qvs.length()-1;l3>2;l3--) {
690 >  if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
691 >  }
692 > //just in case, check also the 5' the end (?)
693 > for (l5=0;l5<qvs.length()-3;l5++) {
694 >  if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
695 >  }
696 > if (qvtrim_max>0) {
697 >  if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
698 >  if (l5>qvtrim_max) l5=qvtrim_max;
699 >  }
700 > return (l5>0 || l3<qvs.length()-1);
701 > }
702 >
703 > bool ntrim(GStr& rseq, int &l5, int &l3) {
704 > //count Ns in the sequence, trim N-rich ends
705   feat.init(rseq);
706   l5=feat.end5-1;
707   l3=feat.end3-1;
708   N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
709 < if (l5==feat.end5-1 && l3==feat.end3-1)
710 <    return false; //nothing changed
709 > if (l5==feat.end5-1 && l3==feat.end3-1) {
710 >    if (feat.perc_N>max_perc_N) {
711 >           feat.valid=false;
712 >           l3=l5+1;
713 >           return true;
714 >           }
715 >      else {
716 >       return false; //nothing changed
717 >       }
718 >    }
719   l5=feat.end5-1;
720   l3=feat.end3-1;
721   if (l3-l5+1<min_read_len) {
# Line 462 | Line 723
723     return true;
724     }
725   feat.N_calc();
726 < if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases
726 >
727 > if (feat.perc_N>max_perc_N) {
728        feat.valid=false;
729        l3=l5+1;
730        return true;
# Line 588 | Line 850
850   return ncount;
851   }
852  
853 < // ------------------ adapter matching - triplet matching
854 < //look for matching triplets amongst the first 9 nucleotides of the 3' adaptor
855 < // or the last 9 nucleotides for the 5' adapter
856 < //when a triplet match is found, simply try to extend the alignment using a drop-off scheme
857 < // check minimum score and
596 < // for 3' adapter trimming:
853 >
854 > // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
855 > //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
856 > //check minimum score and
857 > //for 3' adapter trimming:
858   //     require that the right end of the alignment for either the adaptor OR the read must be
859   //     < 3 distance from its right end
860   // for 5' adapter trimming:
861   //     require that the left end of the alignment for either the adaptor OR the read must
862 < //     be at coordinate 0
862 > //     be at coordinate < 3 from start
863  
864   bool extendMatch(const char* a, int alen, int ai,
865 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3) {
865 >                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
866   //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
867 < //GStr dbg(a);
868 < //GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai);
867 > #ifdef DEBUG
868 > GStr dbg(b);
869 > #endif
870 > //if (debug) {
871 > //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
872 > //  }
873   int a_l=ai; //alignment coordinates on a
874   int a_r=ai+mlen-1;
875   int b_l=bi; //alignment coordinates on b
# Line 613 | Line 878
878   int bi_maxscore=bi;
879   int score=mlen*a_m_score;
880   int maxscore=score;
881 < int mism5score=a_mis_score-2; //increase penalty for mismatches at 5' end
881 > int mism5score=a_mis_score;
882 > if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
883   //try to extend to the left first, if possible
884   while (ai>0 && bi>0) {
885     ai--;
# Line 628 | Line 894
894     }
895   a_l=ai_maxscore;
896   b_l=bi_maxscore;
897 + //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
898   //now extend to the right
899   ai_maxscore=a_r;
900   bi_maxscore=b_r;
# Line 661 | Line 928
928     }
929    a_r=ai_maxscore;
930    b_r=bi_maxscore;
931 <  if (maxscore>=a_min_score && (alen-a_r<3 || blen-b_r<3 || a_l<2 || b_l<2)) {
932 <     if (a_l<alen-a_r-1) {
931 >  int a_ovh3=alen-a_r-1;
932 >  int b_ovh3=blen-b_r-1;
933 >  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
934 >  int mmovh5=(a_l<b_l)? a_l : b_l;
935 >  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
936 > #ifdef DEBUG
937 >  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
938 > #endif
939 >  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
940 >     if (a_l<a_ovh3) {
941          //adapter closer to the left end (typical for 5' adapter)
942          l5=a_r+1;
943          l3=alen-1;
# Line 674 | Line 949
949          }
950       return true;
951       }
952 + else { //keep this segment pair for later (gapped alignment)
953 +   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
954 +   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
955 +   }
956 +  //do not trim:
957 +  l5=0;
958 +  l3=alen-1;
959    return false;
960   }
961  
962 + /*
963 + int getWordValue(const char* s, int wlen) {
964 + int r=0;
965 + while (wlen--) { r+=(((int)s[wlen])<<wlen) }
966 + return r;
967 + }
968 + */
969 + int get3mer_value(const char* s) {
970 + return (s[0]<<16)+(s[1]<<8)+s[2];
971 + }
972 +
973 + int w3_match(int qv, const char* str, int slen, int start_index=0) {
974 + if (start_index>=slen || start_index<0) return -1;
975 + for (int i=start_index;i<slen-3;i++) {
976 +   int rv=get3mer_value(str+i);
977 +   if (rv==qv) return i;
978 +   }
979 + return -1;
980 + }
981 +
982 + int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
983 + if (end_index>=slen) return -1;
984 + if (end_index<0) end_index=slen-1;
985 + for (int i=end_index-2;i>=0;i--) {
986 +   int rv=get3mer_value(str+i);
987 +   if (rv==qv) return i;
988 +   }
989 + return -1;
990 + }
991 +
992 + int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
993 + if (start_index>=slen || start_index<0) return -1;
994 + for (int i=start_index;i<slen-4;i++) {
995 +   int32* rv=(int32*)(str+i);
996 +   if (*rv==qv) return i;
997 +   }
998 + return -1;
999 + }
1000 +
1001 + int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1002 + if (end_index>=slen) return -1;
1003 + if (end_index<0) end_index=slen-1;
1004 + for (int i=end_index-3;i>=0;i--) {
1005 +   int32* rv=(int32*)(str+i);
1006 +   if (*rv==qv) return i;
1007 +   }
1008 + return -1;
1009 + }
1010 +
1011 + #ifdef DEBUG
1012 + void dbgPrintChain(CSegChain& chain, const char* aseq) {
1013 +  GStr s(aseq);
1014 +  for (int i=0;i<chain.Count();i++) {
1015 +   CSegPair& seg=*chain[i];
1016 +   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1017 +            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
1018 +   }
1019 + }
1020 + #endif
1021 +
1022   bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1023   int rlen=seq.length();
1024   l5=0;
# Line 690 | Line 1032
1032        }
1033      else {
1034        l5=0;
1035 <      l3=fi-1;
1035 >      l3=fi-1;
1036        }
1037     return true;
1038     }
1039 + #ifdef DEBUG
1040 + if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
1041 + #endif
1042 +
1043   //also, for fast detection of other adapter-only reads that start past
1044   // the beginning of the adapter sequence, try to see if the first a3len-4
1045 < // characters of the read are a substring of the adapter
1046 < GStr rstart=seq.substr(0,a3len-4);
1047 < if ((fi=adapter3.index(rstart))>=0) {
1048 <   l3=rlen-1;
1049 <   l5=a3len-4;
1050 <   while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1051 <   return true;
1045 > // bases of the read are a substring of the adapter
1046 > if (rlen>a3len-3) {
1047 >   GStr rstart=seq.substr(1,a3len-4);
1048 >   if ((fi=adapter3.index(rstart))>=0) {
1049 >     l3=rlen-1;
1050 >     l5=a3len-4;
1051 >     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1052 >     return true;
1053 >     }
1054 >  }
1055 > CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1056 >  //check the easy cases - 11 bases exact match at the end
1057 > int fdlen=11;
1058 >  if (a3len<16) {
1059 >   fdlen=a3len>>1;
1060     }
1061 <  
1062 < //another easy case: first half of the adaptor matches
1063 < int a3hlen=a3len>>1;
1064 < GStr ahalf=adapter3.substr(0, a3hlen);
1065 < if ((fi=seq.index(ahalf))>=0) {
1066 <   extendMatch(seq.chars(), rlen, fi,
1067 <                 adapter3.chars(), a3len, 0,  a3hlen, l5,l3);
1068 <   return true;
1061 > if (fdlen>4) {
1062 >     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1063 >     GStr rstart=seq.substr(-fdlen-3,fdlen);
1064 >     if ((fi=adapter3.index(rstart))>=0) {
1065 > #ifdef DEBUG
1066 >       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1067 > #endif
1068 >       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1069 >                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
1070 >            return true;
1071 >       }
1072 >     //another easy case: first 11 characters of the adaptor found as a substring of the read
1073 >     GStr bstr=adapter3.substr(0, fdlen);
1074 >     if ((fi=seq.rindex(bstr))>=0) {
1075 > #ifdef DEBUG
1076 >       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1077 > #endif
1078 >       if (extendMatch(seq.chars(), rlen, fi,
1079 >                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
1080 >            return true;
1081 >       }
1082 >     } //tried to match 11 bases first
1083 >    
1084 > //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1085 > //-- only extend if the match is in the 3' (ending) region of the read
1086 > int wordSize=3;
1087 > int hlen=12;
1088 > if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1089 > int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1090 > if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1091 > imin=rlen-imin;
1092 > for (int iw=0;iw<hlen;iw++) {
1093 >   //int32* qv=(int32*)(adapter3.chars()+iw);
1094 >   int qv=get3mer_value(adapter3.chars()+iw);
1095 >   fi=-1;
1096 >   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1097 >   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1098 >     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1099 >
1100 > #ifdef DEBUG
1101 >     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1102 > #endif
1103 >     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1104 >                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
1105 >     fi--;
1106 >     if (fi<imin) break;
1107 >     }
1108 >   } //for each wmer in the first hlen bases of the adaptor
1109 > /*
1110 > //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1111 > //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1112 > if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1113 > int hlen2=a3len-wordSize;
1114 > //if (hlen2>a3len-4) hlen2=a3len-4;
1115 > if (hlen2>hlen) {
1116 > #ifdef DEBUG
1117 >     if (debug && a3segs.Count()>0) {
1118 >        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1119 >        }
1120 > #endif
1121 >     for (int iw=hlen;iw<hlen2;iw++) {
1122 >         //int* qv=(int32 *)(adapter3.chars()+iw);
1123 >         int qv=get3mer_value(adapter3.chars()+iw);
1124 >         fi=-1;
1125 >         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1126 >         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1127 >           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1128 >                         a3len, iw, wordSize, l5,l3, a3segs);
1129 >           fi--;
1130 >           if (fi<imin) break;
1131 >           }
1132 >         } //for each wmer between hlen2 and hlen bases of the adaptor
1133 >     }
1134 > //lastly, analyze collected a3segs for a possible gapped alignment:
1135 > GList<CSegChain> segchains(false,true,false);
1136 > #ifdef DEBUG
1137 > if (debug && a3segs.Count()>0) {
1138 >   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1139     }
1140 < //no easy cases, so let's do the word hashing
1141 < for (int iw=0;iw<6;iw++) {
1142 <   GStr aword=adapter3.substr(iw,3);
1143 <   if ((fi=seq.index(aword))>=0  && rlen-fi>3) {
1144 <      if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1145 <                   a3len, iw, 3, l5,l3)) return true;
1140 > #endif
1141 > for (int i=0;i<a3segs.Count();i++) {
1142 >   if (a3segs[i]->chain==NULL) {
1143 >       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1144 >       CSegChain* newchain=new CSegChain();
1145 >       newchain->setFreeItem(false);
1146 >       newchain->addSegPair(a3segs[i]);
1147 >       a3segs[i]->chain=newchain;
1148 >       segchains.Add(newchain); //just to free them when done
1149 >       }
1150 >   for (int j=i+1;j<a3segs.Count();j++) {
1151 >      CSegChain* chain=a3segs[i]->chain;
1152 >      if (chain->extendChain(a3segs[j])) {
1153 >          a3segs[j]->chain=chain;
1154 > #ifdef DEBUG
1155 >          if (debug) dbgPrintChain(*chain, adapter3.chars());
1156 > #endif      
1157 >          //save time by checking here if the extended chain is already acceptable for trimming
1158 >          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1159 >            l5=0;
1160 >            l3=chain->astart-2;
1161 > #ifdef DEBUG
1162 >          if (debug && a3segs.Count()>0) {
1163 >            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1164 >            }
1165 > #endif
1166 >            return true;
1167 >            }
1168 >          } //chain can be extended
1169        }
1170 <   }
1170 >   } //collect segment alignments into chains
1171 > */  
1172   return false; //no adapter parts found
1173   }
1174  
1175   bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1176 + //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1177   int rlen=seq.length();
1178   l5=0;
1179   l3=rlen-1;
# Line 741 | Line 1190
1190        }
1191     return true;
1192     }
1193 < //for fast detection of adapter-rich reads, check if the first 12
1194 < //characters of the read are a substring of the adapter
1195 < GStr rstart=seq.substr(1,12);
1196 < if ((fi=adapter5.index(rstart))>=0) {
1197 <   //l3=rlen-1;
1198 <   //l5=a5len-4;
1199 <   //while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++;
1200 <   //return true;
1201 <   extendMatch(seq.chars(), rlen, 1,
1202 <                 adapter5.chars(), a5len, fi,  12, l5,l3);
754 <   return true;
1193 > #ifdef DEBUG
1194 > if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1195 > #endif
1196 >
1197 > CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1198 >
1199 > //try the easy way out first - look for an exact match of 11 bases
1200 > int fdlen=11;
1201 >  if (a5len<16) {
1202 >   fdlen=a5len>>1;
1203     }
1204 <  
1205 < //another easy case: last 12 characters of the adaptor found as a substring of the read
1206 < int aplen=12;
1207 < int apstart=a5len-aplen-2;
1208 < if (apstart<0) { apstart=0; aplen=a5len-1; }
1209 < GStr bstr=adapter5.substr(apstart, aplen);
1210 < if ((fi=seq.index(bstr))>=0) {
1211 <   extendMatch(seq.chars(), rlen, fi,
1212 <                 adapter5.chars(), a5len, apstart,  aplen, l5,l3);
1213 <   return true;
1204 > if (fdlen>4) {
1205 >     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1206 >     if ((fi=adapter5.index(rstart))>=0) {
1207 > #ifdef DEBUG
1208 >       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1209 > #endif
1210 >       if (extendMatch(seq.chars(), rlen, 1,
1211 >                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1212 >           return true;
1213 >       }
1214 >     //another easy case: last 11 characters of the adaptor found as a substring of the read
1215 >     GStr bstr=adapter5.substr(-fdlen);
1216 >     if ((fi=seq.index(bstr))>=0) {
1217 > #ifdef DEBUG
1218 >       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1219 > #endif
1220 >       if (extendMatch(seq.chars(), rlen, fi,
1221 >                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1222 >          return true;
1223 >       }
1224 >     } //tried to matching at most 11 bases first
1225 >    
1226 > //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1227 > //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1228 > int wordSize=3;
1229 > int hlen=12;
1230 > if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1231 > int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1232 > if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1233 > for (int iw=0;iw<=hlen;iw++) {
1234 >   int apstart=a5len-iw-wordSize;
1235 >   fi=0;
1236 >   //int* qv=(int32 *)(adapter5.chars()+apstart);
1237 >   int qv=get3mer_value(adapter5.chars()+apstart);
1238 >   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1239 >   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1240 > #ifdef DEBUG
1241 >     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1242 > #endif
1243 >     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1244 >                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1245 >     fi++;
1246 >     if (fi>imax) break;
1247 >     }
1248 >   } //for each wmer in the last hlen bases of the adaptor
1249 > /*
1250 >
1251 > //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1252 > //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1253 > if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1254 > int hlen2=a5len-wordSize;
1255 > //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1256 > #ifdef DEBUG
1257 >      if (debug && a5segs.Count()>0) {
1258 >        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1259 >        }
1260 > #endif
1261 > if (hlen2>hlen) {
1262 >     for (int iw=hlen+1;iw<=hlen2;iw++) {
1263 >         int apstart=a5len-iw-wordSize;
1264 >         fi=0;
1265 >         //int* qv=(int32 *)(adapter5.chars()+apstart);
1266 >         int qv=get3mer_value(adapter5.chars()+apstart);
1267 >         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1268 >         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1269 >           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1270 >                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1271 >           fi++;
1272 >           if (fi>imax) break;
1273 >           }
1274 >         } //for each wmer between hlen2 and hlen bases of the adaptor
1275 >     }
1276 > if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1277 > // lastly, analyze collected a5segs for a possible gapped alignment:
1278 > GList<CSegChain> segchains(false,true,false);
1279 > #ifdef DEBUG
1280 > if (debug && a5segs.Count()>0) {
1281 >   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1282     }
1283 < //no easy cases, find a triplet match as a seed for alignment extension
1284 < //find triplets at the right end of the adapter
1285 < for (int iw=0;iw<6;iw++) {
1286 <   apstart=a5len-iw-4;
1287 <   GStr aword=adapter5.substr(apstart,3);
1288 <   if ((fi=seq.index(aword))>=0) {
1289 <      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1290 <                   a5len, apstart, 3, l5,l3)) {
1291 <                     return true;
1292 <                     }
1283 > #endif
1284 > for (int i=0;i<a5segs.Count();i++) {
1285 >   if (a5segs[i]->chain==NULL) {
1286 >       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1287 >       CSegChain* newchain=new CSegChain(true);
1288 >       newchain->setFreeItem(false);
1289 >       newchain->addSegPair(a5segs[i]);
1290 >       a5segs[i]->chain=newchain;
1291 >       segchains.Add(newchain); //just to free them when done
1292 >       }
1293 >   for (int j=i+1;j<a5segs.Count();j++) {
1294 >      CSegChain* chain=a5segs[i]->chain;
1295 >      if (chain->extendChain(a5segs[j])) {
1296 >         a5segs[j]->chain=chain;
1297 > #ifdef DEBUG
1298 >         if (debug) dbgPrintChain(*chain, adapter5.chars());
1299 > #endif      
1300 >      //save time by checking here if the extended chain is already acceptable for trimming
1301 >         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1302 >            l5=chain->aend;
1303 >            l3=rlen-1;
1304 >            return true;
1305 >            }
1306 >         } //chain can be extended
1307        }
1308 <   }
1308 >   } //collect segment alignments into chains
1309 > */
1310   return false; //no adapter parts found
1311   }
1312  
1313 < //conversion of phred64 to phread33
1314 < void convertQ64(GStr& q) {
1315 < for (int i=0;i<q.length();i++) q[i]-=31;
1313 > //convert qvs to/from phred64 from/to phread33
1314 > void convertPhred(GStr& q) {
1315 > for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1316 > }
1317 >
1318 > void convertPhred(char* q, int len) {
1319 > for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1320 > }
1321 >
1322 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1323 >          GStr& rname, GStr& rinfo, GStr& infname) {
1324 > rseq="";
1325 > rqv="";
1326 > rname="";
1327 > rinfo="";
1328 > if (fq.eof()) return false;
1329 > char* l=fq.getLine();
1330 > while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1331 > if (l==NULL) return false;
1332 > /* if (rawFormat) {
1333 >      //TODO: implement raw qseq parsing here
1334 >      //if (raw type=N) then continue; //skip invalid/bad records
1335 >      } //raw qseq format
1336 > else { // FASTQ or FASTA */
1337 > isfasta=(l[0]=='>');
1338 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1339 >      infname.chars(), l);
1340 > GStr s(l);
1341 > rname=&(l[1]);
1342 > for (int i=0;i<rname.length();i++)
1343 >    if (rname[i]<=' ') {
1344 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1345 >       rname.cut(i);
1346 >       break;
1347 >       }
1348 >  //now get the sequence
1349 > if ((l=fq.getLine())==NULL)
1350 >      GError("Error: unexpected EOF after header for read %s (%s)\n",
1351 >                   rname.chars(), infname.chars());
1352 > rseq=l; //this must be the DNA line
1353 > while ((l=fq.getLine())!=NULL) {
1354 >      //seq can span multiple lines
1355 >      if (l[0]=='>' || l[0]=='+') {
1356 >           fq.pushBack();
1357 >           break; //
1358 >           }
1359 >      rseq+=l;
1360 >      } //check for multi-line seq
1361 > if (!isfasta) { //reading fastq quality values, which can also be multi-line
1362 >    if ((l=fq.getLine())==NULL)
1363 >        GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1364 >    if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1365 >    if ((l=fq.getLine())==NULL)
1366 >        GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1367 >    rqv=l;
1368 >    //if (rqv.length()!=rseq.length())
1369 >    //  GError("Error: qv len != seq len for %s\n", rname.chars());
1370 >    while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1371 >      rqv+=l; //append to qv string
1372 >      }
1373 >    }// fastq
1374 > // } //<-- FASTA or FASTQ
1375 > rseq.upper(); //TODO: what if we care about masking?
1376 > return true;
1377   }
1378  
1379 < void convertQ64(char* q, int len) {
1380 < for (int i=0;i<len;i++) q[i]-=31;
1379 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1380 > //returns 0 if the read was untouched, 1 if it was just trimmed
1381 > // and a trash code if it was trashed
1382 > l5=0;
1383 > l3=rseq.length()-1;
1384 > if (l3-l5+1<min_read_len) {
1385 >   return 's';
1386 >   }
1387 > GStr wseq(rseq.chars());
1388 > GStr wqv(rqv.chars());
1389 > int w5=l5;
1390 > int w3=l3;
1391 > //first do the q-based trimming
1392 > if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1393 >   if (w3-w5+1<min_read_len) {
1394 >     return 'Q'; //invalid read
1395 >     }
1396 >    //-- keep only the w5..w3 range
1397 >   l5=w5;
1398 >   l3=w3;
1399 >   wseq=wseq.substr(w5, w3-w5+1);
1400 >   if (!wqv.is_empty())
1401 >      wqv=wqv.substr(w5, w3-w5+1);
1402 >   } //qv trimming
1403 > // N-trimming on the remaining read seq
1404 > if (ntrim(wseq, w5, w3)) {
1405 >   //GMessage("before: %s\n",wseq.chars());
1406 >   //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1407 >   l5+=w5;
1408 >   l3-=(wseq.length()-1-w3);
1409 >   if (w3-w5+1<min_read_len) {
1410 >     return 'N'; //to be trashed
1411 >     }
1412 >    //-- keep only the w5..w3 range
1413 >   wseq=wseq.substr(w5, w3-w5+1);
1414 >   if (!wqv.is_empty())
1415 >      wqv=wqv.substr(w5, w3-w5+1);
1416 >   w5=0;
1417 >   w3=wseq.length()-1;
1418 >   }
1419 > if (a3len>0) {
1420 >  if (trim_adapter3(wseq, w5, w3)) {
1421 >     int trimlen=wseq.length()-(w3-w5+1);
1422 >     num_trimmed3++;
1423 >     if (trimlen<min_trimmed3)
1424 >         min_trimmed3=trimlen;
1425 >     l5+=w5;
1426 >     l3-=(wseq.length()-1-w3);
1427 >     if (w3-w5+1<min_read_len) {
1428 >         return '3';
1429 >         }
1430 >      //-- keep only the w5..w3 range
1431 >      wseq=wseq.substr(w5, w3-w5+1);
1432 >      if (!wqv.is_empty())
1433 >         wqv=wqv.substr(w5, w3-w5+1);
1434 >      }//some adapter was trimmed
1435 >   } //adapter trimming
1436 > if (a5len>0) {
1437 >  if (trim_adapter5(wseq, w5, w3)) {
1438 >     int trimlen=wseq.length()-(w3-w5+1);
1439 >     num_trimmed5++;
1440 >     if (trimlen<min_trimmed5)
1441 >         min_trimmed5=trimlen;
1442 >     l5+=w5;
1443 >     l3-=(wseq.length()-1-w3);
1444 >     if (w3-w5+1<min_read_len) {
1445 >         return '5';
1446 >         }
1447 >      //-- keep only the w5..w3 range
1448 >      wseq=wseq.substr(w5, w3-w5+1);
1449 >      if (!wqv.is_empty())
1450 >         wqv=wqv.substr(w5, w3-w5+1);
1451 >      }//some adapter was trimmed
1452 >   } //adapter trimming
1453 > if (doCollapse) {
1454 >   //keep read for later
1455 >   FqDupRec* dr=dhash.Find(wseq.chars());
1456 >   if (dr==NULL) { //new entry
1457 >          //if (prefix.is_empty())
1458 >             dhash.Add(wseq.chars(),
1459 >                  new FqDupRec(&wqv, rname.chars()));
1460 >          //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1461 >         }
1462 >      else
1463 >         dr->add(wqv);
1464 >   } //collapsing duplicates
1465 > else { //not collapsing duplicates
1466 >   //apply the dust filter now
1467 >   if (doDust) {
1468 >     int dustbases=dust(wseq);
1469 >     if (dustbases>(wseq.length()>>1)) {
1470 >        return 'D';
1471 >        }
1472 >     }
1473 >   } //not collapsing duplicates
1474 > return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1475   }
1476  
1477 + void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1478 + //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1479 + if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1480 +  else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1481 + }
1482 +
1483 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1484 +   outcounter++;
1485 +   bool asFasta=(rqv.is_empty() || fastaOutput);
1486 +   if (asFasta) {
1487 +    if (prefix.is_empty()) {
1488 +       printHeader(f_out, '>',rname,rinfo);
1489 +       fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1490 +       }
1491 +      else {
1492 +       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1493 +                          rseq.chars());
1494 +       }
1495 +     }
1496 +   else {  //fastq
1497 +    if (convert_phred) convertPhred(rqv);
1498 +    if (prefix.is_empty()) {
1499 +       printHeader(f_out, '@', rname, rinfo);
1500 +       fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1501 +       }
1502 +      else
1503 +       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1504 +                          rseq.chars(),rqv.chars() );
1505 +     }
1506 + }
1507 +
1508 + void trash_report(char trashcode, GStr& rname, FILE* freport) {
1509 + if (freport==NULL || trashcode<=' ') return;
1510 + if (trashcode=='3' || trashcode=='5') {
1511 +   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1512 +   }
1513 + else {
1514 +   fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1515 +   }
1516 + //tcounter++;
1517 + }
1518 +
1519 + GStr getFext(GStr& s, int* xpos=NULL) {
1520 + //s must be a filename without a path
1521 + GStr r("");
1522 + if (xpos!=NULL) *xpos=0;
1523 + if (s.is_empty() || s=="-") return r;
1524 + int p=s.rindex('.');
1525 + int d=s.rindex('/');
1526 + if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1527 + r=s.substr(p+1);
1528 + if (xpos!=NULL) *xpos=p+1;
1529 + r.lower();
1530 + return r;
1531 + }
1532 +
1533 + void baseFileName(GStr& fname) {
1534 + //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1535 + int xpos=0;
1536 + GStr fext=getFext(fname, &xpos);
1537 + if (xpos<=1) return;
1538 + bool extdel=false;
1539 + GStr f2;
1540 + if (fext=="z" || fext=="zip") {
1541 +   extdel=true;
1542 +   }
1543 +  else if (fext.length()>=2) {
1544 +   f2=fext.substr(0,2);
1545 +   extdel=(f2=="gz" || f2=="bz");
1546 +   }
1547 + if (extdel) {
1548 +   fname.cut(xpos-1);
1549 +   fext=getFext(fname, &xpos);
1550 +   if (xpos<=1) return;
1551 +   }
1552 + extdel=false;
1553 + if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1554 +   extdel=true;
1555 +   }
1556 +  else if (fext.length()>=2) {
1557 +   extdel=(fext.substr(0,2)=="fa");
1558 +   }
1559 + if (extdel) fname.cut(xpos-1);
1560 + GStr fncp(fname);
1561 + fncp.lower();
1562 + fncp.chomp("seq");
1563 + fncp.chomp("sequence");
1564 + fncp.trimR("_.");
1565 + if (fncp.length()<fname.length()) fname.cut(fncp.length());
1566 + }
1567 +
1568 + FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1569 +  FILE* f_out=NULL;
1570 +  GStr fname(getFileName(infname.chars()));
1571 +  //eliminate known extensions
1572 +  baseFileName(fname);
1573 +  if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1574 +    else if (pocmd.is_empty()) {
1575 +               GStr oname(fname);
1576 +               oname.append('.');
1577 +               oname.append(outsuffix);
1578 +               f_out=fopen(oname.chars(),"w");
1579 +               if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1580 +               }
1581 +            else {
1582 +              GStr oname(">");
1583 +              oname.append(fname);
1584 +              oname.append('.');
1585 +              oname.append(outsuffix);
1586 +              pocmd.append(oname);
1587 +              f_out=popen(pocmd.chars(), "w");
1588 +              if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1589 +              }
1590 + return f_out;
1591 + }
1592 +
1593 + void guess_unzip(GStr& fname, GStr& picmd) {
1594 + GStr fext=getFext(fname);
1595 + if (fext=="gz" || fext=="gzip" || fext=="z") {
1596 +    picmd="gzip -cd ";
1597 +    }
1598 +   else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1599 +    picmd="bzip2 -cd ";
1600 +    }
1601 + }
1602 +
1603 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1604 +                       GStr& s, GStr& infname, GStr& infname2) {
1605 + // uses outsuffix to generate output file names and open file handles as needed
1606 + infname="";
1607 + infname2="";
1608 + f_in=NULL;
1609 + f_in2=NULL;
1610 + f_out=NULL;
1611 + f_out2=NULL;
1612 + //analyze outsuffix intent
1613 + GStr pocmd;
1614 + GStr ox=getFext(outsuffix);
1615 + if (ox.length()>2) ox=ox.substr(0,2);
1616 + if (ox=="gz") pocmd="gzip -9 -c ";
1617 +   else if (ox=="bz") pocmd="bzip2 -9 -c ";
1618 + if (s=="-") {
1619 +    f_in=stdin;
1620 +    infname="stdin";
1621 +    f_out=prepOutFile(infname, pocmd);
1622 +    return;
1623 +    } // streaming from stdin
1624 + s.startTokenize(",:");
1625 + s.nextToken(infname);
1626 + bool paired=s.nextToken(infname2);
1627 + if (fileExists(infname.chars())==0)
1628 +    GError("Error: cannot find file %s!\n",infname.chars());
1629 + GStr fname(getFileName(infname.chars()));
1630 + GStr picmd;
1631 + guess_unzip(fname, picmd);
1632 + if (picmd.is_empty()) {
1633 +   f_in=fopen(infname.chars(), "r");
1634 +   if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1635 +   }
1636 +  else {
1637 +   picmd.append(infname);
1638 +   f_in=popen(picmd.chars(), "r");
1639 +   if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1640 +   }
1641 + f_out=prepOutFile(infname, pocmd);
1642 + if (!paired) return;
1643 + if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1644 + // ---- paired reads:-------------
1645 + if (fileExists(infname2.chars())==0)
1646 +     GError("Error: cannot find file %s!\n",infname2.chars());
1647 + picmd="";
1648 + GStr fname2(getFileName(infname2.chars()));
1649 + guess_unzip(fname2, picmd);
1650 + if (picmd.is_empty()) {
1651 +   f_in2=fopen(infname2.chars(), "r");
1652 +   if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1653 +   }
1654 +  else {
1655 +   picmd.append(infname2);
1656 +   f_in2=popen(picmd.chars(), "r");
1657 +   if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1658 +   }
1659 + f_out2=prepOutFile(infname2, pocmd);
1660 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines