ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 2 | Line 2
2   #include "GStr.h"
3   #include "GHash.hh"
4   #include "GList.hh"
5 + #include <ctype.h>
6 + #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9 < fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\
10 <    [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\
9 > fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 >   [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 >    <input.fq>[,<input_mates.fq>\n\
13   \n\
14 < Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\
15 < optionally collapse duplicates in the input fastq data.\n\
14 > Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 > for low complexity and collapse duplicate reads.\n\
16 > If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 > one read in a pair), the two file names should be given delimited by a comma\n\
18 > or a colon character\n\
19   \n\
20   Options:\n\
21   -n  rename all the reads using the <prefix> followed by a read counter;\n\
22      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23      the read duplication count\n\
24 < -o  write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
24 > -o  unless this parameter is '-', write the trimmed/filtered reads to \n\
25 >    file(s) named <input>.<outsuffix> which will be created in the \n\
26 >    current (working) directory; (writes to stdout if -o- is given);\n\
27 >    a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 > -f  file with adapter sequences to trim, each line having this format:\n\
29 >    <5'-adapter-sequence> <3'-adapter-sequence>\n\
30   -5  trim the given adapter or primer sequence at the 5' end of each read\n\
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32   -3  trim the given adapter sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -l  minimum \"clean\" length at the high quality 5'end that a read must\n\
35 <    have in order to pass the filter (default: 16)\n\
34 > -A  disable polyA/T trimming (enabled by default)\n\
35 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 > -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 > -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 > -l  minimum \"clean\" length after trimming that a read must have\n\
40 >    in order to pass the filter (default: 16)\n\
41   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
42      one letter code for the reason why they were trashed\n\
43 + -D  apply a low-complexity (dust) filter and discard any read that has over\n\
44 +    50% of its length detected as low complexity\n\
45   -C  collapse duplicate reads and add a prefix with their count to the read\n\
46 <    name\n\
47 < -D  apply a low-complexity (dust) filter and discard any read that has at\n\
48 <    least half of it masked by this\n\
49 < -Q  quality values in the input data are interpreted as phred64, convert\n\
31 <    them to phred33\n\
46 >    name (e.g. for microRNAs)\n\
47 > -p  input is phred64/phred33 (use -p64 or -p33)\n\
48 > -Q  convert quality values to the other Phred qv type\n\
49 > -V  verbose processing\n\
50   "
51 +
52 + //-z  for -o option, the output stream(s) will be first piped into the given\n
53 + //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54 +
55 +
56   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 < FILE* f_out=NULL; //stdout if not provided
58 < FILE* f_in=NULL; //input fastq (stdin if not provided)
57 >
58 > //For paired reads sequencing:
59 > //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 > //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 > //FILE* f_out=NULL; //stdout if not provided
62 > //FILE* f_out2=NULL; //for paired reads
63 > //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 > //FILE* f_in2=NULL; //for paired reads
65 >
66   FILE* freport=NULL;
67 +
68   bool debug=false;
69 + bool verbose=false;
70   bool doCollapse=false;
71   bool doDust=false;
72 + bool doPolyTrim=true;
73 + bool fastaOutput=false;
74   bool trashReport=false;
75 < bool rawFormat=false;
75 > bool revCompl=false; //also reverse complement adaptor sequences
76   int min_read_len=16;
77 + double max_perc_N=7.0;
78   int dust_cutoff=16;
79   bool isfasta=false;
80 < bool phred64=false;
80 > bool convert_phred=false;
81 > GStr outsuffix; // -o
82   GStr prefix;
83 < GStr adapter3;
84 < GStr adapter5;
85 < int a3len=0;
86 < int a5len=0;
87 < // adaptor matching metrics -- for extendMatch() function
88 < static const int a_m_score=2; //match score
89 < static const int a_mis_score=-3; //mismatch
90 < static const int a_dropoff_score=7;
91 < static const int a_min_score=7;
83 > GStr zcmd;
84 > int num_trimmed5=0;
85 > int num_trimmed3=0;
86 > int num_trimmedN=0;
87 > int num_trimmedQ=0;
88 > int min_trimmed5=INT_MAX;
89 > int min_trimmed3=INT_MAX;
90 >
91 > int qvtrim_qmin=0;
92 > int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
93 > int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94 > int qv_cvtadd=0; //could be -31 or +31
95 >
96 > // adaptor matching metrics -- for X-drop ungapped extension
97 > const int match_reward=2;
98 > const int mismatch_penalty=3;
99 > const int Xdrop=8;
100 >
101 > const int poly_m_score=2; //match score
102 > const int poly_mis_score=-3; //mismatch
103 > const int poly_dropoff_score=7;
104 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
105 >
106 > const char *polyA_seed="AAAA";
107 > const char *polyT_seed="TTTT";
108 >
109 > struct CASeqData {
110 >   //positional data for every possible hexamer in an adapter
111 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
112 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
113 >   GStr seq; //actual adapter sequence data
114 >   GStr seqr; //reverse complement sequence
115 >   bool use_reverse;
116 >   CASeqData(bool rev=false):seq(),seqr() {
117 >     use_reverse=rev;
118 >     for (int i=0;i<4096;i++) {
119 >        pz[i]=NULL;
120 >        pzr[i]=NULL;
121 >        }
122 >     }
123 >
124 >   void update(const char* s) {
125 >   seq=s;
126 >   table6mers(seq.chars(), seq.length(), pz);
127 >   if (!use_reverse) return;
128 >   //reverse complement
129 >   seqr=s;
130 >   int slen=seq.length();
131 >   for (int i=0;i<slen;i++)
132 >     seqr[i]=ntComplement(seq[slen-i-1]);
133 >   table6mers(seqr.chars(), seqr.length(), pzr);
134 >   }
135 >
136 >   ~CASeqData() {
137 >     for (int i=0;i<4096;i++) {
138 >       delete pz[i];
139 >       delete pzr[i];
140 >     }
141 >   }
142 > };
143 >
144 > GVec<CASeqData> adapters5;
145 > GVec<CASeqData> adapters3;
146 >
147 > CGreedyAlignData* gxmem_l=NULL;
148 > CGreedyAlignData* gxmem_r=NULL;
149  
150   // element in dhash:
151   class FqDupRec {
# Line 91 | Line 184
184    if (!s.is_empty()) {
185        if (s=='-') f=stdout;
186        else {
187 <       f=fopen(s,"w");
187 >       f=fopen(s.chars(),"w");
188         if (f==NULL) GError("Error creating file: %s\n", s.chars());
189         }
190       }
# Line 102 | Line 195
195  
196   GHash<FqDupRec> dhash; //hash to keep track of duplicates
197  
198 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured
199 <
198 > void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
199 > int loadAdapters(const char* fname);
200 >
201 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
202 >                       GStr& s, GStr& infname, GStr& infname2);
203 > // uses outsuffix to generate output file names and open file handles as needed
204 >
205 > void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
206 > void trash_report(char trashcode, GStr& rname, FILE* freport);
207 >
208 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
209 >          GStr& rname, GStr& rinfo, GStr& infname);
210 >
211 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
212 > //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
213 >
214 > bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
215 > bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
216   int dust(GStr& seq);
217 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
217 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
218 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
219   bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
220 + bool trim_adapter3(GStr& seq, int &l5, int &l3);
221  
222 < void convertQ64(char* q, int len);
223 < void convertQ64(GStr& q);
222 > void convertPhred(char* q, int len);
223 > void convertPhred(GStr& q);
224  
225   int main(int argc, char * const argv[]) {
226 <  GArgs args(argc, argv, "YQRDCl:d:3:5:n:r:o:");
226 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
227    int e;
117  int icounter=0; //counter for input reads
118  int outcounter=0; //counter for output reads
228    if ((e=args.isError())>0) {
229        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
230        exit(224);
231        }
232    debug=(args.getOpt('Y')!=NULL);
233 <  phred64=(args.getOpt('Q')!=NULL);
233 >  verbose=(args.getOpt('V')!=NULL);
234 >  convert_phred=(args.getOpt('Q')!=NULL);
235    doCollapse=(args.getOpt('C')!=NULL);
236    doDust=(args.getOpt('D')!=NULL);
237 +  revCompl=(args.getOpt('R')!=NULL);
238 +  if (args.getOpt('A')) doPolyTrim=false;
239 +  /*
240    rawFormat=(args.getOpt('R')!=NULL);
241    if (rawFormat) {
242      GError("Sorry, raw qseq format parsing is not implemented yet!\n");
243      }
244 +  */
245    prefix=args.getOpt('n');
246    GStr s=args.getOpt('l');
247 <  if (!s.is_empty())
247 >  if (!s.is_empty())
248       min_read_len=s.asInt();
249 +  s=args.getOpt('m');
250 +  if (!s.is_empty())
251 +     max_perc_N=s.asDouble();
252    s=args.getOpt('d');
253    if (!s.is_empty()) {
254       dust_cutoff=s.asInt();
255       doDust=true;
256       }
257 <    
258 <  if (args.getOpt('3')!=NULL) {
259 <    adapter3=args.getOpt('3');
260 <    adapter3.upper();
261 <    a3len=adapter3.length();
262 <    }
263 <  if (args.getOpt('5')!=NULL) {
264 <    adapter5=args.getOpt('5');
265 <    adapter5.upper();
266 <    a5len=adapter5.length();
257 >  s=args.getOpt('q');
258 >  if (!s.is_empty()) {
259 >     qvtrim_qmin=s.asInt();
260 >     }
261 >  s=args.getOpt('t');
262 >  if (!s.is_empty()) {
263 >     qvtrim_max=s.asInt();
264 >     }
265 >  s=args.getOpt('p');
266 >  if (!s.is_empty()) {
267 >     int v=s.asInt();
268 >     if (v==33) {
269 >        qv_phredtype=33;
270 >        qv_cvtadd=31;
271 >        }
272 >      else if (v==64) {
273 >        qv_phredtype=64;
274 >        qv_cvtadd=-31;
275 >        }
276 >       else
277 >         GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
278 >     }
279 >  s=args.getOpt('f');
280 >  if (!s.is_empty()) {
281 >   loadAdapters(s.chars());
282 >   }
283 >  bool fileAdapters=adapters5.Count()+adapters3.Count();
284 >  s=args.getOpt('5');
285 >  if (!s.is_empty()) {
286 >    if (fileAdapters)
287 >      GError("Error: options -5 and -f cannot be used together!\n");
288 >    s.upper();
289 >    addAdapter(adapters5, s);
290      }
291 +  s=args.getOpt('3');
292 +  if (!s.is_empty()) {
293 +    if (fileAdapters)
294 +      GError("Error: options -3 and -f cannot be used together!\n");
295 +      s.upper();
296 +      addAdapter(adapters3, s);
297 +    }
298 +  s=args.getOpt('y');
299 +  if (!s.is_empty()) {
300 +     int minmatch=s.asInt();
301 +     poly_minScore=minmatch*poly_m_score;
302 +     }
303 +
304 +  if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
305 +                         else outsuffix="-";
306    trashReport=  (args.getOpt('r')!=NULL);
307 <  if (args.startNonOpt()==0) {
307 >  int fcount=args.startNonOpt();
308 >  if (fcount==0) {
309      GMessage(USAGE);
310      exit(224);
311      }
312 <
313 <  openfw(f_out, args, 'o');
314 <  if (f_out==NULL) f_out=stdout;
312 >   if (fcount>1 && doCollapse) {
313 >    GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
314 >    }
315 >  //openfw(f_out, args, 'o');
316 >  //if (f_out==NULL) f_out=stdout;
317    if (trashReport)
318      openfw(freport, args, 'r');
319    char* infile=NULL;
320 +
321 +  if (adapters5.Count()>0)
322 +    gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
323 +  if (adapters3.Count()>0)
324 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
325 +
326    while ((infile=args.nextNonOpt())!=NULL) {
327 <    GStr infname(infile);
328 <    if (strcmp(infile,"-")==0) {
329 <       f_in=stdin; infname="stdin"; }
330 <     else {
331 <        f_in=fopen(infile,"r");
332 <        if (f_in==NULL)
333 <            GError("Cannot open input file %s!\n",infile);
334 <        }
335 <     GLineReader fq(f_in);
336 <     char* l=NULL;
337 <     while ((l=fq.getLine())!=NULL) {
338 <        GStr rname; //current read name
339 <        GStr rseq;  //current read sequence
340 <        GStr rqv;   //current read quality values
341 <        GStr s;
342 <        if (rawFormat) {
343 <          //TODO: implement qseq parsing here
344 <          //if (raw type=N) then continue; //skip invalid/bad records
345 <          
346 <          } //raw qseq format
347 <         else { // FASTQ or FASTA
348 <          isfasta=(l[0]=='>');
349 <          if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
350 <          s=l;
351 <          rname=&(l[1]);
352 <          icounter++;
353 <          //GMessage("readname=%s\n",rname.chars());
354 <          for (int i=0;i<rname.length();i++)
355 <            if (rname[i]<=' ') { rname.cut(i); break; }
356 <          //now get the sequence
357 <          if ((l=fq.getLine())==NULL)
358 <              GError("Error: unexpected EOF after header for %s\n",rname.chars());
359 <          rseq=l; //this must be the DNA line
360 <          if (isfasta) {
361 <            while ((l=fq.getLine())!=NULL) {
362 <              //fasta can have multiple sequence lines
363 <              if (l[0]=='>') {
364 <                   fq.pushBack();
365 <                   break; //
327 >    //for each input file
328 >    int incounter=0; //counter for input reads
329 >    int outcounter=0; //counter for output reads
330 >    int trash_s=0; //too short from the get go
331 >    int trash_Q=0;
332 >    int trash_N=0;
333 >    int trash_D=0;
334 >    int trash_poly=0;
335 >    int trash_A3=0;
336 >    int trash_A5=0;
337 >    s=infile;
338 >    GStr infname;
339 >    GStr infname2;
340 >    FILE* f_in=NULL;
341 >    FILE* f_in2=NULL;
342 >    FILE* f_out=NULL;
343 >    FILE* f_out2=NULL;
344 >    bool paired_reads=false;
345 >    setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
346 >    GLineReader fq(f_in);
347 >    GLineReader* fq2=NULL;
348 >    if (f_in2!=NULL) {
349 >       fq2=new GLineReader(f_in2);
350 >       paired_reads=true;
351 >       }
352 >    GStr rseq, rqv, seqid, seqinfo;
353 >    GStr rseq2, rqv2, seqid2, seqinfo2;
354 >    while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
355 >       incounter++;
356 >       int a5=0, a3=0, b5=0, b3=0;
357 >       char tcode=0, tcode2=0;
358 >       tcode=process_read(seqid, rseq, rqv, a5, a3);
359 >       if (fq2!=NULL) {
360 >            getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
361 >            if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
362 >               GError("Error: no paired match for read %s vs %s (%s,%s)\n",
363 >                  seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
364 >               }
365 >            tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
366 >            //decide what to do with this pair and print rseq2 if the pair makes it
367 >            if (tcode>1 && tcode2<=1) {
368 >               //"untrash" rseq
369 >               if (a3-a5+1<min_read_len) {
370 >                   a5=1;
371 >                   if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
372                     }
373 <              rseq+=l;
374 <              }
375 <            } //multi-line fasta file
376 <          if (!isfasta) {
377 <            if ((l=fq.getLine())==NULL)
378 <                GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
379 <            if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
380 <            if ((l=fq.getLine())==NULL)
381 <                GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
382 <            rqv=l;
383 <            if (rqv.length()!=rseq.length())
384 <                GError("Error: qv len != seq len for %s\n", rname.chars());
385 <            }
386 <        } //<-- FASTA or FASTQ
217 <        rseq.upper();
218 <        int l5=0;
219 <        int l3=rseq.length()-1;
220 <        if (l3-l5+1<min_read_len) {
221 <           if (trashReport) {
222 <                  fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
223 <                  }
224 <           continue;
225 <           }
226 <        if (ntrim(rseq, rqv, l5, l3)) {
227 <           if (l3-l5+1<min_read_len) {
228 <             if (trashReport) {
229 <                    fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
230 <                    }
231 <             continue; //invalid read
232 <             }
233 <            //-- keep only the l5..l3 range
234 <           rseq=rseq.substr(l5, l3-l5+1);
235 <           if (!rqv.is_empty())
236 <              rqv=rqv.substr(l5, l3-l5+1);
237 <           }
238 <          
239 <        if (a3len>0) {
240 <          if (trim_adapter3(rseq, l5, l3)) {
241 <             if (l3-l5+1<min_read_len) {
242 <                 if (trashReport) {
243 <                     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
244 <                     }
245 <                 continue;
246 <                 }
247 <              //-- keep only the l5..l3 range
248 <              rseq=rseq.substr(l5, l3-l5+1);
249 <              if (!rqv.is_empty())
250 <                 rqv=rqv.substr(l5, l3-l5+1);
251 <              }//some adapter was trimmed
252 <           } //adapter trimming
253 <        if (a5len>0) {
254 <          if (trim_adapter5(rseq, l5, l3)) {
255 <             if (l3-l5+1<min_read_len) {
256 <                 if (trashReport) {
257 <                     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
258 <                     }
259 <                 continue;
373 >               tcode=1;
374 >               }
375 >             else if (tcode<=1 && tcode2>1) {
376 >               //"untrash" rseq2
377 >               if (b3-b5+1<min_read_len) {
378 >                   b5=1;
379 >                   if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
380 >                   }
381 >               tcode2=1;
382 >               }
383 >            if (tcode<=1) { //trimmed or left intact -- write it!
384 >               if (tcode>0) {
385 >                 rseq2=rseq2.substr(b5,b3-b5+1);
386 >                 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
387                   }
388 <              //-- keep only the l5..l3 range
389 <              rseq=rseq.substr(l5, l3-l5+1);
390 <              if (!rqv.is_empty())
391 <                 rqv=rqv.substr(l5, l3-l5+1);
392 <              }//some adapter was trimmed
393 <           } //adapter trimming
394 <        if (doCollapse) {
395 <           //keep read for later
396 <           FqDupRec* dr=dhash.Find(rseq.chars());
397 <           if (dr==NULL) { //new entry
398 <                  //if (prefix.is_empty())
399 <                     dhash.Add(rseq.chars(),
400 <                          new FqDupRec(&rqv, rname.chars()));
401 <                  //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
388 >               int nocounter=0;
389 >               writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
390 >               }
391 >            } //pair read
392 >       if (tcode>1) { //trashed
393 >         #ifdef GDEBUG
394 >         GMessage(" !!!!TRASH => 'N'\n");
395 >         #endif
396 >          if (tcode=='s') trash_s++;
397 >          else if (tcode=='A' || tcode=='T') trash_poly++;
398 >            else if (tcode=='Q') trash_Q++;
399 >              else if (tcode=='N') trash_N++;
400 >               else if (tcode=='D') trash_D++;
401 >                else if (tcode=='3') trash_A3++;
402 >                 else if (tcode=='5') trash_A5++;
403 >          if (trashReport) trash_report(tcode, seqid, freport);
404 >          }
405 >         else if (!doCollapse) { //write it
406 >          if (tcode>0) {
407 >            rseq=rseq.substr(a5,a3-a5+1);
408 >            if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
409 >            }
410 >         #ifdef GDEBUG
411 >            GMessage("  After trimming:\n");
412 >            GMessage("%s\n",rseq.chars());
413 >         #endif
414 >          writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
415 >          }
416 >       } //for each fastq record
417 >    delete fq2;
418 >    FRCLOSE(f_in);
419 >    FRCLOSE(f_in2);
420 >    if (doCollapse) {
421 >       outcounter=0;
422 >       int maxdup_count=1;
423 >       char* maxdup_seq=NULL;
424 >       dhash.startIterate();
425 >       FqDupRec* qd=NULL;
426 >       char* seq=NULL;
427 >       while ((qd=dhash.NextData(seq))!=NULL) {
428 >         GStr rseq(seq);
429 >         //do the dusting here
430 >         if (doDust) {
431 >            int dustbases=dust(rseq);
432 >            if (dustbases>(rseq.length()>>1)) {
433 >               if (trashReport && qd->firstname!=NULL) {
434 >                 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
435                   }
436 <              else    
437 <                 dr->add(rqv);
438 <           } //collapsing duplicates
439 <         else { //not collapsing duplicates
440 <           //do the dust filter now
441 <           if (doDust) {
442 <             int dustbases=dust(rseq);
443 <             if (dustbases>(rseq.length()>>1)) {
444 <                if (trashReport) {
445 <                  fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
446 <                  }
447 <                continue;
448 <                }
289 <             }
290 <           //print this record here  
291 <           outcounter++;
292 <           if (isfasta) {
293 <            if (prefix.is_empty())
294 <               fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
295 <              else
296 <               fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
297 <                                  rseq.chars());
436 >               trash_D+=qd->count;
437 >               continue;
438 >               }
439 >            }
440 >         outcounter++;
441 >         if (qd->count>maxdup_count) {
442 >            maxdup_count=qd->count;
443 >            maxdup_seq=seq;
444 >            }
445 >         if (isfasta) {
446 >           if (prefix.is_empty()) {
447 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
448 >                           rseq.chars());
449               }
450 <           else {  //fastq
451 <            if (phred64) convertQ64(rqv);
452 <            if (prefix.is_empty())
302 <               fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
303 <              else
304 <               fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
305 <                                  rseq.chars(),rqv.chars() );
450 >           else { //use custom read name
451 >             fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
452 >                        qd->count, rseq.chars());
453               }
454 <           } //not collapsing duplicates
455 <        } //for each fastq record
456 <   } //while each input file
457 < FRCLOSE(f_in);
458 < if (doCollapse) {
459 <    outcounter=0;
313 <    int maxdup_count=1;
314 <    char* maxdup_seq=NULL;
315 <    dhash.startIterate();
316 <    FqDupRec* qd=NULL;
317 <    char* seq=NULL;
318 <    while ((qd=dhash.NextData(seq))!=NULL) {
319 <      GStr rseq(seq);
320 <      //do the dusting here
321 <      if (doDust) {
322 <         int dustbases=dust(rseq);
323 <         if (dustbases>(rseq.length()>>1)) {
324 <            if (trashReport && qd->firstname!=NULL) {
325 <              fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
326 <              }
327 <            continue;
454 >           }
455 >         else { //fastq format
456 >          if (convert_phred) convertPhred(qd->qv, qd->len);
457 >          if (prefix.is_empty()) {
458 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
459 >                           rseq.chars(), qd->qv);
460              }
461 +          else { //use custom read name
462 +            fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
463 +                        qd->count, rseq.chars(), qd->qv);
464 +            }
465 +           }
466 +         }//for each element of dhash
467 +       if (maxdup_count>1) {
468 +         GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
469           }
470 <      outcounter++;
471 <      if (qd->count>maxdup_count) {
472 <         maxdup_count=qd->count;
473 <         maxdup_seq=seq;
474 <         }
475 <      if (isfasta) {
476 <        if (prefix.is_empty()) {
477 <          fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
478 <                        rseq.chars());
479 <          }
480 <        else { //use custom read name
481 <          fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
482 <                     qd->count, rseq.chars());
470 >       } //collapse entries
471 >    if (verbose) {
472 >       if (paired_reads) {
473 >           GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
474 >           GMessage("Number of input pairs :%9d\n", incounter);
475 >           GMessage("         Output pairs :%9d\n", outcounter);
476 >           }
477 >         else {
478 >           GMessage(">Input file : %s\n", infname.chars());
479 >           GMessage("Number of input reads :%9d\n", incounter);
480 >           GMessage("         Output reads :%9d\n", outcounter);
481 >           }
482 >       GMessage("------------------------------------\n");
483 >       if (num_trimmed5)
484 >          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
485 >       if (num_trimmed3)
486 >          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
487 >       GMessage("------------------------------------\n");
488 >       if (trash_s>0)
489 >         GMessage("     Trashed by length:%9d\n", trash_s);
490 >       if (trash_N>0)
491 >         GMessage("         Trashed by N%%:%9d\n", trash_N);
492 >       if (trash_Q>0)
493 >         GMessage("Trashed by low quality:%9d\n", trash_Q);
494 >       if (trash_poly>0)
495 >         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
496 >       if (trash_A5>0)
497 >         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
498 >       if (trash_A3>0)
499 >         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
500 >       }
501 >    if (trashReport) {
502 >          FWCLOSE(freport);
503            }
504 <        }
505 <      else { //fastq format
506 <       if (phred64) convertQ64(qd->qv, qd->len);
507 <       if (prefix.is_empty()) {
508 <         fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
509 <                        rseq.chars(), qd->qv);
350 <         }
351 <       else { //use custom read name
352 <         fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
353 <                     qd->count, rseq.chars(), qd->qv);
354 <         }
355 <        }
356 <      }//for each element of dhash
357 <    if (maxdup_count>1) {
358 <      GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
359 <      }
360 <   } //report collapsed dhash entries
361 < GMessage("Number of input reads: %9d\n", icounter);
362 < GMessage("       Output records: %9d\n", outcounter);
363 < if (trashReport) {
364 <    FWCLOSE(freport);
365 <    }
366 <
367 < FWCLOSE(f_out);
504 >    FWCLOSE(f_out);
505 >    FWCLOSE(f_out2);
506 >   } //while each input file
507 > delete gxmem_l;
508 > delete gxmem_r;
509 > //getc(stdin);
510   }
511  
512   class NData {
# Line 378 | Line 520
520     const char* seq;
521     bool valid;
522     NData() {
523 +    seqlen=0;
524      NCount=0;
525      end5=0;
526      end3=0;
# Line 393 | Line 536
536       end5=1;
537       end3=seqlen;
538       for (int i=0;i<seqlen;i++)
539 <        if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
539 >        if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
540             NPos[NCount]=i;
541             NCount++;
542             }
# Line 408 | Line 551
551       perc_N=(n*100.0)/(end5-end3+1);
552       }
553   };
554 <
554 >
555   static NData feat;
556   int perc_lenN=12; // incremental distance from ends, in percentage of
557            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
558 <          
558 >
559   void N_analyze(int l5, int l3, int p5, int p3) {
560   /* assumes feat was filled properly */
561   int old_dif, t5,t3,v;
562 < if (l3-l5<min_read_len || p5>p3 ) {
562 > if (l3<l5+2 || p5>p3 ) {
563     feat.end5=l5+1;
564     feat.end3=l3+1;
565 <   return;
565 >   return;
566     }
567  
568   t5=feat.NPos[p5]-l5;
569   t3=l3-feat.NPos[p3];
570   old_dif=p3-p5;
571   v=(int)((((double)(l3-l5))*perc_lenN)/100);
572 < if (v>20) v=20; /* enforce N-search limit for very long reads */
572 > if (v>20) v=20; /* enforce N-search limit for very long reads */
573   if (t5 < v ) {
574     l5=feat.NPos[p5]+1;
575     p5++;
# Line 443 | Line 586
586             feat.end3=l3+1;
587             return;
588             }
589 <    else
589 >    else
590        N_analyze(l5,l3, p5,p3);
591   }
592  
593  
594 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) {
595 < //count Ns in the sequence
594 > bool qtrim(GStr& qvs, int &l5, int &l3) {
595 > if (qvtrim_qmin==0 || qvs.is_empty()) return false;
596 > if (qv_phredtype==0) {
597 >  //try to guess the Phred type
598 >  int vmin=256, vmax=0;
599 >  for (int i=0;i<qvs.length();i++) {
600 >     if (vmin>qvs[i]) vmin=qvs[i];
601 >     if (vmax<qvs[i]) vmax=qvs[i];
602 >     }
603 >  if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
604 >  if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
605 >  if (qv_phredtype==0) {
606 >    GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
607 >    }
608 >  if (verbose)
609 >    GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
610 >  } //guessing Phred type
611 > for (l3=qvs.length()-1;l3>2;l3--) {
612 >  if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
613 >  }
614 > //just in case, check also the 5' the end (?)
615 > for (l5=0;l5<qvs.length()-3;l5++) {
616 >  if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
617 >  }
618 > if (qvtrim_max>0) {
619 >  if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
620 >  if (l5>qvtrim_max) l5=qvtrim_max;
621 >  }
622 > return (l5>0 || l3<qvs.length()-1);
623 > }
624 >
625 > bool ntrim(GStr& rseq, int &l5, int &l3) {
626 > //count Ns in the sequence, trim N-rich ends
627   feat.init(rseq);
628   l5=feat.end5-1;
629   l3=feat.end3-1;
630 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
631 < if (l5==feat.end5-1 && l3==feat.end3-1)
632 <    return false; //nothing changed
630 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
631 > if (l5==feat.end5-1 && l3==feat.end3-1) {
632 >    if (feat.perc_N>max_perc_N) {
633 >           feat.valid=false;
634 >           l3=l5+1;
635 >           return true;
636 >           }
637 >      else {
638 >       return false; //nothing changed
639 >       }
640 >    }
641   l5=feat.end5-1;
642   l3=feat.end3-1;
643   if (l3-l5+1<min_read_len) {
# Line 463 | Line 645
645     return true;
646     }
647   feat.N_calc();
648 < if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases
648 >
649 > if (feat.perc_N>max_perc_N) {
650        feat.valid=false;
651        l3=l5+1;
652        return true;
# Line 474 | Line 657
657   //--------------- dust functions ----------------
658   class DNADuster {
659   public:
660 <  int dustword;
661 <  int dustwindow;
662 <  int dustwindow2;
660 >  int dustword;
661 >  int dustwindow;
662 >  int dustwindow2;
663    int dustcutoff;
664    int mv, iv, jv;
665    int counts[32*32*32];
# Line 571 | Line 754
754                      }
755             }
756           }
757 < //return first;
757 > //return first;
758   }
759   };
760  
# Line 589 | Line 772
772   return ncount;
773   }
774  
775 < // ------------------ adapter matching - triplet matching
776 < //look for matching triplets amongst the first 9 nucleotides of the 3' adaptor
777 < // or the last 9 nucleotides for the 5' adapter
778 < //when a triplet match is found, simply try to extend the alignment using a drop-off scheme
779 < // check minimum score and
780 < // for 3' adapter trimming:
781 < //     require that the right end of the alignment for either the adaptor OR the read must be
782 < //     < 3 distance from its right end
783 < // for 5' adapter trimming:
784 < //     require that the left end of the alignment for either the adaptor OR the read must
785 < //     be at coordinate 0
786 <
787 < bool extendMatch(const char* a, int alen, int ai,
788 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, bool end5=false) {
789 < //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
790 < //if (debug)
608 < //  GStr dbg(b);
609 < //GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai);
610 < int a_l=ai; //alignment coordinates on a
611 < int a_r=ai+mlen-1;
612 < int b_l=bi; //alignment coordinates on b
613 < int b_r=bi+mlen-1;
614 < int ai_maxscore=ai;
615 < int bi_maxscore=bi;
616 < int score=mlen*a_m_score;
617 < int maxscore=score;
618 < int mism5score=a_mis_score;
619 < if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
620 < //try to extend to the left first, if possible
621 < while (ai>0 && bi>0) {
622 <   ai--;
623 <   bi--;
624 <   score+= (a[ai]==b[bi])? a_m_score : mism5score;
625 <   if (score>maxscore) {
626 <       ai_maxscore=ai;
627 <       bi_maxscore=bi;
628 <       maxscore=score;
629 <       }
630 <     else if (maxscore-score>a_dropoff_score) break;
631 <   }
632 < a_l=ai_maxscore;
633 < b_l=bi_maxscore;
634 < //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
635 < //now extend to the right
636 < ai_maxscore=a_r;
637 < bi_maxscore=b_r;
638 < ai=a_r;
639 < bi=b_r;
640 < score=maxscore;
641 < //sometimes there are extra AAAAs at the end of the read, ignore those
642 < if (strcmp(&a[alen-4],"AAAA")==0) {
643 <    alen-=3;
644 <    while (a[alen-1]=='A' && alen>ai) alen--;
645 <    }
646 < while (ai<alen-1 && bi<blen-1) {
647 <   ai++;
648 <   bi++;
649 <   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
650 <   if (a[ai]==b[bi]) { //match
651 <      score+=a_m_score;
652 <      if (ai>=alen-2) {
653 <           score+=a_m_score-(alen-ai-1);
654 <           }
655 <      }
656 <    else { //mismatch
657 <      score+=a_mis_score;
658 <      }  
659 <   if (score>maxscore) {
660 <       ai_maxscore=ai;
661 <       bi_maxscore=bi;
662 <       maxscore=score;
663 <       }
664 <     else if (maxscore-score>a_dropoff_score) break;
665 <   }
666 <  a_r=ai_maxscore;
667 <  b_r=bi_maxscore;
668 <  int a_ovh3=alen-a_r-1;
669 <  int b_ovh3=blen-b_r-1;
670 <  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
671 <  int mmovh5=(a_l<b_l)? a_l : b_l;
672 <  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
673 <  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
674 <     if (a_l<a_ovh3) {
675 <        //adapter closer to the left end (typical for 5' adapter)
676 <        l5=a_r+1;
677 <        l3=alen-1;
678 <        }
679 <      else {
680 <        //adapter matching at the right end (typical for 3' adapter)
681 <        l5=0;
682 <        l3=a_l-1;
683 <        }
684 <     return true;
685 <     }
686 <  //do not trim:
687 <  l5=0;
688 <  l3=alen-1;
689 <  return false;
690 < }
775 > struct SLocScore {
776 >  int pos;
777 >  int score;
778 >  SLocScore(int p=0,int s=0) {
779 >    pos=p;
780 >    score=s;
781 >    }
782 >  void set(int p, int s) {
783 >    pos=p;
784 >    score=s;
785 >    }
786 >  void add(int p, int add) {
787 >    pos=p;
788 >    score+=add;
789 >    }
790 > };
791  
792 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
792 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
793 > if (!doPolyTrim) return false;
794   int rlen=seq.length();
795   l5=0;
796   l3=rlen-1;
797 < //first try a full match, we might get lucky
798 < int fi=-1;
799 < if ((fi=seq.index(adapter3))>=0) {
800 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
801 <      l5=fi+a3len;
802 <      l3=rlen-1;
803 <      }
804 <    else {
805 <      l5=0;
806 <      l3=fi-1;
797 > int32 seedVal=*(int32*)poly_seed;
798 > char polyChar=poly_seed[0];
799 > //assumes N trimming was already done
800 > //so a poly match should be very close to the end of the read
801 > // -- find the initial match (seed)
802 > int lmin=GMAX((rlen-16), 0);
803 > int li;
804 > for (li=rlen-4;li>lmin;li--) {
805 >   if (seedVal==*(int*)&(seq[li])) {
806 >      break;
807        }
707   return true;
808     }
809 < //also, for fast detection of other adapter-only reads that start past
810 < // the beginning of the adapter sequence, try to see if the first a3len-4
811 < // characters of the read are a substring of the adapter
812 < GStr rstart=seq.substr(0,a3len-4);
813 < if ((fi=adapter3.index(rstart))>=0) {
814 <   l3=rlen-1;
815 <   l5=a3len-4;
816 <   while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
817 <   return true;
809 > if (li<=lmin) return false;
810 > //seed found, try to extend it both ways
811 > //extend right
812 > int ri=li+3;
813 > SLocScore loc(ri, poly_m_score<<2);
814 > SLocScore maxloc(loc);
815 > //extend right
816 > while (ri<rlen-1) {
817 >   ri++;
818 >   if (seq[ri]==polyChar) {
819 >                loc.add(ri,poly_m_score);
820 >                }
821 >   else if (seq[ri]=='N') {
822 >                loc.add(ri,0);
823 >                }
824 >   else { //mismatch
825 >        loc.add(ri,poly_mis_score);
826 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
827 >        }
828 >   if (maxloc.score<=loc.score) {
829 >      maxloc=loc;
830 >      }
831     }
832 <  
833 < //another easy case: first half of the adaptor matches
834 < int a3hlen=a3len>>1;
835 < GStr ahalf=adapter3.substr(0, a3hlen);
836 < if ((fi=seq.index(ahalf))>=0) {
837 <   extendMatch(seq.chars(), rlen, fi,
838 <                 adapter3.chars(), a3len, 0,  a3hlen, l5,l3);
839 <   return true;
832 > ri=maxloc.pos;
833 > if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
834 > //ri = right boundary for the poly match
835 > //extend left
836 > loc.set(li, maxloc.score);
837 > maxloc.pos=li;
838 > while (li>0) {
839 >    li--;
840 >    if (seq[li]==polyChar) {
841 >                 loc.add(li,poly_m_score);
842 >                 }
843 >    else if (seq[li]=='N') {
844 >                 loc.add(li,0);
845 >                 }
846 >    else { //mismatch
847 >         loc.add(li,poly_mis_score);
848 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
849 >         }
850 >    if (maxloc.score<=loc.score) {
851 >       maxloc=loc;
852 >       }
853 >    }
854 > li=maxloc.pos;
855 > if ((maxloc.score==poly_minScore && ri==rlen-1) ||
856 >    (maxloc.score>poly_minScore && ri>=rlen-3) ||
857 >    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
858 >  //trimming this li-ri match at 3' end
859 >    l3=li-1;
860 >    if (l3<0) l3=0;
861 >    return true;
862 >    }
863 > return false;
864 > }
865 >
866 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
867 > if (!doPolyTrim) return false;
868 > int rlen=seq.length();
869 > l5=0;
870 > l3=rlen-1;
871 > int32 seedVal=*(int32*)poly_seed;
872 > char polyChar=poly_seed[0];
873 > //assumes N trimming was already done
874 > //so a poly match should be very close to the end of the read
875 > // -- find the initial match (seed)
876 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
877 > int li;
878 > for (li=0;li<=lmax;li++) {
879 >   if (seedVal==*(int*)&(seq[li])) {
880 >      break;
881 >      }
882     }
883 < //no easy cases, so let's do the word hashing
884 < for (int iw=0;iw<6;iw++) {
885 <   GStr aword=adapter3.substr(iw,3);
886 <   if ((fi=seq.index(aword))>=0  && rlen-fi>3) {
887 <      if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
888 <                   a3len, iw, 3, l5,l3)) return true;
883 > if (li>lmax) return false;
884 > //seed found, try to extend it both ways
885 > //extend left
886 > int ri=li+3; //save rightmost base of the seed
887 > SLocScore loc(li, poly_m_score<<2);
888 > SLocScore maxloc(loc);
889 > while (li>0) {
890 >    li--;
891 >    if (seq[li]==polyChar) {
892 >                 loc.add(li,poly_m_score);
893 >                 }
894 >    else if (seq[li]=='N') {
895 >                 loc.add(li,0);
896 >                 }
897 >    else { //mismatch
898 >         loc.add(li,poly_mis_score);
899 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
900 >         }
901 >    if (maxloc.score<=loc.score) {
902 >       maxloc=loc;
903 >       }
904 >    }
905 > li=maxloc.pos;
906 > if (li>5) return false; //no trimming wanted, too far from 5' end
907 > //li = right boundary for the poly match
908 >
909 > //extend right
910 > loc.set(ri, maxloc.score);
911 > maxloc.pos=ri;
912 > while (ri<rlen-1) {
913 >   ri++;
914 >   if (seq[ri]==polyChar) {
915 >                loc.add(ri,poly_m_score);
916 >                }
917 >   else if (seq[ri]=='N') {
918 >                loc.add(ri,0);
919 >                }
920 >   else { //mismatch
921 >        loc.add(ri,poly_mis_score);
922 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
923 >        }
924 >   if (maxloc.score<=loc.score) {
925 >      maxloc=loc;
926        }
927     }
928 < return false; //no adapter parts found
928 > ri=maxloc.pos;
929 > if ((maxloc.score==poly_minScore && li==0) ||
930 >     (maxloc.score>poly_minScore && li<2)
931 >     || (maxloc.score>(poly_minScore*3) && li<8)) {
932 >    //adjust l5 to reflect this trimming of 5' end
933 >    l5=ri+1;
934 >    if (l5>rlen-1) l5=rlen-1;
935 >    return true;
936 >    }
937 > return false;
938 > }
939 >
940 > bool trim_adapter3(GStr& seq, int&l5, int &l3) {
941 > if (adapters3.Count()==0) return false;
942 > int rlen=seq.length();
943 > l5=0;
944 > l3=rlen-1;
945 > bool trimmed=false;
946 > GStr wseq(seq.chars());
947 > int wlen=rlen;
948 > for (int ai=0;ai<adapters3.Count();ai++) {
949 >  //if (adapters3[ai].is_empty()) continue;
950 >  int alen=adapters3[ai].seq.length();
951 >  GStr& aseq=adapters3[ai].seq;
952 >  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, adapters3[ai].pz,
953 >                            wseq.chars(), wlen, gxmem_r, 74);
954 >  if (r_bestaln) {
955 >     trimmed=true;
956 >     //keep unmatched region on the left, if any
957 >     l3-=(wlen-r_bestaln->sl+1);
958 >     delete r_bestaln;
959 >     if (l3<0) l3=0;
960 >     if (l3-l5+1<min_read_len) return true;
961 >     wseq=seq.substr(l5,l3-l5+1);
962 >     wlen=wseq.length();
963 >     }
964 >  }//for each 5' adapter
965 >  return trimmed;
966   }
967  
968   bool trim_adapter5(GStr& seq, int&l5, int &l3) {
969 < //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
969 > if (adapters5.Count()==0) return false;
970   int rlen=seq.length();
971   l5=0;
972   l3=rlen-1;
973 < //try to see if adapter is fully included in the read
974 < int fi=-1;
975 < if ((fi=seq.index(adapter5))>=0) {
976 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
977 <      l5=fi+a5len;
978 <      l3=rlen-1;
979 <      }
980 <    else {
981 <      l5=0;
982 <      l3=fi-1;
973 > bool trimmed=false;
974 > GStr wseq(seq.chars());
975 > int wlen=rlen;
976 > for (int ai=0;ai<adapters5.Count();ai++) {
977 >  //if (adapters5[ai].is_empty()) continue;
978 >  int alen=adapters5[ai].seq.length();
979 >  GStr& aseq=adapters5[ai].seq;
980 >  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
981 >                 wseq.chars(), wlen, gxmem_l, 84);
982 >  if (l_bestaln) {
983 >     trimmed=true;
984 >     l5+=l_bestaln->sr;
985 >     delete l_bestaln;
986 >     if (l5>=rlen) l5=rlen-1;
987 >     if (l3-l5+1<min_read_len) return true;
988 >     wseq=seq.substr(l5,l3-l5+1);
989 >     wlen=wseq.length();
990 >     }
991 >  }//for each 5' adapter
992 >  return trimmed;
993 > }
994 >
995 > //convert qvs to/from phred64 from/to phread33
996 > void convertPhred(GStr& q) {
997 > for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
998 > }
999 >
1000 > void convertPhred(char* q, int len) {
1001 > for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1002 > }
1003 >
1004 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1005 >          GStr& rname, GStr& rinfo, GStr& infname) {
1006 > rseq="";
1007 > rqv="";
1008 > rname="";
1009 > rinfo="";
1010 > if (fq.eof()) return false;
1011 > char* l=fq.getLine();
1012 > while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1013 > if (l==NULL) return false;
1014 > /* if (rawFormat) {
1015 >      //TODO: implement raw qseq parsing here
1016 >      //if (raw type=N) then continue; //skip invalid/bad records
1017 >      } //raw qseq format
1018 > else { // FASTQ or FASTA */
1019 > isfasta=(l[0]=='>');
1020 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1021 >      infname.chars(), l);
1022 > GStr s(l);
1023 > rname=&(l[1]);
1024 > for (int i=0;i<rname.length();i++)
1025 >    if (rname[i]<=' ') {
1026 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1027 >       rname.cut(i);
1028 >       break;
1029 >       }
1030 >  //now get the sequence
1031 > if ((l=fq.getLine())==NULL)
1032 >      GError("Error: unexpected EOF after header for read %s (%s)\n",
1033 >                   rname.chars(), infname.chars());
1034 > rseq=l; //this must be the DNA line
1035 > while ((l=fq.getLine())!=NULL) {
1036 >      //seq can span multiple lines
1037 >      if (l[0]=='>' || l[0]=='+') {
1038 >           fq.pushBack();
1039 >           break; //
1040 >           }
1041 >      rseq+=l;
1042 >      } //check for multi-line seq
1043 > if (!isfasta) { //reading fastq quality values, which can also be multi-line
1044 >    if ((l=fq.getLine())==NULL)
1045 >        GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1046 >    if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1047 >    if ((l=fq.getLine())==NULL)
1048 >        GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1049 >    rqv=l;
1050 >    //if (rqv.length()!=rseq.length())
1051 >    //  GError("Error: qv len != seq len for %s\n", rname.chars());
1052 >    while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1053 >      rqv+=l; //append to qv string
1054        }
1055 <   return true;
1055 >    }// fastq
1056 > // } //<-- FASTA or FASTQ
1057 > rseq.upper();
1058 > return true;
1059 > }
1060 >
1061 > #ifdef GDEBUG
1062 > void showTrim(GStr& s, int l5, int l3) {
1063 >  if (l5>0) {
1064 >    color_bg(c_red);
1065 >    }
1066 >  for (int i=0;i<s.length()-1;i++) {
1067 >    if (i && i==l5) color_resetbg();
1068 >    fprintf(stderr, "%c", s[i]);
1069 >    if (i==l3) color_bg(c_red);
1070     }
1071 < //for fast detection of adapter-rich reads, check if the first 12
1072 < //characters of the read are a substring of the adapter
1073 < GStr rstart=seq.substr(1,12);
1074 < if ((fi=adapter5.index(rstart))>=0) {
1075 <   //l3=rlen-1;
1076 <   //l5=a5len-4;
1077 <   //while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++;
1078 <   //return true;
1079 <   //if (debug) GMessage(" first 12char of the read match adaptor!\n");
1080 <   extendMatch(seq.chars(), rlen, 1,
1081 <                 adapter5.chars(), a5len, fi,  12, l5,l3, true);
1082 <   return true;
1071 >  fprintf(stderr, "%c", s[s.length()-1]);
1072 >  color_reset();
1073 >  fprintf(stderr, "\n");
1074 > }
1075 > #endif
1076 >
1077 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1078 > //returns 0 if the read was untouched, 1 if it was just trimmed
1079 > // and a trash code if it was trashed
1080 > l5=0;
1081 > l3=rseq.length()-1;
1082 > #ifdef GDEBUG
1083 >   //rseq.reverse();
1084 >   GMessage(">%s\n", rname.chars());
1085 >   GMessage("%s\n",rseq.chars());
1086 > #endif
1087 > if (l3-l5+1<min_read_len) {
1088 >   return 's';
1089     }
1090 <  
1091 < //another easy case: last 12 characters of the adaptor found as a substring of the read
1092 < int aplen=12;
1093 < int apstart=a5len-aplen-2;
1094 < if (apstart<0) { apstart=0; aplen=a5len-1; }
1095 < GStr bstr=adapter5.substr(apstart, aplen);
1096 < if ((fi=seq.index(bstr))>=0) {
1097 <   //if (debug) GMessage(" last 12char of adaptor match the read!\n");
1098 <   extendMatch(seq.chars(), rlen, fi,
1099 <                 adapter5.chars(), a5len, apstart,  aplen, l5,l3,true);
1100 <   return true;
1090 > GStr wseq(rseq.chars());
1091 > GStr wqv(rqv.chars());
1092 > int w5=l5;
1093 > int w3=l3;
1094 > //first do the q-based trimming
1095 > if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1096 >   if (w3-w5+1<min_read_len) {
1097 >     return 'Q'; //invalid read
1098 >     }
1099 >    //-- keep only the w5..w3 range
1100 >   l5=w5;
1101 >   l3=w3;
1102 >   wseq=wseq.substr(w5, w3-w5+1);
1103 >   if (!wqv.is_empty())
1104 >      wqv=wqv.substr(w5, w3-w5+1);
1105 >   } //qv trimming
1106 > // N-trimming on the remaining read seq
1107 > if (ntrim(wseq, w5, w3)) {
1108 >   //GMessage("before: %s\n",wseq.chars());
1109 >   //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1110 >   l5+=w5;
1111 >   l3-=(wseq.length()-1-w3);
1112 >   if (w3-w5+1<min_read_len) {
1113 >     return 'N'; //to be trashed
1114 >     }
1115 >    //-- keep only the w5..w3 range
1116 >   wseq=wseq.substr(w5, w3-w5+1);
1117 >   if (!wqv.is_empty())
1118 >      wqv=wqv.substr(w5, w3-w5+1);
1119 >   w5=0;
1120 >   w3=wseq.length()-1;
1121     }
1122 < //no easy cases, find a triplet match as a seed for alignment extension
1123 < //find triplets at the right end of the adapter
1124 < for (int iw=0;iw<6;iw++) {
1125 <   apstart=a5len-iw-4;
1126 <   GStr aword=adapter5.substr(apstart,3);
787 <   if ((fi=seq.index(aword))>=0) {
788 <      //if (debug) GMessage("extending wmatch :%*s\n", fi+3, aword.chars());
789 <      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
790 <                   a5len, apstart, 3, l5,l3,true)) {
791 <                     return true;
792 <                     }
1122 > char trim_code;
1123 > do {
1124 >  trim_code=0;
1125 >  if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1126 >      trim_code='A';
1127        }
1128 +  else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1129 +      trim_code='T';
1130 +      }
1131 +  else if (trim_adapter5(wseq, w5, w3)) {
1132 +      trim_code='5';
1133 +      }
1134 +  if (trim_code) {
1135 +     #ifdef GDEBUG
1136 +      GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1137 +      showTrim(wseq, w5, w3);
1138 +     #endif
1139 +     int trimlen=wseq.length()-(w3-w5+1);
1140 +     num_trimmed5++;
1141 +     if (trimlen<min_trimmed5)
1142 +         min_trimmed5=trimlen;
1143 +     l5+=w5;
1144 +     l3-=(wseq.length()-1-w3);
1145 +     if (w3-w5+1<min_read_len) {
1146 +         return trim_code;
1147 +         }
1148 +      //-- keep only the w5..w3 range
1149 +      wseq=wseq.substr(w5, w3-w5+1);
1150 +      if (!wqv.is_empty())
1151 +         wqv=wqv.substr(w5, w3-w5+1);
1152 +      }// trimmed at 5' end
1153 + } while (trim_code);
1154 +
1155 + do {
1156 +  trim_code=0;
1157 +  if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1158 +      trim_code='A';
1159 +      }
1160 +  else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1161 +      trim_code='T';
1162 +      }
1163 +  else if (trim_adapter3(wseq, w5, w3)) {
1164 +      trim_code='3';
1165 +      }
1166 +  if (trim_code) {
1167 +     #ifdef GDEBUG
1168 +     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1169 +     showTrim(wseq, w5, w3);
1170 +     #endif
1171 +     int trimlen=wseq.length()-(w3-w5+1);
1172 +     num_trimmed3++;
1173 +     if (trimlen<min_trimmed3)
1174 +         min_trimmed3=trimlen;
1175 +     l5+=w5;
1176 +     l3-=(wseq.length()-1-w3);
1177 +     if (w3-w5+1<min_read_len) {
1178 +         return trim_code;
1179 +         }
1180 +      //-- keep only the w5..w3 range
1181 +      wseq=wseq.substr(w5, w3-w5+1);
1182 +      if (!wqv.is_empty())
1183 +         wqv=wqv.substr(w5, w3-w5+1);
1184 +      }//trimming at 3' end
1185 + } while (trim_code);
1186 +
1187 +
1188 + if (doCollapse) {
1189 +   //keep read for later
1190 +   FqDupRec* dr=dhash.Find(wseq.chars());
1191 +   if (dr==NULL) { //new entry
1192 +          //if (prefix.is_empty())
1193 +             dhash.Add(wseq.chars(),
1194 +                  new FqDupRec(&wqv, rname.chars()));
1195 +          //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1196 +         }
1197 +      else
1198 +         dr->add(wqv);
1199 +   } //collapsing duplicates
1200 + else { //not collapsing duplicates
1201 +   //apply the dust filter now
1202 +   if (doDust) {
1203 +     int dustbases=dust(wseq);
1204 +     if (dustbases>(wseq.length()>>1)) {
1205 +        return 'D';
1206 +        }
1207 +     }
1208 +   } //not collapsing duplicates
1209 + return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1210 + }
1211 +
1212 + void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1213 + //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1214 + if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1215 +  else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1216 + }
1217 +
1218 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1219 +   outcounter++;
1220 +   bool asFasta=(rqv.is_empty() || fastaOutput);
1221 +   if (asFasta) {
1222 +    if (prefix.is_empty()) {
1223 +       printHeader(f_out, '>',rname,rinfo);
1224 +       fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1225 +       }
1226 +      else {
1227 +       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1228 +                          rseq.chars());
1229 +       }
1230 +     }
1231 +   else {  //fastq
1232 +    if (convert_phred) convertPhred(rqv);
1233 +    if (prefix.is_empty()) {
1234 +       printHeader(f_out, '@', rname, rinfo);
1235 +       fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1236 +       }
1237 +      else
1238 +       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1239 +                          rseq.chars(),rqv.chars() );
1240 +     }
1241 + }
1242 +
1243 + void trash_report(char trashcode, GStr& rname, FILE* freport) {
1244 + if (freport==NULL || trashcode<=' ') return;
1245 + if (trashcode=='3' || trashcode=='5') {
1246 +   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1247 +   }
1248 + else {
1249 +   fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1250     }
1251 < return false; //no adapter parts found
1251 > //tcounter++;
1252 > }
1253 >
1254 > GStr getFext(GStr& s, int* xpos=NULL) {
1255 > //s must be a filename without a path
1256 > GStr r("");
1257 > if (xpos!=NULL) *xpos=0;
1258 > if (s.is_empty() || s=="-") return r;
1259 > int p=s.rindex('.');
1260 > int d=s.rindex('/');
1261 > if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1262 > r=s.substr(p+1);
1263 > if (xpos!=NULL) *xpos=p+1;
1264 > r.lower();
1265 > return r;
1266   }
1267  
1268 < //conversion of phred64 to phread33
1269 < void convertQ64(GStr& q) {
1270 < for (int i=0;i<q.length();i++) q[i]-=31;
1268 > void baseFileName(GStr& fname) {
1269 > //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1270 > int xpos=0;
1271 > GStr fext=getFext(fname, &xpos);
1272 > if (xpos<=1) return;
1273 > bool extdel=false;
1274 > GStr f2;
1275 > if (fext=="z" || fext=="zip") {
1276 >   extdel=true;
1277 >   }
1278 >  else if (fext.length()>=2) {
1279 >   f2=fext.substr(0,2);
1280 >   extdel=(f2=="gz" || f2=="bz");
1281 >   }
1282 > if (extdel) {
1283 >   fname.cut(xpos-1);
1284 >   fext=getFext(fname, &xpos);
1285 >   if (xpos<=1) return;
1286 >   }
1287 > extdel=false;
1288 > if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1289 >   extdel=true;
1290 >   }
1291 >  else if (fext.length()>=2) {
1292 >   extdel=(fext.substr(0,2)=="fa");
1293 >   }
1294 > if (extdel) fname.cut(xpos-1);
1295 > GStr fncp(fname);
1296 > fncp.lower();
1297 > fncp.chomp("seq");
1298 > fncp.chomp("sequence");
1299 > fncp.trimR("_.");
1300 > if (fncp.length()<fname.length()) fname.cut(fncp.length());
1301 > }
1302 >
1303 > FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1304 >  FILE* f_out=NULL;
1305 >  GStr fname(getFileName(infname.chars()));
1306 >  //eliminate known extensions
1307 >  baseFileName(fname);
1308 >  if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1309 >    else if (pocmd.is_empty()) {
1310 >               GStr oname(fname);
1311 >               oname.append('.');
1312 >               oname.append(outsuffix);
1313 >               f_out=fopen(oname.chars(),"w");
1314 >               if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1315 >               }
1316 >            else {
1317 >              GStr oname(">");
1318 >              oname.append(fname);
1319 >              oname.append('.');
1320 >              oname.append(outsuffix);
1321 >              pocmd.append(oname);
1322 >              f_out=popen(pocmd.chars(), "w");
1323 >              if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1324 >              }
1325 > return f_out;
1326 > }
1327 >
1328 > void guess_unzip(GStr& fname, GStr& picmd) {
1329 > GStr fext=getFext(fname);
1330 > if (fext=="gz" || fext=="gzip" || fext=="z") {
1331 >    picmd="gzip -cd ";
1332 >    }
1333 >   else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1334 >    picmd="bzip2 -cd ";
1335 >    }
1336   }
1337  
1338 < void convertQ64(char* q, int len) {
1339 < for (int i=0;i<len;i++) q[i]-=31;
1338 > void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1339 > //TODO: prepare CASeqData here, and collect hexamers as well
1340 > CASeqData adata(revCompl);
1341 > int idx=adapters.Add(adata);
1342 > if (idx<0) GError("Error: failed to add adaptor!\n");
1343 > adapters[idx].update(seq.chars());
1344   }
1345  
1346 +
1347 + int loadAdapters(const char* fname) {
1348 +  GLineReader lr(fname);
1349 +  char* l;
1350 +  while ((l=lr.nextLine())!=NULL) {
1351 +   if (lr.length()<=3 || l[0]=='#') continue;
1352 +   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1353 +        l[0]==';'|| l[0]==':' ) {
1354 +      int i=1;
1355 +      while (l[i]!=0 && isspace(l[i])) {
1356 +        i++;
1357 +        }
1358 +      if (l[i]!=0) {
1359 +        GStr s(&(l[i]));
1360 +      #ifdef GDEBUG
1361 +          //s.reverse();
1362 +      #endif
1363 +        addAdapter(adapters3, s);
1364 +        continue;
1365 +        }
1366 +      }
1367 +    else {
1368 +      GStr s(l);
1369 +      s.startTokenize("\t ;,:");
1370 +      GStr a5,a3;
1371 +      if (s.nextToken(a5))
1372 +         s.nextToken(a3);
1373 +      a5.upper();
1374 +      a3.upper();
1375 +     #ifdef GDEBUG
1376 +     //   a5.reverse();
1377 +     //   a3.reverse();
1378 +     #endif
1379 +      addAdapter(adapters5, a5);
1380 +      addAdapter(adapters3, a3);
1381 +      }
1382 +   }
1383 +   return adapters5.Count()+adapters3.Count();
1384 + }
1385 +
1386 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1387 +                       GStr& s, GStr& infname, GStr& infname2) {
1388 + // uses outsuffix to generate output file names and open file handles as needed
1389 + infname="";
1390 + infname2="";
1391 + f_in=NULL;
1392 + f_in2=NULL;
1393 + f_out=NULL;
1394 + f_out2=NULL;
1395 + //analyze outsuffix intent
1396 + GStr pocmd;
1397 + if (outsuffix=="-") {
1398 +    f_out=stdout;
1399 +    }
1400 +   else {
1401 +    GStr ox=getFext(outsuffix);
1402 +    if (ox.length()>2) ox=ox.substr(0,2);
1403 +    if (ox=="gz") pocmd="gzip -9 -c ";
1404 +        else if (ox=="bz") pocmd="bzip2 -9 -c ";
1405 +    }
1406 + if (s=="-") {
1407 +    f_in=stdin;
1408 +    infname="stdin";
1409 +    f_out=prepOutFile(infname, pocmd);
1410 +    return;
1411 +    } // streaming from stdin
1412 + s.startTokenize(",:");
1413 + s.nextToken(infname);
1414 + bool paired=s.nextToken(infname2);
1415 + if (fileExists(infname.chars())==0)
1416 +    GError("Error: cannot find file %s!\n",infname.chars());
1417 + GStr fname(getFileName(infname.chars()));
1418 + GStr picmd;
1419 + guess_unzip(fname, picmd);
1420 + if (picmd.is_empty()) {
1421 +   f_in=fopen(infname.chars(), "r");
1422 +   if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1423 +   }
1424 +  else {
1425 +   picmd.append(infname);
1426 +   f_in=popen(picmd.chars(), "r");
1427 +   if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1428 +   }
1429 + if (f_out==stdout) {
1430 +   if (paired) GError("Error: output suffix required for paired reads\n");
1431 +   return;
1432 +   }
1433 + f_out=prepOutFile(infname, pocmd);
1434 + if (!paired) return;
1435 + if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1436 + // ---- paired reads:-------------
1437 + if (fileExists(infname2.chars())==0)
1438 +     GError("Error: cannot find file %s!\n",infname2.chars());
1439 + picmd="";
1440 + GStr fname2(getFileName(infname2.chars()));
1441 + guess_unzip(fname2, picmd);
1442 + if (picmd.is_empty()) {
1443 +   f_in2=fopen(infname2.chars(), "r");
1444 +   if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1445 +   }
1446 +  else {
1447 +   picmd.append(infname2);
1448 +   f_in2=popen(picmd.chars(), "r");
1449 +   if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1450 +   }
1451 + f_out2=prepOutFile(infname2, pocmd);
1452 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines