ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 2 | Line 2
2   #include "GStr.h"
3   #include "GHash.hh"
4   #include "GList.hh"
5 + #include <ctype.h>
6 + #include "GAlnExtend.h"
7 + #include <inttypes.h>
8  
9   #define USAGE "Usage:\n\
10 < fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\
11 <    [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\
10 > fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
11 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
12 >   [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
13 >    <input.fq>[,<input_mates.fq>\n\
14   \n\
15 < Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\
16 < optionally collapse duplicates in the input fastq data.\n\
15 > Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
16 > for low complexity and collapse duplicate reads.\n\
17 > If read pairs should be trimmed and kept together (i.e. without discarding\n\
18 > one read in a pair), the two file names should be given delimited by a comma\n\
19 > or a colon character\n\
20   \n\
21   Options:\n\
22   -n  rename all the reads using the <prefix> followed by a read counter;\n\
23      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
24      the read duplication count\n\
25 < -o  write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
26 < -5  trim the given adapter or primer sequence at the 5' end of each read\n\
25 > -o  unless this parameter is '-', write the trimmed/filtered reads to \n\
26 >    file(s) named <input>.<outsuffix> which will be created in the \n\
27 >    current (working) directory; (writes to stdout if -o- is given);\n\
28 >    a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
29 > -f  file with adaptor sequences to trim, each line having this format:\n\
30 >    <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
31 > -5  trim the given adaptor or primer sequence at the 5' end of each read\n\
32      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
33 < -3  trim the given adapter sequence at the 3' end of each read\n\
33 > -3  trim the given adaptor sequence at the 3' end of each read\n\
34      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
35 < -l  minimum \"clean\" length at the high quality 5'end that a read must\n\
36 <    have in order to pass the filter (default: 16)\n\
35 > -A  disable polyA/T trimming (enabled by default)\n\
36 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
37 > -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39 > -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
40 > -l  minimum \"clean\" length after trimming that a read must have\n\
41 >    in order to pass the filter (default: 16)\n\
42   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
43      one letter code for the reason why they were trashed\n\
44 + -D  apply a low-complexity (dust) filter and discard any read that has over\n\
45 +    50%% of its length detected as low complexity\n\
46   -C  collapse duplicate reads and add a prefix with their count to the read\n\
47 <    name\n\
48 < -D  apply a low-complexity (dust) filter and discard any read that has at\n\
49 <    least half of it masked by this\n\
50 < -Q  quality values in the input data are interpreted as phred64, convert\n\
31 <    them to phred33\n\
47 >    name (e.g. for microRNAs)\n\
48 > -p  input is phred64/phred33 (use -p64 or -p33)\n\
49 > -Q  convert quality values to the other Phred qv type\n\
50 > -V  be verbose when done (print summary counts)\n\
51   "
52 < // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
53 < FILE* f_out=NULL; //stdout if not provided
54 < FILE* f_in=NULL; //input fastq (stdin if not provided)
52 >
53 > //-z  for -o option, the output stream(s) will be first piped into the given\n
54 > //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
55 >
56 >
57 > // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
58 >
59 > //For paired reads sequencing:
60 > //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
61 > //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
62 > //FILE* f_out=NULL; //stdout if not provided
63 > //FILE* f_out2=NULL; //for paired reads
64 > //FILE* f_in=NULL; //input fastq (stdin if not provided)
65 > //FILE* f_in2=NULL; //for paired reads
66 >
67   FILE* freport=NULL;
68 +
69   bool debug=false;
70 + bool verbose=false;
71   bool doCollapse=false;
72   bool doDust=false;
73 + bool doPolyTrim=true;
74 + bool fastaOutput=false;
75   bool trashReport=false;
76 < bool rawFormat=false;
76 > bool revCompl=false; //also reverse complement adaptor sequences
77   int min_read_len=16;
78 + double max_perc_N=7.0;
79   int dust_cutoff=16;
80   bool isfasta=false;
81 < bool phred64=false;
81 > bool convert_phred=false;
82 > GStr outsuffix; // -o
83   GStr prefix;
84 < GStr adapter3;
85 < GStr adapter5;
86 < int a3len=0;
87 < int a5len=0;
88 < // adaptor matching metrics -- for extendMatch() function
89 < static const int a_m_score=2; //match score
90 < static const int a_mis_score=-3; //mismatch
91 < static const int a_dropoff_score=7;
92 < static const int a_min_score=7;
84 > GStr zcmd;
85 > char isACGT[256];
86 >
87 > uint num_trimN=0; //reads trimmed by N%
88 > uint num_trimQ=0; //reads trimmed by qv threshold
89 > uint num_trimV=0; //reads trimmed by adapter match
90 > uint num_trimA=0; //reads trimmed by polyA
91 > uint num_trimT=0; //reads trimmed by polyT
92 > uint num_trim5=0; //number of reads trimmed at 5' end
93 > uint num_trim3=0; //number of reads trimmed at 3' end
94 >
95 > uint64 b_totalIn=0; //total number of input bases
96 > uint64 b_totalN=0;  //total number of undetermined bases found in input bases
97 >
98 > uint64 b_trimN=0; //total number of bases trimmed due to N-trim
99 > uint64 b_trimQ=0; //number of bases trimmed due to qv threshold
100 > uint64 b_trimV=0; //total number of bases trimmed due to adaptor matches
101 > uint64 b_trimA=0; //number of bases trimmed due to poly-A tails
102 > uint64 b_trimT=0; //number of bases trimmed due to poly-T tails
103 > uint64 b_trim5=0; //total bases trimmed on the 5' side
104 > uint64 b_trim3=0; //total bases trimmed on the 3' side
105 > //int min_trimmed5=INT_MAX;
106 > //int min_trimmed3=INT_MAX;
107 >
108 > int qvtrim_qmin=0;
109 > int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
110 > int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
111 > int qv_cvtadd=0; //could be -31 or +31
112 >
113 > // adaptor matching metrics -- for X-drop ungapped extension
114 > //const int match_reward=2;
115 > //const int mismatch_penalty=3;
116 > const int match_reward=2;
117 > const int mismatch_penalty=4;
118 > const int Xdrop=10;
119 >
120 > const int poly_m_score=2; //match score
121 > const int poly_mis_score=-3; //mismatch
122 > const int poly_dropoff_score=7;
123 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
124 >
125 > const char *polyA_seed="AAAA";
126 > const char *polyT_seed="TTTT";
127 >
128 > struct CASeqData {
129 >   //positional data for every possible hexamer in an adaptor
130 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
131 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
132 >   GStr seq; //actual adaptor sequence data
133 >   GStr seqr; //reverse complement sequence
134 >   int amlen; //fraction of adaptor length matching that's
135 >              //enough to consider the alignment
136 >   GAlnTrimType trim_type;
137 >   bool use_reverse;
138 >   CASeqData(bool rev=false):seq(),seqr(),
139 >             amlen(0), use_reverse(rev) {
140 >     trim_type=galn_None; //should be updated later!
141 >     for (int i=0;i<4096;i++) {
142 >        pz[i]=NULL;
143 >        pzr[i]=NULL;
144 >        }
145 >     }
146 >
147 >   void update(const char* s) {
148 >         seq=s;
149 >         table6mers(seq.chars(), seq.length(), pz);
150 >         amlen=iround(double(seq.length())*0.8);
151 >         if (amlen<12)
152 >                amlen=12;
153 >         if (!use_reverse) return;
154 >         //reverse complement
155 >         seqr=s;
156 >         int slen=seq.length();
157 >         for (int i=0;i<slen;i++)
158 >           seqr[i]=ntComplement(seq[slen-i-1]);
159 >         table6mers(seqr.chars(), seqr.length(), pzr);
160 >   }
161 >
162 >   ~CASeqData() {
163 >     for (int i=0;i<4096;i++) {
164 >       delete pz[i];
165 >       delete pzr[i];
166 >     }
167 >   }
168 > };
169 >
170 > GVec<CASeqData> adaptors5;
171 > GVec<CASeqData> adaptors3;
172 >
173 > CGreedyAlignData* gxmem_l=NULL;
174 > CGreedyAlignData* gxmem_r=NULL;
175  
176   // element in dhash:
177   class FqDupRec {
# Line 91 | Line 210
210    if (!s.is_empty()) {
211        if (s=='-') f=stdout;
212        else {
213 <       f=fopen(s,"w");
213 >       f=fopen(s.chars(),"w");
214         if (f==NULL) GError("Error creating file: %s\n", s.chars());
215         }
216       }
# Line 102 | Line 221
221  
222   GHash<FqDupRec> dhash; //hash to keep track of duplicates
223  
224 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured
225 <
224 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
225 > int loadAdaptors(const char* fname);
226 >
227 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
228 >                       GStr& s, GStr& infname, GStr& infname2);
229 > // uses outsuffix to generate output file names and open file handles as needed
230 >
231 > void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
232 > void trash_report(char trashcode, GStr& rname, FILE* freport);
233 >
234 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
235 >          GStr& rname, GStr& rinfo, GStr& infname);
236 >
237 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
238 > //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
239 >
240 > bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
241 > bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
242   int dust(GStr& seq);
243 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
244 < bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
243 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
244 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
245 > bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
246 > bool trim_adaptor3(GStr& seq, int &l5, int &l3);
247  
248 < void convertQ64(char* q, int len);
249 < void convertQ64(GStr& q);
248 > void convertPhred(char* q, int len);
249 > void convertPhred(GStr& q);
250  
251   int main(int argc, char * const argv[]) {
252 <  GArgs args(argc, argv, "YQRDCl:d:3:5:n:r:o:");
252 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
253    int e;
117  int icounter=0; //counter for input reads
118  int outcounter=0; //counter for output reads
254    if ((e=args.isError())>0) {
255        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
256        exit(224);
257        }
258    debug=(args.getOpt('Y')!=NULL);
259 <  phred64=(args.getOpt('Q')!=NULL);
259 >  verbose=(args.getOpt('V')!=NULL);
260 >  convert_phred=(args.getOpt('Q')!=NULL);
261    doCollapse=(args.getOpt('C')!=NULL);
262    doDust=(args.getOpt('D')!=NULL);
263 +  revCompl=(args.getOpt('R')!=NULL);
264 +  if (args.getOpt('A')) doPolyTrim=false;
265 +  /*
266    rawFormat=(args.getOpt('R')!=NULL);
267    if (rawFormat) {
268      GError("Sorry, raw qseq format parsing is not implemented yet!\n");
269      }
270 +  */
271    prefix=args.getOpt('n');
272    GStr s=args.getOpt('l');
273 <  if (!s.is_empty())
273 >  if (!s.is_empty())
274       min_read_len=s.asInt();
275 +  s=args.getOpt('m');
276 +  if (!s.is_empty())
277 +     max_perc_N=s.asDouble();
278    s=args.getOpt('d');
279    if (!s.is_empty()) {
280       dust_cutoff=s.asInt();
281       doDust=true;
282       }
283 <    
284 <  if (args.getOpt('3')!=NULL) {
285 <    adapter3=args.getOpt('3');
286 <    adapter3.upper();
287 <    a3len=adapter3.length();
288 <    }
289 <  if (args.getOpt('5')!=NULL) {
290 <    adapter5=args.getOpt('5');
291 <    adapter5.upper();
292 <    a5len=adapter5.length();
283 >  s=args.getOpt('q');
284 >  if (!s.is_empty()) {
285 >     qvtrim_qmin=s.asInt();
286 >     }
287 >  s=args.getOpt('t');
288 >  if (!s.is_empty()) {
289 >     qvtrim_max=s.asInt();
290 >     }
291 >  s=args.getOpt('p');
292 >  if (!s.is_empty()) {
293 >     int v=s.asInt();
294 >     if (v==33) {
295 >        qv_phredtype=33;
296 >        qv_cvtadd=31;
297 >        }
298 >      else if (v==64) {
299 >        qv_phredtype=64;
300 >        qv_cvtadd=-31;
301 >        }
302 >       else
303 >         GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
304 >     }
305 >  memset((void*)isACGT, 0, 256);
306 >  isACGT['A']=isACGT['a']=isACGT['C']=isACGT['c']=1;
307 >  isACGT['G']=isACGT['g']=isACGT['T']=isACGT['t']=1;
308 >  s=args.getOpt('f');
309 >  if (!s.is_empty()) {
310 >   loadAdaptors(s.chars());
311 >   }
312 >  bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
313 >  s=args.getOpt('5');
314 >  if (!s.is_empty()) {
315 >    if (fileAdaptors)
316 >      GError("Error: options -5 and -f cannot be used together!\n");
317 >    s.upper();
318 >    addAdaptor(adaptors5, s, galn_TrimLeft);
319      }
320 +  s=args.getOpt('3');
321 +  if (!s.is_empty()) {
322 +    if (fileAdaptors)
323 +      GError("Error: options -3 and -f cannot be used together!\n");
324 +      s.upper();
325 +      addAdaptor(adaptors3, s, galn_TrimRight);
326 +    }
327 +  s=args.getOpt('y');
328 +  if (!s.is_empty()) {
329 +     int minmatch=s.asInt();
330 +     poly_minScore=minmatch*poly_m_score;
331 +     }
332 +
333 +  if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
334 +                         else outsuffix="-";
335    trashReport=  (args.getOpt('r')!=NULL);
336 <  if (args.startNonOpt()==0) {
336 >  int fcount=args.startNonOpt();
337 >  if (fcount==0) {
338      GMessage(USAGE);
339      exit(224);
340      }
341 <
342 <  openfw(f_out, args, 'o');
343 <  if (f_out==NULL) f_out=stdout;
341 >   if (fcount>1 && doCollapse) {
342 >    GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
343 >    }
344 >  //openfw(f_out, args, 'o');
345 >  //if (f_out==NULL) f_out=stdout;
346    if (trashReport)
347      openfw(freport, args, 'r');
348    char* infile=NULL;
349 +
350 +  if (adaptors5.Count()>0)
351 +    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
352 +        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
353 +  if (adaptors3.Count()>0)
354 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
355 +
356    while ((infile=args.nextNonOpt())!=NULL) {
357 <    GStr infname(infile);
358 <    if (strcmp(infile,"-")==0) {
359 <       f_in=stdin; infname="stdin"; }
360 <     else {
361 <        f_in=fopen(infile,"r");
362 <        if (f_in==NULL)
363 <            GError("Cannot open input file %s!\n",infile);
364 <        }
365 <     GLineReader fq(f_in);
366 <     char* l=NULL;
367 <     while ((l=fq.getLine())!=NULL) {
368 <        GStr rname; //current read name
369 <        GStr rseq;  //current read sequence
370 <        GStr rqv;   //current read quality values
371 <        GStr s;
372 <        if (rawFormat) {
373 <          //TODO: implement qseq parsing here
374 <          //if (raw type=N) then continue; //skip invalid/bad records
375 <          
376 <          } //raw qseq format
377 <         else { // FASTQ or FASTA
378 <          isfasta=(l[0]=='>');
379 <          if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
380 <          s=l;
381 <          rname=&(l[1]);
382 <          icounter++;
383 <          //GMessage("readname=%s\n",rname.chars());
384 <          for (int i=0;i<rname.length();i++)
385 <            if (rname[i]<=' ') { rname.cut(i); break; }
386 <          //now get the sequence
387 <          if ((l=fq.getLine())==NULL)
388 <              GError("Error: unexpected EOF after header for %s\n",rname.chars());
389 <          rseq=l; //this must be the DNA line
390 <          if (isfasta) {
391 <            while ((l=fq.getLine())!=NULL) {
392 <              //fasta can have multiple sequence lines
393 <              if (l[0]=='>') {
394 <                   fq.pushBack();
395 <                   break; //
357 >    //for each input file
358 >    int incounter=0; //counter for input reads
359 >    int outcounter=0; //counter for output reads
360 >    int trash_s=0; //too short from the get go
361 >    int trash_Q=0;
362 >    int trash_N=0;
363 >    int trash_D=0;
364 >    int trash_poly=0;
365 >    int trash_V=0;
366 >    num_trimN=0;
367 >    num_trimQ=0;
368 >    num_trimV=0;
369 >    num_trimA=0;
370 >    num_trimT=0;
371 >    num_trim5=0;
372 >    num_trim3=0;
373 >
374 >    b_totalIn=0;
375 >    b_totalN=0;
376 >    b_trimN=0;
377 >    b_trimQ=0;
378 >    b_trimV=0;
379 >    b_trimA=0;
380 >    b_trimT=0;
381 >    b_trim5=0;
382 >    b_trim3=0;
383 >
384 >    s=infile;
385 >    GStr infname;
386 >    GStr infname2;
387 >    FILE* f_in=NULL;
388 >    FILE* f_in2=NULL;
389 >    FILE* f_out=NULL;
390 >    FILE* f_out2=NULL;
391 >    bool paired_reads=false;
392 >    setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
393 >    GLineReader fq(f_in);
394 >    GLineReader* fq2=NULL;
395 >    if (f_in2!=NULL) {
396 >       fq2=new GLineReader(f_in2);
397 >       paired_reads=true;
398 >       }
399 >    GStr rseq, rqv, seqid, seqinfo;
400 >    GStr rseq2, rqv2, seqid2, seqinfo2;
401 >    while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
402 >       incounter++;
403 >       int a5=0, a3=0, b5=0, b3=0;
404 >       char tcode=0, tcode2=0;
405 >       tcode=process_read(seqid, rseq, rqv, a5, a3);
406 >       if (a5>0) {
407 >          b_trim5+=a5;
408 >          num_trim5++;
409 >          }
410 >       if (a3<rseq.length()-1) {
411 >          b_trim3+=rseq.length()-a3-1;
412 >          num_trim3++;
413 >          }
414 >       if (fq2!=NULL) {
415 >            getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
416 >            if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
417 >               GError("Error: no paired match for read %s vs %s (%s,%s)\n",
418 >                  seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
419 >               }
420 >            tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
421 >            if (b5>0) {
422 >               b_trim5+=b5;
423 >               num_trim5++;
424 >               }
425 >            if (b3<rseq2.length()-1) {
426 >               b_trim3+=rseq2.length()-b3-1;
427 >               num_trim3++;
428 >               }
429 >            //decide what to do with this pair and print rseq2 if the pair makes it
430 >            if (tcode>1 && tcode2<=1) {
431 >               //"untrash" rseq
432 >               if (a3-a5+1<min_read_len) {
433 >                   a5=1;
434 >                   if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
435                     }
436 <              rseq+=l;
437 <              }
438 <            } //multi-line fasta file
439 <          if (!isfasta) {
440 <            if ((l=fq.getLine())==NULL)
441 <                GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
442 <            if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
443 <            if ((l=fq.getLine())==NULL)
444 <                GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
445 <            rqv=l;
446 <            if (rqv.length()!=rseq.length())
447 <                GError("Error: qv len != seq len for %s\n", rname.chars());
448 <            }
449 <        } //<-- FASTA or FASTQ
217 <        rseq.upper();
218 <        int l5=0;
219 <        int l3=rseq.length()-1;
220 <        if (l3-l5+1<min_read_len) {
221 <           if (trashReport) {
222 <                  fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
223 <                  }
224 <           continue;
225 <           }
226 <        if (ntrim(rseq, rqv, l5, l3)) {
227 <           if (l3-l5+1<min_read_len) {
228 <             if (trashReport) {
229 <                    fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
230 <                    }
231 <             continue; //invalid read
232 <             }
233 <            //-- keep only the l5..l3 range
234 <           rseq=rseq.substr(l5, l3-l5+1);
235 <           if (!rqv.is_empty())
236 <              rqv=rqv.substr(l5, l3-l5+1);
237 <           }
238 <          
239 <        if (a3len>0) {
240 <          if (trim_adapter3(rseq, l5, l3)) {
241 <             if (l3-l5+1<min_read_len) {
242 <                 if (trashReport) {
243 <                     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
244 <                     }
245 <                 continue;
246 <                 }
247 <              //-- keep only the l5..l3 range
248 <              rseq=rseq.substr(l5, l3-l5+1);
249 <              if (!rqv.is_empty())
250 <                 rqv=rqv.substr(l5, l3-l5+1);
251 <              }//some adapter was trimmed
252 <           } //adapter trimming
253 <        if (a5len>0) {
254 <          if (trim_adapter5(rseq, l5, l3)) {
255 <             if (l3-l5+1<min_read_len) {
256 <                 if (trashReport) {
257 <                     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
258 <                     }
259 <                 continue;
436 >               tcode=1;
437 >               }
438 >             else if (tcode<=1 && tcode2>1) {
439 >               //"untrash" rseq2
440 >               if (b3-b5+1<min_read_len) {
441 >                   b5=1;
442 >                   if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
443 >                   }
444 >               tcode2=1;
445 >               }
446 >            if (tcode<=1) { //trimmed or left intact -- write it!
447 >               if (tcode>0) {
448 >                 rseq2=rseq2.substr(b5,b3-b5+1);
449 >                 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
450                   }
451 <              //-- keep only the l5..l3 range
452 <              rseq=rseq.substr(l5, l3-l5+1);
453 <              if (!rqv.is_empty())
454 <                 rqv=rqv.substr(l5, l3-l5+1);
455 <              }//some adapter was trimmed
456 <           } //adapter trimming
457 <        if (doCollapse) {
458 <           //keep read for later
459 <           FqDupRec* dr=dhash.Find(rseq.chars());
460 <           if (dr==NULL) { //new entry
461 <                  //if (prefix.is_empty())
462 <                     dhash.Add(rseq.chars(),
463 <                          new FqDupRec(&rqv, rname.chars()));
464 <                  //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
451 >               int nocounter=0;
452 >               writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
453 >               }
454 >            } //pair read
455 >       if (tcode>1) { //trashed
456 >         #ifdef GDEBUG
457 >         GMessage(" !!!!TRASH code = %c\n",tcode);
458 >         #endif
459 >          if (tcode=='s') trash_s++;
460 >          else if (tcode=='A' || tcode=='T') trash_poly++;
461 >            else if (tcode=='Q') trash_Q++;
462 >              else if (tcode=='N') trash_N++;
463 >               else if (tcode=='D') trash_D++;
464 >                else if (tcode=='V') trash_V++;
465 >                 //else if (tcode=='5') trash_A5++;
466 >          if (trashReport) trash_report(tcode, seqid, freport);
467 >          }
468 >         else if (!doCollapse) { //write it
469 >          if (tcode>0) {
470 >            rseq=rseq.substr(a5,a3-a5+1);
471 >            if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
472 >            }
473 >         #ifdef GDEBUG
474 >            GMessage("  After trimming:\n");
475 >            GMessage("%s\n",rseq.chars());
476 >         #endif
477 >          writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
478 >          }
479 >       } //for each fastq record
480 >    delete fq2;
481 >    FRCLOSE(f_in);
482 >    FRCLOSE(f_in2);
483 >    if (doCollapse) {
484 >       outcounter=0;
485 >       int maxdup_count=1;
486 >       char* maxdup_seq=NULL;
487 >       dhash.startIterate();
488 >       FqDupRec* qd=NULL;
489 >       char* seq=NULL;
490 >       while ((qd=dhash.NextData(seq))!=NULL) {
491 >         GStr rseq(seq);
492 >         //do the dusting here
493 >         if (doDust) {
494 >            int dustbases=dust(rseq);
495 >            if (dustbases>(rseq.length()>>1)) {
496 >               if (trashReport && qd->firstname!=NULL) {
497 >                 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
498                   }
499 <              else    
500 <                 dr->add(rqv);
501 <           } //collapsing duplicates
502 <         else { //not collapsing duplicates
503 <           //do the dust filter now
504 <           if (doDust) {
505 <             int dustbases=dust(rseq);
506 <             if (dustbases>(rseq.length()>>1)) {
507 <                if (trashReport) {
508 <                  fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
509 <                  }
510 <                continue;
511 <                }
289 <             }
290 <           //print this record here  
291 <           outcounter++;
292 <           if (isfasta) {
293 <            if (prefix.is_empty())
294 <               fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
295 <              else
296 <               fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
297 <                                  rseq.chars());
499 >               trash_D+=qd->count;
500 >               continue;
501 >               }
502 >            }
503 >         outcounter++;
504 >         if (qd->count>maxdup_count) {
505 >            maxdup_count=qd->count;
506 >            maxdup_seq=seq;
507 >            }
508 >         if (isfasta) {
509 >           if (prefix.is_empty()) {
510 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
511 >                           rseq.chars());
512               }
513 <           else {  //fastq
514 <            if (phred64) convertQ64(rqv);
515 <            if (prefix.is_empty())
302 <               fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
303 <              else
304 <               fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
305 <                                  rseq.chars(),rqv.chars() );
513 >           else { //use custom read name
514 >             fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
515 >                        qd->count, rseq.chars());
516               }
517 <           } //not collapsing duplicates
518 <        } //for each fastq record
519 <   } //while each input file
520 < FRCLOSE(f_in);
521 < if (doCollapse) {
522 <    outcounter=0;
313 <    int maxdup_count=1;
314 <    char* maxdup_seq=NULL;
315 <    dhash.startIterate();
316 <    FqDupRec* qd=NULL;
317 <    char* seq=NULL;
318 <    while ((qd=dhash.NextData(seq))!=NULL) {
319 <      GStr rseq(seq);
320 <      //do the dusting here
321 <      if (doDust) {
322 <         int dustbases=dust(rseq);
323 <         if (dustbases>(rseq.length()>>1)) {
324 <            if (trashReport && qd->firstname!=NULL) {
325 <              fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
326 <              }
327 <            continue;
517 >           }
518 >         else { //fastq format
519 >          if (convert_phred) convertPhred(qd->qv, qd->len);
520 >          if (prefix.is_empty()) {
521 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
522 >                           rseq.chars(), qd->qv);
523              }
524 +          else { //use custom read name
525 +            fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
526 +                        qd->count, rseq.chars(), qd->qv);
527 +            }
528 +           }
529 +         }//for each element of dhash
530 +       if (maxdup_count>1) {
531 +         GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
532           }
533 <      outcounter++;
534 <      if (qd->count>maxdup_count) {
535 <         maxdup_count=qd->count;
536 <         maxdup_seq=seq;
537 <         }
538 <      if (isfasta) {
539 <        if (prefix.is_empty()) {
540 <          fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
541 <                        rseq.chars());
542 <          }
543 <        else { //use custom read name
544 <          fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
545 <                     qd->count, rseq.chars());
533 >       } //collapse entries
534 >    if (verbose) {
535 >       if (paired_reads) {
536 >           GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
537 >           GMessage("Number of input pairs :%9u\n", incounter);
538 >           GMessage("         Output pairs :%9u\t(%u trashed)\n", outcounter, incounter-outcounter);
539 >           }
540 >         else {
541 >           GMessage(">Input file : %s\n", infname.chars());
542 >           GMessage("Number of input reads :%9d\n", incounter);
543 >           GMessage("         Output reads :%9d  (%u trashed)\n", outcounter, incounter-outcounter);
544 >           }
545 >       GMessage("--------------- Read trimming: ------------------\n");
546 >       if (num_trim5)
547 >          GMessage("           5' trimmed :%9u\n", num_trim5);
548 >       if (num_trim3)
549 >          GMessage("           3' trimmed :%9u\n", num_trim3);
550 >       if (num_trimQ)
551 >          GMessage("         q.v. trimmed :%9u\n", num_trimQ);
552 >       if (num_trimN)
553 >          GMessage("            N trimmed :%9u\n", num_trimN);
554 >       if (num_trimT)
555 >          GMessage("       poly-T trimmed :%9u\n", num_trimT);
556 >       if (num_trimA)
557 >          GMessage("       poly-A trimmed :%9u\n", num_trimA);
558 >       if (num_trimV)
559 >          GMessage("      Adapter trimmed :%9u\n", num_trimV);
560 >       GMessage("------------------------------------\n");
561 >       if (trash_s>0)
562 >         GMessage("Trashed by initial len:%9d\n", trash_s);
563 >       if (trash_N>0)
564 >         GMessage("         Trashed by N%%:%9d\n", trash_N);
565 >       if (trash_Q>0)
566 >         GMessage("Trashed by low quality:%9d\n", trash_Q);
567 >       if (trash_poly>0)
568 >         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
569 >       if (trash_V>0)
570 >         GMessage("    Trashed by adaptor:%9d\n", trash_V);
571 >       GMessage("\n--------------- Base counts  --------------------\n");
572 >       GMessage("    Input bases   :%12llu\n", b_totalIn);
573 >       double percN=100.0* ((double)b_totalN/(double)b_totalIn);
574 >       GMessage("          N bases :%12llu (%4.2f%%)\n", b_totalN, percN);
575 >       GMessage("   trimmed from 5':%12llu\n", b_trim5);
576 >       GMessage("   trimmed from 3':%12llu\n", b_trim3);
577 >       GMessage("\n");
578 >       if (b_trimQ)
579 >       GMessage("     q.v. trimmed :%12llu\n", b_trimQ);
580 >       if (b_trimN)
581 >       GMessage("        N trimmed :%12llu\n", b_trimN);
582 >       if (b_trimT)
583 >       GMessage("   poly-T trimmed :%12llu\n", b_trimT);
584 >       if (b_trimA)
585 >       GMessage("   poly-A trimmed :%12llu\n", b_trimA);
586 >       if (b_trimV)
587 >       GMessage("  Adapter trimmed :%12llu\n", b_trimV);
588 >
589 >       }
590 >    if (trashReport) {
591 >          FWCLOSE(freport);
592            }
593 <        }
594 <      else { //fastq format
595 <       if (phred64) convertQ64(qd->qv, qd->len);
596 <       if (prefix.is_empty()) {
597 <         fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
598 <                        rseq.chars(), qd->qv);
350 <         }
351 <       else { //use custom read name
352 <         fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
353 <                     qd->count, rseq.chars(), qd->qv);
354 <         }
355 <        }
356 <      }//for each element of dhash
357 <    if (maxdup_count>1) {
358 <      GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
359 <      }
360 <   } //report collapsed dhash entries
361 < GMessage("Number of input reads: %9d\n", icounter);
362 < GMessage("       Output records: %9d\n", outcounter);
363 < if (trashReport) {
364 <    FWCLOSE(freport);
365 <    }
366 <
367 < FWCLOSE(f_out);
593 >    FWCLOSE(f_out);
594 >    FWCLOSE(f_out2);
595 >   } //while each input file
596 > delete gxmem_l;
597 > delete gxmem_r;
598 > //getc(stdin);
599   }
600  
601   class NData {
# Line 378 | Line 609
609     const char* seq;
610     bool valid;
611     NData() {
612 +    seqlen=0;
613      NCount=0;
614      end5=0;
615      end3=0;
# Line 393 | Line 625
625       end5=1;
626       end3=seqlen;
627       for (int i=0;i<seqlen;i++)
628 <        if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
628 >        if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
629             NPos[NCount]=i;
630             NCount++;
631             }
# Line 408 | Line 640
640       perc_N=(n*100.0)/(end5-end3+1);
641       }
642   };
643 <
643 >
644   static NData feat;
645   int perc_lenN=12; // incremental distance from ends, in percentage of
646            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
647 <          
647 >
648   void N_analyze(int l5, int l3, int p5, int p3) {
649   /* assumes feat was filled properly */
650   int old_dif, t5,t3,v;
651 < if (l3-l5<min_read_len || p5>p3 ) {
651 > if (l3<l5+2 || p5>p3 ) {
652     feat.end5=l5+1;
653     feat.end3=l3+1;
654 <   return;
654 >   return;
655     }
656  
657   t5=feat.NPos[p5]-l5;
658   t3=l3-feat.NPos[p3];
659   old_dif=p3-p5;
660   v=(int)((((double)(l3-l5))*perc_lenN)/100);
661 < if (v>20) v=20; /* enforce N-search limit for very long reads */
661 > if (v>20) v=20; /* enforce N-search limit for very long reads */
662   if (t5 < v ) {
663     l5=feat.NPos[p5]+1;
664     p5++;
# Line 443 | Line 675
675             feat.end3=l3+1;
676             return;
677             }
678 <    else
678 >    else
679        N_analyze(l5,l3, p5,p3);
680   }
681  
682  
683 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) {
684 < //count Ns in the sequence
683 > bool qtrim(GStr& qvs, int &l5, int &l3) {
684 > if (qvtrim_qmin==0 || qvs.is_empty()) return false;
685 > if (qv_phredtype==0) {
686 >  //try to guess the Phred type
687 >  int vmin=256, vmax=0;
688 >  for (int i=0;i<qvs.length();i++) {
689 >     if (vmin>qvs[i]) vmin=qvs[i];
690 >     if (vmax<qvs[i]) vmax=qvs[i];
691 >     }
692 >  if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
693 >  if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
694 >  if (qv_phredtype==0) {
695 >    GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
696 >    }
697 >  if (verbose)
698 >    GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
699 >  } //guessing Phred type
700 > for (l3=qvs.length()-1;l3>2;l3--) {
701 >  if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
702 >  }
703 > //just in case, check also the 5' the end (?)
704 > for (l5=0;l5<qvs.length()-3;l5++) {
705 >  if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
706 >  }
707 > if (qvtrim_max>0) {
708 >  if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
709 >  if (l5>qvtrim_max) l5=qvtrim_max;
710 >  }
711 > return (l5>0 || l3<qvs.length()-1);
712 > }
713 >
714 > bool ntrim(GStr& rseq, int &l5, int &l3) {
715 > //count Ns in the sequence, trim N-rich ends
716   feat.init(rseq);
717   l5=feat.end5-1;
718   l3=feat.end3-1;
719 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
720 < if (l5==feat.end5-1 && l3==feat.end3-1)
721 <    return false; //nothing changed
719 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
720 > if (l5==feat.end5-1 && l3==feat.end3-1) {
721 >    if (feat.perc_N>max_perc_N) {
722 >           feat.valid=false;
723 >           l3=l5+1;
724 >           return true;
725 >           }
726 >      else {
727 >       return false; //nothing changed
728 >       }
729 >    }
730   l5=feat.end5-1;
731   l3=feat.end3-1;
732   if (l3-l5+1<min_read_len) {
# Line 463 | Line 734
734     return true;
735     }
736   feat.N_calc();
737 < if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases
737 >
738 > if (feat.perc_N>max_perc_N) {
739        feat.valid=false;
740        l3=l5+1;
741        return true;
# Line 474 | Line 746
746   //--------------- dust functions ----------------
747   class DNADuster {
748   public:
749 <  int dustword;
750 <  int dustwindow;
751 <  int dustwindow2;
749 >  int dustword;
750 >  int dustwindow;
751 >  int dustwindow2;
752    int dustcutoff;
753    int mv, iv, jv;
754    int counts[32*32*32];
# Line 571 | Line 843
843                      }
844             }
845           }
846 < //return first;
846 > //return first;
847   }
848   };
849  
# Line 589 | Line 861
861   return ncount;
862   }
863  
864 < // ------------------ adapter matching - triplet matching
865 < //look for matching triplets amongst the first 9 nucleotides of the 3' adaptor
866 < // or the last 9 nucleotides for the 5' adapter
867 < //when a triplet match is found, simply try to extend the alignment using a drop-off scheme
868 < // check minimum score and
869 < // for 3' adapter trimming:
870 < //     require that the right end of the alignment for either the adaptor OR the read must be
871 < //     < 3 distance from its right end
872 < // for 5' adapter trimming:
873 < //     require that the left end of the alignment for either the adaptor OR the read must
874 < //     be at coordinate 0
875 <
876 < bool extendMatch(const char* a, int alen, int ai,
877 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, bool end5=false) {
878 < //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
879 < //if (debug)
608 < //  GStr dbg(b);
609 < //GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai);
610 < int a_l=ai; //alignment coordinates on a
611 < int a_r=ai+mlen-1;
612 < int b_l=bi; //alignment coordinates on b
613 < int b_r=bi+mlen-1;
614 < int ai_maxscore=ai;
615 < int bi_maxscore=bi;
616 < int score=mlen*a_m_score;
617 < int maxscore=score;
618 < int mism5score=a_mis_score;
619 < if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
620 < //try to extend to the left first, if possible
621 < while (ai>0 && bi>0) {
622 <   ai--;
623 <   bi--;
624 <   score+= (a[ai]==b[bi])? a_m_score : mism5score;
625 <   if (score>maxscore) {
626 <       ai_maxscore=ai;
627 <       bi_maxscore=bi;
628 <       maxscore=score;
629 <       }
630 <     else if (maxscore-score>a_dropoff_score) break;
631 <   }
632 < a_l=ai_maxscore;
633 < b_l=bi_maxscore;
634 < //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
635 < //now extend to the right
636 < ai_maxscore=a_r;
637 < bi_maxscore=b_r;
638 < ai=a_r;
639 < bi=b_r;
640 < score=maxscore;
641 < //sometimes there are extra AAAAs at the end of the read, ignore those
642 < if (strcmp(&a[alen-4],"AAAA")==0) {
643 <    alen-=3;
644 <    while (a[alen-1]=='A' && alen>ai) alen--;
645 <    }
646 < while (ai<alen-1 && bi<blen-1) {
647 <   ai++;
648 <   bi++;
649 <   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
650 <   if (a[ai]==b[bi]) { //match
651 <      score+=a_m_score;
652 <      if (ai>=alen-2) {
653 <           score+=a_m_score-(alen-ai-1);
654 <           }
655 <      }
656 <    else { //mismatch
657 <      score+=a_mis_score;
658 <      }  
659 <   if (score>maxscore) {
660 <       ai_maxscore=ai;
661 <       bi_maxscore=bi;
662 <       maxscore=score;
663 <       }
664 <     else if (maxscore-score>a_dropoff_score) break;
665 <   }
666 <  a_r=ai_maxscore;
667 <  b_r=bi_maxscore;
668 <  int a_ovh3=alen-a_r-1;
669 <  int b_ovh3=blen-b_r-1;
670 <  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
671 <  int mmovh5=(a_l<b_l)? a_l : b_l;
672 <  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
673 <  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
674 <     if (a_l<a_ovh3) {
675 <        //adapter closer to the left end (typical for 5' adapter)
676 <        l5=a_r+1;
677 <        l3=alen-1;
678 <        }
679 <      else {
680 <        //adapter matching at the right end (typical for 3' adapter)
681 <        l5=0;
682 <        l3=a_l-1;
683 <        }
684 <     return true;
685 <     }
686 <  //do not trim:
687 <  l5=0;
688 <  l3=alen-1;
689 <  return false;
690 < }
864 > struct SLocScore {
865 >  int pos;
866 >  int score;
867 >  SLocScore(int p=0,int s=0) {
868 >    pos=p;
869 >    score=s;
870 >    }
871 >  void set(int p, int s) {
872 >    pos=p;
873 >    score=s;
874 >    }
875 >  void add(int p, int add) {
876 >    pos=p;
877 >    score+=add;
878 >    }
879 > };
880  
881 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
881 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
882 > if (!doPolyTrim) return false;
883   int rlen=seq.length();
884   l5=0;
885   l3=rlen-1;
886 < //first try a full match, we might get lucky
887 < int fi=-1;
888 < if ((fi=seq.index(adapter3))>=0) {
889 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
890 <      l5=fi+a3len;
891 <      l3=rlen-1;
892 <      }
893 <    else {
894 <      l5=0;
895 <      l3=fi-1;
886 > int32 seedVal=*(int32*)poly_seed;
887 > char polyChar=poly_seed[0];
888 > //assumes N trimming was already done
889 > //so a poly match should be very close to the end of the read
890 > // -- find the initial match (seed)
891 > int lmin=GMAX((rlen-16), 0);
892 > int li;
893 > for (li=rlen-4;li>lmin;li--) {
894 >   if (seedVal==*(int*)&(seq[li])) {
895 >      break;
896        }
707   return true;
897     }
898 < //also, for fast detection of other adapter-only reads that start past
899 < // the beginning of the adapter sequence, try to see if the first a3len-4
900 < // characters of the read are a substring of the adapter
901 < GStr rstart=seq.substr(0,a3len-4);
902 < if ((fi=adapter3.index(rstart))>=0) {
903 <   l3=rlen-1;
904 <   l5=a3len-4;
905 <   while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
906 <   return true;
898 > if (li<=lmin) return false;
899 > //seed found, try to extend it both ways
900 > //extend right
901 > int ri=li+3;
902 > SLocScore loc(ri, poly_m_score<<2);
903 > SLocScore maxloc(loc);
904 > //extend right
905 > while (ri<rlen-1) {
906 >   ri++;
907 >   if (seq[ri]==polyChar) {
908 >                loc.add(ri,poly_m_score);
909 >                }
910 >   else if (seq[ri]=='N') {
911 >                loc.add(ri,0);
912 >                }
913 >   else { //mismatch
914 >        loc.add(ri,poly_mis_score);
915 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
916 >        }
917 >   if (maxloc.score<=loc.score) {
918 >      maxloc=loc;
919 >      }
920     }
921 <  
922 < //another easy case: first half of the adaptor matches
923 < int a3hlen=a3len>>1;
924 < GStr ahalf=adapter3.substr(0, a3hlen);
925 < if ((fi=seq.index(ahalf))>=0) {
926 <   extendMatch(seq.chars(), rlen, fi,
927 <                 adapter3.chars(), a3len, 0,  a3hlen, l5,l3);
928 <   return true;
921 > ri=maxloc.pos;
922 > if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
923 > //ri = right boundary for the poly match
924 > //extend left
925 > loc.set(li, maxloc.score);
926 > maxloc.pos=li;
927 > while (li>0) {
928 >    li--;
929 >    if (seq[li]==polyChar) {
930 >                 loc.add(li,poly_m_score);
931 >                 }
932 >    else if (seq[li]=='N') {
933 >                 loc.add(li,0);
934 >                 }
935 >    else { //mismatch
936 >         loc.add(li,poly_mis_score);
937 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
938 >         }
939 >    if (maxloc.score<=loc.score) {
940 >       maxloc=loc;
941 >       }
942 >    }
943 > li=maxloc.pos;
944 > if ((maxloc.score==poly_minScore && ri==rlen-1) ||
945 >    (maxloc.score>poly_minScore && ri>=rlen-3) ||
946 >    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
947 >  //trimming this li-ri match at 3' end
948 >    l3=li-1;
949 >    if (l3<0) l3=0;
950 >    return true;
951 >    }
952 > return false;
953 > }
954 >
955 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
956 > if (!doPolyTrim) return false;
957 > int rlen=seq.length();
958 > l5=0;
959 > l3=rlen-1;
960 > int32 seedVal=*(int32*)poly_seed;
961 > char polyChar=poly_seed[0];
962 > //assumes N trimming was already done
963 > //so a poly match should be very close to the end of the read
964 > // -- find the initial match (seed)
965 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
966 > int li;
967 > for (li=0;li<=lmax;li++) {
968 >   if (seedVal==*(int*)&(seq[li])) {
969 >      break;
970 >      }
971     }
972 < //no easy cases, so let's do the word hashing
973 < for (int iw=0;iw<6;iw++) {
974 <   GStr aword=adapter3.substr(iw,3);
975 <   if ((fi=seq.index(aword))>=0  && rlen-fi>3) {
976 <      if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
977 <                   a3len, iw, 3, l5,l3)) return true;
972 > if (li>lmax) return false;
973 > //seed found, try to extend it both ways
974 > //extend left
975 > int ri=li+3; //save rightmost base of the seed
976 > SLocScore loc(li, poly_m_score<<2);
977 > SLocScore maxloc(loc);
978 > while (li>0) {
979 >    li--;
980 >    if (seq[li]==polyChar) {
981 >                 loc.add(li,poly_m_score);
982 >                 }
983 >    else if (seq[li]=='N') {
984 >                 loc.add(li,0);
985 >                 }
986 >    else { //mismatch
987 >         loc.add(li,poly_mis_score);
988 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
989 >         }
990 >    if (maxloc.score<=loc.score) {
991 >       maxloc=loc;
992 >       }
993 >    }
994 > li=maxloc.pos;
995 > if (li>5) return false; //no trimming wanted, too far from 5' end
996 > //li = right boundary for the poly match
997 >
998 > //extend right
999 > loc.set(ri, maxloc.score);
1000 > maxloc.pos=ri;
1001 > while (ri<rlen-1) {
1002 >   ri++;
1003 >   if (seq[ri]==polyChar) {
1004 >                loc.add(ri,poly_m_score);
1005 >                }
1006 >   else if (seq[ri]=='N') {
1007 >                loc.add(ri,0);
1008 >                }
1009 >   else { //mismatch
1010 >        loc.add(ri,poly_mis_score);
1011 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
1012 >        }
1013 >   if (maxloc.score<=loc.score) {
1014 >      maxloc=loc;
1015        }
1016     }
1017 < return false; //no adapter parts found
1017 > ri=maxloc.pos;
1018 > if ((maxloc.score==poly_minScore && li==0) ||
1019 >     (maxloc.score>poly_minScore && li<2)
1020 >     || (maxloc.score>(poly_minScore*3) && li<8)) {
1021 >    //adjust l5 to reflect this trimming of 5' end
1022 >    l5=ri+1;
1023 >    if (l5>rlen-1) l5=rlen-1;
1024 >    return true;
1025 >    }
1026 > return false;
1027 > }
1028 >
1029 > bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
1030 > if (adaptors3.Count()==0) return false;
1031 > int rlen=seq.length();
1032 > l5=0;
1033 > l3=rlen-1;
1034 > bool trimmed=false;
1035 > GStr wseq(seq);
1036 > int wlen=rlen;
1037 > GXSeqData seqdata;
1038 > int numruns=revCompl ? 2 : 1;
1039 > GList<GXAlnInfo> bestalns(true, true, false);
1040 > for (int ai=0;ai<adaptors3.Count();ai++) {
1041 >   for (int r=0;r<numruns;r++) {
1042 >     if (r) {
1043 >          seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
1044 >                 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
1045 >        }
1046 >     else {
1047 >            seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
1048 >                 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
1049 >        }
1050 >     GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
1051 >         if (aln) {
1052 >           if (aln->strong) {
1053 >                   trimmed=true;
1054 >                   bestalns.Add(aln);
1055 >                   break; //will check the rest next time
1056 >                   }
1057 >            else bestalns.Add(aln);
1058 >           }
1059 >   }//forward and reverse adaptors
1060 >   if (trimmed) break; //will check the rest in the next cycle
1061 >  }//for each 3' adaptor
1062 > if (bestalns.Count()>0) {
1063 >           GXAlnInfo* aln=bestalns[0];
1064 >           if (aln->sl-1 > wlen-aln->sr) {
1065 >                   //keep left side
1066 >                   l3-=(wlen-aln->sl+1);
1067 >                   if (l3<0) l3=0;
1068 >                   }
1069 >           else { //keep right side
1070 >                   l5+=aln->sr;
1071 >                   if (l5>=rlen) l5=rlen-1;
1072 >                   }
1073 >           //delete aln;
1074 >           //if (l3-l5+1<min_read_len) return true;
1075 >           wseq=seq.substr(l5,l3-l5+1);
1076 >           wlen=wseq.length();
1077 >           return true; //break the loops here to report a good find
1078 >     }
1079 >  return false;
1080   }
1081  
1082 < bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1083 < //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1082 > bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1083 > if (adaptors5.Count()==0) return false;
1084   int rlen=seq.length();
1085   l5=0;
1086   l3=rlen-1;
1087 < //try to see if adapter is fully included in the read
1088 < int fi=-1;
1089 < if ((fi=seq.index(adapter5))>=0) {
1090 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
1091 <      l5=fi+a5len;
1092 <      l3=rlen-1;
1093 <      }
1094 <    else {
1095 <      l5=0;
1096 <      l3=fi-1;
1087 > bool trimmed=false;
1088 > GStr wseq(seq);
1089 > int wlen=rlen;
1090 > GXSeqData seqdata;
1091 > int numruns=revCompl ? 2 : 1;
1092 > GList<GXAlnInfo> bestalns(true, true, false);
1093 > for (int ai=0;ai<adaptors5.Count();ai++) {
1094 >   for (int r=0;r<numruns;r++) {
1095 >     if (r) {
1096 >          seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1097 >                 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1098 >        }
1099 >     else {
1100 >            seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1101 >                 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1102 >        }
1103 >         GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1104 >         if (aln) {
1105 >           if (aln->strong) {
1106 >                   trimmed=true;
1107 >                   bestalns.Add(aln);
1108 >                   break; //will check the rest next time
1109 >                   }
1110 >            else bestalns.Add(aln);
1111 >           }
1112 >         } //forward and reverse?
1113 >   if (trimmed) break; //will check the rest in the next cycle
1114 >  }//for each 5' adaptor
1115 >  if (bestalns.Count()>0) {
1116 >           GXAlnInfo* aln=bestalns[0];
1117 >           if (aln->sl-1 > wlen-aln->sr) {
1118 >                   //keep left side
1119 >                   l3-=(wlen-aln->sl+1);
1120 >                   if (l3<0) l3=0;
1121 >                   }
1122 >           else { //keep right side
1123 >                   l5+=aln->sr;
1124 >                   if (l5>=rlen) l5=rlen-1;
1125 >                   }
1126 >           //delete aln;
1127 >           //if (l3-l5+1<min_read_len) return true;
1128 >           wseq=seq.substr(l5,l3-l5+1);
1129 >           wlen=wseq.length();
1130 >           return true; //break the loops here to report a good find
1131 >     }
1132 >  return false;
1133 > }
1134 >
1135 > //convert qvs to/from phred64 from/to phread33
1136 > void convertPhred(GStr& q) {
1137 > for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1138 > }
1139 >
1140 > void convertPhred(char* q, int len) {
1141 > for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1142 > }
1143 >
1144 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1145 >          GStr& rname, GStr& rinfo, GStr& infname) {
1146 > rseq="";
1147 > rqv="";
1148 > rname="";
1149 > rinfo="";
1150 > if (fq.eof()) return false;
1151 > char* l=fq.getLine();
1152 > while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1153 > if (l==NULL) return false;
1154 > /* if (rawFormat) {
1155 >      //TODO: implement raw qseq parsing here
1156 >      //if (raw type=N) then continue; //skip invalid/bad records
1157 >      } //raw qseq format
1158 > else { // FASTQ or FASTA */
1159 > isfasta=(l[0]=='>');
1160 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1161 >      infname.chars(), l);
1162 > GStr s(l);
1163 > rname=&(l[1]);
1164 > for (int i=0;i<rname.length();i++)
1165 >    if (rname[i]<=' ') {
1166 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1167 >       rname.cut(i);
1168 >       break;
1169 >       }
1170 >  //now get the sequence
1171 > if ((l=fq.getLine())==NULL)
1172 >      GError("Error: unexpected EOF after header for read %s (%s)\n",
1173 >                   rname.chars(), infname.chars());
1174 > rseq=l; //this must be the DNA line
1175 > while ((l=fq.getLine())!=NULL) {
1176 >      //seq can span multiple lines
1177 >      if (l[0]=='>' || l[0]=='+') {
1178 >           fq.pushBack();
1179 >           break; //
1180 >           }
1181 >      rseq+=l;
1182 >      } //check for multi-line seq
1183 > if (!isfasta) { //reading fastq quality values, which can also be multi-line
1184 >    if ((l=fq.getLine())==NULL)
1185 >        GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1186 >    if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1187 >    if ((l=fq.getLine())==NULL)
1188 >        GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1189 >    rqv=l;
1190 >    //if (rqv.length()!=rseq.length())
1191 >    //  GError("Error: qv len != seq len for %s\n", rname.chars());
1192 >    while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1193 >      rqv+=l; //append to qv string
1194        }
1195 <   return true;
1195 >    }// fastq
1196 > // } //<-- FASTA or FASTQ
1197 > rseq.upper();
1198 > return true;
1199 > }
1200 >
1201 > #ifdef GDEBUG
1202 > void showTrim(GStr& s, int l5, int l3) {
1203 >  if (l5>0 || l3==0) {
1204 >    color_bg(c_red);
1205 >    }
1206 >  for (int i=0;i<s.length()-1;i++) {
1207 >    if (i && i==l5) color_resetbg();
1208 >    fprintf(stderr, "%c", s[i]);
1209 >    if (i && i==l3) color_bg(c_red);
1210     }
1211 < //for fast detection of adapter-rich reads, check if the first 12
1212 < //characters of the read are a substring of the adapter
1213 < GStr rstart=seq.substr(1,12);
1214 < if ((fi=adapter5.index(rstart))>=0) {
1215 <   //l3=rlen-1;
1216 <   //l5=a5len-4;
1217 <   //while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++;
1218 <   //return true;
1219 <   //if (debug) GMessage(" first 12char of the read match adaptor!\n");
1220 <   extendMatch(seq.chars(), rlen, 1,
1221 <                 adapter5.chars(), a5len, fi,  12, l5,l3, true);
1222 <   return true;
1211 >  fprintf(stderr, "%c", s[s.length()-1]);
1212 >  color_reset();
1213 >  fprintf(stderr, "\n");
1214 > }
1215 > #endif
1216 >
1217 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1218 > //returns 0 if the read was untouched, 1 if it was just trimmed
1219 > // and a trash code if it was trashed
1220 > l5=0;
1221 > l3=rseq.length()-1;
1222 > #ifdef GDEBUG
1223 >   //rseq.reverse();
1224 >   GMessage(">%s\n", rname.chars());
1225 >   GMessage("%s\n",rseq.chars());
1226 > #endif
1227 > if (l3-l5+1<min_read_len) {
1228 >   return 's';
1229     }
1230 <  
1231 < //another easy case: last 12 characters of the adaptor found as a substring of the read
1232 < int aplen=12;
1233 < int apstart=a5len-aplen-2;
1234 < if (apstart<0) { apstart=0; aplen=a5len-1; }
1235 < GStr bstr=adapter5.substr(apstart, aplen);
1236 < if ((fi=seq.index(bstr))>=0) {
1237 <   //if (debug) GMessage(" last 12char of adaptor match the read!\n");
1238 <   extendMatch(seq.chars(), rlen, fi,
1239 <                 adapter5.chars(), a5len, apstart,  aplen, l5,l3,true);
1240 <   return true;
1230 > GStr wseq(rseq.chars());
1231 > GStr wqv(rqv.chars());
1232 > int w5=l5;
1233 > int w3=l3;
1234 > //count Ns
1235 > b_totalIn+=rseq.length();
1236 > for (int i=0;i<rseq.length();i++) {
1237 > if (isACGT[(int)rseq[i]]==0) b_totalN++;
1238 > }
1239 >
1240 > //first do the q-based trimming
1241 > if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1242 >   b_trimQ+=(w5-l5)+(l3-w3);
1243 >   num_trimQ++;
1244 >   if (w3-w5+1<min_read_len) {
1245 >     return 'Q'; //invalid read
1246 >     }
1247 >    //-- keep only the w5..w3 range
1248 >   l5=w5;
1249 >   l3=w3;
1250 >   wseq=wseq.substr(w5, w3-w5+1);
1251 >   if (!wqv.is_empty())
1252 >      wqv=wqv.substr(w5, w3-w5+1);
1253 >   } //qv trimming
1254 > // N-trimming on the remaining read seq
1255 > if (ntrim(wseq, w5, w3)) {
1256 >   //Note: ntrim resets w5 and w3
1257 >   //GMessage("before: %s\n",wseq.chars());
1258 >   //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1259 >   int trim3=(wseq.length()-1-w3);
1260 >   b_trimN+=w5+trim3;
1261 >   num_trimN++;
1262 >   l5+=w5;
1263 >   l3-=trim3;
1264 >   if (w3-w5+1<min_read_len) {
1265 >     return 'N'; //to be trashed
1266 >     }
1267 >    //-- keep only the w5..w3 range
1268 >   wseq=wseq.substr(w5, w3-w5+1);
1269 >   if (!wqv.is_empty())
1270 >      wqv=wqv.substr(w5, w3-w5+1);
1271 >   w5=0;
1272 >   w3=wseq.length()-1;
1273     }
1274 < //no easy cases, find a triplet match as a seed for alignment extension
1275 < //find triplets at the right end of the adapter
1276 < for (int iw=0;iw<6;iw++) {
1277 <   apstart=a5len-iw-4;
1278 <   GStr aword=adapter5.substr(apstart,3);
1279 <   if ((fi=seq.index(aword))>=0) {
1280 <      //if (debug) GMessage("extending wmatch :%*s\n", fi+3, aword.chars());
1281 <      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1282 <                   a5len, apstart, 3, l5,l3,true)) {
1283 <                     return true;
1284 <                     }
1274 > char trim_code;
1275 > //clean the more dirty end first - 3'
1276 > bool trimmedA=false;
1277 > bool trimmedT=false;
1278 > bool trimmedV=false;
1279 >
1280 > int prev_w5=0;
1281 > int prev_w3=0;
1282 > bool w3upd=true;
1283 > bool w5upd=true;
1284 > do {
1285 >  trim_code=0;
1286 >  if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1287 >      trim_code='A';
1288 >      b_trimA+=(w5+(wseq.length()-1-w3));
1289 >      if (!trimmedA) { num_trimA++; trimmedA=true; }
1290 >      }
1291 >  else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1292 >      trim_code='T';
1293 >      b_trimT+=(w5+(wseq.length()-1-w3));
1294 >      if (!trimmedT) { num_trimT++; trimmedT=true; }
1295 >      }
1296 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1297 >      trim_code='A';
1298 >      b_trimA+=(w5+(wseq.length()-1-w3));
1299 >      if (!trimmedA) { num_trimA++; trimmedA=true; }
1300        }
1301 +  else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1302 +      trim_code='T';
1303 +      b_trimT+=(w5+(wseq.length()-1-w3));
1304 +      if (!trimmedT) { num_trimT++; trimmedT=true; }
1305 +      }
1306 +  else if (trim_adaptor5(wseq, w5, w3)) {
1307 +      trim_code='V';
1308 +      b_trimV+=(w5+(wseq.length()-1-w3));
1309 +      if (!trimmedV) { num_trimV++; trimmedV=true; }
1310 +      }
1311 +  else if (trim_adaptor3(wseq, w5, w3)) {
1312 +      trim_code='V';
1313 +      b_trimV+=(w5+(wseq.length()-1-w3));
1314 +      if (!trimmedV) { num_trimV++; trimmedV=true; }
1315 +      }
1316 +  if (trim_code) {
1317 +     w3upd=(w3!=prev_w3);
1318 +         w5upd=(w5!=prev_w5);
1319 +         if (w3upd) prev_w3=w3;
1320 +         if (w5upd) prev_w5=w5;
1321 +   #ifdef GDEBUG
1322 +     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1323 +     showTrim(wseq, w5, w3);
1324 +   #endif
1325 +     //int trimlen=wseq.length()-(w3-w5+1);
1326 +     l5+=w5;
1327 +     l3-=(wseq.length()-1-w3);
1328 +     if (w3-w5+1<min_read_len) {
1329 +         return trim_code;
1330 +         }
1331 +      //-- keep only the w5..w3 range
1332 +      wseq=wseq.substr(w5, w3-w5+1);
1333 +      if (!wqv.is_empty())
1334 +         wqv=wqv.substr(w5, w3-w5+1);
1335 +      }//trimming at 3' end
1336 + } while (trim_code);
1337 + if (doCollapse) {
1338 +   //keep read for later
1339 +   FqDupRec* dr=dhash.Find(wseq.chars());
1340 +   if (dr==NULL) { //new entry
1341 +          //if (prefix.is_empty())
1342 +             dhash.Add(wseq.chars(),
1343 +                  new FqDupRec(&wqv, rname.chars()));
1344 +          //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1345 +         }
1346 +      else
1347 +         dr->add(wqv);
1348 +   } //collapsing duplicates
1349 + else { //not collapsing duplicates
1350 +   //apply the dust filter now
1351 +   if (doDust) {
1352 +     int dustbases=dust(wseq);
1353 +     if (dustbases>(wseq.length()>>1)) {
1354 +        return 'D';
1355 +        }
1356 +     }
1357 +   } //not collapsing duplicates
1358 + return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1359 + }
1360 +
1361 + void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1362 + //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1363 + if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1364 +  else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1365 + }
1366 +
1367 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1368 +   outcounter++;
1369 +   bool asFasta=(rqv.is_empty() || fastaOutput);
1370 +   if (asFasta) {
1371 +    if (prefix.is_empty()) {
1372 +       printHeader(f_out, '>',rname,rinfo);
1373 +       fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1374 +       }
1375 +      else {
1376 +       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1377 +                          rseq.chars());
1378 +       }
1379 +     }
1380 +   else {  //fastq
1381 +    if (convert_phred) convertPhred(rqv);
1382 +    if (prefix.is_empty()) {
1383 +       printHeader(f_out, '@', rname, rinfo);
1384 +       fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1385 +       }
1386 +      else
1387 +       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1388 +                          rseq.chars(),rqv.chars() );
1389 +     }
1390 + }
1391 +
1392 + void trash_report(char trashcode, GStr& rname, FILE* freport) {
1393 + if (freport==NULL || trashcode<=' ') return;
1394 + if (trashcode=='3' || trashcode=='5') {
1395 +   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1396 +   }
1397 + else {
1398 +   fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1399     }
1400 < return false; //no adapter parts found
1400 > //tcounter++;
1401 > }
1402 >
1403 > GStr getFext(GStr& s, int* xpos=NULL) {
1404 > //s must be a filename without a path
1405 > GStr r("");
1406 > if (xpos!=NULL) *xpos=0;
1407 > if (s.is_empty() || s=="-") return r;
1408 > int p=s.rindex('.');
1409 > int d=s.rindex('/');
1410 > if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1411 > r=s.substr(p+1);
1412 > if (xpos!=NULL) *xpos=p+1;
1413 > r.lower();
1414 > return r;
1415   }
1416  
1417 < //conversion of phred64 to phread33
1418 < void convertQ64(GStr& q) {
1419 < for (int i=0;i<q.length();i++) q[i]-=31;
1417 > void baseFileName(GStr& fname) {
1418 > //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1419 > int xpos=0;
1420 > GStr fext=getFext(fname, &xpos);
1421 > if (xpos<=1) return;
1422 > bool extdel=false;
1423 > GStr f2;
1424 > if (fext=="z" || fext=="zip") {
1425 >   extdel=true;
1426 >   }
1427 >  else if (fext.length()>=2) {
1428 >   f2=fext.substr(0,2);
1429 >   extdel=(f2=="gz" || f2=="bz");
1430 >   }
1431 > if (extdel) {
1432 >   fname.cut(xpos-1);
1433 >   fext=getFext(fname, &xpos);
1434 >   if (xpos<=1) return;
1435 >   }
1436 > extdel=false;
1437 > if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1438 >   extdel=true;
1439 >   }
1440 >  else if (fext.length()>=2) {
1441 >   extdel=(fext.substr(0,2)=="fa");
1442 >   }
1443 > if (extdel) fname.cut(xpos-1);
1444 > GStr fncp(fname);
1445 > fncp.lower();
1446 > fncp.chomp("seq");
1447 > fncp.chomp("sequence");
1448 > fncp.trimR("_.");
1449 > if (fncp.length()<fname.length()) fname.cut(fncp.length());
1450 > }
1451 >
1452 > FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1453 >  FILE* f_out=NULL;
1454 >  GStr fname(getFileName(infname.chars()));
1455 >  //eliminate known extensions
1456 >  baseFileName(fname);
1457 >  if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1458 >    else if (pocmd.is_empty()) {
1459 >               GStr oname(fname);
1460 >               oname.append('.');
1461 >               oname.append(outsuffix);
1462 >               f_out=fopen(oname.chars(),"w");
1463 >               if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1464 >               }
1465 >            else {
1466 >              GStr oname(">");
1467 >              oname.append(fname);
1468 >              oname.append('.');
1469 >              oname.append(outsuffix);
1470 >              pocmd.append(oname);
1471 >              f_out=popen(pocmd.chars(), "w");
1472 >              if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1473 >              }
1474 > return f_out;
1475 > }
1476 >
1477 > void guess_unzip(GStr& fname, GStr& picmd) {
1478 > GStr fext=getFext(fname);
1479 > if (fext=="gz" || fext=="gzip" || fext=="z") {
1480 >    picmd="gzip -cd ";
1481 >    }
1482 >   else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1483 >    picmd="bzip2 -cd ";
1484 >    }
1485 > }
1486 >
1487 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1488 > //TODO: prepare CASeqData here, and collect hexamers as well
1489 >  if (seq.is_empty() || seq=="-" ||
1490 >          seq=="N/A" || seq==".") return;
1491 >
1492 > CASeqData adata(revCompl);
1493 > int idx=adaptors.Add(adata);
1494 > if (idx<0) GError("Error: failed to add adaptor!\n");
1495 > adaptors[idx].trim_type=trim_type;
1496 > adaptors[idx].update(seq.chars());
1497   }
1498  
1499 < void convertQ64(char* q, int len) {
1500 < for (int i=0;i<len;i++) q[i]-=31;
1499 >
1500 > int loadAdaptors(const char* fname) {
1501 >  GLineReader lr(fname);
1502 >  char* l;
1503 >  while ((l=lr.nextLine())!=NULL) {
1504 >   if (lr.length()<=3 || l[0]=='#') continue;
1505 >   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1506 >        l[0]==';'|| l[0]==':' ) {
1507 >      int i=1;
1508 >      while (l[i]!=0 && isspace(l[i])) {
1509 >        i++;
1510 >        }
1511 >      if (l[i]!=0) {
1512 >        GStr s(&(l[i]));
1513 >      #ifdef GDEBUG
1514 >          //s.reverse();
1515 >      #endif
1516 >        addAdaptor(adaptors3, s, galn_TrimRight);
1517 >        continue;
1518 >        }
1519 >      }
1520 >    else {
1521 >      GStr s(l);
1522 >      s.startTokenize("\t ;,:");
1523 >      GStr a5,a3;
1524 >      if (s.nextToken(a5))
1525 >            s.nextToken(a3);
1526 >        else continue; //no tokens on this line
1527 >      GAlnTrimType ttype5=galn_TrimLeft;
1528 >      a5.upper();
1529 >      a3.upper();
1530 >      if (a3.is_empty() || a3==a5 || a3=="=") {
1531 >         a3.clear();
1532 >         ttype5=galn_TrimEither;
1533 >         }
1534 >     #ifdef GDEBUG
1535 >     //   a5.reverse();
1536 >     //   a3.reverse();
1537 >     #endif
1538 >      addAdaptor(adaptors5, a5, ttype5);
1539 >      addAdaptor(adaptors3, a3, galn_TrimRight);
1540 >      }
1541 >   }
1542 >   return adaptors5.Count()+adaptors3.Count();
1543   }
1544  
1545 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1546 +                       GStr& s, GStr& infname, GStr& infname2) {
1547 + // uses outsuffix to generate output file names and open file handles as needed
1548 + infname="";
1549 + infname2="";
1550 + f_in=NULL;
1551 + f_in2=NULL;
1552 + f_out=NULL;
1553 + f_out2=NULL;
1554 + //analyze outsuffix intent
1555 + GStr pocmd;
1556 + if (outsuffix=="-") {
1557 +    f_out=stdout;
1558 +    }
1559 +   else {
1560 +    GStr ox=getFext(outsuffix);
1561 +    if (ox.length()>2) ox=ox.substr(0,2);
1562 +    if (ox=="gz") pocmd="gzip -9 -c ";
1563 +        else if (ox=="bz") pocmd="bzip2 -9 -c ";
1564 +    }
1565 + if (s=="-") {
1566 +    f_in=stdin;
1567 +    infname="stdin";
1568 +    f_out=prepOutFile(infname, pocmd);
1569 +    return;
1570 +    } // streaming from stdin
1571 + s.startTokenize(",:");
1572 + s.nextToken(infname);
1573 + bool paired=s.nextToken(infname2);
1574 + if (fileExists(infname.chars())==0)
1575 +    GError("Error: cannot find file %s!\n",infname.chars());
1576 + GStr fname(getFileName(infname.chars()));
1577 + GStr picmd;
1578 + guess_unzip(fname, picmd);
1579 + if (picmd.is_empty()) {
1580 +   f_in=fopen(infname.chars(), "r");
1581 +   if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1582 +   }
1583 +  else {
1584 +   picmd.append(infname);
1585 +   f_in=popen(picmd.chars(), "r");
1586 +   if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1587 +   }
1588 + if (f_out==stdout) {
1589 +   if (paired) GError("Error: output suffix required for paired reads\n");
1590 +   return;
1591 +   }
1592 + f_out=prepOutFile(infname, pocmd);
1593 + if (!paired) return;
1594 + if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1595 + // ---- paired reads:-------------
1596 + if (fileExists(infname2.chars())==0)
1597 +     GError("Error: cannot find file %s!\n",infname2.chars());
1598 + picmd="";
1599 + GStr fname2(getFileName(infname2.chars()));
1600 + guess_unzip(fname2, picmd);
1601 + if (picmd.is_empty()) {
1602 +   f_in2=fopen(infname2.chars(), "r");
1603 +   if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1604 +   }
1605 +  else {
1606 +   picmd.append(infname2);
1607 +   f_in2=popen(picmd.chars(), "r");
1608 +   if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1609 +   }
1610 + f_out2=prepOutFile(infname2, pocmd);
1611 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines