ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 2 | Line 2
2   #include "GStr.h"
3   #include "GHash.hh"
4   #include "GList.hh"
5 + #include <ctype.h>
6 + #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9 < fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\
10 <    [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\
9 > fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 >   [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 >    <input.fq>[,<input_mates.fq>\n\
13   \n\
14 < Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\
15 < optionally collapse duplicates in the input fastq data.\n\
14 > Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15 > for low complexity and collapse duplicate reads.\n\
16 > If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 > one read in a pair), the two file names should be given delimited by a comma\n\
18 > or a colon character\n\
19   \n\
20   Options:\n\
21   -n  rename all the reads using the <prefix> followed by a read counter;\n\
22      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23      the read duplication count\n\
24 < -o  write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
25 < -5  trim the given adapter or primer sequence at the 5' end of each read\n\
24 > -o  unless this parameter is '-', write the trimmed/filtered reads to \n\
25 >    file(s) named <input>.<outsuffix> which will be created in the \n\
26 >    current (working) directory; (writes to stdout if -o- is given);\n\
27 >    a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 > -f  file with adaptor sequences to trim, each line having this format:\n\
29 >    <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30 > -5  trim the given adaptor or primer sequence at the 5' end of each read\n\
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 < -3  trim the given adapter sequence at the 3' end of each read\n\
32 > -3  trim the given adaptor sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -l  minimum \"clean\" length at the high quality 5'end that a read must\n\
35 <    have in order to pass the filter (default: 16)\n\
34 > -A  disable polyA/T trimming (enabled by default)\n\
35 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 > -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 > -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 > -l  minimum \"clean\" length after trimming that a read must have\n\
40 >    in order to pass the filter (default: 16)\n\
41   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
42      one letter code for the reason why they were trashed\n\
43 + -D  apply a low-complexity (dust) filter and discard any read that has over\n\
44 +    50% of its length detected as low complexity\n\
45   -C  collapse duplicate reads and add a prefix with their count to the read\n\
46 <    name\n\
47 < -D  apply a low-complexity (dust) filter and discard any read that has at\n\
48 <    least half of it masked by this\n\
49 < -Q  quality values in the input data are interpreted as phred64, convert\n\
31 <    them to phred33\n\
46 >    name (e.g. for microRNAs)\n\
47 > -p  input is phred64/phred33 (use -p64 or -p33)\n\
48 > -Q  convert quality values to the other Phred qv type\n\
49 > -V  verbose processing\n\
50   "
51 < // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
52 < FILE* f_out=NULL; //stdout if not provided
53 < FILE* f_in=NULL; //input fastq (stdin if not provided)
51 >
52 > //-z  for -o option, the output stream(s) will be first piped into the given\n
53 > //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54 >
55 >
56 > // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 >
58 > //For paired reads sequencing:
59 > //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 > //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 > //FILE* f_out=NULL; //stdout if not provided
62 > //FILE* f_out2=NULL; //for paired reads
63 > //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 > //FILE* f_in2=NULL; //for paired reads
65 >
66   FILE* freport=NULL;
67 +
68   bool debug=false;
69 + bool verbose=false;
70   bool doCollapse=false;
71   bool doDust=false;
72 + bool doPolyTrim=true;
73 + bool fastaOutput=false;
74   bool trashReport=false;
75 < bool rawFormat=false;
75 > bool revCompl=false; //also reverse complement adaptor sequences
76   int min_read_len=16;
77 + double max_perc_N=7.0;
78   int dust_cutoff=16;
79   bool isfasta=false;
80 < bool phred64=false;
80 > bool convert_phred=false;
81 > GStr outsuffix; // -o
82   GStr prefix;
83 < GStr adapter3;
84 < GStr adapter5;
85 < int a3len=0;
86 < int a5len=0;
87 < // adaptor matching metrics -- for extendMatch() function
88 < static const int a_m_score=2; //match score
89 < static const int a_mis_score=-3; //mismatch
90 < static const int a_dropoff_score=7;
91 < static const int a_min_score=7;
83 > GStr zcmd;
84 > int num_trimmed5=0;
85 > int num_trimmed3=0;
86 > int num_trimmedN=0;
87 > int num_trimmedQ=0;
88 > int min_trimmed5=INT_MAX;
89 > int min_trimmed3=INT_MAX;
90 >
91 > int qvtrim_qmin=0;
92 > int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
93 > int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94 > int qv_cvtadd=0; //could be -31 or +31
95 >
96 > // adaptor matching metrics -- for X-drop ungapped extension
97 > //const int match_reward=2;
98 > //const int mismatch_penalty=3;
99 > const int match_reward=2;
100 > const int mismatch_penalty=4;
101 > const int Xdrop=10;
102 >
103 > const int poly_m_score=2; //match score
104 > const int poly_mis_score=-3; //mismatch
105 > const int poly_dropoff_score=7;
106 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
107 >
108 > const char *polyA_seed="AAAA";
109 > const char *polyT_seed="TTTT";
110 >
111 > struct CASeqData {
112 >   //positional data for every possible hexamer in an adaptor
113 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115 >   GStr seq; //actual adaptor sequence data
116 >   GStr seqr; //reverse complement sequence
117 >   int amlen; //fraction of adaptor length matching that's
118 >              //enough to consider the alignment
119 >   GAlnTrimType trim_type;
120 >   bool use_reverse;
121 >   CASeqData(bool rev=false):seq(),seqr(),
122 >             amlen(0), use_reverse(rev) {
123 >     trim_type=galn_None; //should be updated later!
124 >     for (int i=0;i<4096;i++) {
125 >        pz[i]=NULL;
126 >        pzr[i]=NULL;
127 >        }
128 >     }
129 >
130 >   void update(const char* s) {
131 >         seq=s;
132 >         table6mers(seq.chars(), seq.length(), pz);
133 >         amlen=iround(double(seq.length())*0.8);
134 >         if (amlen<12)
135 >                amlen=12;
136 >         if (!use_reverse) return;
137 >         //reverse complement
138 >         seqr=s;
139 >         int slen=seq.length();
140 >         for (int i=0;i<slen;i++)
141 >           seqr[i]=ntComplement(seq[slen-i-1]);
142 >         table6mers(seqr.chars(), seqr.length(), pzr);
143 >   }
144 >
145 >   ~CASeqData() {
146 >     for (int i=0;i<4096;i++) {
147 >       delete pz[i];
148 >       delete pzr[i];
149 >     }
150 >   }
151 > };
152 >
153 > GVec<CASeqData> adaptors5;
154 > GVec<CASeqData> adaptors3;
155 >
156 > CGreedyAlignData* gxmem_l=NULL;
157 > CGreedyAlignData* gxmem_r=NULL;
158  
159   // element in dhash:
160   class FqDupRec {
# Line 91 | Line 193
193    if (!s.is_empty()) {
194        if (s=='-') f=stdout;
195        else {
196 <       f=fopen(s,"w");
196 >       f=fopen(s.chars(),"w");
197         if (f==NULL) GError("Error creating file: %s\n", s.chars());
198         }
199       }
# Line 102 | Line 204
204  
205   GHash<FqDupRec> dhash; //hash to keep track of duplicates
206  
207 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured
208 <
207 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208 > int loadAdaptors(const char* fname);
209 >
210 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211 >                       GStr& s, GStr& infname, GStr& infname2);
212 > // uses outsuffix to generate output file names and open file handles as needed
213 >
214 > void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
215 > void trash_report(char trashcode, GStr& rname, FILE* freport);
216 >
217 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
218 >          GStr& rname, GStr& rinfo, GStr& infname);
219 >
220 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
221 > //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
222 >
223 > bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
224 > bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
225   int dust(GStr& seq);
226 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
227 < bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
226 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 > bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 > bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230  
231 < void convertQ64(char* q, int len);
232 < void convertQ64(GStr& q);
231 > void convertPhred(char* q, int len);
232 > void convertPhred(GStr& q);
233  
234   int main(int argc, char * const argv[]) {
235 <  GArgs args(argc, argv, "YQRDCl:d:3:5:n:r:o:");
235 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
236    int e;
117  int icounter=0; //counter for input reads
118  int outcounter=0; //counter for output reads
237    if ((e=args.isError())>0) {
238        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
239        exit(224);
240        }
241    debug=(args.getOpt('Y')!=NULL);
242 <  phred64=(args.getOpt('Q')!=NULL);
242 >  verbose=(args.getOpt('V')!=NULL);
243 >  convert_phred=(args.getOpt('Q')!=NULL);
244    doCollapse=(args.getOpt('C')!=NULL);
245    doDust=(args.getOpt('D')!=NULL);
246 +  revCompl=(args.getOpt('R')!=NULL);
247 +  if (args.getOpt('A')) doPolyTrim=false;
248 +  /*
249    rawFormat=(args.getOpt('R')!=NULL);
250    if (rawFormat) {
251      GError("Sorry, raw qseq format parsing is not implemented yet!\n");
252      }
253 +  */
254    prefix=args.getOpt('n');
255    GStr s=args.getOpt('l');
256 <  if (!s.is_empty())
256 >  if (!s.is_empty())
257       min_read_len=s.asInt();
258 +  s=args.getOpt('m');
259 +  if (!s.is_empty())
260 +     max_perc_N=s.asDouble();
261    s=args.getOpt('d');
262    if (!s.is_empty()) {
263       dust_cutoff=s.asInt();
264       doDust=true;
265       }
266 <    
267 <  if (args.getOpt('3')!=NULL) {
268 <    adapter3=args.getOpt('3');
269 <    adapter3.upper();
270 <    a3len=adapter3.length();
271 <    }
272 <  if (args.getOpt('5')!=NULL) {
273 <    adapter5=args.getOpt('5');
274 <    adapter5.upper();
275 <    a5len=adapter5.length();
266 >  s=args.getOpt('q');
267 >  if (!s.is_empty()) {
268 >     qvtrim_qmin=s.asInt();
269 >     }
270 >  s=args.getOpt('t');
271 >  if (!s.is_empty()) {
272 >     qvtrim_max=s.asInt();
273 >     }
274 >  s=args.getOpt('p');
275 >  if (!s.is_empty()) {
276 >     int v=s.asInt();
277 >     if (v==33) {
278 >        qv_phredtype=33;
279 >        qv_cvtadd=31;
280 >        }
281 >      else if (v==64) {
282 >        qv_phredtype=64;
283 >        qv_cvtadd=-31;
284 >        }
285 >       else
286 >         GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
287 >     }
288 >  s=args.getOpt('f');
289 >  if (!s.is_empty()) {
290 >   loadAdaptors(s.chars());
291 >   }
292 >  bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293 >  s=args.getOpt('5');
294 >  if (!s.is_empty()) {
295 >    if (fileAdaptors)
296 >      GError("Error: options -5 and -f cannot be used together!\n");
297 >    s.upper();
298 >    addAdaptor(adaptors5, s, galn_TrimLeft);
299      }
300 +  s=args.getOpt('3');
301 +  if (!s.is_empty()) {
302 +    if (fileAdaptors)
303 +      GError("Error: options -3 and -f cannot be used together!\n");
304 +      s.upper();
305 +      addAdaptor(adaptors3, s, galn_TrimRight);
306 +    }
307 +  s=args.getOpt('y');
308 +  if (!s.is_empty()) {
309 +     int minmatch=s.asInt();
310 +     poly_minScore=minmatch*poly_m_score;
311 +     }
312 +
313 +  if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
314 +                         else outsuffix="-";
315    trashReport=  (args.getOpt('r')!=NULL);
316 <  if (args.startNonOpt()==0) {
316 >  int fcount=args.startNonOpt();
317 >  if (fcount==0) {
318      GMessage(USAGE);
319      exit(224);
320      }
321 <
322 <  openfw(f_out, args, 'o');
321 >   if (fcount>1 && doCollapse) {
322 >    GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
323 >    }
324 >  //openfw(f_out, args, 'o');
325 >  //if (f_out==NULL) f_out=stdout;
326    if (trashReport)
327      openfw(freport, args, 'r');
328    char* infile=NULL;
329 +
330 +  if (adaptors5.Count()>0)
331 +    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332 +        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 +  if (adaptors3.Count()>0)
334 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335 +
336    while ((infile=args.nextNonOpt())!=NULL) {
337 <    GStr infname(infile);
338 <    if (strcmp(infile,"-")==0) {
339 <       f_in=stdin; infname="stdin"; }
340 <     else {
341 <        f_in=fopen(infile,"r");
342 <        if (f_in==NULL)
343 <            GError("Cannot open input file %s!\n",infile);
344 <        }
345 <     GLineReader fq(f_in);
346 <     char* l=NULL;
347 <     while ((l=fq.getLine())!=NULL) {
348 <        GStr rname; //current read name
349 <        GStr rseq;  //current read sequence
350 <        GStr rqv;   //current read quality values
351 <        GStr s;
352 <        if (rawFormat) {
353 <          //TODO: implement qseq parsing here
354 <          //if (raw type=N) then continue; //skip invalid/bad records
355 <          
356 <          } //raw qseq format
357 <         else { // FASTQ or FASTA
358 <          isfasta=(l[0]=='>');
359 <          if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
360 <          s=l;
361 <          rname=&(l[1]);
362 <          icounter++;
363 <          //GMessage("readname=%s\n",rname.chars());
364 <          for (int i=0;i<rname.length();i++)
365 <            if (rname[i]<=' ') { rname.cut(i); break; }
366 <          //now get the sequence
367 <          if ((l=fq.getLine())==NULL)
368 <              GError("Error: unexpected EOF after header for %s\n",rname.chars());
369 <          rseq=l; //this must be the DNA line
370 <          if (isfasta) {
371 <            while ((l=fq.getLine())!=NULL) {
372 <              //fasta can have multiple sequence lines
373 <              if (l[0]=='>') {
374 <                   fq.pushBack();
375 <                   break; //
337 >    //for each input file
338 >    int incounter=0; //counter for input reads
339 >    int outcounter=0; //counter for output reads
340 >    int trash_s=0; //too short from the get go
341 >    int trash_Q=0;
342 >    int trash_N=0;
343 >    int trash_D=0;
344 >    int trash_poly=0;
345 >    int trash_A3=0;
346 >    int trash_A5=0;
347 >    s=infile;
348 >    GStr infname;
349 >    GStr infname2;
350 >    FILE* f_in=NULL;
351 >    FILE* f_in2=NULL;
352 >    FILE* f_out=NULL;
353 >    FILE* f_out2=NULL;
354 >    bool paired_reads=false;
355 >    setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
356 >    GLineReader fq(f_in);
357 >    GLineReader* fq2=NULL;
358 >    if (f_in2!=NULL) {
359 >       fq2=new GLineReader(f_in2);
360 >       paired_reads=true;
361 >       }
362 >    GStr rseq, rqv, seqid, seqinfo;
363 >    GStr rseq2, rqv2, seqid2, seqinfo2;
364 >    while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
365 >       incounter++;
366 >       int a5=0, a3=0, b5=0, b3=0;
367 >       char tcode=0, tcode2=0;
368 >       tcode=process_read(seqid, rseq, rqv, a5, a3);
369 >       if (fq2!=NULL) {
370 >            getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
371 >            if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
372 >               GError("Error: no paired match for read %s vs %s (%s,%s)\n",
373 >                  seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
374 >               }
375 >            tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
376 >            //decide what to do with this pair and print rseq2 if the pair makes it
377 >            if (tcode>1 && tcode2<=1) {
378 >               //"untrash" rseq
379 >               if (a3-a5+1<min_read_len) {
380 >                   a5=1;
381 >                   if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
382                     }
383 <              rseq+=l;
384 <              }
385 <            } //multi-line fasta file
386 <          if (!isfasta) {
387 <            if ((l=fq.getLine())==NULL)
388 <                GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
389 <            if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
390 <            if ((l=fq.getLine())==NULL)
391 <                GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
392 <            rqv=l;
393 <            if (rqv.length()!=rseq.length())
394 <                GError("Error: qv len != seq len for %s\n", rname.chars());
395 <            }
396 <        } //<-- FASTA or FASTQ
216 <        rseq.upper();
217 <        int l5=0;
218 <        int l3=rseq.length()-1;
219 <        if (l3-l5+1<min_read_len) {
220 <           if (trashReport) {
221 <                  fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
222 <                  }
223 <           continue;
224 <           }
225 <        if (ntrim(rseq, rqv, l5, l3)) {
226 <           if (l3-l5+1<min_read_len) {
227 <             if (trashReport) {
228 <                    fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
229 <                    }
230 <             continue; //invalid read
231 <             }
232 <            //-- keep only the l5..l3 range
233 <           rseq=rseq.substr(l5, l3-l5+1);
234 <           if (!rqv.is_empty())
235 <              rqv=rqv.substr(l5, l3-l5+1);
236 <           }
237 <          
238 <        if (a3len>0) {
239 <          if (trim_adapter3(rseq, l5, l3)) {
240 <             if (l3-l5+1<min_read_len) {
241 <                 if (trashReport) {
242 <                     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
243 <                     }
244 <                 continue;
245 <                 }
246 <              //-- keep only the l5..l3 range
247 <              rseq=rseq.substr(l5, l3-l5+1);
248 <              if (!rqv.is_empty())
249 <                 rqv=rqv.substr(l5, l3-l5+1);
250 <              }//some adapter was trimmed
251 <           } //adapter trimming
252 <        if (a5len>0) {
253 <          if (trim_adapter5(rseq, l5, l3)) {
254 <             if (l3-l5+1<min_read_len) {
255 <                 if (trashReport) {
256 <                     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
257 <                     }
258 <                 continue;
383 >               tcode=1;
384 >               }
385 >             else if (tcode<=1 && tcode2>1) {
386 >               //"untrash" rseq2
387 >               if (b3-b5+1<min_read_len) {
388 >                   b5=1;
389 >                   if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
390 >                   }
391 >               tcode2=1;
392 >               }
393 >            if (tcode<=1) { //trimmed or left intact -- write it!
394 >               if (tcode>0) {
395 >                 rseq2=rseq2.substr(b5,b3-b5+1);
396 >                 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
397                   }
398 <              //-- keep only the l5..l3 range
399 <              rseq=rseq.substr(l5, l3-l5+1);
400 <              if (!rqv.is_empty())
401 <                 rqv=rqv.substr(l5, l3-l5+1);
402 <              }//some adapter was trimmed
403 <           } //adapter trimming
404 <        if (doCollapse) {
405 <           //keep read for later
406 <           FqDupRec* dr=dhash.Find(rseq.chars());
407 <           if (dr==NULL) { //new entry
408 <                  //if (prefix.is_empty())
409 <                     dhash.Add(rseq.chars(),
410 <                          new FqDupRec(&rqv, rname.chars()));
411 <                  //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
398 >               int nocounter=0;
399 >               writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
400 >               }
401 >            } //pair read
402 >       if (tcode>1) { //trashed
403 >         #ifdef GDEBUG
404 >         GMessage(" !!!!TRASH code = %c\n",tcode);
405 >         #endif
406 >          if (tcode=='s') trash_s++;
407 >          else if (tcode=='A' || tcode=='T') trash_poly++;
408 >            else if (tcode=='Q') trash_Q++;
409 >              else if (tcode=='N') trash_N++;
410 >               else if (tcode=='D') trash_D++;
411 >                else if (tcode=='3') trash_A3++;
412 >                 else if (tcode=='5') trash_A5++;
413 >          if (trashReport) trash_report(tcode, seqid, freport);
414 >          }
415 >         else if (!doCollapse) { //write it
416 >          if (tcode>0) {
417 >            rseq=rseq.substr(a5,a3-a5+1);
418 >            if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
419 >            }
420 >         #ifdef GDEBUG
421 >            GMessage("  After trimming:\n");
422 >            GMessage("%s\n",rseq.chars());
423 >         #endif
424 >          writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
425 >          }
426 >       } //for each fastq record
427 >    delete fq2;
428 >    FRCLOSE(f_in);
429 >    FRCLOSE(f_in2);
430 >    if (doCollapse) {
431 >       outcounter=0;
432 >       int maxdup_count=1;
433 >       char* maxdup_seq=NULL;
434 >       dhash.startIterate();
435 >       FqDupRec* qd=NULL;
436 >       char* seq=NULL;
437 >       while ((qd=dhash.NextData(seq))!=NULL) {
438 >         GStr rseq(seq);
439 >         //do the dusting here
440 >         if (doDust) {
441 >            int dustbases=dust(rseq);
442 >            if (dustbases>(rseq.length()>>1)) {
443 >               if (trashReport && qd->firstname!=NULL) {
444 >                 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
445                   }
446 <              else    
447 <                 dr->add(rqv);
448 <           } //collapsing duplicates
449 <         else { //not collapsing duplicates
450 <           //do the dust filter now
451 <           if (doDust) {
452 <             int dustbases=dust(rseq);
453 <             if (dustbases>(rseq.length()>>1)) {
454 <                if (trashReport) {
455 <                  fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
456 <                  }
457 <                continue;
458 <                }
288 <             }
289 <           //print this record here  
290 <           outcounter++;
291 <           if (isfasta) {
292 <            if (prefix.is_empty())
293 <               fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
294 <              else
295 <               fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
296 <                                  rseq.chars());
446 >               trash_D+=qd->count;
447 >               continue;
448 >               }
449 >            }
450 >         outcounter++;
451 >         if (qd->count>maxdup_count) {
452 >            maxdup_count=qd->count;
453 >            maxdup_seq=seq;
454 >            }
455 >         if (isfasta) {
456 >           if (prefix.is_empty()) {
457 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
458 >                           rseq.chars());
459               }
460 <           else {  //fastq
461 <            if (phred64) convertQ64(rqv);
462 <            if (prefix.is_empty())
301 <               fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
302 <              else
303 <               fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
304 <                                  rseq.chars(),rqv.chars() );
460 >           else { //use custom read name
461 >             fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
462 >                        qd->count, rseq.chars());
463               }
464 <           } //not collapsing duplicates
465 <        } //for each fastq record
466 <   } //while each input file
467 < FRCLOSE(f_in);
468 < if (doCollapse) {
469 <    outcounter=0;
312 <    int maxdup_count=1;
313 <    char* maxdup_seq=NULL;
314 <    dhash.startIterate();
315 <    FqDupRec* qd=NULL;
316 <    char* seq=NULL;
317 <    while ((qd=dhash.NextData(seq))!=NULL) {
318 <      GStr rseq(seq);
319 <      //do the dusting here
320 <      if (doDust) {
321 <         int dustbases=dust(rseq);
322 <         if (dustbases>(rseq.length()>>1)) {
323 <            if (trashReport && qd->firstname!=NULL) {
324 <              fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
325 <              }
326 <            continue;
464 >           }
465 >         else { //fastq format
466 >          if (convert_phred) convertPhred(qd->qv, qd->len);
467 >          if (prefix.is_empty()) {
468 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
469 >                           rseq.chars(), qd->qv);
470              }
471 +          else { //use custom read name
472 +            fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
473 +                        qd->count, rseq.chars(), qd->qv);
474 +            }
475 +           }
476 +         }//for each element of dhash
477 +       if (maxdup_count>1) {
478 +         GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
479           }
480 <      outcounter++;
481 <      if (qd->count>maxdup_count) {
482 <         maxdup_count=qd->count;
483 <         maxdup_seq=seq;
484 <         }
485 <      if (isfasta) {
486 <        if (prefix.is_empty()) {
487 <          fprintf(f_out, "@%s_x%d\n%s\n", qd->firstname, qd->count,
488 <                        rseq.chars());
489 <          }
490 <        else { //use custom read name
491 <          fprintf(f_out, "@%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
492 <                     qd->count, rseq.chars());
480 >       } //collapse entries
481 >    if (verbose) {
482 >       if (paired_reads) {
483 >           GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
484 >           GMessage("Number of input pairs :%9d\n", incounter);
485 >           GMessage("         Output pairs :%9d\n", outcounter);
486 >           }
487 >         else {
488 >           GMessage(">Input file : %s\n", infname.chars());
489 >           GMessage("Number of input reads :%9d\n", incounter);
490 >           GMessage("         Output reads :%9d\n", outcounter);
491 >           }
492 >       GMessage("------------------------------------\n");
493 >       if (num_trimmed5)
494 >          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
495 >       if (num_trimmed3)
496 >          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
497 >       GMessage("------------------------------------\n");
498 >       if (trash_s>0)
499 >         GMessage("     Trashed by length:%9d\n", trash_s);
500 >       if (trash_N>0)
501 >         GMessage("         Trashed by N%%:%9d\n", trash_N);
502 >       if (trash_Q>0)
503 >         GMessage("Trashed by low quality:%9d\n", trash_Q);
504 >       if (trash_poly>0)
505 >         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
506 >       if (trash_A5>0)
507 >         GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508 >       if (trash_A3>0)
509 >         GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510 >       }
511 >    if (trashReport) {
512 >          FWCLOSE(freport);
513            }
514 <        }
515 <      else { //fastq format
516 <       if (phred64) convertQ64(qd->qv, qd->len);
517 <       if (prefix.is_empty()) {
518 <         fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
519 <                        rseq.chars(), qd->qv);
349 <         }
350 <       else { //use custom read name
351 <         fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
352 <                     qd->count, rseq.chars(), qd->qv);
353 <         }
354 <        }
355 <      }//for each element of dhash
356 <    if (maxdup_count>1) {
357 <      GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
358 <      }
359 <   } //report collapsed dhash entries
360 < GMessage("Number of input reads: %9d\n", icounter);
361 < GMessage("       Output records: %9d\n", outcounter);
362 < if (trashReport) {
363 <    FWCLOSE(freport);
364 <    }
365 <
366 < FWCLOSE(f_out);
514 >    FWCLOSE(f_out);
515 >    FWCLOSE(f_out2);
516 >   } //while each input file
517 > delete gxmem_l;
518 > delete gxmem_r;
519 > //getc(stdin);
520   }
521  
522   class NData {
# Line 377 | Line 530
530     const char* seq;
531     bool valid;
532     NData() {
533 +    seqlen=0;
534      NCount=0;
535      end5=0;
536      end3=0;
# Line 392 | Line 546
546       end5=1;
547       end3=seqlen;
548       for (int i=0;i<seqlen;i++)
549 <        if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
549 >        if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
550             NPos[NCount]=i;
551             NCount++;
552             }
# Line 407 | Line 561
561       perc_N=(n*100.0)/(end5-end3+1);
562       }
563   };
564 <
564 >
565   static NData feat;
566   int perc_lenN=12; // incremental distance from ends, in percentage of
567            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
568 <          
568 >
569   void N_analyze(int l5, int l3, int p5, int p3) {
570   /* assumes feat was filled properly */
571   int old_dif, t5,t3,v;
572 < if (l3-l5<min_read_len || p5>p3 ) {
572 > if (l3<l5+2 || p5>p3 ) {
573     feat.end5=l5+1;
574     feat.end3=l3+1;
575 <   return;
575 >   return;
576     }
577  
578   t5=feat.NPos[p5]-l5;
579   t3=l3-feat.NPos[p3];
580   old_dif=p3-p5;
581   v=(int)((((double)(l3-l5))*perc_lenN)/100);
582 < if (v>20) v=20; /* enforce N-search limit for very long reads */
582 > if (v>20) v=20; /* enforce N-search limit for very long reads */
583   if (t5 < v ) {
584     l5=feat.NPos[p5]+1;
585     p5++;
# Line 442 | Line 596
596             feat.end3=l3+1;
597             return;
598             }
599 <    else
599 >    else
600        N_analyze(l5,l3, p5,p3);
601   }
602  
603  
604 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) {
605 < //count Ns in the sequence
604 > bool qtrim(GStr& qvs, int &l5, int &l3) {
605 > if (qvtrim_qmin==0 || qvs.is_empty()) return false;
606 > if (qv_phredtype==0) {
607 >  //try to guess the Phred type
608 >  int vmin=256, vmax=0;
609 >  for (int i=0;i<qvs.length();i++) {
610 >     if (vmin>qvs[i]) vmin=qvs[i];
611 >     if (vmax<qvs[i]) vmax=qvs[i];
612 >     }
613 >  if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
614 >  if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
615 >  if (qv_phredtype==0) {
616 >    GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
617 >    }
618 >  if (verbose)
619 >    GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
620 >  } //guessing Phred type
621 > for (l3=qvs.length()-1;l3>2;l3--) {
622 >  if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
623 >  }
624 > //just in case, check also the 5' the end (?)
625 > for (l5=0;l5<qvs.length()-3;l5++) {
626 >  if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
627 >  }
628 > if (qvtrim_max>0) {
629 >  if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
630 >  if (l5>qvtrim_max) l5=qvtrim_max;
631 >  }
632 > return (l5>0 || l3<qvs.length()-1);
633 > }
634 >
635 > bool ntrim(GStr& rseq, int &l5, int &l3) {
636 > //count Ns in the sequence, trim N-rich ends
637   feat.init(rseq);
638   l5=feat.end5-1;
639   l3=feat.end3-1;
640 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
641 < if (l5==feat.end5-1 && l3==feat.end3-1)
642 <    return false; //nothing changed
640 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
641 > if (l5==feat.end5-1 && l3==feat.end3-1) {
642 >    if (feat.perc_N>max_perc_N) {
643 >           feat.valid=false;
644 >           l3=l5+1;
645 >           return true;
646 >           }
647 >      else {
648 >       return false; //nothing changed
649 >       }
650 >    }
651   l5=feat.end5-1;
652   l3=feat.end3-1;
653   if (l3-l5+1<min_read_len) {
# Line 462 | Line 655
655     return true;
656     }
657   feat.N_calc();
658 < if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases
658 >
659 > if (feat.perc_N>max_perc_N) {
660        feat.valid=false;
661        l3=l5+1;
662        return true;
# Line 473 | Line 667
667   //--------------- dust functions ----------------
668   class DNADuster {
669   public:
670 <  int dustword;
671 <  int dustwindow;
672 <  int dustwindow2;
670 >  int dustword;
671 >  int dustwindow;
672 >  int dustwindow2;
673    int dustcutoff;
674    int mv, iv, jv;
675    int counts[32*32*32];
# Line 570 | Line 764
764                      }
765             }
766           }
767 < //return first;
767 > //return first;
768   }
769   };
770  
# Line 588 | Line 782
782   return ncount;
783   }
784  
785 < // ------------------ adapter matching - triplet matching
786 < //look for matching triplets amongst the first 9 nucleotides of the 3' adaptor
787 < // or the last 9 nucleotides for the 5' adapter
788 < //when a triplet match is found, simply try to extend the alignment using a drop-off scheme
789 < // check minimum score and
790 < // for 3' adapter trimming:
791 < //     require that the right end of the alignment for either the adaptor OR the read must be
792 < //     < 3 distance from its right end
793 < // for 5' adapter trimming:
794 < //     require that the left end of the alignment for either the adaptor OR the read must
795 < //     be at coordinate 0
796 <
797 < bool extendMatch(const char* a, int alen, int ai,
798 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, bool end5=false) {
799 < //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
800 < //if (debug)
801 < //  GStr dbg(b);
802 < //GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai);
803 < int a_l=ai; //alignment coordinates on a
804 < int a_r=ai+mlen-1;
805 < int b_l=bi; //alignment coordinates on b
806 < int b_r=bi+mlen-1;
807 < int ai_maxscore=ai;
808 < int bi_maxscore=bi;
809 < int score=mlen*a_m_score;
810 < int maxscore=score;
811 < int mism5score=a_mis_score;
812 < if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
813 < //try to extend to the left first, if possible
814 < while (ai>0 && bi>0) {
815 <   ai--;
816 <   bi--;
817 <   score+= (a[ai]==b[bi])? a_m_score : mism5score;
818 <   if (score>maxscore) {
819 <       ai_maxscore=ai;
820 <       bi_maxscore=bi;
821 <       maxscore=score;
822 <       }
823 <     else if (maxscore-score>a_dropoff_score) break;
824 <   }
825 < a_l=ai_maxscore;
826 < b_l=bi_maxscore;
827 < //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
828 < //now extend to the right
829 < ai_maxscore=a_r;
830 < bi_maxscore=b_r;
831 < ai=a_r;
832 < bi=b_r;
833 < score=maxscore;
834 < //sometimes there are extra AAAAs at the end of the read, ignore those
835 < if (strcmp(&a[alen-4],"AAAA")==0) {
836 <    alen-=3;
837 <    while (a[alen-1]=='A' && alen>ai) alen--;
838 <    }
839 < while (ai<alen-1 && bi<blen-1) {
646 <   ai++;
647 <   bi++;
648 <   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
649 <   if (a[ai]==b[bi]) { //match
650 <      score+=a_m_score;
651 <      if (ai>=alen-2) {
652 <           score+=a_m_score-(alen-ai-1);
653 <           }
785 > struct SLocScore {
786 >  int pos;
787 >  int score;
788 >  SLocScore(int p=0,int s=0) {
789 >    pos=p;
790 >    score=s;
791 >    }
792 >  void set(int p, int s) {
793 >    pos=p;
794 >    score=s;
795 >    }
796 >  void add(int p, int add) {
797 >    pos=p;
798 >    score+=add;
799 >    }
800 > };
801 >
802 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
803 > if (!doPolyTrim) return false;
804 > int rlen=seq.length();
805 > l5=0;
806 > l3=rlen-1;
807 > int32 seedVal=*(int32*)poly_seed;
808 > char polyChar=poly_seed[0];
809 > //assumes N trimming was already done
810 > //so a poly match should be very close to the end of the read
811 > // -- find the initial match (seed)
812 > int lmin=GMAX((rlen-16), 0);
813 > int li;
814 > for (li=rlen-4;li>lmin;li--) {
815 >   if (seedVal==*(int*)&(seq[li])) {
816 >      break;
817 >      }
818 >   }
819 > if (li<=lmin) return false;
820 > //seed found, try to extend it both ways
821 > //extend right
822 > int ri=li+3;
823 > SLocScore loc(ri, poly_m_score<<2);
824 > SLocScore maxloc(loc);
825 > //extend right
826 > while (ri<rlen-1) {
827 >   ri++;
828 >   if (seq[ri]==polyChar) {
829 >                loc.add(ri,poly_m_score);
830 >                }
831 >   else if (seq[ri]=='N') {
832 >                loc.add(ri,0);
833 >                }
834 >   else { //mismatch
835 >        loc.add(ri,poly_mis_score);
836 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
837 >        }
838 >   if (maxloc.score<=loc.score) {
839 >      maxloc=loc;
840        }
841 +   }
842 + ri=maxloc.pos;
843 + if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
844 + //ri = right boundary for the poly match
845 + //extend left
846 + loc.set(li, maxloc.score);
847 + maxloc.pos=li;
848 + while (li>0) {
849 +    li--;
850 +    if (seq[li]==polyChar) {
851 +                 loc.add(li,poly_m_score);
852 +                 }
853 +    else if (seq[li]=='N') {
854 +                 loc.add(li,0);
855 +                 }
856      else { //mismatch
857 <      score+=a_mis_score;
858 <      }  
859 <   if (score>maxscore) {
860 <       ai_maxscore=ai;
861 <       bi_maxscore=bi;
862 <       maxscore=score;
863 <       }
864 <     else if (maxscore-score>a_dropoff_score) break;
865 <   }
866 <  a_r=ai_maxscore;
867 <  b_r=bi_maxscore;
868 <  int a_ovh3=alen-a_r-1;
869 <  int b_ovh3=blen-b_r-1;
870 <  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
871 <  int mmovh5=(a_l<b_l)? a_l : b_l;
872 <  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
873 <  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
874 <     if (a_l<a_ovh3) {
875 <        //adapter closer to the left end (typical for 5' adapter)
876 <        l5=a_r+1;
877 <        l3=alen-1;
857 >         loc.add(li,poly_mis_score);
858 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
859 >         }
860 >    if (maxloc.score<=loc.score) {
861 >       maxloc=loc;
862 >       }
863 >    }
864 > li=maxloc.pos;
865 > if ((maxloc.score==poly_minScore && ri==rlen-1) ||
866 >    (maxloc.score>poly_minScore && ri>=rlen-3) ||
867 >    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
868 >  //trimming this li-ri match at 3' end
869 >    l3=li-1;
870 >    if (l3<0) l3=0;
871 >    return true;
872 >    }
873 > return false;
874 > }
875 >
876 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
877 > if (!doPolyTrim) return false;
878 > int rlen=seq.length();
879 > l5=0;
880 > l3=rlen-1;
881 > int32 seedVal=*(int32*)poly_seed;
882 > char polyChar=poly_seed[0];
883 > //assumes N trimming was already done
884 > //so a poly match should be very close to the end of the read
885 > // -- find the initial match (seed)
886 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
887 > int li;
888 > for (li=0;li<=lmax;li++) {
889 >   if (seedVal==*(int*)&(seq[li])) {
890 >      break;
891 >      }
892 >   }
893 > if (li>lmax) return false;
894 > //seed found, try to extend it both ways
895 > //extend left
896 > int ri=li+3; //save rightmost base of the seed
897 > SLocScore loc(li, poly_m_score<<2);
898 > SLocScore maxloc(loc);
899 > while (li>0) {
900 >    li--;
901 >    if (seq[li]==polyChar) {
902 >                 loc.add(li,poly_m_score);
903 >                 }
904 >    else if (seq[li]=='N') {
905 >                 loc.add(li,0);
906 >                 }
907 >    else { //mismatch
908 >         loc.add(li,poly_mis_score);
909 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
910 >         }
911 >    if (maxloc.score<=loc.score) {
912 >       maxloc=loc;
913 >       }
914 >    }
915 > li=maxloc.pos;
916 > if (li>5) return false; //no trimming wanted, too far from 5' end
917 > //li = right boundary for the poly match
918 >
919 > //extend right
920 > loc.set(ri, maxloc.score);
921 > maxloc.pos=ri;
922 > while (ri<rlen-1) {
923 >   ri++;
924 >   if (seq[ri]==polyChar) {
925 >                loc.add(ri,poly_m_score);
926 >                }
927 >   else if (seq[ri]=='N') {
928 >                loc.add(ri,0);
929 >                }
930 >   else { //mismatch
931 >        loc.add(ri,poly_mis_score);
932 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
933          }
934 <      else {
935 <        //adapter matching at the right end (typical for 3' adapter)
936 <        l5=0;
937 <        l3=a_l-1;
934 >   if (maxloc.score<=loc.score) {
935 >      maxloc=loc;
936 >      }
937 >   }
938 > ri=maxloc.pos;
939 > if ((maxloc.score==poly_minScore && li==0) ||
940 >     (maxloc.score>poly_minScore && li<2)
941 >     || (maxloc.score>(poly_minScore*3) && li<8)) {
942 >    //adjust l5 to reflect this trimming of 5' end
943 >    l5=ri+1;
944 >    if (l5>rlen-1) l5=rlen-1;
945 >    return true;
946 >    }
947 > return false;
948 > }
949 >
950 > bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951 > if (adaptors3.Count()==0) return false;
952 > int rlen=seq.length();
953 > l5=0;
954 > l3=rlen-1;
955 > bool trimmed=false;
956 > GStr wseq(seq);
957 > int wlen=rlen;
958 > GXSeqData seqdata;
959 > int numruns=revCompl ? 2 : 1;
960 > GList<GXAlnInfo> bestalns(true, true, false);
961 > for (int ai=0;ai<adaptors3.Count();ai++) {
962 >   for (int r=0;r<numruns;r++) {
963 >     if (r) {
964 >          seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
965 >                 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
966          }
967 <     return true;
967 >     else {
968 >            seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
969 >                 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
970 >        }
971 >     GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
972 >         if (aln) {
973 >           if (aln->strong) {
974 >                   trimmed=true;
975 >                   bestalns.Add(aln);
976 >                   break; //will check the rest next time
977 >                   }
978 >            else bestalns.Add(aln);
979 >           }
980 >   }//forward and reverse adaptors
981 >   if (trimmed) break; //will check the rest in the next cycle
982 >  }//for each 3' adaptor
983 > if (bestalns.Count()>0) {
984 >           GXAlnInfo* aln=bestalns[0];
985 >           if (aln->sl-1 > wlen-aln->sr) {
986 >                   //keep left side
987 >                   l3-=(wlen-aln->sl+1);
988 >                   if (l3<0) l3=0;
989 >                   }
990 >           else { //keep right side
991 >                   l5+=aln->sr;
992 >                   if (l5>=rlen) l5=rlen-1;
993 >                   }
994 >           //delete aln;
995 >           //if (l3-l5+1<min_read_len) return true;
996 >           wseq=seq.substr(l5,l3-l5+1);
997 >           wlen=wseq.length();
998 >           return true; //break the loops here to report a good find
999       }
1000    return false;
1001   }
1002  
1003 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1003 > bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1004 > if (adaptors5.Count()==0) return false;
1005   int rlen=seq.length();
1006   l5=0;
1007   l3=rlen-1;
1008 < //first try a full match, we might get lucky
1009 < int fi=-1;
1010 < if ((fi=seq.index(adapter3))>=0) {
1011 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
1012 <      l5=fi+a3len;
1013 <      l3=rlen-1;
1014 <      }
1015 <    else {
1016 <      l5=0;
1017 <      l3=fi-1;
1008 > bool trimmed=false;
1009 > GStr wseq(seq);
1010 > int wlen=rlen;
1011 > GXSeqData seqdata;
1012 > int numruns=revCompl ? 2 : 1;
1013 > GList<GXAlnInfo> bestalns(true, true, false);
1014 > for (int ai=0;ai<adaptors5.Count();ai++) {
1015 >   for (int r=0;r<numruns;r++) {
1016 >     if (r) {
1017 >          seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1018 >                 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1019 >        }
1020 >     else {
1021 >            seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1022 >                 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1023 >        }
1024 >         GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1025 >         if (aln) {
1026 >           if (aln->strong) {
1027 >                   trimmed=true;
1028 >                   bestalns.Add(aln);
1029 >                   break; //will check the rest next time
1030 >                   }
1031 >            else bestalns.Add(aln);
1032 >           }
1033 >         } //forward and reverse?
1034 >   if (trimmed) break; //will check the rest in the next cycle
1035 >  }//for each 5' adaptor
1036 >  if (bestalns.Count()>0) {
1037 >           GXAlnInfo* aln=bestalns[0];
1038 >           if (aln->sl-1 > wlen-aln->sr) {
1039 >                   //keep left side
1040 >                   l3-=(wlen-aln->sl+1);
1041 >                   if (l3<0) l3=0;
1042 >                   }
1043 >           else { //keep right side
1044 >                   l5+=aln->sr;
1045 >                   if (l5>=rlen) l5=rlen-1;
1046 >                   }
1047 >           //delete aln;
1048 >           //if (l3-l5+1<min_read_len) return true;
1049 >           wseq=seq.substr(l5,l3-l5+1);
1050 >           wlen=wseq.length();
1051 >           return true; //break the loops here to report a good find
1052 >     }
1053 >  return false;
1054 > }
1055 >
1056 > //convert qvs to/from phred64 from/to phread33
1057 > void convertPhred(GStr& q) {
1058 > for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1059 > }
1060 >
1061 > void convertPhred(char* q, int len) {
1062 > for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1063 > }
1064 >
1065 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1066 >          GStr& rname, GStr& rinfo, GStr& infname) {
1067 > rseq="";
1068 > rqv="";
1069 > rname="";
1070 > rinfo="";
1071 > if (fq.eof()) return false;
1072 > char* l=fq.getLine();
1073 > while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1074 > if (l==NULL) return false;
1075 > /* if (rawFormat) {
1076 >      //TODO: implement raw qseq parsing here
1077 >      //if (raw type=N) then continue; //skip invalid/bad records
1078 >      } //raw qseq format
1079 > else { // FASTQ or FASTA */
1080 > isfasta=(l[0]=='>');
1081 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1082 >      infname.chars(), l);
1083 > GStr s(l);
1084 > rname=&(l[1]);
1085 > for (int i=0;i<rname.length();i++)
1086 >    if (rname[i]<=' ') {
1087 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1088 >       rname.cut(i);
1089 >       break;
1090 >       }
1091 >  //now get the sequence
1092 > if ((l=fq.getLine())==NULL)
1093 >      GError("Error: unexpected EOF after header for read %s (%s)\n",
1094 >                   rname.chars(), infname.chars());
1095 > rseq=l; //this must be the DNA line
1096 > while ((l=fq.getLine())!=NULL) {
1097 >      //seq can span multiple lines
1098 >      if (l[0]=='>' || l[0]=='+') {
1099 >           fq.pushBack();
1100 >           break; //
1101 >           }
1102 >      rseq+=l;
1103 >      } //check for multi-line seq
1104 > if (!isfasta) { //reading fastq quality values, which can also be multi-line
1105 >    if ((l=fq.getLine())==NULL)
1106 >        GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1107 >    if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1108 >    if ((l=fq.getLine())==NULL)
1109 >        GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1110 >    rqv=l;
1111 >    //if (rqv.length()!=rseq.length())
1112 >    //  GError("Error: qv len != seq len for %s\n", rname.chars());
1113 >    while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1114 >      rqv+=l; //append to qv string
1115        }
1116 <   return true;
1116 >    }// fastq
1117 > // } //<-- FASTA or FASTQ
1118 > rseq.upper();
1119 > return true;
1120 > }
1121 >
1122 > #ifdef GDEBUG
1123 > void showTrim(GStr& s, int l5, int l3) {
1124 >  if (l5>0 || l3==0) {
1125 >    color_bg(c_red);
1126 >    }
1127 >  for (int i=0;i<s.length()-1;i++) {
1128 >    if (i && i==l5) color_resetbg();
1129 >    fprintf(stderr, "%c", s[i]);
1130 >    if (i && i==l3) color_bg(c_red);
1131     }
1132 < //also, for fast detection of other adapter-only reads that start past
1133 < // the beginning of the adapter sequence, try to see if the first a3len-4
1134 < // characters of the read are a substring of the adapter
1135 < GStr rstart=seq.substr(0,a3len-4);
1136 < if ((fi=adapter3.index(rstart))>=0) {
1137 <   l3=rlen-1;
1138 <   l5=a3len-4;
1139 <   while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1140 <   return true;
1132 >  fprintf(stderr, "%c", s[s.length()-1]);
1133 >  color_reset();
1134 >  fprintf(stderr, "\n");
1135 > }
1136 > #endif
1137 >
1138 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1139 > //returns 0 if the read was untouched, 1 if it was just trimmed
1140 > // and a trash code if it was trashed
1141 > l5=0;
1142 > l3=rseq.length()-1;
1143 > #ifdef GDEBUG
1144 >   //rseq.reverse();
1145 >   GMessage(">%s\n", rname.chars());
1146 >   GMessage("%s\n",rseq.chars());
1147 > #endif
1148 > if (l3-l5+1<min_read_len) {
1149 >   return 's';
1150     }
1151 <  
1152 < //another easy case: first half of the adaptor matches
1153 < int a3hlen=a3len>>1;
1154 < GStr ahalf=adapter3.substr(0, a3hlen);
1155 < if ((fi=seq.index(ahalf))>=0) {
1156 <   extendMatch(seq.chars(), rlen, fi,
1157 <                 adapter3.chars(), a3len, 0,  a3hlen, l5,l3);
1158 <   return true;
1151 > GStr wseq(rseq.chars());
1152 > GStr wqv(rqv.chars());
1153 > int w5=l5;
1154 > int w3=l3;
1155 > //first do the q-based trimming
1156 > if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1157 >   if (w3-w5+1<min_read_len) {
1158 >     return 'Q'; //invalid read
1159 >     }
1160 >    //-- keep only the w5..w3 range
1161 >   l5=w5;
1162 >   l3=w3;
1163 >   wseq=wseq.substr(w5, w3-w5+1);
1164 >   if (!wqv.is_empty())
1165 >      wqv=wqv.substr(w5, w3-w5+1);
1166 >   } //qv trimming
1167 > // N-trimming on the remaining read seq
1168 > if (ntrim(wseq, w5, w3)) {
1169 >   //GMessage("before: %s\n",wseq.chars());
1170 >   //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1171 >   l5+=w5;
1172 >   l3-=(wseq.length()-1-w3);
1173 >   if (w3-w5+1<min_read_len) {
1174 >     return 'N'; //to be trashed
1175 >     }
1176 >    //-- keep only the w5..w3 range
1177 >   wseq=wseq.substr(w5, w3-w5+1);
1178 >   if (!wqv.is_empty())
1179 >      wqv=wqv.substr(w5, w3-w5+1);
1180 >   w5=0;
1181 >   w3=wseq.length()-1;
1182     }
1183 < //no easy cases, so let's do the word hashing
1184 < for (int iw=0;iw<6;iw++) {
1185 <   GStr aword=adapter3.substr(iw,3);
1186 <   if ((fi=seq.index(aword))>=0  && rlen-fi>3) {
1187 <      if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1188 <                   a3len, iw, 3, l5,l3)) return true;
1183 > char trim_code;
1184 > //clean the more dirty end first - 3'
1185 > int prev_w5=0;
1186 > int prev_w3=0;
1187 > bool w3upd=true;
1188 > bool w5upd=true;
1189 > do {
1190 >  trim_code=0;
1191 >  if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1192 >      trim_code='A';
1193 >      }
1194 >  else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1195 >      trim_code='T';
1196 >      }
1197 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1198 >      trim_code='A';
1199 >      }
1200 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1201 >      trim_code='T';
1202 >      }
1203 >  else if (trim_adaptor5(wseq, w5, w3)) {
1204 >      trim_code='5';
1205 >      }
1206 >  else if (trim_adaptor3(wseq, w5, w3)) {
1207 >      trim_code='3';
1208        }
1209 +  if (trim_code) {
1210 +     w3upd=(w3!=prev_w3);
1211 +         w5upd=(w5!=prev_w5);
1212 +         if (w3upd) prev_w3=w3;
1213 +         if (w5upd) prev_w5=w5;
1214 +   #ifdef GDEBUG
1215 +     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1216 +     showTrim(wseq, w5, w3);
1217 +   #endif
1218 +     int trimlen=wseq.length()-(w3-w5+1);
1219 +     num_trimmed3++;
1220 +     if (trimlen<min_trimmed3)
1221 +         min_trimmed3=trimlen;
1222 +     l5+=w5;
1223 +     l3-=(wseq.length()-1-w3);
1224 +     if (w3-w5+1<min_read_len) {
1225 +         return trim_code;
1226 +         }
1227 +      //-- keep only the w5..w3 range
1228 +      wseq=wseq.substr(w5, w3-w5+1);
1229 +      if (!wqv.is_empty())
1230 +         wqv=wqv.substr(w5, w3-w5+1);
1231 +      }//trimming at 3' end
1232 + } while (trim_code);
1233 +
1234 + if (doCollapse) {
1235 +   //keep read for later
1236 +   FqDupRec* dr=dhash.Find(wseq.chars());
1237 +   if (dr==NULL) { //new entry
1238 +          //if (prefix.is_empty())
1239 +             dhash.Add(wseq.chars(),
1240 +                  new FqDupRec(&wqv, rname.chars()));
1241 +          //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1242 +         }
1243 +      else
1244 +         dr->add(wqv);
1245 +   } //collapsing duplicates
1246 + else { //not collapsing duplicates
1247 +   //apply the dust filter now
1248 +   if (doDust) {
1249 +     int dustbases=dust(wseq);
1250 +     if (dustbases>(wseq.length()>>1)) {
1251 +        return 'D';
1252 +        }
1253 +     }
1254 +   } //not collapsing duplicates
1255 + return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1256 + }
1257 +
1258 + void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1259 + //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1260 + if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1261 +  else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1262 + }
1263 +
1264 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1265 +   outcounter++;
1266 +   bool asFasta=(rqv.is_empty() || fastaOutput);
1267 +   if (asFasta) {
1268 +    if (prefix.is_empty()) {
1269 +       printHeader(f_out, '>',rname,rinfo);
1270 +       fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1271 +       }
1272 +      else {
1273 +       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1274 +                          rseq.chars());
1275 +       }
1276 +     }
1277 +   else {  //fastq
1278 +    if (convert_phred) convertPhred(rqv);
1279 +    if (prefix.is_empty()) {
1280 +       printHeader(f_out, '@', rname, rinfo);
1281 +       fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1282 +       }
1283 +      else
1284 +       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1285 +                          rseq.chars(),rqv.chars() );
1286 +     }
1287 + }
1288 +
1289 + void trash_report(char trashcode, GStr& rname, FILE* freport) {
1290 + if (freport==NULL || trashcode<=' ') return;
1291 + if (trashcode=='3' || trashcode=='5') {
1292 +   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1293     }
1294 < return false; //no adapter parts found
1294 > else {
1295 >   fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1296 >   }
1297 > //tcounter++;
1298 > }
1299 >
1300 > GStr getFext(GStr& s, int* xpos=NULL) {
1301 > //s must be a filename without a path
1302 > GStr r("");
1303 > if (xpos!=NULL) *xpos=0;
1304 > if (s.is_empty() || s=="-") return r;
1305 > int p=s.rindex('.');
1306 > int d=s.rindex('/');
1307 > if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1308 > r=s.substr(p+1);
1309 > if (xpos!=NULL) *xpos=p+1;
1310 > r.lower();
1311 > return r;
1312   }
1313  
1314 < bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1315 < //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1316 < int rlen=seq.length();
1317 < l5=0;
1318 < l3=rlen-1;
1319 < //try to see if adapter is fully included in the read
1320 < int fi=-1;
1321 < if ((fi=seq.index(adapter5))>=0) {
1322 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
744 <      l5=fi+a5len;
745 <      l3=rlen-1;
746 <      }
747 <    else {
748 <      l5=0;
749 <      l3=fi-1;
750 <      }
751 <   return true;
1314 > void baseFileName(GStr& fname) {
1315 > //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1316 > int xpos=0;
1317 > GStr fext=getFext(fname, &xpos);
1318 > if (xpos<=1) return;
1319 > bool extdel=false;
1320 > GStr f2;
1321 > if (fext=="z" || fext=="zip") {
1322 >   extdel=true;
1323     }
1324 < //for fast detection of adapter-rich reads, check if the first 12
1325 < //characters of the read are a substring of the adapter
1326 < GStr rstart=seq.substr(1,12);
756 < if ((fi=adapter5.index(rstart))>=0) {
757 <   //l3=rlen-1;
758 <   //l5=a5len-4;
759 <   //while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++;
760 <   //return true;
761 <   //if (debug) GMessage(" first 12char of the read match adaptor!\n");
762 <   extendMatch(seq.chars(), rlen, 1,
763 <                 adapter5.chars(), a5len, fi,  12, l5,l3, true);
764 <   return true;
1324 >  else if (fext.length()>=2) {
1325 >   f2=fext.substr(0,2);
1326 >   extdel=(f2=="gz" || f2=="bz");
1327     }
1328 <  
1329 < //another easy case: last 12 characters of the adaptor found as a substring of the read
1330 < int aplen=12;
1331 < int apstart=a5len-aplen-2;
770 < if (apstart<0) { apstart=0; aplen=a5len-1; }
771 < GStr bstr=adapter5.substr(apstart, aplen);
772 < if ((fi=seq.index(bstr))>=0) {
773 <   //if (debug) GMessage(" last 12char of adaptor match the read!\n");
774 <   extendMatch(seq.chars(), rlen, fi,
775 <                 adapter5.chars(), a5len, apstart,  aplen, l5,l3,true);
776 <   return true;
1328 > if (extdel) {
1329 >   fname.cut(xpos-1);
1330 >   fext=getFext(fname, &xpos);
1331 >   if (xpos<=1) return;
1332     }
1333 < //no easy cases, find a triplet match as a seed for alignment extension
1334 < //find triplets at the right end of the adapter
1335 < for (int iw=0;iw<6;iw++) {
781 <   apstart=a5len-iw-4;
782 <   GStr aword=adapter5.substr(apstart,3);
783 <   if ((fi=seq.index(aword))>=0) {
784 <      //if (debug) GMessage("extending wmatch :%*s\n", fi+3, aword.chars());
785 <      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
786 <                   a5len, apstart, 3, l5,l3,true)) {
787 <                     return true;
788 <                     }
789 <      }
1333 > extdel=false;
1334 > if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1335 >   extdel=true;
1336     }
1337 < return false; //no adapter parts found
1338 < }
1337 >  else if (fext.length()>=2) {
1338 >   extdel=(fext.substr(0,2)=="fa");
1339 >   }
1340 > if (extdel) fname.cut(xpos-1);
1341 > GStr fncp(fname);
1342 > fncp.lower();
1343 > fncp.chomp("seq");
1344 > fncp.chomp("sequence");
1345 > fncp.trimR("_.");
1346 > if (fncp.length()<fname.length()) fname.cut(fncp.length());
1347 > }
1348 >
1349 > FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1350 >  FILE* f_out=NULL;
1351 >  GStr fname(getFileName(infname.chars()));
1352 >  //eliminate known extensions
1353 >  baseFileName(fname);
1354 >  if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1355 >    else if (pocmd.is_empty()) {
1356 >               GStr oname(fname);
1357 >               oname.append('.');
1358 >               oname.append(outsuffix);
1359 >               f_out=fopen(oname.chars(),"w");
1360 >               if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1361 >               }
1362 >            else {
1363 >              GStr oname(">");
1364 >              oname.append(fname);
1365 >              oname.append('.');
1366 >              oname.append(outsuffix);
1367 >              pocmd.append(oname);
1368 >              f_out=popen(pocmd.chars(), "w");
1369 >              if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1370 >              }
1371 > return f_out;
1372 > }
1373  
1374 < //conversion of phred64 to phread33
1375 < void convertQ64(GStr& q) {
1376 < for (int i=0;i<q.length();i++) q[i]-=31;
1374 > void guess_unzip(GStr& fname, GStr& picmd) {
1375 > GStr fext=getFext(fname);
1376 > if (fext=="gz" || fext=="gzip" || fext=="z") {
1377 >    picmd="gzip -cd ";
1378 >    }
1379 >   else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1380 >    picmd="bzip2 -cd ";
1381 >    }
1382 > }
1383 >
1384 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1385 > //TODO: prepare CASeqData here, and collect hexamers as well
1386 >  if (seq.is_empty() || seq=="-" ||
1387 >          seq=="N/A" || seq==".") return;
1388 >
1389 > CASeqData adata(revCompl);
1390 > int idx=adaptors.Add(adata);
1391 > if (idx<0) GError("Error: failed to add adaptor!\n");
1392 > adaptors[idx].trim_type=trim_type;
1393 > adaptors[idx].update(seq.chars());
1394   }
1395  
1396 < void convertQ64(char* q, int len) {
1397 < for (int i=0;i<len;i++) q[i]-=31;
1396 >
1397 > int loadAdaptors(const char* fname) {
1398 >  GLineReader lr(fname);
1399 >  char* l;
1400 >  while ((l=lr.nextLine())!=NULL) {
1401 >   if (lr.length()<=3 || l[0]=='#') continue;
1402 >   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1403 >        l[0]==';'|| l[0]==':' ) {
1404 >      int i=1;
1405 >      while (l[i]!=0 && isspace(l[i])) {
1406 >        i++;
1407 >        }
1408 >      if (l[i]!=0) {
1409 >        GStr s(&(l[i]));
1410 >      #ifdef GDEBUG
1411 >          //s.reverse();
1412 >      #endif
1413 >        addAdaptor(adaptors3, s, galn_TrimRight);
1414 >        continue;
1415 >        }
1416 >      }
1417 >    else {
1418 >      GStr s(l);
1419 >      s.startTokenize("\t ;,:");
1420 >      GStr a5,a3;
1421 >      if (s.nextToken(a5))
1422 >            s.nextToken(a3);
1423 >        else continue; //no tokens on this line
1424 >      GAlnTrimType ttype5=galn_TrimLeft;
1425 >      a5.upper();
1426 >      a3.upper();
1427 >      if (a3.is_empty() || a3==a5 || a3=="=") {
1428 >         a3.clear();
1429 >         ttype5=galn_TrimEither;
1430 >         }
1431 >     #ifdef GDEBUG
1432 >     //   a5.reverse();
1433 >     //   a3.reverse();
1434 >     #endif
1435 >      addAdaptor(adaptors5, a5, ttype5);
1436 >      addAdaptor(adaptors3, a3, galn_TrimRight);
1437 >      }
1438 >   }
1439 >   return adaptors5.Count()+adaptors3.Count();
1440   }
1441  
1442 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1443 +                       GStr& s, GStr& infname, GStr& infname2) {
1444 + // uses outsuffix to generate output file names and open file handles as needed
1445 + infname="";
1446 + infname2="";
1447 + f_in=NULL;
1448 + f_in2=NULL;
1449 + f_out=NULL;
1450 + f_out2=NULL;
1451 + //analyze outsuffix intent
1452 + GStr pocmd;
1453 + if (outsuffix=="-") {
1454 +    f_out=stdout;
1455 +    }
1456 +   else {
1457 +    GStr ox=getFext(outsuffix);
1458 +    if (ox.length()>2) ox=ox.substr(0,2);
1459 +    if (ox=="gz") pocmd="gzip -9 -c ";
1460 +        else if (ox=="bz") pocmd="bzip2 -9 -c ";
1461 +    }
1462 + if (s=="-") {
1463 +    f_in=stdin;
1464 +    infname="stdin";
1465 +    f_out=prepOutFile(infname, pocmd);
1466 +    return;
1467 +    } // streaming from stdin
1468 + s.startTokenize(",:");
1469 + s.nextToken(infname);
1470 + bool paired=s.nextToken(infname2);
1471 + if (fileExists(infname.chars())==0)
1472 +    GError("Error: cannot find file %s!\n",infname.chars());
1473 + GStr fname(getFileName(infname.chars()));
1474 + GStr picmd;
1475 + guess_unzip(fname, picmd);
1476 + if (picmd.is_empty()) {
1477 +   f_in=fopen(infname.chars(), "r");
1478 +   if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1479 +   }
1480 +  else {
1481 +   picmd.append(infname);
1482 +   f_in=popen(picmd.chars(), "r");
1483 +   if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1484 +   }
1485 + if (f_out==stdout) {
1486 +   if (paired) GError("Error: output suffix required for paired reads\n");
1487 +   return;
1488 +   }
1489 + f_out=prepOutFile(infname, pocmd);
1490 + if (!paired) return;
1491 + if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1492 + // ---- paired reads:-------------
1493 + if (fileExists(infname2.chars())==0)
1494 +     GError("Error: cannot find file %s!\n",infname2.chars());
1495 + picmd="";
1496 + GStr fname2(getFileName(infname2.chars()));
1497 + guess_unzip(fname2, picmd);
1498 + if (picmd.is_empty()) {
1499 +   f_in2=fopen(infname2.chars(), "r");
1500 +   if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1501 +   }
1502 +  else {
1503 +   picmd.append(infname2);
1504 +   f_in2=popen(picmd.chars(), "r");
1505 +   if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1506 +   }
1507 + f_out2=prepOutFile(infname2, pocmd);
1508 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines