ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 2 | Line 2
2   #include "GStr.h"
3   #include "GHash.hh"
4   #include "GList.hh"
5 + #include <ctype.h>
6 + #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9 < fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\
10 <    [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\
9 > fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 >   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 >   [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 >    <input.fq>[,<input_mates.fq>\n\
13   \n\
14 < Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\
15 < optionally collapse duplicates in the input fastq data.\n\
14 > Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 > for low complexity and collapse duplicate reads.\n\
16 > If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 > one read in a pair), the two file names should be given delimited by a comma\n\
18 > or a colon character\n\
19   \n\
20   Options:\n\
21   -n  rename all the reads using the <prefix> followed by a read counter;\n\
22      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23      the read duplication count\n\
24 < -o  write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
24 > -o  unless this parameter is '-', write the trimmed/filtered reads to \n\
25 >    file(s) named <input>.<outsuffix> which will be created in the \n\
26 >    current (working) directory; (writes to stdout if -o- is given);\n\
27 >    a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 > -f  file with adapter sequences to trim, each line having this format:\n\
29 >    <5'-adapter-sequence> <3'-adapter-sequence>\n\
30   -5  trim the given adapter or primer sequence at the 5' end of each read\n\
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32   -3  trim the given adapter sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -l  minimum \"clean\" length at the high quality 5'end that a read must\n\
35 <    have in order to pass the filter (default: 16)\n\
34 > -A  disable polyA trimming (enabled by default)\n\
35 > -T  enable polyT trimming (disabled by default)\n\
36 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
37 > -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39 > -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
40 > -l  minimum \"clean\" length after trimming that a read must have\n\
41 >    in order to pass the filter (default: 16)\n\
42   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
43      one letter code for the reason why they were trashed\n\
44 + -D  apply a low-complexity (dust) filter and discard any read that has over\n\
45 +    50% of its length detected as low complexity\n\
46   -C  collapse duplicate reads and add a prefix with their count to the read\n\
47 <    name\n\
48 < -D  apply a low-complexity (dust) filter and discard any read that has at\n\
49 <    least half of it masked by this\n\
50 < -Q  quality values in the input data are interpreted as phred64, convert\n\
31 <    them to phred33\n\
47 >    name (e.g. for microRNAs)\n\
48 > -p  input is phred64/phred33 (use -p64 or -p33)\n\
49 > -Q  convert quality values to the other Phred qv type\n\
50 > -V  verbose processing\n\
51   "
52 +
53 + //-z  for -o option, the output stream(s) will be first piped into the given\n
54 + //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
55 +
56 +
57   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
58 < FILE* f_out=NULL; //stdout if not provided
59 < FILE* f_in=NULL; //input fastq (stdin if not provided)
58 >
59 > //For paired reads sequencing:
60 > //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
61 > //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
62 > //FILE* f_out=NULL; //stdout if not provided
63 > //FILE* f_out2=NULL; //for paired reads
64 > //FILE* f_in=NULL; //input fastq (stdin if not provided)
65 > //FILE* f_in2=NULL; //for paired reads
66   FILE* freport=NULL;
67 +
68   bool debug=false;
69 + bool verbose=false;
70   bool doCollapse=false;
71   bool doDust=false;
72 + bool fastaOutput=false;
73   bool trashReport=false;
74 < bool rawFormat=false;
74 > //bool rawFormat=false;
75   int min_read_len=16;
76 + double max_perc_N=7.0;
77   int dust_cutoff=16;
78   bool isfasta=false;
79 < bool phred64=false;
79 > bool convert_phred=false;
80 > GStr outsuffix; // -o
81   GStr prefix;
82 < GStr adapter3;
83 < GStr adapter5;
84 < int a3len=0;
85 < int a5len=0;
86 < // adaptor matching metrics -- for extendMatch() function
87 < static const int a_m_score=2; //match score
88 < static const int a_mis_score=-3; //mismatch
89 < static const int a_dropoff_score=7;
90 < static const int a_min_score=7;
82 > GStr zcmd;
83 > int num_trimmed5=0;
84 > int num_trimmed3=0;
85 > int num_trimmedN=0;
86 > int num_trimmedQ=0;
87 > int min_trimmed5=INT_MAX;
88 > int min_trimmed3=INT_MAX;
89 >
90 > int qvtrim_qmin=0;
91 > int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
92 > int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93 > int qv_cvtadd=0; //could be -31 or +31
94 >
95 > // adaptor matching metrics -- for X-drop ungapped extension
96 > const int poly_m_score=2; //match score
97 > const int poly_mis_score=-3; //mismatch
98 > const int poly_dropoff_score=7;
99 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
100 >
101 > const char *polyA_seed="AAAA";
102 > const char *polyT_seed="TTTT";
103 >
104 > struct CAdapters {
105 >    GStr a5;
106 >    GStr a3;
107 >    CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
108 >      }
109 > };
110 >
111 > GPVec<CAdapters> adapters;
112  
113   // element in dhash:
114   class FqDupRec {
# Line 91 | Line 147
147    if (!s.is_empty()) {
148        if (s=='-') f=stdout;
149        else {
150 <       f=fopen(s,"w");
150 >       f=fopen(s.chars(),"w");
151         if (f==NULL) GError("Error creating file: %s\n", s.chars());
152         }
153       }
# Line 102 | Line 158
158  
159   GHash<FqDupRec> dhash; //hash to keep track of duplicates
160  
161 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured
161 > int loadAdapters(const char* fname);
162 >
163 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
164 >                       GStr& s, GStr& infname, GStr& infname2);
165 > // uses outsuffix to generate output file names and open file handles as needed
166  
167 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
168 + void trash_report(char trashcode, GStr& rname, FILE* freport);
169 +
170 + bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
171 +          GStr& rname, GStr& rinfo, GStr& infname);
172 +
173 + char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
174 + //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
175 +
176 + bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
177 + bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
178   int dust(GStr& seq);
179 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
179 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
180 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
181   bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
182 + bool trim_adapter3(GStr& seq, int &l5, int &l3);
183  
184 < void convertQ64(char* q, int len);
185 < void convertQ64(GStr& q);
184 > void convertPhred(char* q, int len);
185 > void convertPhred(GStr& q);
186  
187   int main(int argc, char * const argv[]) {
188 <  GArgs args(argc, argv, "YQRDCl:d:3:5:n:r:o:");
188 >  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189    int e;
117  int icounter=0; //counter for input reads
118  int outcounter=0; //counter for output reads
190    if ((e=args.isError())>0) {
191        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
192        exit(224);
193        }
194    debug=(args.getOpt('Y')!=NULL);
195 <  phred64=(args.getOpt('Q')!=NULL);
195 >  verbose=(args.getOpt('V')!=NULL);
196 >  convert_phred=(args.getOpt('Q')!=NULL);
197    doCollapse=(args.getOpt('C')!=NULL);
198    doDust=(args.getOpt('D')!=NULL);
199 +  /*
200    rawFormat=(args.getOpt('R')!=NULL);
201    if (rawFormat) {
202      GError("Sorry, raw qseq format parsing is not implemented yet!\n");
203      }
204 +  */
205    prefix=args.getOpt('n');
206    GStr s=args.getOpt('l');
207    if (!s.is_empty())
208       min_read_len=s.asInt();
209 +  s=args.getOpt('m');
210 +  if (!s.is_empty())
211 +     max_perc_N=s.asDouble();
212    s=args.getOpt('d');
213    if (!s.is_empty()) {
214       dust_cutoff=s.asInt();
215       doDust=true;
216       }
217 <    
218 <  if (args.getOpt('3')!=NULL) {
219 <    adapter3=args.getOpt('3');
220 <    adapter3.upper();
221 <    a3len=adapter3.length();
222 <    }
223 <  if (args.getOpt('5')!=NULL) {
224 <    adapter5=args.getOpt('5');
225 <    adapter5.upper();
226 <    a5len=adapter5.length();
217 >  s=args.getOpt('q');
218 >  if (!s.is_empty()) {
219 >     qvtrim_qmin=s.asInt();
220 >     }
221 >  s=args.getOpt('t');
222 >  if (!s.is_empty()) {
223 >     qvtrim_max=s.asInt();
224 >     }
225 >  s=args.getOpt('p');
226 >  if (!s.is_empty()) {
227 >     int v=s.asInt();
228 >     if (v==33) {
229 >        qv_phredtype=33;
230 >        qv_cvtadd=31;
231 >        }
232 >      else if (v==64) {
233 >        qv_phredtype=64;
234 >        qv_cvtadd=-31;
235 >        }
236 >       else
237 >         GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
238 >     }
239 >  s=args.getOpt('f');
240 >  if (!s.is_empty()) {
241 >   loadAdapters(s.chars());
242 >   }
243 >  bool fileAdapters=adapters.Count();
244 >  s=args.getOpt('5');
245 >  if (!s.is_empty()) {
246 >    if (fileAdapters)
247 >      GError("Error: options -5 and -f cannot be used together!\n");
248 >    s.upper();
249 >    adapters.Add(new CAdapters(s.chars()));
250      }
251 +  s=args.getOpt('3');
252 +  if (!s.is_empty()) {
253 +    if (fileAdapters)
254 +      GError("Error: options -3 and -f cannot be used together!\n");
255 +    s.upper();
256 +    if (adapters.Count()>0)
257 +          adapters[0]->a3=s.chars();
258 +     else adapters.Add(NULL, new CAdapters(s.chars()));
259 +    }
260 +  s=args.getOpt('y');
261 +  if (!s.is_empty()) {
262 +     int minmatch=s.asInt();
263 +     poly_minScore=minmatch*poly_m_score;
264 +     }
265 +  
266 +  if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
267 +                         else outsuffix="-";
268    trashReport=  (args.getOpt('r')!=NULL);
269 <  if (args.startNonOpt()==0) {
269 >  int fcount=args.startNonOpt();
270 >  if (fcount==0) {
271      GMessage(USAGE);
272      exit(224);
273      }
274 <
275 <  openfw(f_out, args, 'o');
276 <  if (f_out==NULL) f_out=stdout;
274 >   if (fcount>1 && doCollapse) {
275 >    GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
276 >    }
277 >  //openfw(f_out, args, 'o');
278 >  //if (f_out==NULL) f_out=stdout;
279    if (trashReport)
280      openfw(freport, args, 'r');
281    char* infile=NULL;
282    while ((infile=args.nextNonOpt())!=NULL) {
283 <    GStr infname(infile);
284 <    if (strcmp(infile,"-")==0) {
285 <       f_in=stdin; infname="stdin"; }
286 <     else {
287 <        f_in=fopen(infile,"r");
288 <        if (f_in==NULL)
289 <            GError("Cannot open input file %s!\n",infile);
290 <        }
291 <     GLineReader fq(f_in);
292 <     char* l=NULL;
293 <     while ((l=fq.getLine())!=NULL) {
294 <        GStr rname; //current read name
295 <        GStr rseq;  //current read sequence
296 <        GStr rqv;   //current read quality values
297 <        GStr s;
298 <        if (rawFormat) {
299 <          //TODO: implement qseq parsing here
300 <          //if (raw type=N) then continue; //skip invalid/bad records
301 <          
302 <          } //raw qseq format
303 <         else { // FASTQ or FASTA
304 <          isfasta=(l[0]=='>');
305 <          if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
306 <          s=l;
307 <          rname=&(l[1]);
308 <          icounter++;
309 <          //GMessage("readname=%s\n",rname.chars());
310 <          for (int i=0;i<rname.length();i++)
311 <            if (rname[i]<=' ') { rname.cut(i); break; }
312 <          //now get the sequence
313 <          if ((l=fq.getLine())==NULL)
314 <              GError("Error: unexpected EOF after header for %s\n",rname.chars());
315 <          rseq=l; //this must be the DNA line
316 <          if (isfasta) {
317 <            while ((l=fq.getLine())!=NULL) {
318 <              //fasta can have multiple sequence lines
319 <              if (l[0]=='>') {
320 <                   fq.pushBack();
321 <                   break; //
283 >    int incounter=0; //counter for input reads
284 >    int outcounter=0; //counter for output reads
285 >    int trash_s=0; //too short from the get go
286 >    int trash_Q=0;
287 >    int trash_N=0;
288 >    int trash_D=0;
289 >    int trash_A3=0;
290 >    int trash_A5=0;
291 >    s=infile;
292 >    GStr infname;
293 >    GStr infname2;
294 >    FILE* f_in=NULL;
295 >    FILE* f_in2=NULL;
296 >    FILE* f_out=NULL;
297 >    FILE* f_out2=NULL;
298 >    bool paired_reads=false;
299 >    setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
300 >    GLineReader fq(f_in);
301 >    GLineReader* fq2=NULL;
302 >    if (f_in2!=NULL) {
303 >       fq2=new GLineReader(f_in2);
304 >       paired_reads=true;
305 >       }
306 >    GStr rseq, rqv, seqid, seqinfo;
307 >    GStr rseq2, rqv2, seqid2, seqinfo2;
308 >    while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
309 >       incounter++;
310 >       int a5=0, a3=0, b5=0, b3=0;
311 >       char tcode=0, tcode2=0;
312 >       tcode=process_read(seqid, rseq, rqv, a5, a3);
313 >       //if (!doCollapse) {
314 >         if (fq2!=NULL) {
315 >            getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
316 >            if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
317 >               GError("Error: no paired match for read %s vs %s (%s,%s)\n",
318 >                  seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
319 >               }
320 >            tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
321 >            //decide what to do with this pair and print rseq2 if the pair makes it
322 >            if (tcode>1 && tcode2<=1) {
323 >               //"untrash" rseq
324 >               if (a3-a5+1<min_read_len) {
325 >                   a5=1;
326 >                   if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
327                     }
328 <              rseq+=l;
329 <              }
330 <            } //multi-line fasta file
331 <          if (!isfasta) {
332 <            if ((l=fq.getLine())==NULL)
333 <                GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
334 <            if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
335 <            if ((l=fq.getLine())==NULL)
336 <                GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
337 <            rqv=l;
338 <            if (rqv.length()!=rseq.length())
339 <                GError("Error: qv len != seq len for %s\n", rname.chars());
340 <            }
341 <        } //<-- FASTA or FASTQ
217 <        rseq.upper();
218 <        int l5=0;
219 <        int l3=rseq.length()-1;
220 <        if (l3-l5+1<min_read_len) {
221 <           if (trashReport) {
222 <                  fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
223 <                  }
224 <           continue;
225 <           }
226 <        if (ntrim(rseq, rqv, l5, l3)) {
227 <           if (l3-l5+1<min_read_len) {
228 <             if (trashReport) {
229 <                    fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
230 <                    }
231 <             continue; //invalid read
232 <             }
233 <            //-- keep only the l5..l3 range
234 <           rseq=rseq.substr(l5, l3-l5+1);
235 <           if (!rqv.is_empty())
236 <              rqv=rqv.substr(l5, l3-l5+1);
237 <           }
238 <          
239 <        if (a3len>0) {
240 <          if (trim_adapter3(rseq, l5, l3)) {
241 <             if (l3-l5+1<min_read_len) {
242 <                 if (trashReport) {
243 <                     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
244 <                     }
245 <                 continue;
246 <                 }
247 <              //-- keep only the l5..l3 range
248 <              rseq=rseq.substr(l5, l3-l5+1);
249 <              if (!rqv.is_empty())
250 <                 rqv=rqv.substr(l5, l3-l5+1);
251 <              }//some adapter was trimmed
252 <           } //adapter trimming
253 <        if (a5len>0) {
254 <          if (trim_adapter5(rseq, l5, l3)) {
255 <             if (l3-l5+1<min_read_len) {
256 <                 if (trashReport) {
257 <                     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
258 <                     }
259 <                 continue;
328 >               tcode=1;
329 >               }
330 >             else if (tcode<=1 && tcode2>1) {
331 >               //"untrash" rseq2
332 >               if (b3-b5+1<min_read_len) {
333 >                   b5=1;
334 >                   if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
335 >                   }
336 >               tcode2=1;
337 >               }
338 >            if (tcode<=1) { //trimmed or left intact -- write it!
339 >               if (tcode>0) {
340 >                 rseq2=rseq2.substr(b5,b3-b5+1);
341 >                 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
342                   }
343 <              //-- keep only the l5..l3 range
344 <              rseq=rseq.substr(l5, l3-l5+1);
345 <              if (!rqv.is_empty())
346 <                 rqv=rqv.substr(l5, l3-l5+1);
347 <              }//some adapter was trimmed
348 <           } //adapter trimming
349 <        if (doCollapse) {
350 <           //keep read for later
351 <           FqDupRec* dr=dhash.Find(rseq.chars());
352 <           if (dr==NULL) { //new entry
353 <                  //if (prefix.is_empty())
354 <                     dhash.Add(rseq.chars(),
355 <                          new FqDupRec(&rqv, rname.chars()));
356 <                  //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
343 >               int nocounter=0;
344 >               writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
345 >               }
346 >            } //paired read
347 >       // }
348 >       if (tcode>1) { //trashed
349 >          if (tcode=='s') trash_s++;
350 >            else if (tcode=='Q') trash_Q++;
351 >              else if (tcode=='N') trash_N++;
352 >               else if (tcode=='D') trash_D++;
353 >                else if (tcode=='3') trash_A3++;
354 >                 else if (tcode=='5') trash_A5++;
355 >          if (trashReport) trash_report(tcode, seqid, freport);
356 >          }
357 >         else if (!doCollapse) { //write it
358 >          if (tcode>0) {
359 >            rseq=rseq.substr(a5,a3-a5+1);
360 >            if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
361 >            }
362 >          writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
363 >          }
364 >       } //for each fastq record
365 >    delete fq2;
366 >    FRCLOSE(f_in);
367 >    FRCLOSE(f_in2);
368 >    if (doCollapse) {
369 >       outcounter=0;
370 >       int maxdup_count=1;
371 >       char* maxdup_seq=NULL;
372 >       dhash.startIterate();
373 >       FqDupRec* qd=NULL;
374 >       char* seq=NULL;
375 >       while ((qd=dhash.NextData(seq))!=NULL) {
376 >         GStr rseq(seq);
377 >         //do the dusting here
378 >         if (doDust) {
379 >            int dustbases=dust(rseq);
380 >            if (dustbases>(rseq.length()>>1)) {
381 >               if (trashReport && qd->firstname!=NULL) {
382 >                 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
383                   }
384 <              else    
385 <                 dr->add(rqv);
386 <           } //collapsing duplicates
387 <         else { //not collapsing duplicates
388 <           //do the dust filter now
389 <           if (doDust) {
390 <             int dustbases=dust(rseq);
391 <             if (dustbases>(rseq.length()>>1)) {
392 <                if (trashReport) {
393 <                  fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
394 <                  }
395 <                continue;
396 <                }
289 <             }
290 <           //print this record here  
291 <           outcounter++;
292 <           if (isfasta) {
293 <            if (prefix.is_empty())
294 <               fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
295 <              else
296 <               fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
297 <                                  rseq.chars());
384 >               trash_D+=qd->count;
385 >               continue;
386 >               }
387 >            }
388 >         outcounter++;
389 >         if (qd->count>maxdup_count) {
390 >            maxdup_count=qd->count;
391 >            maxdup_seq=seq;
392 >            }
393 >         if (isfasta) {
394 >           if (prefix.is_empty()) {
395 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
396 >                           rseq.chars());
397               }
398 <           else {  //fastq
399 <            if (phred64) convertQ64(rqv);
400 <            if (prefix.is_empty())
302 <               fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
303 <              else
304 <               fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
305 <                                  rseq.chars(),rqv.chars() );
398 >           else { //use custom read name
399 >             fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
400 >                        qd->count, rseq.chars());
401               }
402 <           } //not collapsing duplicates
403 <        } //for each fastq record
404 <   } //while each input file
405 < FRCLOSE(f_in);
406 < if (doCollapse) {
407 <    outcounter=0;
313 <    int maxdup_count=1;
314 <    char* maxdup_seq=NULL;
315 <    dhash.startIterate();
316 <    FqDupRec* qd=NULL;
317 <    char* seq=NULL;
318 <    while ((qd=dhash.NextData(seq))!=NULL) {
319 <      GStr rseq(seq);
320 <      //do the dusting here
321 <      if (doDust) {
322 <         int dustbases=dust(rseq);
323 <         if (dustbases>(rseq.length()>>1)) {
324 <            if (trashReport && qd->firstname!=NULL) {
325 <              fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
326 <              }
327 <            continue;
402 >           }
403 >         else { //fastq format
404 >          if (convert_phred) convertPhred(qd->qv, qd->len);
405 >          if (prefix.is_empty()) {
406 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
407 >                           rseq.chars(), qd->qv);
408              }
409 +          else { //use custom read name
410 +            fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
411 +                        qd->count, rseq.chars(), qd->qv);
412 +            }
413 +           }
414 +         }//for each element of dhash
415 +       if (maxdup_count>1) {
416 +         GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
417           }
418 <      outcounter++;
419 <      if (qd->count>maxdup_count) {
420 <         maxdup_count=qd->count;
421 <         maxdup_seq=seq;
422 <         }
423 <      if (isfasta) {
424 <        if (prefix.is_empty()) {
425 <          fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
426 <                        rseq.chars());
427 <          }
428 <        else { //use custom read name
429 <          fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
430 <                     qd->count, rseq.chars());
418 >       } //collapse entries
419 >    if (verbose) {
420 >       if (paired_reads) {
421 >           GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
422 >           GMessage("Number of input pairs :%9d\n", incounter);
423 >           GMessage("         Output pairs :%9d\n", outcounter);
424 >           }
425 >         else {
426 >           GMessage(">Input file : %s\n", infname.chars());
427 >           GMessage("Number of input reads :%9d\n", incounter);
428 >           GMessage("         Output reads :%9d\n", outcounter);
429 >           }
430 >       GMessage("------------------------------------\n");
431 >       if (num_trimmed5)
432 >          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
433 >       if (num_trimmed3)
434 >          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
435 >       GMessage("------------------------------------\n");
436 >       if (trash_s>0)
437 >         GMessage("     Trashed by length:%9d\n", trash_s);
438 >       if (trash_N>0)
439 >         GMessage("         Trashed by N%%:%9d\n", trash_N);
440 >       if (trash_Q>0)
441 >         GMessage("Trashed by low quality:%9d\n", trash_Q);
442 >       if (trash_A5>0)
443 >         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
444 >       if (trash_A3>0)
445 >         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
446 >       }
447 >    if (trashReport) {
448 >          FWCLOSE(freport);
449            }
450 <        }
451 <      else { //fastq format
452 <       if (phred64) convertQ64(qd->qv, qd->len);
347 <       if (prefix.is_empty()) {
348 <         fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
349 <                        rseq.chars(), qd->qv);
350 <         }
351 <       else { //use custom read name
352 <         fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
353 <                     qd->count, rseq.chars(), qd->qv);
354 <         }
355 <        }
356 <      }//for each element of dhash
357 <    if (maxdup_count>1) {
358 <      GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
359 <      }
360 <   } //report collapsed dhash entries
361 < GMessage("Number of input reads: %9d\n", icounter);
362 < GMessage("       Output records: %9d\n", outcounter);
363 < if (trashReport) {
364 <    FWCLOSE(freport);
365 <    }
450 >    FWCLOSE(f_out);
451 >    FWCLOSE(f_out2);
452 >   } //while each input file
453  
454 < FWCLOSE(f_out);
454 > //getc(stdin);
455   }
456  
457   class NData {
# Line 393 | Line 480
480       end5=1;
481       end3=seqlen;
482       for (int i=0;i<seqlen;i++)
483 <        if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
483 >        if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
484             NPos[NCount]=i;
485             NCount++;
486             }
# Line 416 | Line 503
503   void N_analyze(int l5, int l3, int p5, int p3) {
504   /* assumes feat was filled properly */
505   int old_dif, t5,t3,v;
506 < if (l3-l5<min_read_len || p5>p3 ) {
506 > if (l3<l5+2 || p5>p3 ) {
507     feat.end5=l5+1;
508     feat.end3=l3+1;
509     return;
# Line 448 | Line 535
535   }
536  
537  
538 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) {
539 < //count Ns in the sequence
538 > bool qtrim(GStr& qvs, int &l5, int &l3) {
539 > if (qvtrim_qmin==0 || qvs.is_empty()) return false;
540 > if (qv_phredtype==0) {
541 >  //try to guess the Phred type
542 >  int vmin=256, vmax=0;
543 >  for (int i=0;i<qvs.length();i++) {
544 >     if (vmin>qvs[i]) vmin=qvs[i];
545 >     if (vmax<qvs[i]) vmax=qvs[i];
546 >     }
547 >  if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
548 >  if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
549 >  if (qv_phredtype==0) {
550 >    GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
551 >    }
552 >  if (verbose)
553 >    GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
554 >  } //guessing Phred type
555 > for (l3=qvs.length()-1;l3>2;l3--) {
556 >  if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
557 >  }
558 > //just in case, check also the 5' the end (?)
559 > for (l5=0;l5<qvs.length()-3;l5++) {
560 >  if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
561 >  }
562 > if (qvtrim_max>0) {
563 >  if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
564 >  if (l5>qvtrim_max) l5=qvtrim_max;
565 >  }
566 > return (l5>0 || l3<qvs.length()-1);
567 > }
568 >
569 > bool ntrim(GStr& rseq, int &l5, int &l3) {
570 > //count Ns in the sequence, trim N-rich ends
571   feat.init(rseq);
572   l5=feat.end5-1;
573   l3=feat.end3-1;
574   N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
575 < if (l5==feat.end5-1 && l3==feat.end3-1)
576 <    return false; //nothing changed
575 > if (l5==feat.end5-1 && l3==feat.end3-1) {
576 >    if (feat.perc_N>max_perc_N) {
577 >           feat.valid=false;
578 >           l3=l5+1;
579 >           return true;
580 >           }
581 >      else {
582 >       return false; //nothing changed
583 >       }
584 >    }
585   l5=feat.end5-1;
586   l3=feat.end3-1;
587   if (l3-l5+1<min_read_len) {
# Line 463 | Line 589
589     return true;
590     }
591   feat.N_calc();
592 < if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases
592 >
593 > if (feat.perc_N>max_perc_N) {
594        feat.valid=false;
595        l3=l5+1;
596        return true;
# Line 589 | Line 716
716   return ncount;
717   }
718  
719 < // ------------------ adapter matching - triplet matching
720 < //look for matching triplets amongst the first 9 nucleotides of the 3' adaptor
721 < // or the last 9 nucleotides for the 5' adapter
722 < //when a triplet match is found, simply try to extend the alignment using a drop-off scheme
723 < // check minimum score and
724 < // for 3' adapter trimming:
725 < //     require that the right end of the alignment for either the adaptor OR the read must be
726 < //     < 3 distance from its right end
727 < // for 5' adapter trimming:
728 < //     require that the left end of the alignment for either the adaptor OR the read must
729 < //     be at coordinate 0
730 <
731 < bool extendMatch(const char* a, int alen, int ai,
732 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, bool end5=false) {
733 < //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
734 < //if (debug)
735 < //  GStr dbg(b);
736 < //GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai);
737 < int a_l=ai; //alignment coordinates on a
738 < int a_r=ai+mlen-1;
739 < int b_l=bi; //alignment coordinates on b
740 < int b_r=bi+mlen-1;
741 < int ai_maxscore=ai;
742 < int bi_maxscore=bi;
743 < int score=mlen*a_m_score;
744 < int maxscore=score;
745 < int mism5score=a_mis_score;
746 < if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
747 < //try to extend to the left first, if possible
748 < while (ai>0 && bi>0) {
749 <   ai--;
750 <   bi--;
751 <   score+= (a[ai]==b[bi])? a_m_score : mism5score;
752 <   if (score>maxscore) {
753 <       ai_maxscore=ai;
754 <       bi_maxscore=bi;
755 <       maxscore=score;
756 <       }
757 <     else if (maxscore-score>a_dropoff_score) break;
758 <   }
759 < a_l=ai_maxscore;
760 < b_l=bi_maxscore;
761 < //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
762 < //now extend to the right
763 < ai_maxscore=a_r;
764 < bi_maxscore=b_r;
765 < ai=a_r;
766 < bi=b_r;
767 < score=maxscore;
768 < //sometimes there are extra AAAAs at the end of the read, ignore those
769 < if (strcmp(&a[alen-4],"AAAA")==0) {
770 <    alen-=3;
771 <    while (a[alen-1]=='A' && alen>ai) alen--;
772 <    }
646 < while (ai<alen-1 && bi<blen-1) {
647 <   ai++;
648 <   bi++;
649 <   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
650 <   if (a[ai]==b[bi]) { //match
651 <      score+=a_m_score;
652 <      if (ai>=alen-2) {
653 <           score+=a_m_score-(alen-ai-1);
654 <           }
719 > int get3mer_value(const char* s) {
720 > return (s[0]<<16)+(s[1]<<8)+s[2];
721 > }
722 >
723 > int w3_match(int qv, const char* str, int slen, int start_index=0) {
724 > if (start_index>=slen || start_index<0) return -1;
725 > for (int i=start_index;i<slen-3;i++) {
726 >   int rv=get3mer_value(str+i);
727 >   if (rv==qv) return i;
728 >   }
729 > return -1;
730 > }
731 >
732 > int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
733 > if (end_index>=slen) return -1;
734 > if (end_index<0) end_index=slen-1;
735 > for (int i=end_index-2;i>=0;i--) {
736 >   int rv=get3mer_value(str+i);
737 >   if (rv==qv) return i;
738 >   }
739 > return -1;
740 > }
741 >
742 > struct SLocScore {
743 >  int pos;
744 >  int score;
745 >  SLocScore(int p=0,int s=0) {
746 >    pos=p;
747 >    score=s;
748 >    }
749 >  void set(int p, int s) {
750 >    pos=p;
751 >    score=s;
752 >    }
753 >  void add(int p, int add) {
754 >    pos=p;
755 >    score+=add;
756 >    }
757 > };
758 >
759 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
760 > int rlen=seq.length();
761 > l5=0;
762 > l3=rlen-1;
763 > int32 seedVal=*(int32*)poly_seed;
764 > char polyChar=poly_seed[0];
765 > //assumes N trimming was already done
766 > //so a poly match should be very close to the end of the read
767 > // -- find the initial match (seed)
768 > int lmin=GMAX((rlen-12), 0);
769 > int li;
770 > for (li=rlen-4;li>lmin;li--) {
771 >   if (seedVal==*(int*)&(seq[li])) {
772 >      break;
773        }
774 <    else { //mismatch
775 <      score+=a_mis_score;
776 <      }  
777 <   if (score>maxscore) {
778 <       ai_maxscore=ai;
779 <       bi_maxscore=bi;
780 <       maxscore=score;
781 <       }
782 <     else if (maxscore-score>a_dropoff_score) break;
783 <   }
784 <  a_r=ai_maxscore;
785 <  b_r=bi_maxscore;
786 <  int a_ovh3=alen-a_r-1;
787 <  int b_ovh3=blen-b_r-1;
788 <  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
789 <  int mmovh5=(a_l<b_l)? a_l : b_l;
790 <  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
791 <  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
792 <     if (a_l<a_ovh3) {
675 <        //adapter closer to the left end (typical for 5' adapter)
676 <        l5=a_r+1;
677 <        l3=alen-1;
774 >   }
775 > if (li<=lmin) return false;
776 > //seed found, try to extend it both ways
777 > //extend right
778 > int ri=li+3;
779 > SLocScore loc(ri, poly_m_score<<2);
780 > SLocScore maxloc(loc);
781 > //extend right
782 > while (ri<rlen-2) {
783 >   ri++;
784 >   if (seq[ri]==polyChar) {
785 >                loc.add(ri,poly_m_score);
786 >                }
787 >   else if (seq[ri]=='N') {
788 >                loc.add(ri,0);
789 >                }
790 >   else { //mismatch
791 >        loc.add(ri,poly_mis_score);
792 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
793          }
794 <      else {
795 <        //adapter matching at the right end (typical for 3' adapter)
796 <        l5=0;
797 <        l3=a_l-1;
794 >   if (maxloc.score<=loc.score) {
795 >      maxloc=loc;
796 >      }
797 >   }
798 > ri=maxloc.pos;
799 > if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end
800 > //ri = right boundary for the poly match
801 > //extend left
802 > loc.set(li, maxloc.score);
803 > maxloc.pos=li;
804 > while (li>0) {
805 >    li--;
806 >    if (seq[li]==polyChar) {
807 >                 loc.add(li,poly_m_score);
808 >                 }
809 >    else if (seq[li]=='N') {
810 >                 loc.add(li,0);
811 >                 }
812 >    else { //mismatch
813 >         loc.add(li,poly_mis_score);
814 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
815 >         }
816 >    if (maxloc.score<=loc.score) {
817 >       maxloc=loc;
818 >       }
819 >    }
820 > if (maxloc.score>poly_minScore && ri>=rlen-3) {
821 >    l5=li;
822 >    l3=ri;
823 >    return true;
824 >    }
825 > return false;
826 > }
827 >
828 >
829 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
830 > int rlen=seq.length();
831 > l5=0;
832 > l3=rlen-1;
833 > int32 seedVal=*(int32*)poly_seed;
834 > char polyChar=poly_seed[0];
835 > //assumes N trimming was already done
836 > //so a poly match should be very close to the end of the read
837 > // -- find the initial match (seed)
838 > int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
839 > int li;
840 > for (li=0;li<=lmax;li++) {
841 >   if (seedVal==*(int*)&(seq[li])) {
842 >      break;
843 >      }
844 >   }
845 > if (li>lmax) return false;
846 > //seed found, try to extend it both ways
847 > //extend left
848 > int ri=li+3; //save rightmost base of the seed
849 > SLocScore loc(li, poly_m_score<<2);
850 > SLocScore maxloc(loc);
851 > while (li>0) {
852 >    li--;
853 >    if (seq[li]==polyChar) {
854 >                 loc.add(li,poly_m_score);
855 >                 }
856 >    else if (seq[li]=='N') {
857 >                 loc.add(li,0);
858 >                 }
859 >    else { //mismatch
860 >         loc.add(li,poly_mis_score);
861 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
862 >         }
863 >    if (maxloc.score<=loc.score) {
864 >       maxloc=loc;
865 >       }
866 >    }
867 > li=maxloc.pos;
868 > if (li>3) return false; //no trimming wanted, too far from 5' end
869 > //li = right boundary for the poly match
870 >
871 > //extend right
872 > loc.set(ri, maxloc.score);
873 > maxloc.pos=ri;
874 > while (ri<rlen-2) {
875 >   ri++;
876 >   if (seq[ri]==polyChar) {
877 >                loc.add(ri,poly_m_score);
878 >                }
879 >   else if (seq[ri]=='N') {
880 >                loc.add(ri,0);
881 >                }
882 >   else { //mismatch
883 >        loc.add(ri,poly_mis_score);
884 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
885          }
886 <     return true;
887 <     }
888 <  //do not trim:
889 <  l5=0;
890 <  l3=alen-1;
891 <  return false;
892 < }
886 >   if (maxloc.score<=loc.score) {
887 >      maxloc=loc;
888 >      }
889 >   }
890 >
891 > if (maxloc.score>poly_minScore && li<=3) {
892 >    l5=li;
893 >    l3=ri;
894 >    return true;
895 >    }
896 > return false;
897 > }
898  
899   bool trim_adapter3(GStr& seq, int&l5, int &l3) {
900   int rlen=seq.length();
# Line 702 | Line 909
909        }
910      else {
911        l5=0;
912 <      l3=fi-1;
912 >      l3=fi-1;
913        }
914     return true;
915     }
916 + #ifdef DEBUG
917 + if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
918 + #endif
919 +
920   //also, for fast detection of other adapter-only reads that start past
921   // the beginning of the adapter sequence, try to see if the first a3len-4
922 < // characters of the read are a substring of the adapter
923 < GStr rstart=seq.substr(0,a3len-4);
924 < if ((fi=adapter3.index(rstart))>=0) {
925 <   l3=rlen-1;
926 <   l5=a3len-4;
927 <   while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
928 <   return true;
922 > // bases of the read are a substring of the adapter
923 > if (rlen>a3len-3) {
924 >   GStr rstart=seq.substr(1,a3len-4);
925 >   if ((fi=adapter3.index(rstart))>=0) {
926 >     l3=rlen-1;
927 >     l5=a3len-4;
928 >     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
929 >     return true;
930 >     }
931 >  }
932 > CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
933 >  //check the easy cases - 11 bases exact match at the end
934 > int fdlen=11;
935 >  if (a3len<16) {
936 >   fdlen=a3len>>1;
937     }
938 <  
939 < //another easy case: first half of the adaptor matches
940 < int a3hlen=a3len>>1;
941 < GStr ahalf=adapter3.substr(0, a3hlen);
942 < if ((fi=seq.index(ahalf))>=0) {
943 <   extendMatch(seq.chars(), rlen, fi,
944 <                 adapter3.chars(), a3len, 0,  a3hlen, l5,l3);
945 <   return true;
938 > if (fdlen>4) {
939 >     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
940 >     GStr rstart=seq.substr(-fdlen-3,fdlen);
941 >     if ((fi=adapter3.index(rstart))>=0) {
942 > #ifdef DEBUG
943 >       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
944 > #endif
945 >       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
946 >                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
947 >            return true;
948 >       }
949 >     //another easy case: first 11 characters of the adaptor found as a substring of the read
950 >     GStr bstr=adapter3.substr(0, fdlen);
951 >     if ((fi=seq.rindex(bstr))>=0) {
952 > #ifdef DEBUG
953 >       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
954 > #endif
955 >       if (extendMatch(seq.chars(), rlen, fi,
956 >                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
957 >            return true;
958 >       }
959 >     } //tried to match 11 bases first
960 >    
961 > //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
962 > //-- only extend if the match is in the 3' (ending) region of the read
963 > int wordSize=3;
964 > int hlen=12;
965 > if (hlen>a3len-wordSize) hlen=a3len-wordSize;
966 > int imin=rlen>>1; //last half of the read, left boundary for the wmer match
967 > if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
968 > imin=rlen-imin;
969 > for (int iw=0;iw<hlen;iw++) {
970 >   //int32* qv=(int32*)(adapter3.chars()+iw);
971 >   int qv=get3mer_value(adapter3.chars()+iw);
972 >   fi=-1;
973 >   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
974 >   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
975 >     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
976 >
977 > #ifdef DEBUG
978 >     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
979 > #endif
980 >     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
981 >                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
982 >     fi--;
983 >     if (fi<imin) break;
984 >     }
985 >   } //for each wmer in the first hlen bases of the adaptor
986 > /*
987 > //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
988 > //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
989 > if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
990 > int hlen2=a3len-wordSize;
991 > //if (hlen2>a3len-4) hlen2=a3len-4;
992 > if (hlen2>hlen) {
993 > #ifdef DEBUG
994 >     if (debug && a3segs.Count()>0) {
995 >        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
996 >        }
997 > #endif
998 >     for (int iw=hlen;iw<hlen2;iw++) {
999 >         //int* qv=(int32 *)(adapter3.chars()+iw);
1000 >         int qv=get3mer_value(adapter3.chars()+iw);
1001 >         fi=-1;
1002 >         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1003 >         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1004 >           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1005 >                         a3len, iw, wordSize, l5,l3, a3segs);
1006 >           fi--;
1007 >           if (fi<imin) break;
1008 >           }
1009 >         } //for each wmer between hlen2 and hlen bases of the adaptor
1010 >     }
1011 > //lastly, analyze collected a3segs for a possible gapped alignment:
1012 > GList<CSegChain> segchains(false,true,false);
1013 > #ifdef DEBUG
1014 > if (debug && a3segs.Count()>0) {
1015 >   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1016     }
1017 < //no easy cases, so let's do the word hashing
1018 < for (int iw=0;iw<6;iw++) {
1019 <   GStr aword=adapter3.substr(iw,3);
1020 <   if ((fi=seq.index(aword))>=0  && rlen-fi>3) {
1021 <      if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1022 <                   a3len, iw, 3, l5,l3)) return true;
1017 > #endif
1018 > for (int i=0;i<a3segs.Count();i++) {
1019 >   if (a3segs[i]->chain==NULL) {
1020 >       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1021 >       CSegChain* newchain=new CSegChain();
1022 >       newchain->setFreeItem(false);
1023 >       newchain->addSegPair(a3segs[i]);
1024 >       a3segs[i]->chain=newchain;
1025 >       segchains.Add(newchain); //just to free them when done
1026 >       }
1027 >   for (int j=i+1;j<a3segs.Count();j++) {
1028 >      CSegChain* chain=a3segs[i]->chain;
1029 >      if (chain->extendChain(a3segs[j])) {
1030 >          a3segs[j]->chain=chain;
1031 > #ifdef DEBUG
1032 >          if (debug) dbgPrintChain(*chain, adapter3.chars());
1033 > #endif      
1034 >          //save time by checking here if the extended chain is already acceptable for trimming
1035 >          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1036 >            l5=0;
1037 >            l3=chain->astart-2;
1038 > #ifdef DEBUG
1039 >          if (debug && a3segs.Count()>0) {
1040 >            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1041 >            }
1042 > #endif
1043 >            return true;
1044 >            }
1045 >          } //chain can be extended
1046        }
1047 <   }
1047 >   } //collect segment alignments into chains
1048 > */  
1049   return false; //no adapter parts found
1050   }
1051  
# Line 743 | Line 1056
1056   l3=rlen-1;
1057   //try to see if adapter is fully included in the read
1058   int fi=-1;
1059 < if ((fi=seq.index(adapter5))>=0) {
1060 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
1061 <      l5=fi+a5len;
1062 <      l3=rlen-1;
1059 > for (int ai=0;ai<adapters.Count();ai++) {
1060 >  if (adapters[ai]->a5.is_empty()) continue;
1061 >  int a5len=adapters[ai]->a5.length();
1062 >  GStr& adapter5=adapters[ai]->a5;
1063 >  if ((fi=seq.index(adapter5))>=0) {
1064 >    if (fi<rlen-fi-a5len) {//match is closer to the right end
1065 >       l5=fi+a5len;
1066 >       l3=rlen-1;
1067 >       }
1068 >     else {
1069 >       l5=0;
1070 >       l3=fi-1;
1071 >       }
1072 >    return true;
1073 >    }
1074 > #ifdef DEBUG
1075 >  if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1076 > #endif
1077 >
1078 >  //try the easy way out first - look for an exact match of 11 bases
1079 >  int fdlen=11;
1080 >   if (a5len<16) {
1081 >    fdlen=a5len>>1;
1082 >    }
1083 >  if (fdlen>4) {
1084 >      GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1085 >      if ((fi=adapter5.index(rstart))>=0) {
1086 > #ifdef DEBUG
1087 >        if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1088 > #endif
1089 >        if (extendMatch(seq.chars(), rlen, 1,
1090 >                      adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1091 >            return true;
1092 >        }
1093 >      //another easy case: last 11 characters of the adaptor found as a substring of the read
1094 >      GStr bstr=adapter5.substr(-fdlen);
1095 >      if ((fi=seq.index(bstr))>=0) {
1096 > #ifdef DEBUG
1097 >        if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1098 > #endif
1099 >        if (extendMatch(seq.chars(), rlen, fi,
1100 >                      adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1101 >           return true;
1102 >        }
1103 >      } //tried to matching at most 11 bases first
1104 >
1105 >  //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1106 >  //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1107 >  int wordSize=3;
1108 >  int hlen=12;
1109 >  if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1110 >  int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1111 >  if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1112 >  for (int iw=0;iw<=hlen;iw++) {
1113 >    int apstart=a5len-iw-wordSize;
1114 >    fi=0;
1115 >    //int* qv=(int32 *)(adapter5.chars()+apstart);
1116 >    int qv=get3mer_value(adapter5.chars()+apstart);
1117 >    //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1118 >    while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1119 > #ifdef DEBUG
1120 >      if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1121 > #endif
1122 >      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1123 >                 a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1124 >      fi++;
1125 >      if (fi>imax) break;
1126        }
1127 <    else {
1128 <      l5=0;
1129 <      l3=fi-1;
1127 >    } //for each wmer in the last hlen bases of the adaptor
1128 > //if we're here we couldn't find a good extension
1129 >  return false; //no adapter parts found
1130 > }//for each 5' adapter
1131 > }
1132 >
1133 > //convert qvs to/from phred64 from/to phread33
1134 > void convertPhred(GStr& q) {
1135 > for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1136 > }
1137 >
1138 > void convertPhred(char* q, int len) {
1139 > for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1140 > }
1141 >
1142 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1143 >          GStr& rname, GStr& rinfo, GStr& infname) {
1144 > rseq="";
1145 > rqv="";
1146 > rname="";
1147 > rinfo="";
1148 > if (fq.eof()) return false;
1149 > char* l=fq.getLine();
1150 > while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1151 > if (l==NULL) return false;
1152 > /* if (rawFormat) {
1153 >      //TODO: implement raw qseq parsing here
1154 >      //if (raw type=N) then continue; //skip invalid/bad records
1155 >      } //raw qseq format
1156 > else { // FASTQ or FASTA */
1157 > isfasta=(l[0]=='>');
1158 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1159 >      infname.chars(), l);
1160 > GStr s(l);
1161 > rname=&(l[1]);
1162 > for (int i=0;i<rname.length();i++)
1163 >    if (rname[i]<=' ') {
1164 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1165 >       rname.cut(i);
1166 >       break;
1167 >       }
1168 >  //now get the sequence
1169 > if ((l=fq.getLine())==NULL)
1170 >      GError("Error: unexpected EOF after header for read %s (%s)\n",
1171 >                   rname.chars(), infname.chars());
1172 > rseq=l; //this must be the DNA line
1173 > while ((l=fq.getLine())!=NULL) {
1174 >      //seq can span multiple lines
1175 >      if (l[0]=='>' || l[0]=='+') {
1176 >           fq.pushBack();
1177 >           break; //
1178 >           }
1179 >      rseq+=l;
1180 >      } //check for multi-line seq
1181 > if (!isfasta) { //reading fastq quality values, which can also be multi-line
1182 >    if ((l=fq.getLine())==NULL)
1183 >        GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1184 >    if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1185 >    if ((l=fq.getLine())==NULL)
1186 >        GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1187 >    rqv=l;
1188 >    //if (rqv.length()!=rseq.length())
1189 >    //  GError("Error: qv len != seq len for %s\n", rname.chars());
1190 >    while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1191 >      rqv+=l; //append to qv string
1192        }
1193 <   return true;
1193 >    }// fastq
1194 > // } //<-- FASTA or FASTQ
1195 > rseq.upper();
1196 > return true;
1197 > }
1198 >
1199 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1200 > //returns 0 if the read was untouched, 1 if it was just trimmed
1201 > // and a trash code if it was trashed
1202 > l5=0;
1203 > l3=rseq.length()-1;
1204 > if (l3-l5+1<min_read_len) {
1205 >   return 's';
1206     }
1207 < //for fast detection of adapter-rich reads, check if the first 12
1208 < //characters of the read are a substring of the adapter
1209 < GStr rstart=seq.substr(1,12);
1210 < if ((fi=adapter5.index(rstart))>=0) {
1211 <   //l3=rlen-1;
1212 <   //l5=a5len-4;
1213 <   //while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++;
1214 <   //return true;
1215 <   //if (debug) GMessage(" first 12char of the read match adaptor!\n");
1216 <   extendMatch(seq.chars(), rlen, 1,
1217 <                 adapter5.chars(), a5len, fi,  12, l5,l3, true);
1218 <   return true;
1207 > GStr wseq(rseq.chars());
1208 > GStr wqv(rqv.chars());
1209 > int w5=l5;
1210 > int w3=l3;
1211 > //first do the q-based trimming
1212 > if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1213 >   if (w3-w5+1<min_read_len) {
1214 >     return 'Q'; //invalid read
1215 >     }
1216 >    //-- keep only the w5..w3 range
1217 >   l5=w5;
1218 >   l3=w3;
1219 >   wseq=wseq.substr(w5, w3-w5+1);
1220 >   if (!wqv.is_empty())
1221 >      wqv=wqv.substr(w5, w3-w5+1);
1222 >   } //qv trimming
1223 > // N-trimming on the remaining read seq
1224 > if (ntrim(wseq, w5, w3)) {
1225 >   //GMessage("before: %s\n",wseq.chars());
1226 >   //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1227 >   l5+=w5;
1228 >   l3-=(wseq.length()-1-w3);
1229 >   if (w3-w5+1<min_read_len) {
1230 >     return 'N'; //to be trashed
1231 >     }
1232 >    //-- keep only the w5..w3 range
1233 >   wseq=wseq.substr(w5, w3-w5+1);
1234 >   if (!wqv.is_empty())
1235 >      wqv=wqv.substr(w5, w3-w5+1);
1236 >   w5=0;
1237 >   w3=wseq.length()-1;
1238     }
1239 <  
1240 < //another easy case: last 12 characters of the adaptor found as a substring of the read
1241 < int aplen=12;
1242 < int apstart=a5len-aplen-2;
1243 < if (apstart<0) { apstart=0; aplen=a5len-1; }
1244 < GStr bstr=adapter5.substr(apstart, aplen);
1245 < if ((fi=seq.index(bstr))>=0) {
1246 <   //if (debug) GMessage(" last 12char of adaptor match the read!\n");
1247 <   extendMatch(seq.chars(), rlen, fi,
1248 <                 adapter5.chars(), a5len, apstart,  aplen, l5,l3,true);
1249 <   return true;
1239 > if (a3len>0) {
1240 >  if (trim_adapter3(wseq, w5, w3)) {
1241 >     int trimlen=wseq.length()-(w3-w5+1);
1242 >     num_trimmed3++;
1243 >     if (trimlen<min_trimmed3)
1244 >         min_trimmed3=trimlen;
1245 >     l5+=w5;
1246 >     l3-=(wseq.length()-1-w3);
1247 >     if (w3-w5+1<min_read_len) {
1248 >         return '3';
1249 >         }
1250 >      //-- keep only the w5..w3 range
1251 >      wseq=wseq.substr(w5, w3-w5+1);
1252 >      if (!wqv.is_empty())
1253 >         wqv=wqv.substr(w5, w3-w5+1);
1254 >      }//some adapter was trimmed
1255 >   } //adapter trimming
1256 > if (a5len>0) {
1257 >  if (trim_adapter5(wseq, w5, w3)) {
1258 >     int trimlen=wseq.length()-(w3-w5+1);
1259 >     num_trimmed5++;
1260 >     if (trimlen<min_trimmed5)
1261 >         min_trimmed5=trimlen;
1262 >     l5+=w5;
1263 >     l3-=(wseq.length()-1-w3);
1264 >     if (w3-w5+1<min_read_len) {
1265 >         return '5';
1266 >         }
1267 >      //-- keep only the w5..w3 range
1268 >      wseq=wseq.substr(w5, w3-w5+1);
1269 >      if (!wqv.is_empty())
1270 >         wqv=wqv.substr(w5, w3-w5+1);
1271 >      }//some adapter was trimmed
1272 >   } //adapter trimming
1273 > if (doCollapse) {
1274 >   //keep read for later
1275 >   FqDupRec* dr=dhash.Find(wseq.chars());
1276 >   if (dr==NULL) { //new entry
1277 >          //if (prefix.is_empty())
1278 >             dhash.Add(wseq.chars(),
1279 >                  new FqDupRec(&wqv, rname.chars()));
1280 >          //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1281 >         }
1282 >      else
1283 >         dr->add(wqv);
1284 >   } //collapsing duplicates
1285 > else { //not collapsing duplicates
1286 >   //apply the dust filter now
1287 >   if (doDust) {
1288 >     int dustbases=dust(wseq);
1289 >     if (dustbases>(wseq.length()>>1)) {
1290 >        return 'D';
1291 >        }
1292 >     }
1293 >   } //not collapsing duplicates
1294 > return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1295 > }
1296 >
1297 > void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1298 > //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1299 > if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1300 >  else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1301 > }
1302 >
1303 > void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1304 >   outcounter++;
1305 >   bool asFasta=(rqv.is_empty() || fastaOutput);
1306 >   if (asFasta) {
1307 >    if (prefix.is_empty()) {
1308 >       printHeader(f_out, '>',rname,rinfo);
1309 >       fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1310 >       }
1311 >      else {
1312 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1313 >                          rseq.chars());
1314 >       }
1315 >     }
1316 >   else {  //fastq
1317 >    if (convert_phred) convertPhred(rqv);
1318 >    if (prefix.is_empty()) {
1319 >       printHeader(f_out, '@', rname, rinfo);
1320 >       fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1321 >       }
1322 >      else
1323 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1324 >                          rseq.chars(),rqv.chars() );
1325 >     }
1326 > }
1327 >
1328 > void trash_report(char trashcode, GStr& rname, FILE* freport) {
1329 > if (freport==NULL || trashcode<=' ') return;
1330 > if (trashcode=='3' || trashcode=='5') {
1331 >   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1332     }
1333 < //no easy cases, find a triplet match as a seed for alignment extension
1334 < //find triplets at the right end of the adapter
784 < for (int iw=0;iw<6;iw++) {
785 <   apstart=a5len-iw-4;
786 <   GStr aword=adapter5.substr(apstart,3);
787 <   if ((fi=seq.index(aword))>=0) {
788 <      //if (debug) GMessage("extending wmatch :%*s\n", fi+3, aword.chars());
789 <      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
790 <                   a5len, apstart, 3, l5,l3,true)) {
791 <                     return true;
792 <                     }
793 <      }
1333 > else {
1334 >   fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1335     }
1336 < return false; //no adapter parts found
1336 > //tcounter++;
1337 > }
1338 >
1339 > GStr getFext(GStr& s, int* xpos=NULL) {
1340 > //s must be a filename without a path
1341 > GStr r("");
1342 > if (xpos!=NULL) *xpos=0;
1343 > if (s.is_empty() || s=="-") return r;
1344 > int p=s.rindex('.');
1345 > int d=s.rindex('/');
1346 > if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1347 > r=s.substr(p+1);
1348 > if (xpos!=NULL) *xpos=p+1;
1349 > r.lower();
1350 > return r;
1351   }
1352  
1353 < //conversion of phred64 to phread33
1354 < void convertQ64(GStr& q) {
1355 < for (int i=0;i<q.length();i++) q[i]-=31;
1353 > void baseFileName(GStr& fname) {
1354 > //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1355 > int xpos=0;
1356 > GStr fext=getFext(fname, &xpos);
1357 > if (xpos<=1) return;
1358 > bool extdel=false;
1359 > GStr f2;
1360 > if (fext=="z" || fext=="zip") {
1361 >   extdel=true;
1362 >   }
1363 >  else if (fext.length()>=2) {
1364 >   f2=fext.substr(0,2);
1365 >   extdel=(f2=="gz" || f2=="bz");
1366 >   }
1367 > if (extdel) {
1368 >   fname.cut(xpos-1);
1369 >   fext=getFext(fname, &xpos);
1370 >   if (xpos<=1) return;
1371 >   }
1372 > extdel=false;
1373 > if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1374 >   extdel=true;
1375 >   }
1376 >  else if (fext.length()>=2) {
1377 >   extdel=(fext.substr(0,2)=="fa");
1378 >   }
1379 > if (extdel) fname.cut(xpos-1);
1380 > GStr fncp(fname);
1381 > fncp.lower();
1382 > fncp.chomp("seq");
1383 > fncp.chomp("sequence");
1384 > fncp.trimR("_.");
1385 > if (fncp.length()<fname.length()) fname.cut(fncp.length());
1386 > }
1387 >
1388 > FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1389 >  FILE* f_out=NULL;
1390 >  GStr fname(getFileName(infname.chars()));
1391 >  //eliminate known extensions
1392 >  baseFileName(fname);
1393 >  if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1394 >    else if (pocmd.is_empty()) {
1395 >               GStr oname(fname);
1396 >               oname.append('.');
1397 >               oname.append(outsuffix);
1398 >               f_out=fopen(oname.chars(),"w");
1399 >               if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1400 >               }
1401 >            else {
1402 >              GStr oname(">");
1403 >              oname.append(fname);
1404 >              oname.append('.');
1405 >              oname.append(outsuffix);
1406 >              pocmd.append(oname);
1407 >              f_out=popen(pocmd.chars(), "w");
1408 >              if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1409 >              }
1410 > return f_out;
1411   }
1412  
1413 < void convertQ64(char* q, int len) {
1414 < for (int i=0;i<len;i++) q[i]-=31;
1413 > void guess_unzip(GStr& fname, GStr& picmd) {
1414 > GStr fext=getFext(fname);
1415 > if (fext=="gz" || fext=="gzip" || fext=="z") {
1416 >    picmd="gzip -cd ";
1417 >    }
1418 >   else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1419 >    picmd="bzip2 -cd ";
1420 >    }
1421 > }
1422 >
1423 >
1424 > int loadAdapters(const char* fname) {
1425 >  GLineReader lr(fname);
1426 >  char* l;
1427 >  while ((l=lr.nextLine())!=NULL) {
1428 >   if (lr.length()<=3 || l[0]=='#') continue;
1429 >   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1430 >        l[0]==';'|| l[0]==':' ) {
1431 >      int i=1;
1432 >      while (l[i]!=0 && isspace(l[i])) {
1433 >        i++;
1434 >        }
1435 >      if (l[i]!=0) {
1436 >        adapters.Add(new CAdapters(NULL, &(l[i])));
1437 >        continue;
1438 >        }
1439 >      }
1440 >    else {
1441 >      GStr s(l);
1442 >      s.startTokenize("\t ;,:");
1443 >      GStr a5,a3;
1444 >      if (s.nextToken(a5))
1445 >         s.nextToken(a3);
1446 >      adapters.Add(new CAdapters(a5.is_empty()?NULL:a5.chars(),
1447 >                                a3.is_empty()?NULL:a3.chars()));
1448 >      }
1449 >   }
1450 >   return adapters.Count();
1451   }
1452  
1453 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1454 +                       GStr& s, GStr& infname, GStr& infname2) {
1455 + // uses outsuffix to generate output file names and open file handles as needed
1456 + infname="";
1457 + infname2="";
1458 + f_in=NULL;
1459 + f_in2=NULL;
1460 + f_out=NULL;
1461 + f_out2=NULL;
1462 + //analyze outsuffix intent
1463 + GStr pocmd;
1464 + if (outsuffix=="-") {
1465 +    f_out=stdout;
1466 +    }
1467 +   else {
1468 +    GStr ox=getFext(outsuffix);
1469 +    if (ox.length()>2) ox=ox.substr(0,2);
1470 +    if (ox=="gz") pocmd="gzip -9 -c ";
1471 +        else if (ox=="bz") pocmd="bzip2 -9 -c ";
1472 +    }
1473 + if (s=="-") {
1474 +    f_in=stdin;
1475 +    infname="stdin";
1476 +    f_out=prepOutFile(infname, pocmd);
1477 +    return;
1478 +    } // streaming from stdin
1479 + s.startTokenize(",:");
1480 + s.nextToken(infname);
1481 + bool paired=s.nextToken(infname2);
1482 + if (fileExists(infname.chars())==0)
1483 +    GError("Error: cannot find file %s!\n",infname.chars());
1484 + GStr fname(getFileName(infname.chars()));
1485 + GStr picmd;
1486 + guess_unzip(fname, picmd);
1487 + if (picmd.is_empty()) {
1488 +   f_in=fopen(infname.chars(), "r");
1489 +   if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1490 +   }
1491 +  else {
1492 +   picmd.append(infname);
1493 +   f_in=popen(picmd.chars(), "r");
1494 +   if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1495 +   }
1496 + if (f_out==stdout) {
1497 +   if (paired) GError("Error: output suffix required for paired reads\n");
1498 +   return;
1499 +   }
1500 + f_out=prepOutFile(infname, pocmd);
1501 + if (!paired) return;
1502 + if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1503 + // ---- paired reads:-------------
1504 + if (fileExists(infname2.chars())==0)
1505 +     GError("Error: cannot find file %s!\n",infname2.chars());
1506 + picmd="";
1507 + GStr fname2(getFileName(infname2.chars()));
1508 + guess_unzip(fname2, picmd);
1509 + if (picmd.is_empty()) {
1510 +   f_in2=fopen(infname2.chars(), "r");
1511 +   if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1512 +   }
1513 +  else {
1514 +   picmd.append(infname2);
1515 +   f_in2=popen(picmd.chars(), "r");
1516 +   if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1517 +   }
1518 + f_out2=prepOutFile(infname2, pocmd);
1519 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines