ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 2 | Line 2
2   #include "GStr.h"
3   #include "GHash.hh"
4   #include "GList.hh"
5 + #include <ctype.h>
6 + #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9 < fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\
10 <    [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\
9 > fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 >   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 >   [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 >    <input.fq>[,<input_mates.fq>\n\
13   \n\
14 < Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\
15 < optionally collapse duplicates in the input fastq data.\n\
14 > Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 > for low complexity and collapse duplicate reads.\n\
16 > If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 > one read in a pair), the two file names should be given delimited by a comma\n\
18 > or a colon character\n\
19   \n\
20   Options:\n\
21   -n  rename all the reads using the <prefix> followed by a read counter;\n\
22      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23      the read duplication count\n\
24 < -o  write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
24 > -o  unless this parameter is '-', write the trimmed/filtered reads to \n\
25 >    file(s) named <input>.<outsuffix> which will be created in the \n\
26 >    current (working) directory; (writes to stdout if -o- is given);\n\
27 >    a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 > -f  file with adapter sequences to trim, each line having this format:\n\
29 >    <5'-adapter-sequence> <3'-adapter-sequence>\n\
30   -5  trim the given adapter or primer sequence at the 5' end of each read\n\
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32   -3  trim the given adapter sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -l  minimum \"clean\" length at the high quality 5'end that a read must\n\
35 <    have in order to pass the filter (default: 16)\n\
34 > -A  disable polyA/T trimming (enabled by default)\n\
35 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 > -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 > -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 > -l  minimum \"clean\" length after trimming that a read must have\n\
40 >    in order to pass the filter (default: 16)\n\
41   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
42      one letter code for the reason why they were trashed\n\
43 + -D  apply a low-complexity (dust) filter and discard any read that has over\n\
44 +    50% of its length detected as low complexity\n\
45   -C  collapse duplicate reads and add a prefix with their count to the read\n\
46 <    name\n\
47 < -D  apply a low-complexity (dust) filter and discard any read that has at\n\
48 <    least half of it masked by this\n\
49 < -Q  quality values in the input data are interpreted as phred64, convert\n\
31 <    them to phred33\n\
46 >    name (e.g. for microRNAs)\n\
47 > -p  input is phred64/phred33 (use -p64 or -p33)\n\
48 > -Q  convert quality values to the other Phred qv type\n\
49 > -V  verbose processing\n\
50   "
51 +
52 + //-z  for -o option, the output stream(s) will be first piped into the given\n
53 + //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54 +
55 +
56   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 < FILE* f_out=NULL; //stdout if not provided
58 < FILE* f_in=NULL; //input fastq (stdin if not provided)
57 >
58 > //For paired reads sequencing:
59 > //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 > //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 > //FILE* f_out=NULL; //stdout if not provided
62 > //FILE* f_out2=NULL; //for paired reads
63 > //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 > //FILE* f_in2=NULL; //for paired reads
65   FILE* freport=NULL;
66 +
67   bool debug=false;
68 + bool verbose=false;
69   bool doCollapse=false;
70   bool doDust=false;
71 + bool doPolyTrim=true;
72 + bool fastaOutput=false;
73   bool trashReport=false;
74 < bool rawFormat=false;
74 > //bool rawFormat=false;
75   int min_read_len=16;
76 + double max_perc_N=7.0;
77   int dust_cutoff=16;
78   bool isfasta=false;
79 < bool phred64=false;
79 > bool convert_phred=false;
80 > GStr outsuffix; // -o
81   GStr prefix;
82 < GStr adapter3;
83 < GStr adapter5;
84 < int a3len=0;
85 < int a5len=0;
86 < // adaptor matching metrics -- for extendMatch() function
87 < static const int a_m_score=2; //match score
88 < static const int a_mis_score=-3; //mismatch
89 < static const int a_dropoff_score=7;
90 < static const int a_min_score=7;
82 > GStr zcmd;
83 > int num_trimmed5=0;
84 > int num_trimmed3=0;
85 > int num_trimmedN=0;
86 > int num_trimmedQ=0;
87 > int min_trimmed5=INT_MAX;
88 > int min_trimmed3=INT_MAX;
89 >
90 > int qvtrim_qmin=0;
91 > int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
92 > int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93 > int qv_cvtadd=0; //could be -31 or +31
94 >
95 > // adaptor matching metrics -- for X-drop ungapped extension
96 > const int match_reward=2;
97 > const int mismatch_penalty=3;
98 > const int Xdrop=8;
99 >
100 > const int poly_m_score=2; //match score
101 > const int poly_mis_score=-3; //mismatch
102 > const int poly_dropoff_score=7;
103 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
104 >
105 > const char *polyA_seed="AAAA";
106 > const char *polyT_seed="TTTT";
107 > GVec<GStr> adapters5;
108 > GVec<GStr> adapters3;
109 >
110 > CGreedyAlignData* gxmem_l=NULL;
111 > CGreedyAlignData* gxmem_r=NULL;
112  
113   // element in dhash:
114   class FqDupRec {
# Line 91 | Line 147
147    if (!s.is_empty()) {
148        if (s=='-') f=stdout;
149        else {
150 <       f=fopen(s,"w");
150 >       f=fopen(s.chars(),"w");
151         if (f==NULL) GError("Error creating file: %s\n", s.chars());
152         }
153       }
# Line 102 | Line 158
158  
159   GHash<FqDupRec> dhash; //hash to keep track of duplicates
160  
161 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured
161 > int loadAdapters(const char* fname);
162 >
163 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
164 >                       GStr& s, GStr& infname, GStr& infname2);
165 > // uses outsuffix to generate output file names and open file handles as needed
166  
167 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
168 + void trash_report(char trashcode, GStr& rname, FILE* freport);
169 +
170 + bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
171 +          GStr& rname, GStr& rinfo, GStr& infname);
172 +
173 + char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
174 + //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
175 +
176 + bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
177 + bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
178   int dust(GStr& seq);
179 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
179 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
180 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
181   bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
182 + bool trim_adapter3(GStr& seq, int &l5, int &l3);
183  
184 < void convertQ64(char* q, int len);
185 < void convertQ64(GStr& q);
184 > void convertPhred(char* q, int len);
185 > void convertPhred(GStr& q);
186  
187   int main(int argc, char * const argv[]) {
188 <  GArgs args(argc, argv, "QRDCl:d:3:5:n:r:o:");
188 >  GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189    int e;
117  int icounter=0; //counter for input reads
118  int outcounter=0; //counter for output reads
190    if ((e=args.isError())>0) {
191        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
192        exit(224);
193        }
194 <  debug=(args.getOpt('D')!=NULL);
195 <  phred64=(args.getOpt('Q')!=NULL);
194 >  debug=(args.getOpt('Y')!=NULL);
195 >  verbose=(args.getOpt('V')!=NULL);
196 >  convert_phred=(args.getOpt('Q')!=NULL);
197    doCollapse=(args.getOpt('C')!=NULL);
198    doDust=(args.getOpt('D')!=NULL);
199 +  if (args.getOpt('A')) doPolyTrim=false;
200 +  /*
201    rawFormat=(args.getOpt('R')!=NULL);
202    if (rawFormat) {
203      GError("Sorry, raw qseq format parsing is not implemented yet!\n");
204      }
205 +  */
206    prefix=args.getOpt('n');
207    GStr s=args.getOpt('l');
208    if (!s.is_empty())
209       min_read_len=s.asInt();
210 +  s=args.getOpt('m');
211 +  if (!s.is_empty())
212 +     max_perc_N=s.asDouble();
213    s=args.getOpt('d');
214    if (!s.is_empty()) {
215       dust_cutoff=s.asInt();
216       doDust=true;
217       }
218 <    
219 <  if (args.getOpt('3')!=NULL) {
220 <    adapter3=args.getOpt('3');
221 <    adapter3.upper();
222 <    a3len=adapter3.length();
223 <    }
224 <  if (args.getOpt('5')!=NULL) {
225 <    adapter5=args.getOpt('5');
226 <    adapter5.upper();
227 <    a5len=adapter5.length();
218 >  s=args.getOpt('q');
219 >  if (!s.is_empty()) {
220 >     qvtrim_qmin=s.asInt();
221 >     }
222 >  s=args.getOpt('t');
223 >  if (!s.is_empty()) {
224 >     qvtrim_max=s.asInt();
225 >     }
226 >  s=args.getOpt('p');
227 >  if (!s.is_empty()) {
228 >     int v=s.asInt();
229 >     if (v==33) {
230 >        qv_phredtype=33;
231 >        qv_cvtadd=31;
232 >        }
233 >      else if (v==64) {
234 >        qv_phredtype=64;
235 >        qv_cvtadd=-31;
236 >        }
237 >       else
238 >         GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
239 >     }
240 >  s=args.getOpt('f');
241 >  if (!s.is_empty()) {
242 >   loadAdapters(s.chars());
243 >   }
244 >  bool fileAdapters=adapters5.Count()+adapters3.Count();
245 >  s=args.getOpt('5');
246 >  if (!s.is_empty()) {
247 >    if (fileAdapters)
248 >      GError("Error: options -5 and -f cannot be used together!\n");
249 >    s.upper();
250 >    adapters5.Add(s);
251 >    }
252 >  s=args.getOpt('3');
253 >  if (!s.is_empty()) {
254 >    if (fileAdapters)
255 >      GError("Error: options -3 and -f cannot be used together!\n");
256 >      s.upper();
257 >      adapters3.Add(s);
258      }
259 +  s=args.getOpt('y');
260 +  if (!s.is_empty()) {
261 +     int minmatch=s.asInt();
262 +     poly_minScore=minmatch*poly_m_score;
263 +     }
264 +  
265 +  if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
266 +                         else outsuffix="-";
267    trashReport=  (args.getOpt('r')!=NULL);
268 <  if (args.startNonOpt()==0) {
268 >  int fcount=args.startNonOpt();
269 >  if (fcount==0) {
270      GMessage(USAGE);
271      exit(224);
272      }
273 <
274 <  openfw(f_out, args, 'o');
273 >   if (fcount>1 && doCollapse) {
274 >    GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
275 >    }
276 >  //openfw(f_out, args, 'o');
277 >  //if (f_out==NULL) f_out=stdout;
278    if (trashReport)
279      openfw(freport, args, 'r');
280    char* infile=NULL;
281 +
282 +  if (adapters5.Count()>0)
283 +    gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
284 +  if (adapters3.Count()>0)
285 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
286 +
287    while ((infile=args.nextNonOpt())!=NULL) {
288 <    GStr infname(infile);
289 <    if (strcmp(infile,"-")==0) {
290 <       f_in=stdin; infname="stdin"; }
291 <     else {
292 <        f_in=fopen(infile,"r");
293 <        if (f_in==NULL)
294 <            GError("Cannot open input file %s!\n",infile);
295 <        }
296 <     GLineReader fq(f_in);
297 <     char* l=NULL;
298 <     while ((l=fq.getLine())!=NULL) {
299 <        GStr rname; //current read name
300 <        GStr rseq;  //current read sequence
301 <        GStr rqv;   //current read quality values
302 <        GStr s;
303 <        if (rawFormat) {
304 <          //TODO: implement qseq parsing here
305 <          //if (raw type=N) then continue; //skip invalid/bad records
306 <          
307 <          } //raw qseq format
308 <         else { // FASTQ or FASTA
309 <          isfasta=(l[0]=='>');
310 <          if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
311 <          s=l;
312 <          rname=&(l[1]);
313 <          icounter++;
314 <          //GMessage("readname=%s\n",rname.chars());
315 <          for (int i=0;i<rname.length();i++)
316 <            if (rname[i]<=' ') { rname.cut(i); break; }
317 <          //now get the sequence
318 <          if ((l=fq.getLine())==NULL)
319 <              GError("Error: unexpected EOF after header for %s\n",rname.chars());
320 <          rseq=l; //this must be the DNA line
321 <          if (isfasta) {
322 <            while ((l=fq.getLine())!=NULL) {
323 <              //fasta can have multiple sequence lines
324 <              if (l[0]=='>') {
325 <                   fq.pushBack();
326 <                   break; //
288 >    //for each input file
289 >    int incounter=0; //counter for input reads
290 >    int outcounter=0; //counter for output reads
291 >    int trash_s=0; //too short from the get go
292 >    int trash_Q=0;
293 >    int trash_N=0;
294 >    int trash_D=0;
295 >    int trash_poly=0;
296 >    int trash_A3=0;
297 >    int trash_A5=0;
298 >    s=infile;
299 >    GStr infname;
300 >    GStr infname2;
301 >    FILE* f_in=NULL;
302 >    FILE* f_in2=NULL;
303 >    FILE* f_out=NULL;
304 >    FILE* f_out2=NULL;
305 >    bool paired_reads=false;
306 >    setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
307 >    GLineReader fq(f_in);
308 >    GLineReader* fq2=NULL;
309 >    if (f_in2!=NULL) {
310 >       fq2=new GLineReader(f_in2);
311 >       paired_reads=true;
312 >       }
313 >    GStr rseq, rqv, seqid, seqinfo;
314 >    GStr rseq2, rqv2, seqid2, seqinfo2;
315 >    while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
316 >       incounter++;
317 >       int a5=0, a3=0, b5=0, b3=0;
318 >       char tcode=0, tcode2=0;
319 >       tcode=process_read(seqid, rseq, rqv, a5, a3);
320 >       if (fq2!=NULL) {
321 >            getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
322 >            if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
323 >               GError("Error: no paired match for read %s vs %s (%s,%s)\n",
324 >                  seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
325 >               }
326 >            tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
327 >            //decide what to do with this pair and print rseq2 if the pair makes it
328 >            if (tcode>1 && tcode2<=1) {
329 >               //"untrash" rseq
330 >               if (a3-a5+1<min_read_len) {
331 >                   a5=1;
332 >                   if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
333                     }
334 <              rseq+=l;
335 <              }
336 <            } //multi-line fasta file
337 <          if (!isfasta) {
338 <            if ((l=fq.getLine())==NULL)
339 <                GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
340 <            if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
341 <            if ((l=fq.getLine())==NULL)
342 <                GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
343 <            rqv=l;
344 <            if (rqv.length()!=rseq.length())
345 <                GError("Error: qv len != seq len for %s\n", rname.chars());
346 <            }
347 <        } //<-- FASTA or FASTQ
216 <        rseq.upper();
217 <        int l5=0;
218 <        int l3=rseq.length()-1;
219 <        if (l3-l5+1<min_read_len) {
220 <           if (trashReport) {
221 <                  fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
222 <                  }
223 <           continue;
224 <           }
225 <        if (ntrim(rseq, rqv, l5, l3)) {
226 <           if (l3-l5+1<min_read_len) {
227 <             if (trashReport) {
228 <                    fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
229 <                    }
230 <             continue; //invalid read
231 <             }
232 <            //-- keep only the l5..l3 range
233 <           rseq=rseq.substr(l5, l3-l5+1);
234 <           if (!rqv.is_empty())
235 <              rqv=rqv.substr(l5, l3-l5+1);
236 <           }
237 <          
238 <        if (a3len>0) {
239 <          if (trim_adapter3(rseq, l5, l3)) {
240 <             if (l3-l5+1<min_read_len) {
241 <                 if (trashReport) {
242 <                     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
243 <                     }
244 <                 continue;
245 <                 }
246 <              //-- keep only the l5..l3 range
247 <              rseq=rseq.substr(l5, l3-l5+1);
248 <              if (!rqv.is_empty())
249 <                 rqv=rqv.substr(l5, l3-l5+1);
250 <              }//some adapter was trimmed
251 <           } //adapter trimming
252 <        if (a5len>0) {
253 <          if (trim_adapter5(rseq, l5, l3)) {
254 <             if (l3-l5+1<min_read_len) {
255 <                 if (trashReport) {
256 <                     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
257 <                     }
258 <                 continue;
334 >               tcode=1;
335 >               }
336 >             else if (tcode<=1 && tcode2>1) {
337 >               //"untrash" rseq2
338 >               if (b3-b5+1<min_read_len) {
339 >                   b5=1;
340 >                   if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
341 >                   }
342 >               tcode2=1;
343 >               }
344 >            if (tcode<=1) { //trimmed or left intact -- write it!
345 >               if (tcode>0) {
346 >                 rseq2=rseq2.substr(b5,b3-b5+1);
347 >                 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
348                   }
349 <              //-- keep only the l5..l3 range
350 <              rseq=rseq.substr(l5, l3-l5+1);
351 <              if (!rqv.is_empty())
352 <                 rqv=rqv.substr(l5, l3-l5+1);
353 <              }//some adapter was trimmed
354 <           } //adapter trimming
355 <        if (doCollapse) {
356 <           //keep read for later
357 <           FqDupRec* dr=dhash.Find(rseq.chars());
358 <           if (dr==NULL) { //new entry
359 <                  //if (prefix.is_empty())
360 <                     dhash.Add(rseq.chars(),
361 <                          new FqDupRec(&rqv, rname.chars()));
362 <                  //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
349 >               int nocounter=0;
350 >               writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
351 >               }
352 >            } //pair read
353 >       if (tcode>1) { //trashed
354 >         #ifdef GDEBUG
355 >         GMessage(" !!!!TRASH => 'N'\n");
356 >         #endif
357 >          if (tcode=='s') trash_s++;
358 >          else if (tcode=='A' || tcode=='T') trash_poly++;
359 >            else if (tcode=='Q') trash_Q++;
360 >              else if (tcode=='N') trash_N++;
361 >               else if (tcode=='D') trash_D++;
362 >                else if (tcode=='3') trash_A3++;
363 >                 else if (tcode=='5') trash_A5++;
364 >          if (trashReport) trash_report(tcode, seqid, freport);
365 >          }
366 >         else if (!doCollapse) { //write it
367 >          if (tcode>0) {
368 >            rseq=rseq.substr(a5,a3-a5+1);
369 >            if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
370 >            }
371 >         #ifdef GDEBUG
372 >            GMessage("  After trimming:\n");
373 >            GMessage("<==%s\n",rseq.chars());
374 >         #endif
375 >          writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
376 >          }
377 >       } //for each fastq record
378 >    delete fq2;
379 >    FRCLOSE(f_in);
380 >    FRCLOSE(f_in2);
381 >    if (doCollapse) {
382 >       outcounter=0;
383 >       int maxdup_count=1;
384 >       char* maxdup_seq=NULL;
385 >       dhash.startIterate();
386 >       FqDupRec* qd=NULL;
387 >       char* seq=NULL;
388 >       while ((qd=dhash.NextData(seq))!=NULL) {
389 >         GStr rseq(seq);
390 >         //do the dusting here
391 >         if (doDust) {
392 >            int dustbases=dust(rseq);
393 >            if (dustbases>(rseq.length()>>1)) {
394 >               if (trashReport && qd->firstname!=NULL) {
395 >                 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
396                   }
397 <              else    
398 <                 dr->add(rqv);
399 <           } //collapsing duplicates
400 <         else { //not collapsing duplicates
401 <           //do the dust filter now
402 <           if (doDust) {
403 <             int dustbases=dust(rseq);
404 <             if (dustbases>(rseq.length()>>1)) {
405 <                if (trashReport) {
406 <                  fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
407 <                  }
408 <                continue;
409 <                }
288 <             }
289 <           //print this record here  
290 <           outcounter++;
291 <           if (isfasta) {
292 <            if (prefix.is_empty())
293 <               fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
294 <              else
295 <               fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
296 <                                  rseq.chars());
397 >               trash_D+=qd->count;
398 >               continue;
399 >               }
400 >            }
401 >         outcounter++;
402 >         if (qd->count>maxdup_count) {
403 >            maxdup_count=qd->count;
404 >            maxdup_seq=seq;
405 >            }
406 >         if (isfasta) {
407 >           if (prefix.is_empty()) {
408 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
409 >                           rseq.chars());
410               }
411 <           else {  //fastq
412 <            if (phred64) convertQ64(rqv);
413 <            if (prefix.is_empty())
301 <               fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
302 <              else
303 <               fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
304 <                                  rseq.chars(),rqv.chars() );
411 >           else { //use custom read name
412 >             fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
413 >                        qd->count, rseq.chars());
414               }
415 <           } //not collapsing duplicates
416 <        } //for each fastq record
417 <   } //while each input file
418 < FRCLOSE(f_in);
419 < if (doCollapse) {
420 <    outcounter=0;
312 <    int maxdup_count=1;
313 <    char* maxdup_seq=NULL;
314 <    dhash.startIterate();
315 <    FqDupRec* qd=NULL;
316 <    char* seq=NULL;
317 <    while ((qd=dhash.NextData(seq))!=NULL) {
318 <      GStr rseq(seq);
319 <      //do the dusting here
320 <      if (doDust) {
321 <         int dustbases=dust(rseq);
322 <         if (dustbases>(rseq.length()>>1)) {
323 <            if (trashReport && qd->firstname!=NULL) {
324 <              fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
325 <              }
326 <            continue;
415 >           }
416 >         else { //fastq format
417 >          if (convert_phred) convertPhred(qd->qv, qd->len);
418 >          if (prefix.is_empty()) {
419 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
420 >                           rseq.chars(), qd->qv);
421              }
422 +          else { //use custom read name
423 +            fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
424 +                        qd->count, rseq.chars(), qd->qv);
425 +            }
426 +           }
427 +         }//for each element of dhash
428 +       if (maxdup_count>1) {
429 +         GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
430           }
431 <      outcounter++;
432 <      if (qd->count>maxdup_count) {
433 <         maxdup_count=qd->count;
434 <         maxdup_seq=seq;
435 <         }
436 <      if (isfasta) {
437 <        if (prefix.is_empty()) {
438 <          fprintf(f_out, "@%s_x%d\n%s\n", qd->firstname, qd->count,
439 <                        rseq.chars());
440 <          }
441 <        else { //use custom read name
442 <          fprintf(f_out, "@%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
443 <                     qd->count, rseq.chars());
431 >       } //collapse entries
432 >    if (verbose) {
433 >       if (paired_reads) {
434 >           GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
435 >           GMessage("Number of input pairs :%9d\n", incounter);
436 >           GMessage("         Output pairs :%9d\n", outcounter);
437 >           }
438 >         else {
439 >           GMessage(">Input file : %s\n", infname.chars());
440 >           GMessage("Number of input reads :%9d\n", incounter);
441 >           GMessage("         Output reads :%9d\n", outcounter);
442 >           }
443 >       GMessage("------------------------------------\n");
444 >       if (num_trimmed5)
445 >          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
446 >       if (num_trimmed3)
447 >          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
448 >       GMessage("------------------------------------\n");
449 >       if (trash_s>0)
450 >         GMessage("     Trashed by length:%9d\n", trash_s);
451 >       if (trash_N>0)
452 >         GMessage("         Trashed by N%%:%9d\n", trash_N);
453 >       if (trash_Q>0)
454 >         GMessage("Trashed by low quality:%9d\n", trash_Q);
455 >       if (trash_poly>0)
456 >         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
457 >       if (trash_A5>0)
458 >         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
459 >       if (trash_A3>0)
460 >         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
461 >       }
462 >    if (trashReport) {
463 >          FWCLOSE(freport);
464            }
465 <        }
466 <      else { //fastq format
467 <       if (phred64) convertQ64(qd->qv, qd->len);
468 <       if (prefix.is_empty()) {
469 <         fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
470 <                        rseq.chars(), qd->qv);
349 <         }
350 <       else { //use custom read name
351 <         fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
352 <                     qd->count, rseq.chars(), qd->qv);
353 <         }
354 <        }
355 <      }//for each element of dhash
356 <    if (maxdup_count>1) {
357 <      GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
358 <      }
359 <   } //report collapsed dhash entries
360 < GMessage("Number of input reads: %9d\n", icounter);
361 < GMessage("       Output records: %9d\n", outcounter);
362 < if (trashReport) {
363 <    FWCLOSE(freport);
364 <    }
365 <
366 < FWCLOSE(f_out);
465 >    FWCLOSE(f_out);
466 >    FWCLOSE(f_out2);
467 >   } //while each input file
468 > delete gxmem_l;
469 > delete gxmem_r;
470 > //getc(stdin);
471   }
472  
473   class NData {
# Line 392 | Line 496
496       end5=1;
497       end3=seqlen;
498       for (int i=0;i<seqlen;i++)
499 <        if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
499 >        if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
500             NPos[NCount]=i;
501             NCount++;
502             }
# Line 415 | Line 519
519   void N_analyze(int l5, int l3, int p5, int p3) {
520   /* assumes feat was filled properly */
521   int old_dif, t5,t3,v;
522 < if (l3-l5<min_read_len || p5>p3 ) {
522 > if (l3<l5+2 || p5>p3 ) {
523     feat.end5=l5+1;
524     feat.end3=l3+1;
525     return;
# Line 447 | Line 551
551   }
552  
553  
554 < bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) {
555 < //count Ns in the sequence
554 > bool qtrim(GStr& qvs, int &l5, int &l3) {
555 > if (qvtrim_qmin==0 || qvs.is_empty()) return false;
556 > if (qv_phredtype==0) {
557 >  //try to guess the Phred type
558 >  int vmin=256, vmax=0;
559 >  for (int i=0;i<qvs.length();i++) {
560 >     if (vmin>qvs[i]) vmin=qvs[i];
561 >     if (vmax<qvs[i]) vmax=qvs[i];
562 >     }
563 >  if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
564 >  if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
565 >  if (qv_phredtype==0) {
566 >    GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
567 >    }
568 >  if (verbose)
569 >    GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
570 >  } //guessing Phred type
571 > for (l3=qvs.length()-1;l3>2;l3--) {
572 >  if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
573 >  }
574 > //just in case, check also the 5' the end (?)
575 > for (l5=0;l5<qvs.length()-3;l5++) {
576 >  if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
577 >  }
578 > if (qvtrim_max>0) {
579 >  if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
580 >  if (l5>qvtrim_max) l5=qvtrim_max;
581 >  }
582 > return (l5>0 || l3<qvs.length()-1);
583 > }
584 >
585 > bool ntrim(GStr& rseq, int &l5, int &l3) {
586 > //count Ns in the sequence, trim N-rich ends
587   feat.init(rseq);
588   l5=feat.end5-1;
589   l3=feat.end3-1;
590   N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
591 < if (l5==feat.end5-1 && l3==feat.end3-1)
592 <    return false; //nothing changed
591 > if (l5==feat.end5-1 && l3==feat.end3-1) {
592 >    if (feat.perc_N>max_perc_N) {
593 >           feat.valid=false;
594 >           l3=l5+1;
595 >           return true;
596 >           }
597 >      else {
598 >       return false; //nothing changed
599 >       }
600 >    }
601   l5=feat.end5-1;
602   l3=feat.end3-1;
603   if (l3-l5+1<min_read_len) {
# Line 462 | Line 605
605     return true;
606     }
607   feat.N_calc();
608 < if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases
608 >
609 > if (feat.perc_N>max_perc_N) {
610        feat.valid=false;
611        l3=l5+1;
612        return true;
# Line 588 | Line 732
732   return ncount;
733   }
734  
735 < // ------------------ adapter matching - triplet matching
736 < //look for matching triplets amongst the first 9 nucleotides of the 3' adaptor
737 < // or the last 9 nucleotides for the 5' adapter
738 < //when a triplet match is found, simply try to extend the alignment using a drop-off scheme
739 < // check minimum score and
740 < // for 3' adapter trimming:
741 < //     require that the right end of the alignment for either the adaptor OR the read must be
742 < //     < 3 distance from its right end
743 < // for 5' adapter trimming:
744 < //     require that the left end of the alignment for either the adaptor OR the read must
745 < //     be at coordinate 0
602 <
603 < bool extendMatch(const char* a, int alen, int ai,
604 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3) {
605 < //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
606 < //GStr dbg(a);
607 < //GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai);
608 < int a_l=ai; //alignment coordinates on a
609 < int a_r=ai+mlen-1;
610 < int b_l=bi; //alignment coordinates on b
611 < int b_r=bi+mlen-1;
612 < int ai_maxscore=ai;
613 < int bi_maxscore=bi;
614 < int score=mlen*a_m_score;
615 < int maxscore=score;
616 < int mism5score=a_mis_score-2; //increase penalty for mismatches at 5' end
617 < //try to extend to the left first, if possible
618 < while (ai>0 && bi>0) {
619 <   ai--;
620 <   bi--;
621 <   score+= (a[ai]==b[bi])? a_m_score : mism5score;
622 <   if (score>maxscore) {
623 <       ai_maxscore=ai;
624 <       bi_maxscore=bi;
625 <       maxscore=score;
626 <       }
627 <     else if (maxscore-score>a_dropoff_score) break;
628 <   }
629 < a_l=ai_maxscore;
630 < b_l=bi_maxscore;
631 < //now extend to the right
632 < ai_maxscore=a_r;
633 < bi_maxscore=b_r;
634 < ai=a_r;
635 < bi=b_r;
636 < score=maxscore;
637 < //sometimes there are extra AAAAs at the end of the read, ignore those
638 < if (strcmp(&a[alen-4],"AAAA")==0) {
639 <    alen-=3;
640 <    while (a[alen-1]=='A' && alen>ai) alen--;
641 <    }
642 < while (ai<alen-1 && bi<blen-1) {
643 <   ai++;
644 <   bi++;
645 <   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
646 <   if (a[ai]==b[bi]) { //match
647 <      score+=a_m_score;
648 <      if (ai>=alen-2) {
649 <           score+=a_m_score-(alen-ai-1);
650 <           }
651 <      }
652 <    else { //mismatch
653 <      score+=a_mis_score;
654 <      }  
655 <   if (score>maxscore) {
656 <       ai_maxscore=ai;
657 <       bi_maxscore=bi;
658 <       maxscore=score;
659 <       }
660 <     else if (maxscore-score>a_dropoff_score) break;
661 <   }
662 <  a_r=ai_maxscore;
663 <  b_r=bi_maxscore;
664 <  if (maxscore>=a_min_score && (alen-a_r<3 || blen-b_r<3 || a_l<2 || b_l<2)) {
665 <     if (a_l<alen-a_r-1) {
666 <        //adapter closer to the left end (typical for 5' adapter)
667 <        l5=a_r+1;
668 <        l3=alen-1;
669 <        }
670 <      else {
671 <        //adapter matching at the right end (typical for 3' adapter)
672 <        l5=0;
673 <        l3=a_l-1;
674 <        }
675 <     return true;
676 <     }
677 <  return false;
735 > int get3mer_value(const char* s) {
736 > return (s[0]<<16)+(s[1]<<8)+s[2];
737 > }
738 >
739 > int w3_match(int qv, const char* str, int slen, int start_index=0) {
740 > if (start_index>=slen || start_index<0) return -1;
741 > for (int i=start_index;i<slen-3;i++) {
742 >   int rv=get3mer_value(str+i);
743 >   if (rv==qv) return i;
744 >   }
745 > return -1;
746   }
747  
748 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
748 > int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
749 > if (end_index>=slen) return -1;
750 > if (end_index<0) end_index=slen-1;
751 > for (int i=end_index-2;i>=0;i--) {
752 >   int rv=get3mer_value(str+i);
753 >   if (rv==qv) return i;
754 >   }
755 > return -1;
756 > }
757 >
758 > struct SLocScore {
759 >  int pos;
760 >  int score;
761 >  SLocScore(int p=0,int s=0) {
762 >    pos=p;
763 >    score=s;
764 >    }
765 >  void set(int p, int s) {
766 >    pos=p;
767 >    score=s;
768 >    }
769 >  void add(int p, int add) {
770 >    pos=p;
771 >    score+=add;
772 >    }
773 > };
774 >
775 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
776 > if (!doPolyTrim) return false;
777   int rlen=seq.length();
778   l5=0;
779   l3=rlen-1;
780 < //first try a full match, we might get lucky
781 < int fi=-1;
782 < if ((fi=seq.index(adapter3))>=0) {
783 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
784 <      l5=fi+a3len;
785 <      l3=rlen-1;
780 > int32 seedVal=*(int32*)poly_seed;
781 > char polyChar=poly_seed[0];
782 > //assumes N trimming was already done
783 > //so a poly match should be very close to the end of the read
784 > // -- find the initial match (seed)
785 > int lmin=GMAX((rlen-12), 0);
786 > int li;
787 > for (li=rlen-4;li>lmin;li--) {
788 >   if (seedVal==*(int*)&(seq[li])) {
789 >      break;
790        }
691    else {
692      l5=0;
693      l3=fi-1;
694      }
695   return true;
791     }
792 < //also, for fast detection of other adapter-only reads that start past
793 < // the beginning of the adapter sequence, try to see if the first a3len-4
794 < // characters of the read are a substring of the adapter
795 < GStr rstart=seq.substr(0,a3len-4);
796 < if ((fi=adapter3.index(rstart))>=0) {
797 <   l3=rlen-1;
798 <   l5=a3len-4;
799 <   while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
800 <   return true;
792 > if (li<=lmin) return false;
793 > //seed found, try to extend it both ways
794 > //extend right
795 > int ri=li+3;
796 > SLocScore loc(ri, poly_m_score<<2);
797 > SLocScore maxloc(loc);
798 > //extend right
799 > while (ri<rlen-2) {
800 >   ri++;
801 >   if (seq[ri]==polyChar) {
802 >                loc.add(ri,poly_m_score);
803 >                }
804 >   else if (seq[ri]=='N') {
805 >                loc.add(ri,0);
806 >                }
807 >   else { //mismatch
808 >        loc.add(ri,poly_mis_score);
809 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
810 >        }
811 >   if (maxloc.score<=loc.score) {
812 >      maxloc=loc;
813 >      }
814     }
815 <  
816 < //another easy case: first half of the adaptor matches
817 < int a3hlen=a3len>>1;
818 < GStr ahalf=adapter3.substr(0, a3hlen);
819 < if ((fi=seq.index(ahalf))>=0) {
820 <   extendMatch(seq.chars(), rlen, fi,
821 <                 adapter3.chars(), a3len, 0,  a3hlen, l5,l3);
822 <   return true;
815 > ri=maxloc.pos;
816 > if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
817 > //ri = right boundary for the poly match
818 > //extend left
819 > loc.set(li, maxloc.score);
820 > maxloc.pos=li;
821 > while (li>0) {
822 >    li--;
823 >    if (seq[li]==polyChar) {
824 >                 loc.add(li,poly_m_score);
825 >                 }
826 >    else if (seq[li]=='N') {
827 >                 loc.add(li,0);
828 >                 }
829 >    else { //mismatch
830 >         loc.add(li,poly_mis_score);
831 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
832 >         }
833 >    if (maxloc.score<=loc.score) {
834 >       maxloc=loc;
835 >       }
836 >    }
837 > if ((maxloc.score>poly_minScore && ri>=rlen-3) ||
838 >    (maxloc.score==poly_minScore && ri==rlen-1) ||
839 >    (maxloc.score>(poly_minScore<<1) && ri>=rlen-6)) {
840 >    l5=li;
841 >    l3=ri;
842 >    return true;
843 >    }
844 > return false;
845 > }
846 >
847 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
848 > if (!doPolyTrim) return false;
849 > int rlen=seq.length();
850 > l5=0;
851 > l3=rlen-1;
852 > int32 seedVal=*(int32*)poly_seed;
853 > char polyChar=poly_seed[0];
854 > //assumes N trimming was already done
855 > //so a poly match should be very close to the end of the read
856 > // -- find the initial match (seed)
857 > int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
858 > int li;
859 > for (li=0;li<=lmax;li++) {
860 >   if (seedVal==*(int*)&(seq[li])) {
861 >      break;
862 >      }
863     }
864 < //no easy cases, so let's do the word hashing
865 < for (int iw=0;iw<6;iw++) {
866 <   GStr aword=adapter3.substr(iw,3);
867 <   if ((fi=seq.index(aword))>=0  && rlen-fi>3) {
868 <      if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
869 <                   a3len, iw, 3, l5,l3)) return true;
864 > if (li>lmax) return false;
865 > //seed found, try to extend it both ways
866 > //extend left
867 > int ri=li+3; //save rightmost base of the seed
868 > SLocScore loc(li, poly_m_score<<2);
869 > SLocScore maxloc(loc);
870 > while (li>0) {
871 >    li--;
872 >    if (seq[li]==polyChar) {
873 >                 loc.add(li,poly_m_score);
874 >                 }
875 >    else if (seq[li]=='N') {
876 >                 loc.add(li,0);
877 >                 }
878 >    else { //mismatch
879 >         loc.add(li,poly_mis_score);
880 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
881 >         }
882 >    if (maxloc.score<=loc.score) {
883 >       maxloc=loc;
884 >       }
885 >    }
886 > li=maxloc.pos;
887 > if (li>5) return false; //no trimming wanted, too far from 5' end
888 > //li = right boundary for the poly match
889 >
890 > //extend right
891 > loc.set(ri, maxloc.score);
892 > maxloc.pos=ri;
893 > while (ri<rlen-2) {
894 >   ri++;
895 >   if (seq[ri]==polyChar) {
896 >                loc.add(ri,poly_m_score);
897 >                }
898 >   else if (seq[ri]=='N') {
899 >                loc.add(ri,0);
900 >                }
901 >   else { //mismatch
902 >        loc.add(ri,poly_mis_score);
903 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
904 >        }
905 >   if (maxloc.score<=loc.score) {
906 >      maxloc=loc;
907        }
908     }
909 < return false; //no adapter parts found
909 >
910 > if ((maxloc.score==poly_minScore && li==0) ||
911 >     (maxloc.score>poly_minScore && li<2)
912 >     || (maxloc.score>(poly_minScore<<1) && li<6)) {
913 >    l5=li;
914 >    l3=ri;
915 >    return true;
916 >    }
917 > return false;
918 > }
919 >
920 > bool trim_adapter3(GStr& seq, int&l5, int &l3) {
921 > if (adapters3.Count()==0) return false;
922 > int rlen=seq.length();
923 > l5=0;
924 > l3=rlen-1;
925 > bool trimmed=false;
926 > GStr wseq(seq.chars());
927 > int wlen=rlen;
928 > for (int ai=0;ai<adapters3.Count();ai++) {
929 >  if (adapters3[ai].is_empty()) continue;
930 >  int alen=adapters3[ai].length();
931 >  GStr& aseq=adapters3[ai];
932 >  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
933 >  if (r_bestaln) {
934 >     trimmed=true;
935 >     //keep unmatched region on the left, if any
936 >     l3-=(wlen-r_bestaln->sl+1);
937 >     delete r_bestaln;
938 >     if (l3<0) l3=0;
939 >     if (l3-l5+1<min_read_len) return true;
940 >     wseq=seq.substr(l5,l3-l5+1);
941 >     wlen=wseq.length();
942 >     }
943 >  }//for each 5' adapter
944 >  return trimmed;
945   }
946  
947   bool trim_adapter5(GStr& seq, int&l5, int &l3) {
948 + if (adapters5.Count()==0) return false;
949   int rlen=seq.length();
950   l5=0;
951   l3=rlen-1;
952 < //try to see if adapter is fully included in the read
953 < int fi=-1;
954 < if ((fi=seq.index(adapter5))>=0) {
955 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
956 <      l5=fi+a5len;
957 <      l3=rlen-1;
958 <      }
959 <    else {
960 <      l5=0;
961 <      l3=fi-1;
952 > bool trimmed=false;
953 > GStr wseq(seq.chars());
954 > int wlen=rlen;
955 > for (int ai=0;ai<adapters5.Count();ai++) {
956 >  if (adapters5[ai].is_empty()) continue;
957 >  int alen=adapters5[ai].length();
958 >  GStr& aseq=adapters5[ai];
959 >  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_l, 84);
960 >  if (l_bestaln) {
961 >     trimmed=true;
962 >     l5+=l_bestaln->sr;
963 >     delete l_bestaln;
964 >     if (l5>=rlen) l5=rlen-1;
965 >     if (l3-l5+1<min_read_len) return true;
966 >     wseq=seq.substr(l5,l3-l5+1);
967 >     wlen=wseq.length();
968 >     }
969 >  }//for each 5' adapter
970 >  return trimmed;
971 > }
972 >
973 > //convert qvs to/from phred64 from/to phread33
974 > void convertPhred(GStr& q) {
975 > for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
976 > }
977 >
978 > void convertPhred(char* q, int len) {
979 > for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
980 > }
981 >
982 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
983 >          GStr& rname, GStr& rinfo, GStr& infname) {
984 > rseq="";
985 > rqv="";
986 > rname="";
987 > rinfo="";
988 > if (fq.eof()) return false;
989 > char* l=fq.getLine();
990 > while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
991 > if (l==NULL) return false;
992 > /* if (rawFormat) {
993 >      //TODO: implement raw qseq parsing here
994 >      //if (raw type=N) then continue; //skip invalid/bad records
995 >      } //raw qseq format
996 > else { // FASTQ or FASTA */
997 > isfasta=(l[0]=='>');
998 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
999 >      infname.chars(), l);
1000 > GStr s(l);
1001 > rname=&(l[1]);
1002 > for (int i=0;i<rname.length();i++)
1003 >    if (rname[i]<=' ') {
1004 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1005 >       rname.cut(i);
1006 >       break;
1007 >       }
1008 >  //now get the sequence
1009 > if ((l=fq.getLine())==NULL)
1010 >      GError("Error: unexpected EOF after header for read %s (%s)\n",
1011 >                   rname.chars(), infname.chars());
1012 > rseq=l; //this must be the DNA line
1013 > while ((l=fq.getLine())!=NULL) {
1014 >      //seq can span multiple lines
1015 >      if (l[0]=='>' || l[0]=='+') {
1016 >           fq.pushBack();
1017 >           break; //
1018 >           }
1019 >      rseq+=l;
1020 >      } //check for multi-line seq
1021 > if (!isfasta) { //reading fastq quality values, which can also be multi-line
1022 >    if ((l=fq.getLine())==NULL)
1023 >        GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1024 >    if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1025 >    if ((l=fq.getLine())==NULL)
1026 >        GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1027 >    rqv=l;
1028 >    //if (rqv.length()!=rseq.length())
1029 >    //  GError("Error: qv len != seq len for %s\n", rname.chars());
1030 >    while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1031 >      rqv+=l; //append to qv string
1032        }
1033 <   return true;
1034 <   }
1035 < //for fast detection of adapter-rich reads, check if the first 12
1036 < //characters of the read are a substring of the adapter
1037 < GStr rstart=seq.substr(1,12);
1038 < if ((fi=adapter5.index(rstart))>=0) {
1039 <   //l3=rlen-1;
1040 <   //l5=a5len-4;
1041 <   //while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++;
1042 <   //return true;
1043 <   extendMatch(seq.chars(), rlen, 1,
1044 <                 adapter5.chars(), a5len, fi,  12, l5,l3);
1045 <   return true;
1033 >    }// fastq
1034 > // } //<-- FASTA or FASTQ
1035 > rseq.upper();
1036 > return true;
1037 > }
1038 >
1039 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1040 > //returns 0 if the read was untouched, 1 if it was just trimmed
1041 > // and a trash code if it was trashed
1042 > l5=0;
1043 > l3=rseq.length()-1;
1044 > #ifdef GDEBUG
1045 >   GMessage(">%s\n", rname.chars());
1046 >   GMessage("==>%s\n",rseq.chars());
1047 > #endif
1048 > if (l3-l5+1<min_read_len) {
1049 >   return 's';
1050     }
1051 <  
1052 < //another easy case: last 12 characters of the adaptor found as a substring of the read
1053 < int aplen=12;
1054 < int apstart=a5len-aplen-2;
1055 < if (apstart<0) { apstart=0; aplen=a5len-1; }
1056 < GStr bstr=adapter5.substr(apstart, aplen);
1057 < if ((fi=seq.index(bstr))>=0) {
1058 <   extendMatch(seq.chars(), rlen, fi,
1059 <                 adapter5.chars(), a5len, apstart,  aplen, l5,l3);
1060 <   return true;
1051 > GStr wseq(rseq.chars());
1052 > GStr wqv(rqv.chars());
1053 > int w5=l5;
1054 > int w3=l3;
1055 > //first do the q-based trimming
1056 > if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1057 >   if (w3-w5+1<min_read_len) {
1058 >     return 'Q'; //invalid read
1059 >     }
1060 >    //-- keep only the w5..w3 range
1061 >   l5=w5;
1062 >   l3=w3;
1063 >   wseq=wseq.substr(w5, w3-w5+1);
1064 >   if (!wqv.is_empty())
1065 >      wqv=wqv.substr(w5, w3-w5+1);
1066 >   } //qv trimming
1067 > // N-trimming on the remaining read seq
1068 > if (ntrim(wseq, w5, w3)) {
1069 >   //GMessage("before: %s\n",wseq.chars());
1070 >   //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1071 >   l5+=w5;
1072 >   l3-=(wseq.length()-1-w3);
1073 >   if (w3-w5+1<min_read_len) {
1074 >     return 'N'; //to be trashed
1075 >     }
1076 >    //-- keep only the w5..w3 range
1077 >   wseq=wseq.substr(w5, w3-w5+1);
1078 >   if (!wqv.is_empty())
1079 >      wqv=wqv.substr(w5, w3-w5+1);
1080 >   w5=0;
1081 >   w3=wseq.length()-1;
1082     }
1083 < //no easy cases, find a triplet match as a seed for alignment extension
1084 < //find triplets at the right end of the adapter
1085 < for (int iw=0;iw<6;iw++) {
1086 <   apstart=a5len-iw-4;
1087 <   GStr aword=adapter5.substr(apstart,3);
1088 <   if ((fi=seq.index(aword))>=0) {
1089 <      if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1090 <                   a5len, apstart, 3, l5,l3)) {
775 <                     return true;
776 <                     }
1083 > char trim_code;
1084 > do {
1085 >  trim_code=0;
1086 >  if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1087 >      trim_code='A';
1088 >      #ifdef GDEBUG
1089 >       GMessage("\t-trimmed poly-A at 5' end\n");
1090 >      #endif
1091        }
1092 +  else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1093 +      trim_code='T';
1094 +      #ifdef GDEBUG
1095 +       GMessage("\t-trimmed poly-T at 5' end\n");
1096 +      #endif
1097 +      }
1098 +  else if (trim_adapter5(wseq, w5, w3)) {
1099 +      trim_code='5';
1100 +      #ifdef GDEBUG
1101 +       GMessage("\t-trimmed adapter at 5' end\n");
1102 +      #endif
1103 +      }
1104 +  if (trim_code) {
1105 +     int trimlen=wseq.length()-(w3-w5+1);
1106 +     num_trimmed5++;
1107 +     if (trimlen<min_trimmed5)
1108 +         min_trimmed5=trimlen;
1109 +     l5+=w5;
1110 +     l3-=(wseq.length()-1-w3);
1111 +     if (w3-w5+1<min_read_len) {
1112 +         return trim_code;
1113 +         }
1114 +      //-- keep only the w5..w3 range
1115 +      wseq=wseq.substr(w5, w3-w5+1);
1116 +      if (!wqv.is_empty())
1117 +         wqv=wqv.substr(w5, w3-w5+1);
1118 +      }// trimmed at 5' end
1119 + } while (trim_code);
1120 +
1121 + do {
1122 +  trim_code=0;
1123 +  if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1124 +      trim_code='A';
1125 +      }
1126 +  else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1127 +      trim_code='T';
1128 +      }
1129 +  else if (trim_adapter3(wseq, w5, w3)) {
1130 +      trim_code='3';
1131 +      }
1132 +  if (trim_code) {
1133 +     int trimlen=wseq.length()-(w3-w5+1);
1134 +     num_trimmed3++;
1135 +     if (trimlen<min_trimmed3)
1136 +         min_trimmed3=trimlen;
1137 +     l5+=w5;
1138 +     l3-=(wseq.length()-1-w3);
1139 +     if (w3-w5+1<min_read_len) {
1140 +         return trim_code;
1141 +         }
1142 +      //-- keep only the w5..w3 range
1143 +      wseq=wseq.substr(w5, w3-w5+1);
1144 +      if (!wqv.is_empty())
1145 +         wqv=wqv.substr(w5, w3-w5+1);
1146 +      }//trimming at 3' end
1147 + } while (trim_code);
1148 +
1149 +
1150 + if (doCollapse) {
1151 +   //keep read for later
1152 +   FqDupRec* dr=dhash.Find(wseq.chars());
1153 +   if (dr==NULL) { //new entry
1154 +          //if (prefix.is_empty())
1155 +             dhash.Add(wseq.chars(),
1156 +                  new FqDupRec(&wqv, rname.chars()));
1157 +          //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1158 +         }
1159 +      else
1160 +         dr->add(wqv);
1161 +   } //collapsing duplicates
1162 + else { //not collapsing duplicates
1163 +   //apply the dust filter now
1164 +   if (doDust) {
1165 +     int dustbases=dust(wseq);
1166 +     if (dustbases>(wseq.length()>>1)) {
1167 +        return 'D';
1168 +        }
1169 +     }
1170 +   } //not collapsing duplicates
1171 + return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1172 + }
1173 +
1174 + void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1175 + //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1176 + if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1177 +  else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1178 + }
1179 +
1180 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1181 +   outcounter++;
1182 +   bool asFasta=(rqv.is_empty() || fastaOutput);
1183 +   if (asFasta) {
1184 +    if (prefix.is_empty()) {
1185 +       printHeader(f_out, '>',rname,rinfo);
1186 +       fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1187 +       }
1188 +      else {
1189 +       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1190 +                          rseq.chars());
1191 +       }
1192 +     }
1193 +   else {  //fastq
1194 +    if (convert_phred) convertPhred(rqv);
1195 +    if (prefix.is_empty()) {
1196 +       printHeader(f_out, '@', rname, rinfo);
1197 +       fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1198 +       }
1199 +      else
1200 +       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1201 +                          rseq.chars(),rqv.chars() );
1202 +     }
1203 + }
1204 +
1205 + void trash_report(char trashcode, GStr& rname, FILE* freport) {
1206 + if (freport==NULL || trashcode<=' ') return;
1207 + if (trashcode=='3' || trashcode=='5') {
1208 +   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1209 +   }
1210 + else {
1211 +   fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1212     }
1213 < return false; //no adapter parts found
1213 > //tcounter++;
1214 > }
1215 >
1216 > GStr getFext(GStr& s, int* xpos=NULL) {
1217 > //s must be a filename without a path
1218 > GStr r("");
1219 > if (xpos!=NULL) *xpos=0;
1220 > if (s.is_empty() || s=="-") return r;
1221 > int p=s.rindex('.');
1222 > int d=s.rindex('/');
1223 > if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1224 > r=s.substr(p+1);
1225 > if (xpos!=NULL) *xpos=p+1;
1226 > r.lower();
1227 > return r;
1228   }
1229  
1230 < //conversion of phred64 to phread33
1231 < void convertQ64(GStr& q) {
1232 < for (int i=0;i<q.length();i++) q[i]-=31;
1230 > void baseFileName(GStr& fname) {
1231 > //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1232 > int xpos=0;
1233 > GStr fext=getFext(fname, &xpos);
1234 > if (xpos<=1) return;
1235 > bool extdel=false;
1236 > GStr f2;
1237 > if (fext=="z" || fext=="zip") {
1238 >   extdel=true;
1239 >   }
1240 >  else if (fext.length()>=2) {
1241 >   f2=fext.substr(0,2);
1242 >   extdel=(f2=="gz" || f2=="bz");
1243 >   }
1244 > if (extdel) {
1245 >   fname.cut(xpos-1);
1246 >   fext=getFext(fname, &xpos);
1247 >   if (xpos<=1) return;
1248 >   }
1249 > extdel=false;
1250 > if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1251 >   extdel=true;
1252 >   }
1253 >  else if (fext.length()>=2) {
1254 >   extdel=(fext.substr(0,2)=="fa");
1255 >   }
1256 > if (extdel) fname.cut(xpos-1);
1257 > GStr fncp(fname);
1258 > fncp.lower();
1259 > fncp.chomp("seq");
1260 > fncp.chomp("sequence");
1261 > fncp.trimR("_.");
1262 > if (fncp.length()<fname.length()) fname.cut(fncp.length());
1263   }
1264  
1265 < void convertQ64(char* q, int len) {
1266 < for (int i=0;i<len;i++) q[i]-=31;
1265 > FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1266 >  FILE* f_out=NULL;
1267 >  GStr fname(getFileName(infname.chars()));
1268 >  //eliminate known extensions
1269 >  baseFileName(fname);
1270 >  if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1271 >    else if (pocmd.is_empty()) {
1272 >               GStr oname(fname);
1273 >               oname.append('.');
1274 >               oname.append(outsuffix);
1275 >               f_out=fopen(oname.chars(),"w");
1276 >               if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1277 >               }
1278 >            else {
1279 >              GStr oname(">");
1280 >              oname.append(fname);
1281 >              oname.append('.');
1282 >              oname.append(outsuffix);
1283 >              pocmd.append(oname);
1284 >              f_out=popen(pocmd.chars(), "w");
1285 >              if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1286 >              }
1287 > return f_out;
1288 > }
1289 >
1290 > void guess_unzip(GStr& fname, GStr& picmd) {
1291 > GStr fext=getFext(fname);
1292 > if (fext=="gz" || fext=="gzip" || fext=="z") {
1293 >    picmd="gzip -cd ";
1294 >    }
1295 >   else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1296 >    picmd="bzip2 -cd ";
1297 >    }
1298 > }
1299 >
1300 >
1301 > int loadAdapters(const char* fname) {
1302 >  GLineReader lr(fname);
1303 >  char* l;
1304 >  while ((l=lr.nextLine())!=NULL) {
1305 >   if (lr.length()<=3 || l[0]=='#') continue;
1306 >   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1307 >        l[0]==';'|| l[0]==':' ) {
1308 >      int i=1;
1309 >      while (l[i]!=0 && isspace(l[i])) {
1310 >        i++;
1311 >        }
1312 >      if (l[i]!=0) {
1313 >        GStr s(&(l[i]));
1314 >        adapters3.Add(s);
1315 >        continue;
1316 >        }
1317 >      }
1318 >    else {
1319 >      GStr s(l);
1320 >      s.startTokenize("\t ;,:");
1321 >      GStr a5,a3;
1322 >      if (s.nextToken(a5))
1323 >         s.nextToken(a3);
1324 >      a5.upper();
1325 >      a3.upper();
1326 >      adapters5.Add(a5);
1327 >      adapters3.Add(a3);
1328 >      }
1329 >   }
1330 >   return adapters5.Count()+adapters3.Count();
1331   }
1332  
1333 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1334 +                       GStr& s, GStr& infname, GStr& infname2) {
1335 + // uses outsuffix to generate output file names and open file handles as needed
1336 + infname="";
1337 + infname2="";
1338 + f_in=NULL;
1339 + f_in2=NULL;
1340 + f_out=NULL;
1341 + f_out2=NULL;
1342 + //analyze outsuffix intent
1343 + GStr pocmd;
1344 + if (outsuffix=="-") {
1345 +    f_out=stdout;
1346 +    }
1347 +   else {
1348 +    GStr ox=getFext(outsuffix);
1349 +    if (ox.length()>2) ox=ox.substr(0,2);
1350 +    if (ox=="gz") pocmd="gzip -9 -c ";
1351 +        else if (ox=="bz") pocmd="bzip2 -9 -c ";
1352 +    }
1353 + if (s=="-") {
1354 +    f_in=stdin;
1355 +    infname="stdin";
1356 +    f_out=prepOutFile(infname, pocmd);
1357 +    return;
1358 +    } // streaming from stdin
1359 + s.startTokenize(",:");
1360 + s.nextToken(infname);
1361 + bool paired=s.nextToken(infname2);
1362 + if (fileExists(infname.chars())==0)
1363 +    GError("Error: cannot find file %s!\n",infname.chars());
1364 + GStr fname(getFileName(infname.chars()));
1365 + GStr picmd;
1366 + guess_unzip(fname, picmd);
1367 + if (picmd.is_empty()) {
1368 +   f_in=fopen(infname.chars(), "r");
1369 +   if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1370 +   }
1371 +  else {
1372 +   picmd.append(infname);
1373 +   f_in=popen(picmd.chars(), "r");
1374 +   if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1375 +   }
1376 + if (f_out==stdout) {
1377 +   if (paired) GError("Error: output suffix required for paired reads\n");
1378 +   return;
1379 +   }
1380 + f_out=prepOutFile(infname, pocmd);
1381 + if (!paired) return;
1382 + if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1383 + // ---- paired reads:-------------
1384 + if (fileExists(infname2.chars())==0)
1385 +     GError("Error: cannot find file %s!\n",infname2.chars());
1386 + picmd="";
1387 + GStr fname2(getFileName(infname2.chars()));
1388 + guess_unzip(fname2, picmd);
1389 + if (picmd.is_empty()) {
1390 +   f_in2=fopen(infname2.chars(), "r");
1391 +   if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1392 +   }
1393 +  else {
1394 +   picmd.append(infname2);
1395 +   f_in2=popen(picmd.chars(), "r");
1396 +   if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1397 +   }
1398 + f_out2=prepOutFile(infname2, pocmd);
1399 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines