ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 2 | Line 2
2   #include "GStr.h"
3   #include "GHash.hh"
4   #include "GList.hh"
5 + #include <ctype.h>
6 + #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9 < fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-q <minqv>] [-C] [-D]\\\n\
10 <   [-p {64|33}] [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>]\\\n\
11 <   [-Q] <input.fq>\n\
9 > fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 >   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 >   [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 >    <input.fq>[,<input_mates.fq>\n\
13   \n\
14 < Trim low quality bases at 3' end, optionally trim adapter sequence, filter\n\
15 < for low complexity and collapse duplicate reads\n\
14 > Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 > for low complexity and collapse duplicate reads.\n\
16 > If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 > one read in a pair), the two file names should be given delimited by a comma\n\
18 > or a colon character\n\
19   \n\
20   Options:\n\
21   -n  rename all the reads using the <prefix> followed by a read counter;\n\
22      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23      the read duplication count\n\
24 < -o  write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\
24 > -o  unless this parameter is '-', write the trimmed/filtered reads to \n\
25 >    file(s) named <input>.<outsuffix> which will be created in the \n\
26 >    current (working) directory; (writes to stdout if -o- is given);\n\
27 >    a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 > -f  file with adapter sequences to trim, each line having this format:\n\
29 >    <5'-adapter-sequence> <3'-adapter-sequence>\n\
30   -5  trim the given adapter or primer sequence at the 5' end of each read\n\
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32   -3  trim the given adapter sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -q  trim bases with quality value lower than <minq> (starting the 3' end)\n\
34 > -A  disable polyA trimming (enabled by default)\n\
35 > -T  enable polyT trimming (disabled by default)\n\
36 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
37 > -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
40   -l  minimum \"clean\" length after trimming that a read must have\n\
41      in order to pass the filter (default: 16)\n\
# Line 34 | Line 49
49   -Q  convert quality values to the other Phred qv type\n\
50   -V  verbose processing\n\
51   "
52 +
53 + //-z  for -o option, the output stream(s) will be first piped into the given\n
54 + //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
55 +
56 +
57   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
58  
59 < //For pair ends sequencing:
59 > //For paired reads sequencing:
60   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
61   //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
62 < FILE* f_out=NULL; //stdout if not provided
63 < FILE* f_in=NULL; //input fastq (stdin if not provided)
62 > //FILE* f_out=NULL; //stdout if not provided
63 > //FILE* f_out2=NULL; //for paired reads
64 > //FILE* f_in=NULL; //input fastq (stdin if not provided)
65 > //FILE* f_in2=NULL; //for paired reads
66   FILE* freport=NULL;
67 +
68   bool debug=false;
69   bool verbose=false;
70   bool doCollapse=false;
71   bool doDust=false;
72 + bool fastaOutput=false;
73   bool trashReport=false;
74   //bool rawFormat=false;
75   int min_read_len=16;
# Line 53 | Line 77
77   int dust_cutoff=16;
78   bool isfasta=false;
79   bool convert_phred=false;
80 + GStr outsuffix; // -o
81   GStr prefix;
82 < GStr adapter3;
83 < GStr adapter5;
84 <
85 < int qvtrim_min=0;
82 > GStr zcmd;
83 > int num_trimmed5=0;
84 > int num_trimmed3=0;
85 > int num_trimmedN=0;
86 > int num_trimmedQ=0;
87 > int min_trimmed5=INT_MAX;
88 > int min_trimmed3=INT_MAX;
89  
90 + int qvtrim_qmin=0;
91 + int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
92   int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93   int qv_cvtadd=0; //could be -31 or +31
94  
95 < int a3len=0;
96 < int a5len=0;
97 < // adaptor matching metrics -- for extendMatch() function
98 < const int a_m_score=2; //match score
99 < const int a_mis_score=-3; //mismatch
100 < const int a_dropoff_score=7;
101 < const int a_min_score=8; //an exact match of 4 bases at the proper ends WILL be trimmed
102 < const int a_min_chain_score=15; //for gapped alignments
103 <
104 < class CSegChain;
105 <
106 < class CSegPair {
107 <  public:
78 <   GSeg a;
79 <   GSeg b; //the adapter segment
80 <   int score;
81 <   int flags;
82 <   CSegChain* chain;
83 <   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
84 <      score=mscore;
85 <      if (score==0) score=a.len()*a_m_score;
86 <      flags=0;
87 <      chain=NULL;
88 <      }
89 <   int len() { return  a.len(); }
90 <   bool operator==(CSegPair& d){
91 <      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
92 <      //make equal even segments that are included into one another:
93 <      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
95 > // adaptor matching metrics -- for X-drop ungapped extension
96 > const int poly_m_score=2; //match score
97 > const int poly_mis_score=-3; //mismatch
98 > const int poly_dropoff_score=7;
99 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
100 >
101 > const char *polyA_seed="AAAA";
102 > const char *polyT_seed="TTTT";
103 >
104 > struct CAdapters {
105 >    GStr a5;
106 >    GStr a3;
107 >    CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
108        }
109 <   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
96 <     if (b.start==d.b.start) {
97 <        if (score==d.score) {
98 <           //just try to be consistent:
99 <           if (b.end==d.b.end) {
100 <             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
101 <             }
102 <           return (b.end>d.b.end);
103 <           }
104 <         else return (score<d.score);
105 <        }
106 <     return (b.start>d.b.start);
107 <     }
108 <   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
109 <     /*if (b.start==d.b.start && b.end==d.b.end) {
110 <          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
111 <          }
112 <     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
113 <     if (b.start==d.b.start) {
114 <        if (score==d.score) {
115 <           //just try to be consistent:
116 <           if (b.end==d.b.end) {
117 <             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
118 <             }
119 <           return (b.end<d.b.end);
120 <           }
121 <         else return (score>d.score);
122 <        }
123 <     return (b.start<d.b.start);
124 <     }
125 < };
126 <
127 < int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
128 < CSegPair& x = *(CSegPair *)sa;
129 < CSegPair& y = *(CSegPair *)sb;
130 < /*
131 < if (x.b.end==y.b.end) {
132 <     if (x.b.start==y.b.start) {
133 <         if (x.a.end==y.a.end) {
134 <            if (x.a.start==y.a.start) return 0;
135 <            return ((x.a.start>y.a.start) ? -1 : 1);
136 <            }
137 <          else {
138 <            return ((x.a.end>y.a.end) ? -1 : 1);
139 <            }
140 <          }
141 <      else {
142 <       return ((x.b.start>y.b.start) ? -1 : 1);
143 <       }
144 <     }
145 <    else {
146 <     return ((x.b.end>y.b.end) ? -1 : 1);
147 <     }
148 < */
149 <  if (x.b.end==y.b.end) {
150 <     if (x.score==y.score) {
151 <     if (x.b.start==y.b.start) {
152 <         if (x.a.end==y.a.end) {
153 <            if (x.a.start==y.a.start) return 0;
154 <            return ((x.a.start<y.a.start) ? -1 : 1);
155 <            }
156 <          else {
157 <            return ((x.a.end<y.a.end) ? -1 : 1);
158 <            }
159 <          }
160 <      else {
161 <       return ((x.b.start<y.b.start) ? -1 : 1);
162 <       }
163 <      } else return ((x.score>y.score) ? -1 : 1);
164 <     }
165 <    else {
166 <     return ((x.b.end>y.b.end) ? -1 : 1);
167 <     }
168 <
169 < }
109 > };
110  
111 < class CSegChain:public GList<CSegPair> {
172 < public:
173 <   uint astart;
174 <   uint aend;
175 <   uint bstart;
176 <   uint bend;
177 <   int score;
178 <   bool endSort;
179 <  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
180 <   //as SegPairs are inserted, they will be sorted by a.start coordinate
181 <   score=0;
182 <   astart=MAX_UINT;
183 <   aend=0;
184 <   bstart=MAX_UINT;
185 <   bend=0;
186 <   endSort=aln5;
187 <   if (aln5) { setSorted(cmpSegEnds); }
188 <   }
189 < bool operator==(CSegChain& d) {
190 <   //return (score==d.score);
191 <    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
192 <   }
193 < bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
194 <   //return (score<d.score);
195 <   if (bstart==d.bstart && bend==d.bend) {
196 <          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
197 <          }
198 <     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
199 <   }
200 < bool operator<(CSegChain& d) {
201 <   //return (score>d.score);
202 <   if (bstart==d.bstart && bend==d.bend) {
203 <          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
204 <          }
205 <     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
206 <   }
207 < void addSegPair(CSegPair* segp) {
208 <   if (AddIfNew(segp)!=segp) return;
209 <   score+=segp->score;
210 <   if (astart>segp->a.start) astart=segp->a.start;
211 <   if (aend<segp->a.end) aend=segp->a.end;
212 <   if (bstart>segp->b.start) bstart=segp->b.start;
213 <   if (bend<segp->b.end) bend=segp->b.end;
214 <   }
215 < //for building actual chains:
216 < bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
217 <   int bgap=0;
218 <   int agap=0;
219 <   //if (endSort) {
220 <   if (bstart>segp->b.start) {
221 <      bgap = (int)(bstart-segp->b.end);
222 <      if (abs(bgap)>2) return false;
223 <      agap = (int)(astart-segp->a.end);
224 <      if (abs(agap)>2) return false;
225 <      }
226 <     else {
227 <      bgap = (int) (segp->b.start-bend);
228 <      if (abs(bgap)>2) return false;
229 <      agap = (int)(segp->a.start-aend);
230 <      if (abs(agap)>2) return false;
231 <      }
232 <   if (agap*bgap<0) return false;
233 <   addSegPair(segp);
234 <   score-=abs(agap)+abs(bgap);
235 <   return true;
236 <   }
237 < };
111 > GPVec<CAdapters> adapters;
112  
113   // element in dhash:
114   class FqDupRec {
# Line 273 | Line 147
147    if (!s.is_empty()) {
148        if (s=='-') f=stdout;
149        else {
150 <       f=fopen(s,"w");
150 >       f=fopen(s.chars(),"w");
151         if (f==NULL) GError("Error creating file: %s\n", s.chars());
152         }
153       }
# Line 284 | Line 158
158  
159   GHash<FqDupRec> dhash; //hash to keep track of duplicates
160  
161 + int loadAdapters(const char* fname);
162 +
163 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
164 +                       GStr& s, GStr& infname, GStr& infname2);
165 + // uses outsuffix to generate output file names and open file handles as needed
166 +
167 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
168 + void trash_report(char trashcode, GStr& rname, FILE* freport);
169 +
170 + bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
171 +          GStr& rname, GStr& rinfo, GStr& infname);
172 +
173 + char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
174 + //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
175 +
176   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
177   bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
178   int dust(GStr& seq);
179 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
179 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
180 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
181   bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
182 + bool trim_adapter3(GStr& seq, int &l5, int &l3);
183  
184   void convertPhred(char* q, int len);
185   void convertPhred(GStr& q);
186  
187   int main(int argc, char * const argv[]) {
188 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:o:");
188 >  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189    int e;
299  int icounter=0; //counter for input reads
300  int outcounter=0; //counter for output reads
190    if ((e=args.isError())>0) {
191        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
192        exit(224);
193        }
194    debug=(args.getOpt('Y')!=NULL);
195 <  debug=(args.getOpt('V')!=NULL);
195 >  verbose=(args.getOpt('V')!=NULL);
196    convert_phred=(args.getOpt('Q')!=NULL);
197    doCollapse=(args.getOpt('C')!=NULL);
198    doDust=(args.getOpt('D')!=NULL);
# Line 327 | Line 216
216       }
217    s=args.getOpt('q');
218    if (!s.is_empty()) {
219 <     qvtrim_min=s.asInt();
219 >     qvtrim_qmin=s.asInt();
220 >     }
221 >  s=args.getOpt('t');
222 >  if (!s.is_empty()) {
223 >     qvtrim_max=s.asInt();
224       }
225    s=args.getOpt('p');
226    if (!s.is_empty()) {
# Line 343 | Line 236
236         else
237           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
238       }
239 <  if (args.getOpt('3')!=NULL) {
240 <    adapter3=args.getOpt('3');
241 <    adapter3.upper();
242 <    a3len=adapter3.length();
243 <    }
244 <  if (args.getOpt('5')!=NULL) {
245 <    adapter5=args.getOpt('5');
246 <    adapter5.upper();
247 <    a5len=adapter5.length();
239 >  s=args.getOpt('f');
240 >  if (!s.is_empty()) {
241 >   loadAdapters(s.chars());
242 >   }
243 >  bool fileAdapters=adapters.Count();
244 >  s=args.getOpt('5');
245 >  if (!s.is_empty()) {
246 >    if (fileAdapters)
247 >      GError("Error: options -5 and -f cannot be used together!\n");
248 >    s.upper();
249 >    adapters.Add(new CAdapters(s.chars()));
250 >    }
251 >  s=args.getOpt('3');
252 >  if (!s.is_empty()) {
253 >    if (fileAdapters)
254 >      GError("Error: options -3 and -f cannot be used together!\n");
255 >    s.upper();
256 >    if (adapters.Count()>0)
257 >          adapters[0]->a3=s.chars();
258 >     else adapters.Add(NULL, new CAdapters(s.chars()));
259      }
260 +  s=args.getOpt('y');
261 +  if (!s.is_empty()) {
262 +     int minmatch=s.asInt();
263 +     poly_minScore=minmatch*poly_m_score;
264 +     }
265 +  
266 +  if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
267 +                         else outsuffix="-";
268    trashReport=  (args.getOpt('r')!=NULL);
269 <  if (args.startNonOpt()==0) {
269 >  int fcount=args.startNonOpt();
270 >  if (fcount==0) {
271      GMessage(USAGE);
272      exit(224);
273      }
274 <
275 <  openfw(f_out, args, 'o');
276 <  if (f_out==NULL) f_out=stdout;
274 >   if (fcount>1 && doCollapse) {
275 >    GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
276 >    }
277 >  //openfw(f_out, args, 'o');
278 >  //if (f_out==NULL) f_out=stdout;
279    if (trashReport)
280      openfw(freport, args, 'r');
281    char* infile=NULL;
282    while ((infile=args.nextNonOpt())!=NULL) {
283 <    GStr infname(infile);
284 <    if (strcmp(infile,"-")==0) {
285 <       f_in=stdin; infname="stdin"; }
286 <     else {
287 <        f_in=fopen(infile,"r");
288 <        if (f_in==NULL)
289 <            GError("Cannot open input file %s!\n",infile);
290 <        }
291 <     GLineReader fq(f_in);
292 <     char* l=NULL;
293 <     while ((l=fq.getLine())!=NULL) {
294 <        GStr rname; //current read name
295 <        GStr rseq;  //current read sequence
296 <        GStr rqv;   //current read quality values
297 <        GStr s;
298 <        /* if (rawFormat) {
299 <          //TODO: implement qseq parsing here
300 <          //if (raw type=N) then continue; //skip invalid/bad records
301 <          
302 <          } //raw qseq format
303 <         else { // FASTQ or FASTA */
304 <          isfasta=(l[0]=='>');
305 <          if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n");
306 <          s=l;
307 <          rname=&(l[1]);
308 <          icounter++;
309 <          for (int i=0;i<rname.length();i++)
310 <            if (rname[i]<=' ') { rname.cut(i); break; }
311 <          //now get the sequence
312 <          if ((l=fq.getLine())==NULL)
313 <              GError("Error: unexpected EOF after header for %s\n",rname.chars());
314 <          rseq=l; //this must be the DNA line
315 <          while ((l=fq.getLine())!=NULL) {
316 <              //seq can span multiple lines
317 <              if (l[0]=='>' || l[0]=='+') {
318 <                   fq.pushBack();
319 <                   break; //
283 >    int incounter=0; //counter for input reads
284 >    int outcounter=0; //counter for output reads
285 >    int trash_s=0; //too short from the get go
286 >    int trash_Q=0;
287 >    int trash_N=0;
288 >    int trash_D=0;
289 >    int trash_A3=0;
290 >    int trash_A5=0;
291 >    s=infile;
292 >    GStr infname;
293 >    GStr infname2;
294 >    FILE* f_in=NULL;
295 >    FILE* f_in2=NULL;
296 >    FILE* f_out=NULL;
297 >    FILE* f_out2=NULL;
298 >    bool paired_reads=false;
299 >    setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
300 >    GLineReader fq(f_in);
301 >    GLineReader* fq2=NULL;
302 >    if (f_in2!=NULL) {
303 >       fq2=new GLineReader(f_in2);
304 >       paired_reads=true;
305 >       }
306 >    GStr rseq, rqv, seqid, seqinfo;
307 >    GStr rseq2, rqv2, seqid2, seqinfo2;
308 >    while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
309 >       incounter++;
310 >       int a5=0, a3=0, b5=0, b3=0;
311 >       char tcode=0, tcode2=0;
312 >       tcode=process_read(seqid, rseq, rqv, a5, a3);
313 >       //if (!doCollapse) {
314 >         if (fq2!=NULL) {
315 >            getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
316 >            if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
317 >               GError("Error: no paired match for read %s vs %s (%s,%s)\n",
318 >                  seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
319 >               }
320 >            tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
321 >            //decide what to do with this pair and print rseq2 if the pair makes it
322 >            if (tcode>1 && tcode2<=1) {
323 >               //"untrash" rseq
324 >               if (a3-a5+1<min_read_len) {
325 >                   a5=1;
326 >                   if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
327                     }
328 <              rseq+=l;
329 <              } //check for multi-line seq
330 <          if (!isfasta) { //reading fastq quality values, which can also be multi-line
331 <            if ((l=fq.getLine())==NULL)
332 <                GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
333 <            if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
334 <            if ((l=fq.getLine())==NULL)
335 <                GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
336 <            rqv=l;
337 <            //if (rqv.length()!=rseq.length())
338 <            //  GError("Error: qv len != seq len for %s\n", rname.chars());
339 <            while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
340 <              rqv+=l; //append to qv string
341 <              }
420 <            }// fastq
421 <        // } //<-- FASTA or FASTQ
422 <        rseq.upper();
423 <        int l5=0;
424 <        int l3=rseq.length()-1;
425 <        if (l3-l5+1<min_read_len) {
426 <           if (trashReport) {
427 <                  fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars());
428 <                  }
429 <           continue;
430 <           }
431 <        if (ntrim(rseq, l5, l3)) { // N-trimming
432 <           //GMessage("before: %s\n",rseq.chars());
433 <           //GMessage("after : %s (%d)\n",rseq.substr(l5,l3-l5+1).chars(),l3-l5+1);
434 <           if (l3-l5+1<min_read_len) {
435 <             if (trashReport) {
436 <                    fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars());
437 <                    }
438 <             continue; //invalid read
439 <             }
440 <            //-- keep only the l5..l3 range
441 <           rseq=rseq.substr(l5, l3-l5+1);
442 <           if (!rqv.is_empty())
443 <              rqv=rqv.substr(l5, l3-l5+1);
444 <           l5=0;
445 <           l3=rseq.length()-1;
446 <           }
447 <        if (qvtrim_min!=0 && !rqv.is_empty() && qtrim(rqv, l5, l3)) { // qv-threshold trimming
448 <           if (l3-l5+1<min_read_len) {
449 <             if (trashReport) {
450 <                    fprintf(freport, "%s\tQ\t%s\n", rname.chars(), rseq.chars());
451 <                    }
452 <             continue; //invalid read
453 <             }
454 <            //-- keep only the l5..l3 range
455 <           rseq=rseq.substr(l5, l3-l5+1);
456 <           if (!rqv.is_empty())
457 <              rqv=rqv.substr(l5, l3-l5+1);
458 <           } //qv trimming
459 <        if (a3len>0) {
460 <          if (trim_adapter3(rseq, l5, l3)) {
461 <             if (l3-l5+1<min_read_len) {
462 <                 if (trashReport) {
463 <                     fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars());
464 <                     }
465 <                 continue;
466 <                 }
467 <              //-- keep only the l5..l3 range
468 <              rseq=rseq.substr(l5, l3-l5+1);
469 <              if (!rqv.is_empty())
470 <                 rqv=rqv.substr(l5, l3-l5+1);
471 <              }//some adapter was trimmed
472 <           } //adapter trimming
473 <        if (a5len>0) {
474 <          if (trim_adapter5(rseq, l5, l3)) {
475 <             if (l3-l5+1<min_read_len) {
476 <                 if (trashReport) {
477 <                     fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars());
478 <                     }
479 <                 continue;
328 >               tcode=1;
329 >               }
330 >             else if (tcode<=1 && tcode2>1) {
331 >               //"untrash" rseq2
332 >               if (b3-b5+1<min_read_len) {
333 >                   b5=1;
334 >                   if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
335 >                   }
336 >               tcode2=1;
337 >               }
338 >            if (tcode<=1) { //trimmed or left intact -- write it!
339 >               if (tcode>0) {
340 >                 rseq2=rseq2.substr(b5,b3-b5+1);
341 >                 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
342                   }
343 <              //-- keep only the l5..l3 range
344 <              rseq=rseq.substr(l5, l3-l5+1);
345 <              if (!rqv.is_empty())
346 <                 rqv=rqv.substr(l5, l3-l5+1);
347 <              }//some adapter was trimmed
348 <           } //adapter trimming
349 <        if (doCollapse) {
350 <           //keep read for later
351 <           FqDupRec* dr=dhash.Find(rseq.chars());
352 <           if (dr==NULL) { //new entry
353 <                  //if (prefix.is_empty())
354 <                     dhash.Add(rseq.chars(),
355 <                          new FqDupRec(&rqv, rname.chars()));
356 <                  //else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length()));
343 >               int nocounter=0;
344 >               writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
345 >               }
346 >            } //paired read
347 >       // }
348 >       if (tcode>1) { //trashed
349 >          if (tcode=='s') trash_s++;
350 >            else if (tcode=='Q') trash_Q++;
351 >              else if (tcode=='N') trash_N++;
352 >               else if (tcode=='D') trash_D++;
353 >                else if (tcode=='3') trash_A3++;
354 >                 else if (tcode=='5') trash_A5++;
355 >          if (trashReport) trash_report(tcode, seqid, freport);
356 >          }
357 >         else if (!doCollapse) { //write it
358 >          if (tcode>0) {
359 >            rseq=rseq.substr(a5,a3-a5+1);
360 >            if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
361 >            }
362 >          writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
363 >          }
364 >       } //for each fastq record
365 >    delete fq2;
366 >    FRCLOSE(f_in);
367 >    FRCLOSE(f_in2);
368 >    if (doCollapse) {
369 >       outcounter=0;
370 >       int maxdup_count=1;
371 >       char* maxdup_seq=NULL;
372 >       dhash.startIterate();
373 >       FqDupRec* qd=NULL;
374 >       char* seq=NULL;
375 >       while ((qd=dhash.NextData(seq))!=NULL) {
376 >         GStr rseq(seq);
377 >         //do the dusting here
378 >         if (doDust) {
379 >            int dustbases=dust(rseq);
380 >            if (dustbases>(rseq.length()>>1)) {
381 >               if (trashReport && qd->firstname!=NULL) {
382 >                 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
383                   }
384 <              else    
385 <                 dr->add(rqv);
386 <           } //collapsing duplicates
387 <         else { //not collapsing duplicates
388 <           //do the dust filter now
389 <           if (doDust) {
390 <             int dustbases=dust(rseq);
391 <             if (dustbases>(rseq.length()>>1)) {
392 <                if (trashReport) {
393 <                  fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars());
394 <                  }
395 <                continue;
396 <                }
509 <             }
510 <           //print this record here  
511 <           outcounter++;
512 <           if (isfasta) {
513 <            if (prefix.is_empty())
514 <               fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars());
515 <              else
516 <               fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
517 <                                  rseq.chars());
384 >               trash_D+=qd->count;
385 >               continue;
386 >               }
387 >            }
388 >         outcounter++;
389 >         if (qd->count>maxdup_count) {
390 >            maxdup_count=qd->count;
391 >            maxdup_seq=seq;
392 >            }
393 >         if (isfasta) {
394 >           if (prefix.is_empty()) {
395 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
396 >                           rseq.chars());
397               }
398 <           else {  //fastq
399 <            if (convert_phred) convertPhred(rqv);
400 <            if (prefix.is_empty())
522 <               fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars());
523 <              else
524 <               fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
525 <                                  rseq.chars(),rqv.chars() );
398 >           else { //use custom read name
399 >             fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
400 >                        qd->count, rseq.chars());
401               }
402 <           } //not collapsing duplicates
403 <        } //for each fastq record
404 <   } //while each input file
405 < FRCLOSE(f_in);
406 < if (doCollapse) {
407 <    outcounter=0;
533 <    int maxdup_count=1;
534 <    char* maxdup_seq=NULL;
535 <    dhash.startIterate();
536 <    FqDupRec* qd=NULL;
537 <    char* seq=NULL;
538 <    while ((qd=dhash.NextData(seq))!=NULL) {
539 <      GStr rseq(seq);
540 <      //do the dusting here
541 <      if (doDust) {
542 <         int dustbases=dust(rseq);
543 <         if (dustbases>(rseq.length()>>1)) {
544 <            if (trashReport && qd->firstname!=NULL) {
545 <              fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq);
546 <              }
547 <            continue;
402 >           }
403 >         else { //fastq format
404 >          if (convert_phred) convertPhred(qd->qv, qd->len);
405 >          if (prefix.is_empty()) {
406 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
407 >                           rseq.chars(), qd->qv);
408              }
409 +          else { //use custom read name
410 +            fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
411 +                        qd->count, rseq.chars(), qd->qv);
412 +            }
413 +           }
414 +         }//for each element of dhash
415 +       if (maxdup_count>1) {
416 +         GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
417           }
418 <      outcounter++;
419 <      if (qd->count>maxdup_count) {
420 <         maxdup_count=qd->count;
421 <         maxdup_seq=seq;
422 <         }
423 <      if (isfasta) {
424 <        if (prefix.is_empty()) {
425 <          fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
426 <                        rseq.chars());
427 <          }
428 <        else { //use custom read name
429 <          fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
430 <                     qd->count, rseq.chars());
418 >       } //collapse entries
419 >    if (verbose) {
420 >       if (paired_reads) {
421 >           GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
422 >           GMessage("Number of input pairs :%9d\n", incounter);
423 >           GMessage("         Output pairs :%9d\n", outcounter);
424 >           }
425 >         else {
426 >           GMessage(">Input file : %s\n", infname.chars());
427 >           GMessage("Number of input reads :%9d\n", incounter);
428 >           GMessage("         Output reads :%9d\n", outcounter);
429 >           }
430 >       GMessage("------------------------------------\n");
431 >       if (num_trimmed5)
432 >          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
433 >       if (num_trimmed3)
434 >          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
435 >       GMessage("------------------------------------\n");
436 >       if (trash_s>0)
437 >         GMessage("     Trashed by length:%9d\n", trash_s);
438 >       if (trash_N>0)
439 >         GMessage("         Trashed by N%%:%9d\n", trash_N);
440 >       if (trash_Q>0)
441 >         GMessage("Trashed by low quality:%9d\n", trash_Q);
442 >       if (trash_A5>0)
443 >         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
444 >       if (trash_A3>0)
445 >         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
446 >       }
447 >    if (trashReport) {
448 >          FWCLOSE(freport);
449            }
450 <        }
451 <      else { //fastq format
452 <       if (convert_phred) convertPhred(qd->qv, qd->len);
567 <       if (prefix.is_empty()) {
568 <         fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
569 <                        rseq.chars(), qd->qv);
570 <         }
571 <       else { //use custom read name
572 <         fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
573 <                     qd->count, rseq.chars(), qd->qv);
574 <         }
575 <        }
576 <      }//for each element of dhash
577 <    if (maxdup_count>1) {
578 <      GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
579 <      }
580 <   } //report collapsed dhash entries
581 < GMessage("Number of input reads: %9d\n", icounter);
582 < GMessage("       Output records: %9d\n", outcounter);
583 < if (trashReport) {
584 <    FWCLOSE(freport);
585 <    }
450 >    FWCLOSE(f_out);
451 >    FWCLOSE(f_out2);
452 >   } //while each input file
453  
587 FWCLOSE(f_out);
454   //getc(stdin);
455   }
456  
# Line 670 | Line 536
536  
537  
538   bool qtrim(GStr& qvs, int &l5, int &l3) {
539 < if (qvtrim_min==0 || qvs.is_empty()) return false;
539 > if (qvtrim_qmin==0 || qvs.is_empty()) return false;
540   if (qv_phredtype==0) {
541    //try to guess the Phred type
542    int vmin=256, vmax=0;
# Line 683 | Line 549
549    if (qv_phredtype==0) {
550      GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
551      }
552 <  } //guessing the Phred type
552 >  if (verbose)
553 >    GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
554 >  } //guessing Phred type
555   for (l3=qvs.length()-1;l3>2;l3--) {
556 <  if (qvs[l3]-qv_phredtype>=qvtrim_min && qvs[l3-1]-qv_phredtype>=qvtrim_min) break;
556 >  if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
557    }
558   //just in case, check also the 5' the end (?)
559   for (l5=0;l5<qvs.length()-3;l5++) {
560 <  if (qvs[l5]-qv_phredtype>=qvtrim_min && qvs[l5+1]-qv_phredtype>=qvtrim_min) break;
560 >  if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
561 >  }
562 > if (qvtrim_max>0) {
563 >  if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
564 >  if (l5>qvtrim_max) l5=qvtrim_max;
565    }
566   return (l5>0 || l3<qvs.length()-1);
567   }
# Line 844 | Line 716
716   return ncount;
717   }
718  
847
848 // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
849 //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
850 //check minimum score and
851 //for 3' adapter trimming:
852 //     require that the right end of the alignment for either the adaptor OR the read must be
853 //     < 3 distance from its right end
854 // for 5' adapter trimming:
855 //     require that the left end of the alignment for either the adaptor OR the read must
856 //     be at coordinate < 3 from start
857
858 bool extendMatch(const char* a, int alen, int ai,
859                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
860 //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
861 #ifdef DEBUG
862 GStr dbg(b);
863 #endif
864 //if (debug) {
865 //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
866 //  }
867 int a_l=ai; //alignment coordinates on a
868 int a_r=ai+mlen-1;
869 int b_l=bi; //alignment coordinates on b
870 int b_r=bi+mlen-1;
871 int ai_maxscore=ai;
872 int bi_maxscore=bi;
873 int score=mlen*a_m_score;
874 int maxscore=score;
875 int mism5score=a_mis_score;
876 if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
877 //try to extend to the left first, if possible
878 while (ai>0 && bi>0) {
879   ai--;
880   bi--;
881   score+= (a[ai]==b[bi])? a_m_score : mism5score;
882   if (score>maxscore) {
883       ai_maxscore=ai;
884       bi_maxscore=bi;
885       maxscore=score;
886       }
887     else if (maxscore-score>a_dropoff_score) break;
888   }
889 a_l=ai_maxscore;
890 b_l=bi_maxscore;
891 //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
892 //now extend to the right
893 ai_maxscore=a_r;
894 bi_maxscore=b_r;
895 ai=a_r;
896 bi=b_r;
897 score=maxscore;
898 //sometimes there are extra AAAAs at the end of the read, ignore those
899 if (strcmp(&a[alen-4],"AAAA")==0) {
900    alen-=3;
901    while (a[alen-1]=='A' && alen>ai) alen--;
902    }
903 while (ai<alen-1 && bi<blen-1) {
904   ai++;
905   bi++;
906   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
907   if (a[ai]==b[bi]) { //match
908      score+=a_m_score;
909      if (ai>=alen-2) {
910           score+=a_m_score-(alen-ai-1);
911           }
912      }
913    else { //mismatch
914      score+=a_mis_score;
915      }  
916   if (score>maxscore) {
917       ai_maxscore=ai;
918       bi_maxscore=bi;
919       maxscore=score;
920       }
921     else if (maxscore-score>a_dropoff_score) break;
922   }
923  a_r=ai_maxscore;
924  b_r=bi_maxscore;
925  int a_ovh3=alen-a_r-1;
926  int b_ovh3=blen-b_r-1;
927  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
928  int mmovh5=(a_l<b_l)? a_l : b_l;
929  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
930 #ifdef DEBUG
931  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
932 #endif
933  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
934     if (a_l<a_ovh3) {
935        //adapter closer to the left end (typical for 5' adapter)
936        l5=a_r+1;
937        l3=alen-1;
938        }
939      else {
940        //adapter matching at the right end (typical for 3' adapter)
941        l5=0;
942        l3=a_l-1;
943        }
944     return true;
945     }
946 else { //keep this segment pair for later (gapped alignment)
947   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
948   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
949   }
950  //do not trim:
951  l5=0;
952  l3=alen-1;
953  return false;
954 }
955
956 /*
957 int getWordValue(const char* s, int wlen) {
958 int r=0;
959 while (wlen--) { r+=(((int)s[wlen])<<wlen) }
960 return r;
961 }
962 */
719   int get3mer_value(const char* s) {
720   return (s[0]<<16)+(s[1]<<8)+s[2];
721   }
# Line 983 | Line 739
739   return -1;
740   }
741  
742 < int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
743 < if (start_index>=slen || start_index<0) return -1;
744 < for (int i=start_index;i<slen-4;i++) {
745 <   int32* rv=(int32*)(str+i);
746 <   if (*rv==qv) return i;
747 <   }
748 < return -1;
749 < }
742 > struct SLocScore {
743 >  int pos;
744 >  int score;
745 >  SLocScore(int p=0,int s=0) {
746 >    pos=p;
747 >    score=s;
748 >    }
749 >  void set(int p, int s) {
750 >    pos=p;
751 >    score=s;
752 >    }
753 >  void add(int p, int add) {
754 >    pos=p;
755 >    score+=add;
756 >    }
757 > };
758  
759 < int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
760 < if (end_index>=slen) return -1;
761 < if (end_index<0) end_index=slen-1;
762 < for (int i=end_index-3;i>=0;i--) {
763 <   int32* rv=(int32*)(str+i);
764 <   if (*rv==qv) return i;
759 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
760 > int rlen=seq.length();
761 > l5=0;
762 > l3=rlen-1;
763 > int32 seedVal=*(int32*)poly_seed;
764 > char polyChar=poly_seed[0];
765 > //assumes N trimming was already done
766 > //so a poly match should be very close to the end of the read
767 > // -- find the initial match (seed)
768 > int lmin=GMAX((rlen-12), 0);
769 > int li;
770 > for (li=rlen-4;li>lmin;li--) {
771 >   if (seedVal==*(int*)&(seq[li])) {
772 >      break;
773 >      }
774     }
775 < return -1;
776 < }
775 > if (li<=lmin) return false;
776 > //seed found, try to extend it both ways
777 > //extend right
778 > int ri=li+3;
779 > SLocScore loc(ri, poly_m_score<<2);
780 > SLocScore maxloc(loc);
781 > //extend right
782 > while (ri<rlen-2) {
783 >   ri++;
784 >   if (seq[ri]==polyChar) {
785 >                loc.add(ri,poly_m_score);
786 >                }
787 >   else if (seq[ri]=='N') {
788 >                loc.add(ri,0);
789 >                }
790 >   else { //mismatch
791 >        loc.add(ri,poly_mis_score);
792 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
793 >        }
794 >   if (maxloc.score<=loc.score) {
795 >      maxloc=loc;
796 >      }
797 >   }
798 > ri=maxloc.pos;
799 > if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end
800 > //ri = right boundary for the poly match
801 > //extend left
802 > loc.set(li, maxloc.score);
803 > maxloc.pos=li;
804 > while (li>0) {
805 >    li--;
806 >    if (seq[li]==polyChar) {
807 >                 loc.add(li,poly_m_score);
808 >                 }
809 >    else if (seq[li]=='N') {
810 >                 loc.add(li,0);
811 >                 }
812 >    else { //mismatch
813 >         loc.add(li,poly_mis_score);
814 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
815 >         }
816 >    if (maxloc.score<=loc.score) {
817 >       maxloc=loc;
818 >       }
819 >    }
820 > if (maxloc.score>poly_minScore && ri>=rlen-3) {
821 >    l5=li;
822 >    l3=ri;
823 >    return true;
824 >    }
825 > return false;
826 > }
827  
828 < #ifdef DEBUG
829 < void dbgPrintChain(CSegChain& chain, const char* aseq) {
830 <  GStr s(aseq);
831 <  for (int i=0;i<chain.Count();i++) {
832 <   CSegPair& seg=*chain[i];
833 <   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
834 <            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
828 >
829 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
830 > int rlen=seq.length();
831 > l5=0;
832 > l3=rlen-1;
833 > int32 seedVal=*(int32*)poly_seed;
834 > char polyChar=poly_seed[0];
835 > //assumes N trimming was already done
836 > //so a poly match should be very close to the end of the read
837 > // -- find the initial match (seed)
838 > int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
839 > int li;
840 > for (li=0;li<=lmax;li++) {
841 >   if (seedVal==*(int*)&(seq[li])) {
842 >      break;
843 >      }
844 >   }
845 > if (li>lmax) return false;
846 > //seed found, try to extend it both ways
847 > //extend left
848 > int ri=li+3; //save rightmost base of the seed
849 > SLocScore loc(li, poly_m_score<<2);
850 > SLocScore maxloc(loc);
851 > while (li>0) {
852 >    li--;
853 >    if (seq[li]==polyChar) {
854 >                 loc.add(li,poly_m_score);
855 >                 }
856 >    else if (seq[li]=='N') {
857 >                 loc.add(li,0);
858 >                 }
859 >    else { //mismatch
860 >         loc.add(li,poly_mis_score);
861 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
862 >         }
863 >    if (maxloc.score<=loc.score) {
864 >       maxloc=loc;
865 >       }
866 >    }
867 > li=maxloc.pos;
868 > if (li>3) return false; //no trimming wanted, too far from 5' end
869 > //li = right boundary for the poly match
870 >
871 > //extend right
872 > loc.set(ri, maxloc.score);
873 > maxloc.pos=ri;
874 > while (ri<rlen-2) {
875 >   ri++;
876 >   if (seq[ri]==polyChar) {
877 >                loc.add(ri,poly_m_score);
878 >                }
879 >   else if (seq[ri]=='N') {
880 >                loc.add(ri,0);
881 >                }
882 >   else { //mismatch
883 >        loc.add(ri,poly_mis_score);
884 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
885 >        }
886 >   if (maxloc.score<=loc.score) {
887 >      maxloc=loc;
888 >      }
889     }
890 +
891 + if (maxloc.score>poly_minScore && li<=3) {
892 +    l5=li;
893 +    l3=ri;
894 +    return true;
895 +    }
896 + return false;
897   }
1014 #endif
898  
899   bool trim_adapter3(GStr& seq, int&l5, int &l3) {
900   int rlen=seq.length();
# Line 1313 | Line 1196
1196   for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1197   }
1198  
1199 + bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1200 +          GStr& rname, GStr& rinfo, GStr& infname) {
1201 + rseq="";
1202 + rqv="";
1203 + rname="";
1204 + rinfo="";
1205 + if (fq.eof()) return false;
1206 + char* l=fq.getLine();
1207 + while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1208 + if (l==NULL) return false;
1209 + /* if (rawFormat) {
1210 +      //TODO: implement raw qseq parsing here
1211 +      //if (raw type=N) then continue; //skip invalid/bad records
1212 +      } //raw qseq format
1213 + else { // FASTQ or FASTA */
1214 + isfasta=(l[0]=='>');
1215 + if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1216 +      infname.chars(), l);
1217 + GStr s(l);
1218 + rname=&(l[1]);
1219 + for (int i=0;i<rname.length();i++)
1220 +    if (rname[i]<=' ') {
1221 +       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1222 +       rname.cut(i);
1223 +       break;
1224 +       }
1225 +  //now get the sequence
1226 + if ((l=fq.getLine())==NULL)
1227 +      GError("Error: unexpected EOF after header for read %s (%s)\n",
1228 +                   rname.chars(), infname.chars());
1229 + rseq=l; //this must be the DNA line
1230 + while ((l=fq.getLine())!=NULL) {
1231 +      //seq can span multiple lines
1232 +      if (l[0]=='>' || l[0]=='+') {
1233 +           fq.pushBack();
1234 +           break; //
1235 +           }
1236 +      rseq+=l;
1237 +      } //check for multi-line seq
1238 + if (!isfasta) { //reading fastq quality values, which can also be multi-line
1239 +    if ((l=fq.getLine())==NULL)
1240 +        GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1241 +    if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1242 +    if ((l=fq.getLine())==NULL)
1243 +        GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1244 +    rqv=l;
1245 +    //if (rqv.length()!=rseq.length())
1246 +    //  GError("Error: qv len != seq len for %s\n", rname.chars());
1247 +    while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1248 +      rqv+=l; //append to qv string
1249 +      }
1250 +    }// fastq
1251 + // } //<-- FASTA or FASTQ
1252 + rseq.upper();
1253 + return true;
1254 + }
1255 +
1256 + char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1257 + //returns 0 if the read was untouched, 1 if it was just trimmed
1258 + // and a trash code if it was trashed
1259 + l5=0;
1260 + l3=rseq.length()-1;
1261 + if (l3-l5+1<min_read_len) {
1262 +   return 's';
1263 +   }
1264 + GStr wseq(rseq.chars());
1265 + GStr wqv(rqv.chars());
1266 + int w5=l5;
1267 + int w3=l3;
1268 + //first do the q-based trimming
1269 + if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1270 +   if (w3-w5+1<min_read_len) {
1271 +     return 'Q'; //invalid read
1272 +     }
1273 +    //-- keep only the w5..w3 range
1274 +   l5=w5;
1275 +   l3=w3;
1276 +   wseq=wseq.substr(w5, w3-w5+1);
1277 +   if (!wqv.is_empty())
1278 +      wqv=wqv.substr(w5, w3-w5+1);
1279 +   } //qv trimming
1280 + // N-trimming on the remaining read seq
1281 + if (ntrim(wseq, w5, w3)) {
1282 +   //GMessage("before: %s\n",wseq.chars());
1283 +   //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1284 +   l5+=w5;
1285 +   l3-=(wseq.length()-1-w3);
1286 +   if (w3-w5+1<min_read_len) {
1287 +     return 'N'; //to be trashed
1288 +     }
1289 +    //-- keep only the w5..w3 range
1290 +   wseq=wseq.substr(w5, w3-w5+1);
1291 +   if (!wqv.is_empty())
1292 +      wqv=wqv.substr(w5, w3-w5+1);
1293 +   w5=0;
1294 +   w3=wseq.length()-1;
1295 +   }
1296 + if (a3len>0) {
1297 +  if (trim_adapter3(wseq, w5, w3)) {
1298 +     int trimlen=wseq.length()-(w3-w5+1);
1299 +     num_trimmed3++;
1300 +     if (trimlen<min_trimmed3)
1301 +         min_trimmed3=trimlen;
1302 +     l5+=w5;
1303 +     l3-=(wseq.length()-1-w3);
1304 +     if (w3-w5+1<min_read_len) {
1305 +         return '3';
1306 +         }
1307 +      //-- keep only the w5..w3 range
1308 +      wseq=wseq.substr(w5, w3-w5+1);
1309 +      if (!wqv.is_empty())
1310 +         wqv=wqv.substr(w5, w3-w5+1);
1311 +      }//some adapter was trimmed
1312 +   } //adapter trimming
1313 + if (a5len>0) {
1314 +  if (trim_adapter5(wseq, w5, w3)) {
1315 +     int trimlen=wseq.length()-(w3-w5+1);
1316 +     num_trimmed5++;
1317 +     if (trimlen<min_trimmed5)
1318 +         min_trimmed5=trimlen;
1319 +     l5+=w5;
1320 +     l3-=(wseq.length()-1-w3);
1321 +     if (w3-w5+1<min_read_len) {
1322 +         return '5';
1323 +         }
1324 +      //-- keep only the w5..w3 range
1325 +      wseq=wseq.substr(w5, w3-w5+1);
1326 +      if (!wqv.is_empty())
1327 +         wqv=wqv.substr(w5, w3-w5+1);
1328 +      }//some adapter was trimmed
1329 +   } //adapter trimming
1330 + if (doCollapse) {
1331 +   //keep read for later
1332 +   FqDupRec* dr=dhash.Find(wseq.chars());
1333 +   if (dr==NULL) { //new entry
1334 +          //if (prefix.is_empty())
1335 +             dhash.Add(wseq.chars(),
1336 +                  new FqDupRec(&wqv, rname.chars()));
1337 +          //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1338 +         }
1339 +      else
1340 +         dr->add(wqv);
1341 +   } //collapsing duplicates
1342 + else { //not collapsing duplicates
1343 +   //apply the dust filter now
1344 +   if (doDust) {
1345 +     int dustbases=dust(wseq);
1346 +     if (dustbases>(wseq.length()>>1)) {
1347 +        return 'D';
1348 +        }
1349 +     }
1350 +   } //not collapsing duplicates
1351 + return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1352 + }
1353 +
1354 + void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1355 + //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1356 + if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1357 +  else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1358 + }
1359 +
1360 + void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1361 +   outcounter++;
1362 +   bool asFasta=(rqv.is_empty() || fastaOutput);
1363 +   if (asFasta) {
1364 +    if (prefix.is_empty()) {
1365 +       printHeader(f_out, '>',rname,rinfo);
1366 +       fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1367 +       }
1368 +      else {
1369 +       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1370 +                          rseq.chars());
1371 +       }
1372 +     }
1373 +   else {  //fastq
1374 +    if (convert_phred) convertPhred(rqv);
1375 +    if (prefix.is_empty()) {
1376 +       printHeader(f_out, '@', rname, rinfo);
1377 +       fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1378 +       }
1379 +      else
1380 +       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1381 +                          rseq.chars(),rqv.chars() );
1382 +     }
1383 + }
1384 +
1385 + void trash_report(char trashcode, GStr& rname, FILE* freport) {
1386 + if (freport==NULL || trashcode<=' ') return;
1387 + if (trashcode=='3' || trashcode=='5') {
1388 +   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1389 +   }
1390 + else {
1391 +   fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1392 +   }
1393 + //tcounter++;
1394 + }
1395 +
1396 + GStr getFext(GStr& s, int* xpos=NULL) {
1397 + //s must be a filename without a path
1398 + GStr r("");
1399 + if (xpos!=NULL) *xpos=0;
1400 + if (s.is_empty() || s=="-") return r;
1401 + int p=s.rindex('.');
1402 + int d=s.rindex('/');
1403 + if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1404 + r=s.substr(p+1);
1405 + if (xpos!=NULL) *xpos=p+1;
1406 + r.lower();
1407 + return r;
1408 + }
1409 +
1410 + void baseFileName(GStr& fname) {
1411 + //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1412 + int xpos=0;
1413 + GStr fext=getFext(fname, &xpos);
1414 + if (xpos<=1) return;
1415 + bool extdel=false;
1416 + GStr f2;
1417 + if (fext=="z" || fext=="zip") {
1418 +   extdel=true;
1419 +   }
1420 +  else if (fext.length()>=2) {
1421 +   f2=fext.substr(0,2);
1422 +   extdel=(f2=="gz" || f2=="bz");
1423 +   }
1424 + if (extdel) {
1425 +   fname.cut(xpos-1);
1426 +   fext=getFext(fname, &xpos);
1427 +   if (xpos<=1) return;
1428 +   }
1429 + extdel=false;
1430 + if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1431 +   extdel=true;
1432 +   }
1433 +  else if (fext.length()>=2) {
1434 +   extdel=(fext.substr(0,2)=="fa");
1435 +   }
1436 + if (extdel) fname.cut(xpos-1);
1437 + GStr fncp(fname);
1438 + fncp.lower();
1439 + fncp.chomp("seq");
1440 + fncp.chomp("sequence");
1441 + fncp.trimR("_.");
1442 + if (fncp.length()<fname.length()) fname.cut(fncp.length());
1443 + }
1444 +
1445 + FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1446 +  FILE* f_out=NULL;
1447 +  GStr fname(getFileName(infname.chars()));
1448 +  //eliminate known extensions
1449 +  baseFileName(fname);
1450 +  if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1451 +    else if (pocmd.is_empty()) {
1452 +               GStr oname(fname);
1453 +               oname.append('.');
1454 +               oname.append(outsuffix);
1455 +               f_out=fopen(oname.chars(),"w");
1456 +               if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1457 +               }
1458 +            else {
1459 +              GStr oname(">");
1460 +              oname.append(fname);
1461 +              oname.append('.');
1462 +              oname.append(outsuffix);
1463 +              pocmd.append(oname);
1464 +              f_out=popen(pocmd.chars(), "w");
1465 +              if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1466 +              }
1467 + return f_out;
1468 + }
1469 +
1470 + void guess_unzip(GStr& fname, GStr& picmd) {
1471 + GStr fext=getFext(fname);
1472 + if (fext=="gz" || fext=="gzip" || fext=="z") {
1473 +    picmd="gzip -cd ";
1474 +    }
1475 +   else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1476 +    picmd="bzip2 -cd ";
1477 +    }
1478 + }
1479 +
1480 +
1481 + int loadAdapters(const char* fname) {
1482 +  GLineReader lr(fname);
1483 +  char* l;
1484 +  while ((l=lr.nextLine())!=NULL) {
1485 +   if (lr.length()<=3 || l[0]=='#') continue;
1486 +   char* s5;
1487 +   char* s3;
1488 +   if (isspace(l[0]) || l[0]==',' || l[0]==';'|| l[0]==':') {
1489 +      int i=1;
1490 +      while (l[i]!=0 && isspace(l[i])) {
1491 +        i++;
1492 +        }
1493 +      if (l[i]!=0) {
1494 +        adapters.Add(new CAdapters(NULL, &(l[i])));
1495 +        continue;
1496 +        }
1497 +      }
1498 +    else {
1499 +      p=l;
1500 +      while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL)
1501 +             delpos++;
1502 +      s5.startTokenize("\t ;,:",);
1503 +      s5.nextToken(s3);
1504 +      }
1505 +   }
1506 +
1507 + }
1508 +
1509 + void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1510 +                       GStr& s, GStr& infname, GStr& infname2) {
1511 + // uses outsuffix to generate output file names and open file handles as needed
1512 + infname="";
1513 + infname2="";
1514 + f_in=NULL;
1515 + f_in2=NULL;
1516 + f_out=NULL;
1517 + f_out2=NULL;
1518 + //analyze outsuffix intent
1519 + GStr pocmd;
1520 + if (outsuffix=="-") {
1521 +    f_out=stdout;
1522 +    }
1523 +   else {
1524 +    GStr ox=getFext(outsuffix);
1525 +    if (ox.length()>2) ox=ox.substr(0,2);
1526 +    if (ox=="gz") pocmd="gzip -9 -c ";
1527 +        else if (ox=="bz") pocmd="bzip2 -9 -c ";
1528 +    }
1529 + if (s=="-") {
1530 +    f_in=stdin;
1531 +    infname="stdin";
1532 +    f_out=prepOutFile(infname, pocmd);
1533 +    return;
1534 +    } // streaming from stdin
1535 + s.startTokenize(",:");
1536 + s.nextToken(infname);
1537 + bool paired=s.nextToken(infname2);
1538 + if (fileExists(infname.chars())==0)
1539 +    GError("Error: cannot find file %s!\n",infname.chars());
1540 + GStr fname(getFileName(infname.chars()));
1541 + GStr picmd;
1542 + guess_unzip(fname, picmd);
1543 + if (picmd.is_empty()) {
1544 +   f_in=fopen(infname.chars(), "r");
1545 +   if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1546 +   }
1547 +  else {
1548 +   picmd.append(infname);
1549 +   f_in=popen(picmd.chars(), "r");
1550 +   if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1551 +   }
1552 + if (f_out==stdout) {
1553 +   if (paired) GError("Error: output suffix required for paired reads\n");
1554 +   return;
1555 +   }
1556 + f_out=prepOutFile(infname, pocmd);
1557 + if (!paired) return;
1558 + if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1559 + // ---- paired reads:-------------
1560 + if (fileExists(infname2.chars())==0)
1561 +     GError("Error: cannot find file %s!\n",infname2.chars());
1562 + picmd="";
1563 + GStr fname2(getFileName(infname2.chars()));
1564 + guess_unzip(fname2, picmd);
1565 + if (picmd.is_empty()) {
1566 +   f_in2=fopen(infname2.chars(), "r");
1567 +   if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1568 +   }
1569 +  else {
1570 +   picmd.append(infname2);
1571 +   f_in2=popen(picmd.chars(), "r");
1572 +   if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1573 +   }
1574 + f_out2=prepOutFile(infname2, pocmd);
1575 + }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines