ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 3 | Line 3
3   #include "GHash.hh"
4   #include "GList.hh"
5   #include <ctype.h>
6 + #include "GAlnExtend.h"
7 + #include <inttypes.h>
8  
9   #define USAGE "Usage:\n\
10 < fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
11 <   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10 > fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_match>]\\\n\
11 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
12     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
13 <    <input.fq>[,<input_mates.fq>\n\
13 >   [-a <min_poly>] [-A] <input.fq>[,<input_mates.fq>\n\
14   \n\
15 < Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 > Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
16   for low complexity and collapse duplicate reads.\n\
17   If read pairs should be trimmed and kept together (i.e. without discarding\n\
18   one read in a pair), the two file names should be given delimited by a comma\n\
# Line 24 | Line 26
26      file(s) named <input>.<outsuffix> which will be created in the \n\
27      current (working) directory; (writes to stdout if -o- is given);\n\
28      a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
29 < -f  file with adapter sequences to trim, each line having this format:\n\
30 <    <5'-adapter-sequence> <3'-adapter-sequence>\n\
31 < -5  trim the given adapter or primer sequence at the 5' end of each read\n\
29 > -f  file with adaptor sequences to trim, each line having this format:\n\
30 >    <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
31 > -5  trim the given adaptor or primer sequence at the 5' end of each read\n\
32      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
33 < -3  trim the given adapter sequence at the 3' end of each read\n\
33 > -3  trim the given adaptor sequence at the 3' end of each read\n\
34      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
35 < -a  minimum length of exact match to adaptor sequence at the proper end (6)\n\
35 > -a  minimum exact match to adaptor sequence to trim at the end (6)\n\
36 > -A  disable polyA/T trimming (enabled by default)\n\
37 > -y  minimum length of poly-A/T run to remove (6)\n\
38   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
39   -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
40   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
# Line 39 | Line 43
43   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
44      one letter code for the reason why they were trashed\n\
45   -D  apply a low-complexity (dust) filter and discard any read that has over\n\
46 <    50% of its length detected as low complexity\n\
46 >    50%% of its length detected as low complexity\n\
47   -C  collapse duplicate reads and add a prefix with their count to the read\n\
48      name (e.g. for microRNAs)\n\
49   -p  input is phred64/phred33 (use -p64 or -p33)\n\
50   -Q  convert quality values to the other Phred qv type\n\
51 < -V  verbose processing\n\
51 > -V  be verbose when done (print summary counts)\n\
52   "
53 <
53 >
54   //-z  for -o option, the output stream(s) will be first piped into the given\n
55   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
56  
57 <
58 < // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 >
58 > // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
59  
60   //For paired reads sequencing:
61   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
# Line 60 | Line 64
64   //FILE* f_out2=NULL; //for paired reads
65   //FILE* f_in=NULL; //input fastq (stdin if not provided)
66   //FILE* f_in2=NULL; //for paired reads
67 +
68   FILE* freport=NULL;
69  
70   bool debug=false;
71   bool verbose=false;
72   bool doCollapse=false;
73   bool doDust=false;
74 + bool doPolyTrim=true;
75   bool fastaOutput=false;
76   bool trashReport=false;
77 < //bool rawFormat=false;
77 > bool revCompl=false; //also reverse complement adaptor sequences
78   int min_read_len=16;
79   double max_perc_N=7.0;
80   int dust_cutoff=16;
81   bool isfasta=false;
82   bool convert_phred=false;
83 < GStr outsuffix; // -o
78 < //GStr adapter3;
79 < //GStr adapter5;
83 > GStr outsuffix; // -o
84   GStr prefix;
85   GStr zcmd;
86 < int num_trimmed5=0;
87 < int num_trimmed3=0;
88 < int num_trimmedN=0;
89 < int num_trimmedQ=0;
90 < int min_trimmed5=INT_MAX;
91 < int min_trimmed3=INT_MAX;
86 > char isACGT[256];
87 >
88 > uint num_trimN=0; //reads trimmed by N%
89 > uint num_trimQ=0; //reads trimmed by qv threshold
90 > uint num_trimV=0; //reads trimmed by adapter match
91 > uint num_trimA=0; //reads trimmed by polyA
92 > uint num_trimT=0; //reads trimmed by polyT
93 > uint num_trim5=0; //number of reads trimmed at 5' end
94 > uint num_trim3=0; //number of reads trimmed at 3' end
95 >
96 > uint64 b_totalIn=0; //total number of input bases
97 > uint64 b_totalN=0;  //total number of undetermined bases found in input bases
98 >
99 > uint64 b_trimN=0; //total number of bases trimmed due to N-trim
100 > uint64 b_trimQ=0; //number of bases trimmed due to qv threshold
101 > uint64 b_trimV=0; //total number of bases trimmed due to adaptor matches
102 > uint64 b_trimA=0; //number of bases trimmed due to poly-A tails
103 > uint64 b_trimT=0; //number of bases trimmed due to poly-T tails
104 > uint64 b_trim5=0; //total bases trimmed on the 5' side
105 > uint64 b_trim3=0; //total bases trimmed on the 3' side
106 > //int min_trimmed5=INT_MAX;
107 > //int min_trimmed3=INT_MAX;
108  
109   int qvtrim_qmin=0;
110   int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
111   int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
112   int qv_cvtadd=0; //could be -31 or +31
113  
114 < int a3len=0;
115 < int a5len=0;
116 < // adaptor matching metrics -- for extendMatch() function
117 < const int a_m_score=2; //match score
118 < const int a_mis_score=-3; //mismatch
119 < const int a_dropoff_score=7;
120 < int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
121 < const int a_min_chain_score=15; //for gapped alignments
122 <
123 < class CSegChain;
124 <
125 < class CSegPair {
126 <  public:
127 <   GSeg a;
128 <   GSeg b; //the adapter segment
129 <   int score;
130 <   int flags;
131 <   CSegChain* chain;
132 <   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
133 <      score=mscore;
134 <      if (score==0) score=a.len()*a_m_score;
135 <      flags=0;
136 <      chain=NULL;
137 <      }
138 <   int len() { return  a.len(); }
139 <   bool operator==(CSegPair& d){
140 <      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
141 <      //make equal even segments that are included into one another:
142 <      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
143 <      }
144 <   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
145 <     if (b.start==d.b.start) {
146 <        if (score==d.score) {
127 <           //just try to be consistent:
128 <           if (b.end==d.b.end) {
129 <             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
130 <             }
131 <           return (b.end>d.b.end);
132 <           }
133 <         else return (score<d.score);
134 <        }
135 <     return (b.start>d.b.start);
136 <     }
137 <   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
138 <     /*if (b.start==d.b.start && b.end==d.b.end) {
139 <          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
140 <          }
141 <     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
142 <     if (b.start==d.b.start) {
143 <        if (score==d.score) {
144 <           //just try to be consistent:
145 <           if (b.end==d.b.end) {
146 <             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
147 <             }
148 <           return (b.end<d.b.end);
149 <           }
150 <         else return (score>d.score);
114 > // adaptor matching metrics -- for X-drop ungapped extension
115 > //const int match_reward=2;
116 > //const int mismatch_penalty=3;
117 > int match_reward=1;
118 > int mismatch_penalty=5;
119 > int Xdrop=16;
120 > int minEndAdapter=6;
121 >
122 >
123 > const int poly_m_score=2; //match score
124 > const int poly_mis_score=-3; //mismatch
125 > const int poly_dropoff_score=7;
126 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
127 >
128 > const char *polyA_seed="AAAA";
129 > const char *polyT_seed="TTTT";
130 >
131 > struct CASeqData {
132 >   //positional data for every possible hexamer in an adaptor
133 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
134 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
135 >   GStr seq; //actual adaptor sequence data
136 >   GStr seqr; //reverse complement sequence
137 >   int amlen; //fraction of adaptor length matching that's
138 >              //enough to consider the alignment
139 >   GAlnTrimType trim_type;
140 >   bool use_reverse;
141 >   CASeqData(bool rev=false):seq(),seqr(),
142 >             amlen(0), use_reverse(rev) {
143 >     trim_type=galn_None; //should be updated later!
144 >     for (int i=0;i<4096;i++) {
145 >        pz[i]=NULL;
146 >        pzr[i]=NULL;
147          }
152     return (b.start<d.b.start);
148       }
154 };
149  
150 < int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
151 < CSegPair& x = *(CSegPair *)sa;
152 < CSegPair& y = *(CSegPair *)sb;
153 < /*
154 < if (x.b.end==y.b.end) {
155 <     if (x.b.start==y.b.start) {
156 <         if (x.a.end==y.a.end) {
157 <            if (x.a.start==y.a.start) return 0;
158 <            return ((x.a.start>y.a.start) ? -1 : 1);
159 <            }
160 <          else {
161 <            return ((x.a.end>y.a.end) ? -1 : 1);
162 <            }
163 <          }
164 <      else {
165 <       return ((x.b.start>y.b.start) ? -1 : 1);
166 <       }
173 <     }
174 <    else {
175 <     return ((x.b.end>y.b.end) ? -1 : 1);
176 <     }
177 < */
178 <  if (x.b.end==y.b.end) {
179 <     if (x.score==y.score) {
180 <     if (x.b.start==y.b.start) {
181 <         if (x.a.end==y.a.end) {
182 <            if (x.a.start==y.a.start) return 0;
183 <            return ((x.a.start<y.a.start) ? -1 : 1);
184 <            }
185 <          else {
186 <            return ((x.a.end<y.a.end) ? -1 : 1);
187 <            }
188 <          }
189 <      else {
190 <       return ((x.b.start<y.b.start) ? -1 : 1);
191 <       }
192 <      } else return ((x.score>y.score) ? -1 : 1);
193 <     }
194 <    else {
195 <     return ((x.b.end>y.b.end) ? -1 : 1);
150 >   void update(const char* s) {
151 >         seq=s;
152 >         table6mers(seq.chars(), seq.length(), pz);
153 >         amlen=calc_safelen(seq.length());
154 >         if (!use_reverse) return;
155 >         //reverse complement
156 >         seqr=s;
157 >         int slen=seq.length();
158 >         for (int i=0;i<slen;i++)
159 >           seqr[i]=ntComplement(seq[slen-i-1]);
160 >         table6mers(seqr.chars(), seqr.length(), pzr);
161 >   }
162 >
163 >   ~CASeqData() {
164 >     for (int i=0;i<4096;i++) {
165 >       delete pz[i];
166 >       delete pzr[i];
167       }
168 +   }
169 + };
170  
171 < }
171 > GVec<CASeqData> adaptors5;
172 > GVec<CASeqData> adaptors3;
173  
174 < class CSegChain:public GList<CSegPair> {
175 < public:
202 <   uint astart;
203 <   uint aend;
204 <   uint bstart;
205 <   uint bend;
206 <   int score;
207 <   bool endSort;
208 <  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
209 <   //as SegPairs are inserted, they will be sorted by a.start coordinate
210 <   score=0;
211 <   astart=MAX_UINT;
212 <   aend=0;
213 <   bstart=MAX_UINT;
214 <   bend=0;
215 <   endSort=aln5;
216 <   if (aln5) { setSorted(cmpSegEnds); }
217 <   }
218 < bool operator==(CSegChain& d) {
219 <   //return (score==d.score);
220 <    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
221 <   }
222 < bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
223 <   //return (score<d.score);
224 <   if (bstart==d.bstart && bend==d.bend) {
225 <          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
226 <          }
227 <     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
228 <   }
229 < bool operator<(CSegChain& d) {
230 <   //return (score>d.score);
231 <   if (bstart==d.bstart && bend==d.bend) {
232 <          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
233 <          }
234 <     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
235 <   }
236 < void addSegPair(CSegPair* segp) {
237 <   if (AddIfNew(segp)!=segp) return;
238 <   score+=segp->score;
239 <   if (astart>segp->a.start) astart=segp->a.start;
240 <   if (aend<segp->a.end) aend=segp->a.end;
241 <   if (bstart>segp->b.start) bstart=segp->b.start;
242 <   if (bend<segp->b.end) bend=segp->b.end;
243 <   }
244 < //for building actual chains:
245 < bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
246 <   int bgap=0;
247 <   int agap=0;
248 <   //if (endSort) {
249 <   if (bstart>segp->b.start) {
250 <      bgap = (int)(bstart-segp->b.end);
251 <      if (abs(bgap)>2) return false;
252 <      agap = (int)(astart-segp->a.end);
253 <      if (abs(agap)>2) return false;
254 <      }
255 <     else {
256 <      bgap = (int) (segp->b.start-bend);
257 <      if (abs(bgap)>2) return false;
258 <      agap = (int)(segp->a.start-aend);
259 <      if (abs(agap)>2) return false;
260 <      }
261 <   if (agap*bgap<0) return false;
262 <   addSegPair(segp);
263 <   score-=abs(agap)+abs(bgap);
264 <   return true;
265 <   }
266 < };
174 > CGreedyAlignData* gxmem_l=NULL;
175 > CGreedyAlignData* gxmem_r=NULL;
176  
177   // element in dhash:
178   class FqDupRec {
# Line 313 | Line 222
222  
223   GHash<FqDupRec> dhash; //hash to keep track of duplicates
224  
225 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
226 <                       GStr& s, GStr& infname, GStr& infname2);
225 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
226 > int loadAdaptors(const char* fname);
227 >
228 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
229 >                       GStr& s, GStr& infname, GStr& infname2);
230   // uses outsuffix to generate output file names and open file handles as needed
231 <
231 >
232   void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
233   void trash_report(char trashcode, GStr& rname, FILE* freport);
234  
235 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
235 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
236            GStr& rname, GStr& rinfo, GStr& infname);
237  
238 < char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
238 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
239   //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
240  
241   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
242   bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
243   int dust(GStr& seq);
244 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
245 < bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
244 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
245 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
246 > bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
247 > bool trim_adaptor3(GStr& seq, int &l5, int &l3);
248  
249   void convertPhred(char* q, int len);
250   void convertPhred(GStr& q);
251  
252   int main(int argc, char * const argv[]) {
253 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
253 >  GArgs args(argc, argv, "MATCH=MISMATCH=XDROP=YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:y:");
254    int e;
255    if ((e=args.isError())>0) {
256        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 347 | Line 261
261    convert_phred=(args.getOpt('Q')!=NULL);
262    doCollapse=(args.getOpt('C')!=NULL);
263    doDust=(args.getOpt('D')!=NULL);
264 +  revCompl=(args.getOpt('R')!=NULL);
265 +  if (args.getOpt('A')) doPolyTrim=false;
266    /*
267    rawFormat=(args.getOpt('R')!=NULL);
268    if (rawFormat) {
# Line 355 | Line 271
271    */
272    prefix=args.getOpt('n');
273    GStr s=args.getOpt('l');
274 <  if (!s.is_empty())
274 >  if (!s.is_empty())
275       min_read_len=s.asInt();
276    s=args.getOpt('m');
277 <  if (!s.is_empty())
277 >  if (!s.is_empty())
278       max_perc_N=s.asDouble();
279    s=args.getOpt('d');
280    if (!s.is_empty()) {
# Line 373 | Line 289
289    if (!s.is_empty()) {
290       qvtrim_max=s.asInt();
291       }
292 +  s=args.getOpt("MATCH");
293 +  if (!s.is_empty())
294 +            match_reward=s.asInt();
295 +  s=args.getOpt("MISMATCH");
296 +  if (!s.is_empty())
297 +            mismatch_penalty=s.asInt();
298 +  s=args.getOpt("XDROP");
299 +  if (!s.is_empty())
300 +            Xdrop=s.asInt();
301 +
302    s=args.getOpt('p');
303    if (!s.is_empty()) {
304       int v=s.asInt();
# Line 384 | Line 310
310          qv_phredtype=64;
311          qv_cvtadd=-31;
312          }
313 <       else
313 >       else
314           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
315       }
316 <  if (args.getOpt('3')!=NULL) {
317 <    adapter3=args.getOpt('3');
318 <    adapter3.upper();
319 <    a3len=adapter3.length();
320 <    }
321 <  if (args.getOpt('5')!=NULL) {
322 <    adapter5=args.getOpt('5');
323 <    adapter5.upper();
324 <    a5len=adapter5.length();
316 >  memset((void*)isACGT, 0, 256);
317 >  isACGT['A']=isACGT['a']=isACGT['C']=isACGT['c']=1;
318 >  isACGT['G']=isACGT['g']=isACGT['T']=isACGT['t']=1;
319 >  s=args.getOpt('f');
320 >  if (!s.is_empty()) {
321 >   loadAdaptors(s.chars());
322 >   }
323 >  bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
324 >  s=args.getOpt('5');
325 >  if (!s.is_empty()) {
326 >    if (fileAdaptors)
327 >      GError("Error: options -5 and -f cannot be used together!\n");
328 >    s.upper();
329 >    addAdaptor(adaptors5, s, galn_TrimLeft);
330      }
331 +  s=args.getOpt('3');
332 +  if (!s.is_empty()) {
333 +    if (fileAdaptors)
334 +      GError("Error: options -3 and -f cannot be used together!\n");
335 +      s.upper();
336 +      addAdaptor(adaptors3, s, galn_TrimRight);
337 +    }
338 +  s=args.getOpt('y');
339 +  if (!s.is_empty()) {
340 +     int minmatch=s.asInt();
341 +     if (minmatch>2)
342 +        poly_minScore=minmatch*poly_m_score;
343 +     else GMessage("Warning: invalid -y option, ignored.\n");
344 +     }
345    s=args.getOpt('a');
346    if (!s.is_empty()) {
347 <     int a_minmatch=s.asInt();
348 <     a_min_score=a_minmatch<<1;
347 >     int minmatch=s.asInt();
348 >     if (minmatch>2)
349 >        minEndAdapter=minmatch;
350 >     else GMessage("Warning: invalid -a option, ignored.\n");
351       }
352 <  
352 >
353 >
354    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
355                           else outsuffix="-";
356    trashReport=  (args.getOpt('r')!=NULL);
# Line 419 | Line 367
367    if (trashReport)
368      openfw(freport, args, 'r');
369    char* infile=NULL;
370 +
371 +  if (adaptors5.Count()>0)
372 +    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
373 +        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
374 +  if (adaptors3.Count()>0)
375 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
376 +
377    while ((infile=args.nextNonOpt())!=NULL) {
378 +    //for each input file
379      int incounter=0; //counter for input reads
380      int outcounter=0; //counter for output reads
381      int trash_s=0; //too short from the get go
382      int trash_Q=0;
383      int trash_N=0;
384      int trash_D=0;
385 <    int trash_A3=0;
386 <    int trash_A5=0;
385 >    int trash_poly=0;
386 >    int trash_V=0;
387 >    num_trimN=0;
388 >    num_trimQ=0;
389 >    num_trimV=0;
390 >    num_trimA=0;
391 >    num_trimT=0;
392 >    num_trim5=0;
393 >    num_trim3=0;
394 >
395 >    b_totalIn=0;
396 >    b_totalN=0;
397 >    b_trimN=0;
398 >    b_trimQ=0;
399 >    b_trimV=0;
400 >    b_trimA=0;
401 >    b_trimT=0;
402 >    b_trim5=0;
403 >    b_trim3=0;
404 >
405      s=infile;
406      GStr infname;
407      GStr infname2;
# Line 450 | Line 424
424         int a5=0, a3=0, b5=0, b3=0;
425         char tcode=0, tcode2=0;
426         tcode=process_read(seqid, rseq, rqv, a5, a3);
427 <       //if (!doCollapse) {
428 <         if (fq2!=NULL) {
427 >       if (a5>0) {
428 >          b_trim5+=a5;
429 >          num_trim5++;
430 >          }
431 >       if (a3<rseq.length()-1) {
432 >          b_trim3+=rseq.length()-a3-1;
433 >          num_trim3++;
434 >          }
435 >       if (fq2!=NULL) {
436              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
437              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
438                 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
439                    seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
440                 }
441              tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
442 +            if (b5>0) {
443 +               b_trim5+=b5;
444 +               num_trim5++;
445 +               }
446 +            if (b3<rseq2.length()-1) {
447 +               b_trim3+=rseq2.length()-b3-1;
448 +               num_trim3++;
449 +               }
450              //decide what to do with this pair and print rseq2 if the pair makes it
451              if (tcode>1 && tcode2<=1) {
452                 //"untrash" rseq
# Line 483 | Line 472
472                 int nocounter=0;
473                 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
474                 }
475 <            } //paired read
487 <       // }
475 >            } //pair read
476         if (tcode>1) { //trashed
477 +         #ifdef GDEBUG
478 +         GMessage(" !!!!TRASH code = %c\n",tcode);
479 +         #endif
480            if (tcode=='s') trash_s++;
481 +          else if (tcode=='A' || tcode=='T') trash_poly++;
482              else if (tcode=='Q') trash_Q++;
483                else if (tcode=='N') trash_N++;
484                 else if (tcode=='D') trash_D++;
485 <                else if (tcode=='3') trash_A3++;
486 <                 else if (tcode=='5') trash_A5++;
485 >                else if (tcode=='V') trash_V++;
486 >                 //else if (tcode=='5') trash_A5++;
487            if (trashReport) trash_report(tcode, seqid, freport);
488            }
489           else if (!doCollapse) { //write it
# Line 499 | Line 491
491              rseq=rseq.substr(a5,a3-a5+1);
492              if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
493              }
494 +         #ifdef GDEBUG
495 +            GMessage("  After trimming:\n");
496 +            GMessage("%s\n",rseq.chars());
497 +         #endif
498            writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
499            }
500         } //for each fastq record
# Line 526 | Line 522
522                 }
523              }
524           outcounter++;
525 <         if (qd->count>maxdup_count) {
525 >         if (qd->count>maxdup_count) {
526              maxdup_count=qd->count;
527              maxdup_seq=seq;
528              }
529           if (isfasta) {
530             if (prefix.is_empty()) {
531 <             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
531 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
532                             rseq.chars());
533               }
534             else { //use custom read name
# Line 543 | Line 539
539           else { //fastq format
540            if (convert_phred) convertPhred(qd->qv, qd->len);
541            if (prefix.is_empty()) {
542 <            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
542 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
543                             rseq.chars(), qd->qv);
544              }
545            else { //use custom read name
# Line 559 | Line 555
555      if (verbose) {
556         if (paired_reads) {
557             GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
558 <           GMessage("Number of input pairs :%9d\n", incounter);
559 <           GMessage("         Output pairs :%9d\n", outcounter);
558 >           GMessage("Number of input pairs :%9u\n", incounter);
559 >           GMessage("         Output pairs :%9u\t(%u trashed)\n", outcounter, incounter-outcounter);
560             }
561           else {
562             GMessage(">Input file : %s\n", infname.chars());
563             GMessage("Number of input reads :%9d\n", incounter);
564 <           GMessage("         Output reads :%9d\n", outcounter);
564 >           GMessage("         Output reads :%9d  (%u trashed)\n", outcounter, incounter-outcounter);
565             }
566 <       GMessage("------------------------------------\n");
567 <       if (num_trimmed5)
568 <          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
569 <       if (num_trimmed3)
570 <          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
566 >       GMessage("--------------- Read trimming: ------------------\n");
567 >       if (num_trim5)
568 >          GMessage("           5' trimmed :%9u\n", num_trim5);
569 >       if (num_trim3)
570 >          GMessage("           3' trimmed :%9u\n", num_trim3);
571 >       if (num_trimQ)
572 >          GMessage("         q.v. trimmed :%9u\n", num_trimQ);
573 >       if (num_trimN)
574 >          GMessage("            N trimmed :%9u\n", num_trimN);
575 >       if (num_trimT)
576 >          GMessage("       poly-T trimmed :%9u\n", num_trimT);
577 >       if (num_trimA)
578 >          GMessage("       poly-A trimmed :%9u\n", num_trimA);
579 >       if (num_trimV)
580 >          GMessage("      Adapter trimmed :%9u\n", num_trimV);
581         GMessage("------------------------------------\n");
582         if (trash_s>0)
583 <         GMessage("     Trashed by length:%9d\n", trash_s);
583 >         GMessage("Trashed by initial len:%9d\n", trash_s);
584         if (trash_N>0)
585           GMessage("         Trashed by N%%:%9d\n", trash_N);
586         if (trash_Q>0)
587           GMessage("Trashed by low quality:%9d\n", trash_Q);
588 <       if (trash_A5>0)
589 <         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
590 <       if (trash_A3>0)
591 <         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
588 >       if (trash_poly>0)
589 >         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
590 >       if (trash_V>0)
591 >         GMessage("    Trashed by adaptor:%9d\n", trash_V);
592 >       GMessage("\n--------------- Base counts  --------------------\n");
593 >       GMessage("    Input bases   :%12llu\n", b_totalIn);
594 >       double percN=100.0* ((double)b_totalN/(double)b_totalIn);
595 >       GMessage("          N bases :%12llu (%4.2f%%)\n", b_totalN, percN);
596 >       GMessage("   trimmed from 5':%12llu\n", b_trim5);
597 >       GMessage("   trimmed from 3':%12llu\n", b_trim3);
598 >       GMessage("\n");
599 >       if (b_trimQ)
600 >       GMessage("     q.v. trimmed :%12llu\n", b_trimQ);
601 >       if (b_trimN)
602 >       GMessage("        N trimmed :%12llu\n", b_trimN);
603 >       if (b_trimT)
604 >       GMessage("   poly-T trimmed :%12llu\n", b_trimT);
605 >       if (b_trimA)
606 >       GMessage("   poly-A trimmed :%12llu\n", b_trimA);
607 >       if (b_trimV)
608 >       GMessage("  Adapter trimmed :%12llu\n", b_trimV);
609 >
610         }
611      if (trashReport) {
612            FWCLOSE(freport);
# Line 590 | Line 614
614      FWCLOSE(f_out);
615      FWCLOSE(f_out2);
616     } //while each input file
617 <
617 > delete gxmem_l;
618 > delete gxmem_r;
619   //getc(stdin);
620   }
621  
# Line 605 | Line 630
630     const char* seq;
631     bool valid;
632     NData() {
633 +    seqlen=0;
634      NCount=0;
635      end5=0;
636      end3=0;
# Line 635 | Line 661
661       perc_N=(n*100.0)/(end5-end3+1);
662       }
663   };
664 <
664 >
665   static NData feat;
666   int perc_lenN=12; // incremental distance from ends, in percentage of
667            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
668 <          
668 >
669   void N_analyze(int l5, int l3, int p5, int p3) {
670   /* assumes feat was filled properly */
671   int old_dif, t5,t3,v;
672   if (l3<l5+2 || p5>p3 ) {
673     feat.end5=l5+1;
674     feat.end3=l3+1;
675 <   return;
675 >   return;
676     }
677  
678   t5=feat.NPos[p5]-l5;
679   t3=l3-feat.NPos[p3];
680   old_dif=p3-p5;
681   v=(int)((((double)(l3-l5))*perc_lenN)/100);
682 < if (v>20) v=20; /* enforce N-search limit for very long reads */
682 > if (v>20) v=20; /* enforce N-search limit for very long reads */
683   if (t5 < v ) {
684     l5=feat.NPos[p5]+1;
685     p5++;
# Line 670 | Line 696
696             feat.end3=l3+1;
697             return;
698             }
699 <    else
699 >    else
700        N_analyze(l5,l3, p5,p3);
701   }
702  
# Line 711 | Line 737
737   feat.init(rseq);
738   l5=feat.end5-1;
739   l3=feat.end3-1;
740 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
740 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
741   if (l5==feat.end5-1 && l3==feat.end3-1) {
742      if (feat.perc_N>max_perc_N) {
743             feat.valid=false;
# Line 729 | Line 755
755     return true;
756     }
757   feat.N_calc();
758 <
758 >
759   if (feat.perc_N>max_perc_N) {
760        feat.valid=false;
761        l3=l5+1;
# Line 741 | Line 767
767   //--------------- dust functions ----------------
768   class DNADuster {
769   public:
770 <  int dustword;
771 <  int dustwindow;
772 <  int dustwindow2;
770 >  int dustword;
771 >  int dustwindow;
772 >  int dustwindow2;
773    int dustcutoff;
774    int mv, iv, jv;
775    int counts[32*32*32];
# Line 838 | Line 864
864                      }
865             }
866           }
867 < //return first;
867 > //return first;
868   }
869   };
870  
# Line 856 | Line 882
882   return ncount;
883   }
884  
885 + struct SLocScore {
886 +  int pos;
887 +  int score;
888 +  SLocScore(int p=0,int s=0) {
889 +    pos=p;
890 +    score=s;
891 +    }
892 +  void set(int p, int s) {
893 +    pos=p;
894 +    score=s;
895 +    }
896 +  void add(int p, int add) {
897 +    pos=p;
898 +    score+=add;
899 +    }
900 + };
901  
902 < // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
903 < //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
904 < //check minimum score and
905 < //for 3' adapter trimming:
906 < //     require that the right end of the alignment for either the adaptor OR the read must be
907 < //     < 3 distance from its right end
908 < // for 5' adapter trimming:
909 < //     require that the left end of the alignment for either the adaptor OR the read must
910 < //     be at coordinate < 3 from start
911 <
912 < bool extendMatch(const char* a, int alen, int ai,
913 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
914 < //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
915 < #ifdef DEBUG
916 < GStr dbg(b);
875 < #endif
876 < //if (debug) {
877 < //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
878 < //  }
879 < int a_l=ai; //alignment coordinates on a
880 < int a_r=ai+mlen-1;
881 < int b_l=bi; //alignment coordinates on b
882 < int b_r=bi+mlen-1;
883 < int ai_maxscore=ai;
884 < int bi_maxscore=bi;
885 < int score=mlen*a_m_score;
886 < int maxscore=score;
887 < int mism5score=a_mis_score;
888 < if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
889 < //try to extend to the left first, if possible
890 < while (ai>0 && bi>0) {
891 <   ai--;
892 <   bi--;
893 <   score+= (a[ai]==b[bi])? a_m_score : mism5score;
894 <   if (score>maxscore) {
895 <       ai_maxscore=ai;
896 <       bi_maxscore=bi;
897 <       maxscore=score;
898 <       }
899 <     else if (maxscore-score>a_dropoff_score) break;
900 <   }
901 < a_l=ai_maxscore;
902 < b_l=bi_maxscore;
903 < //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
904 < //now extend to the right
905 < ai_maxscore=a_r;
906 < bi_maxscore=b_r;
907 < ai=a_r;
908 < bi=b_r;
909 < score=maxscore;
910 < //sometimes there are extra AAAAs at the end of the read, ignore those
911 < if (strcmp(&a[alen-4],"AAAA")==0) {
912 <    alen-=3;
913 <    while (a[alen-1]=='A' && alen>ai) alen--;
914 <    }
915 < while (ai<alen-1 && bi<blen-1) {
916 <   ai++;
917 <   bi++;
918 <   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
919 <   if (a[ai]==b[bi]) { //match
920 <      score+=a_m_score;
921 <      if (ai>=alen-2) {
922 <           score+=a_m_score-(alen-ai-1);
923 <           }
902 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
903 > if (!doPolyTrim) return false;
904 > int rlen=seq.length();
905 > l5=0;
906 > l3=rlen-1;
907 > int32 seedVal=*(int32*)poly_seed;
908 > char polyChar=poly_seed[0];
909 > //assumes N trimming was already done
910 > //so a poly match should be very close to the end of the read
911 > // -- find the initial match (seed)
912 > int lmin=GMAX((rlen-16), 0);
913 > int li;
914 > for (li=rlen-4;li>lmin;li--) {
915 >   if (seedVal==*(int*)&(seq[li])) {
916 >      break;
917        }
925    else { //mismatch
926      score+=a_mis_score;
927      }  
928   if (score>maxscore) {
929       ai_maxscore=ai;
930       bi_maxscore=bi;
931       maxscore=score;
932       }
933     else if (maxscore-score>a_dropoff_score) break;
918     }
919 <  a_r=ai_maxscore;
920 <  b_r=bi_maxscore;
921 <  int a_ovh3=alen-a_r-1;
922 <  int b_ovh3=blen-b_r-1;
923 <  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
924 <  int mmovh5=(a_l<b_l)? a_l : b_l;
925 <  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
926 < #ifdef DEBUG
927 <  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
928 < #endif
929 <  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
930 <     if (a_l<a_ovh3) {
931 <        //adapter closer to the left end (typical for 5' adapter)
932 <        l5=a_r+1;
933 <        l3=alen-1;
919 > if (li<=lmin) return false;
920 > //seed found, try to extend it both ways
921 > //extend right
922 > int ri=li+3;
923 > SLocScore loc(ri, poly_m_score<<2);
924 > SLocScore maxloc(loc);
925 > //extend right
926 > while (ri<rlen-1) {
927 >   ri++;
928 >   if (seq[ri]==polyChar) {
929 >                loc.add(ri,poly_m_score);
930 >                }
931 >   else if (seq[ri]=='N') {
932 >                loc.add(ri,0);
933 >                }
934 >   else { //mismatch
935 >        loc.add(ri,poly_mis_score);
936 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
937          }
938 <      else {
939 <        //adapter matching at the right end (typical for 3' adapter)
940 <        l5=0;
954 <        l3=a_l-1;
955 <        }
956 <     return true;
957 <     }
958 < else { //keep this segment pair for later (gapped alignment)
959 <   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
960 <   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
961 <   }
962 <  //do not trim:
963 <  l5=0;
964 <  l3=alen-1;
965 <  return false;
966 < }
967 <
968 < /*
969 < int getWordValue(const char* s, int wlen) {
970 < int r=0;
971 < while (wlen--) { r+=(((int)s[wlen])<<wlen) }
972 < return r;
973 < }
974 < */
975 < int get3mer_value(const char* s) {
976 < return (s[0]<<16)+(s[1]<<8)+s[2];
977 < }
978 <
979 < int w3_match(int qv, const char* str, int slen, int start_index=0) {
980 < if (start_index>=slen || start_index<0) return -1;
981 < for (int i=start_index;i<slen-3;i++) {
982 <   int rv=get3mer_value(str+i);
983 <   if (rv==qv) return i;
984 <   }
985 < return -1;
986 < }
987 <
988 < int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
989 < if (end_index>=slen) return -1;
990 < if (end_index<0) end_index=slen-1;
991 < for (int i=end_index-2;i>=0;i--) {
992 <   int rv=get3mer_value(str+i);
993 <   if (rv==qv) return i;
994 <   }
995 < return -1;
996 < }
997 <
998 < int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
999 < if (start_index>=slen || start_index<0) return -1;
1000 < for (int i=start_index;i<slen-4;i++) {
1001 <   int32* rv=(int32*)(str+i);
1002 <   if (*rv==qv) return i;
1003 <   }
1004 < return -1;
1005 < }
1006 <
1007 < int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1008 < if (end_index>=slen) return -1;
1009 < if (end_index<0) end_index=slen-1;
1010 < for (int i=end_index-3;i>=0;i--) {
1011 <   int32* rv=(int32*)(str+i);
1012 <   if (*rv==qv) return i;
1013 <   }
1014 < return -1;
1015 < }
1016 <
1017 < #ifdef DEBUG
1018 < void dbgPrintChain(CSegChain& chain, const char* aseq) {
1019 <  GStr s(aseq);
1020 <  for (int i=0;i<chain.Count();i++) {
1021 <   CSegPair& seg=*chain[i];
1022 <   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1023 <            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
938 >   if (maxloc.score<=loc.score) {
939 >      maxloc=loc;
940 >      }
941     }
942 + ri=maxloc.pos;
943 + if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
944 + //ri = right boundary for the poly match
945 + //extend left
946 + loc.set(li, maxloc.score);
947 + maxloc.pos=li;
948 + while (li>0) {
949 +    li--;
950 +    if (seq[li]==polyChar) {
951 +                 loc.add(li,poly_m_score);
952 +                 }
953 +    else if (seq[li]=='N') {
954 +                 loc.add(li,0);
955 +                 }
956 +    else { //mismatch
957 +         loc.add(li,poly_mis_score);
958 +         if (maxloc.score-loc.score>poly_dropoff_score) break;
959 +         }
960 +    if (maxloc.score<=loc.score) {
961 +       maxloc=loc;
962 +       }
963 +    }
964 + li=maxloc.pos;
965 + if ((maxloc.score==poly_minScore && ri==rlen-1) ||
966 +    (maxloc.score>poly_minScore && ri>=rlen-3) ||
967 +    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
968 +  //trimming this li-ri match at 3' end
969 +    l3=li-1;
970 +    if (l3<0) l3=0;
971 +    return true;
972 +    }
973 + return false;
974   }
1026 #endif
975  
976 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
976 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
977 > if (!doPolyTrim) return false;
978   int rlen=seq.length();
979   l5=0;
980   l3=rlen-1;
981 < //first try a full match, we might get lucky
982 < int fi=-1;
983 < if ((fi=seq.index(adapter3))>=0) {
984 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
985 <      l5=fi+a3len;
986 <      l3=rlen-1;
981 > int32 seedVal=*(int32*)poly_seed;
982 > char polyChar=poly_seed[0];
983 > //assumes N trimming was already done
984 > //so a poly match should be very close to the end of the read
985 > // -- find the initial match (seed)
986 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
987 > int li;
988 > for (li=0;li<=lmax;li++) {
989 >   if (seedVal==*(int*)&(seq[li])) {
990 >      break;
991        }
1039    else {
1040      l5=0;
1041      l3=fi-1;
1042      }
1043   return true;
992     }
993 < #ifdef DEBUG
994 < if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
995 < #endif
996 <
997 < //also, for fast detection of other adapter-only reads that start past
998 < // the beginning of the adapter sequence, try to see if the first a3len-4
999 < // bases of the read are a substring of the adapter
1000 < if (rlen>a3len-3) {
1001 <   GStr rstart=seq.substr(1,a3len-4);
1002 <   if ((fi=adapter3.index(rstart))>=0) {
1003 <     l3=rlen-1;
1004 <     l5=a3len-4;
1005 <     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1006 <     return true;
1007 <     }
1008 <  }
1009 < CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1010 <  //check the easy cases - 11 bases exact match at the end
1011 < int fdlen=11;
1012 <  if (a3len<16) {
1065 <   fdlen=a3len>>1;
1066 <   }
1067 < if (fdlen>4) {
1068 <     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1069 <     GStr rstart=seq.substr(-fdlen-3,fdlen);
1070 <     if ((fi=adapter3.index(rstart))>=0) {
1071 < #ifdef DEBUG
1072 <       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1073 < #endif
1074 <       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1075 <                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
1076 <            return true;
1077 <       }
1078 <     //another easy case: first 11 characters of the adaptor found as a substring of the read
1079 <     GStr bstr=adapter3.substr(0, fdlen);
1080 <     if ((fi=seq.rindex(bstr))>=0) {
1081 < #ifdef DEBUG
1082 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1083 < #endif
1084 <       if (extendMatch(seq.chars(), rlen, fi,
1085 <                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
1086 <            return true;
993 > if (li>lmax) return false;
994 > //seed found, try to extend it both ways
995 > //extend left
996 > int ri=li+3; //save rightmost base of the seed
997 > SLocScore loc(li, poly_m_score<<2);
998 > SLocScore maxloc(loc);
999 > while (li>0) {
1000 >    li--;
1001 >    if (seq[li]==polyChar) {
1002 >                 loc.add(li,poly_m_score);
1003 >                 }
1004 >    else if (seq[li]=='N') {
1005 >                 loc.add(li,0);
1006 >                 }
1007 >    else { //mismatch
1008 >         loc.add(li,poly_mis_score);
1009 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
1010 >         }
1011 >    if (maxloc.score<=loc.score) {
1012 >       maxloc=loc;
1013         }
1014 <     } //tried to match 11 bases first
1015 <    
1016 < //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1017 < //-- only extend if the match is in the 3' (ending) region of the read
1018 < int wordSize=3;
1019 < int hlen=12;
1020 < if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1021 < int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1022 < if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1023 < imin=rlen-imin;
1024 < for (int iw=0;iw<hlen;iw++) {
1025 <   //int32* qv=(int32*)(adapter3.chars()+iw);
1026 <   int qv=get3mer_value(adapter3.chars()+iw);
1027 <   fi=-1;
1028 <   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1029 <   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1030 <     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1014 >    }
1015 > li=maxloc.pos;
1016 > if (li>5) return false; //no trimming wanted, too far from 5' end
1017 > //li = right boundary for the poly match
1018 >
1019 > //extend right
1020 > loc.set(ri, maxloc.score);
1021 > maxloc.pos=ri;
1022 > while (ri<rlen-1) {
1023 >   ri++;
1024 >   if (seq[ri]==polyChar) {
1025 >                loc.add(ri,poly_m_score);
1026 >                }
1027 >   else if (seq[ri]=='N') {
1028 >                loc.add(ri,0);
1029 >                }
1030 >   else { //mismatch
1031 >        loc.add(ri,poly_mis_score);
1032 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
1033 >        }
1034 >   if (maxloc.score<=loc.score) {
1035 >      maxloc=loc;
1036 >      }
1037 >   }
1038 > ri=maxloc.pos;
1039 > if ((maxloc.score==poly_minScore && li==0) ||
1040 >     (maxloc.score>poly_minScore && li<2)
1041 >     || (maxloc.score>(poly_minScore*3) && li<8)) {
1042 >    //adjust l5 to reflect this trimming of 5' end
1043 >    l5=ri+1;
1044 >    if (l5>rlen-1) l5=rlen-1;
1045 >    return true;
1046 >    }
1047 > return false;
1048 > }
1049  
1050 < #ifdef DEBUG
1051 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1052 < #endif
1053 <     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1054 <                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
1055 <     fi--;
1056 <     if (fi<imin) break;
1057 <     }
1058 <   } //for each wmer in the first hlen bases of the adaptor
1059 < /*
1060 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1061 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1062 < if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1063 < int hlen2=a3len-wordSize;
1064 < //if (hlen2>a3len-4) hlen2=a3len-4;
1065 < if (hlen2>hlen) {
1122 < #ifdef DEBUG
1123 <     if (debug && a3segs.Count()>0) {
1124 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1050 > bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
1051 > if (adaptors3.Count()==0) return false;
1052 > int rlen=seq.length();
1053 > l5=0;
1054 > l3=rlen-1;
1055 > bool trimmed=false;
1056 > GStr wseq(seq);
1057 > int wlen=rlen;
1058 > GXSeqData seqdata;
1059 > int numruns=revCompl ? 2 : 1;
1060 > GList<GXAlnInfo> bestalns(true, true, false);
1061 > for (int ai=0;ai<adaptors3.Count();ai++) {
1062 >   for (int r=0;r<numruns;r++) {
1063 >     if (r) {
1064 >          seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
1065 >                 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
1066          }
1067 < #endif
1068 <     for (int iw=hlen;iw<hlen2;iw++) {
1069 <         //int* qv=(int32 *)(adapter3.chars()+iw);
1070 <         int qv=get3mer_value(adapter3.chars()+iw);
1071 <         fi=-1;
1072 <         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1073 <         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1074 <           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1075 <                         a3len, iw, wordSize, l5,l3, a3segs);
1076 <           fi--;
1077 <           if (fi<imin) break;
1078 <           }
1079 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1067 >     else {
1068 >            seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
1069 >                 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
1070 >        }
1071 >     GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, minEndAdapter, gxmem_r, 86);
1072 >         if (aln) {
1073 >           if (aln->strong) {
1074 >                   trimmed=true;
1075 >                   bestalns.Add(aln);
1076 >                   break; //will check the rest next time
1077 >                   }
1078 >            else bestalns.Add(aln);
1079 >           }
1080 >   }//forward and reverse adaptors
1081 >   if (trimmed) break; //will check the rest in the next cycle
1082 >  }//for each 3' adaptor
1083 > if (bestalns.Count()>0) {
1084 >           GXAlnInfo* aln=bestalns[0];
1085 >           if (aln->sl-1 > wlen-aln->sr) {
1086 >                   //keep left side
1087 >                   l3-=(wlen-aln->sl+1);
1088 >                   if (l3<0) l3=0;
1089 >                   }
1090 >           else { //keep right side
1091 >                   l5+=aln->sr;
1092 >                   if (l5>=rlen) l5=rlen-1;
1093 >                   }
1094 >           //delete aln;
1095 >           //if (l3-l5+1<min_read_len) return true;
1096 >           wseq=seq.substr(l5,l3-l5+1);
1097 >           wlen=wseq.length();
1098 >           return true; //break the loops here to report a good find
1099       }
1100 < //lastly, analyze collected a3segs for a possible gapped alignment:
1141 < GList<CSegChain> segchains(false,true,false);
1142 < #ifdef DEBUG
1143 < if (debug && a3segs.Count()>0) {
1144 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1145 <   }
1146 < #endif
1147 < for (int i=0;i<a3segs.Count();i++) {
1148 <   if (a3segs[i]->chain==NULL) {
1149 <       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1150 <       CSegChain* newchain=new CSegChain();
1151 <       newchain->setFreeItem(false);
1152 <       newchain->addSegPair(a3segs[i]);
1153 <       a3segs[i]->chain=newchain;
1154 <       segchains.Add(newchain); //just to free them when done
1155 <       }
1156 <   for (int j=i+1;j<a3segs.Count();j++) {
1157 <      CSegChain* chain=a3segs[i]->chain;
1158 <      if (chain->extendChain(a3segs[j])) {
1159 <          a3segs[j]->chain=chain;
1160 < #ifdef DEBUG
1161 <          if (debug) dbgPrintChain(*chain, adapter3.chars());
1162 < #endif      
1163 <          //save time by checking here if the extended chain is already acceptable for trimming
1164 <          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1165 <            l5=0;
1166 <            l3=chain->astart-2;
1167 < #ifdef DEBUG
1168 <          if (debug && a3segs.Count()>0) {
1169 <            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1170 <            }
1171 < #endif
1172 <            return true;
1173 <            }
1174 <          } //chain can be extended
1175 <      }
1176 <   } //collect segment alignments into chains
1177 < */  
1178 < return false; //no adapter parts found
1100 >  return false;
1101   }
1102  
1103 < bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1104 < //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1103 > bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1104 > if (adaptors5.Count()==0) return false;
1105   int rlen=seq.length();
1106   l5=0;
1107   l3=rlen-1;
1108 < //try to see if adapter is fully included in the read
1109 < int fi=-1;
1110 < if ((fi=seq.index(adapter5))>=0) {
1111 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
1112 <      l5=fi+a5len;
1113 <      l3=rlen-1;
1114 <      }
1115 <    else {
1116 <      l5=0;
1117 <      l3=fi-1;
1118 <      }
1197 <   return true;
1198 <   }
1199 < #ifdef DEBUG
1200 < if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1201 < #endif
1202 <
1203 < CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1204 <
1205 < //try the easy way out first - look for an exact match of 11 bases
1206 < int fdlen=11;
1207 <  if (a5len<16) {
1208 <   fdlen=a5len>>1;
1209 <   }
1210 < if (fdlen>4) {
1211 <     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1212 <     if ((fi=adapter5.index(rstart))>=0) {
1213 < #ifdef DEBUG
1214 <       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1215 < #endif
1216 <       if (extendMatch(seq.chars(), rlen, 1,
1217 <                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1218 <           return true;
1219 <       }
1220 <     //another easy case: last 11 characters of the adaptor found as a substring of the read
1221 <     GStr bstr=adapter5.substr(-fdlen);
1222 <     if ((fi=seq.index(bstr))>=0) {
1223 < #ifdef DEBUG
1224 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1225 < #endif
1226 <       if (extendMatch(seq.chars(), rlen, fi,
1227 <                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1228 <          return true;
1229 <       }
1230 <     } //tried to matching at most 11 bases first
1231 <    
1232 < //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1233 < //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1234 < int wordSize=3;
1235 < int hlen=12;
1236 < if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1237 < int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1238 < if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1239 < for (int iw=0;iw<=hlen;iw++) {
1240 <   int apstart=a5len-iw-wordSize;
1241 <   fi=0;
1242 <   //int* qv=(int32 *)(adapter5.chars()+apstart);
1243 <   int qv=get3mer_value(adapter5.chars()+apstart);
1244 <   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1245 <   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1246 < #ifdef DEBUG
1247 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1248 < #endif
1249 <     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1250 <                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1251 <     fi++;
1252 <     if (fi>imax) break;
1253 <     }
1254 <   } //for each wmer in the last hlen bases of the adaptor
1255 < /*
1256 <
1257 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1258 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1259 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1260 < int hlen2=a5len-wordSize;
1261 < //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1262 < #ifdef DEBUG
1263 <      if (debug && a5segs.Count()>0) {
1264 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1108 > bool trimmed=false;
1109 > GStr wseq(seq);
1110 > int wlen=rlen;
1111 > GXSeqData seqdata;
1112 > int numruns=revCompl ? 2 : 1;
1113 > GList<GXAlnInfo> bestalns(true, true, false);
1114 > for (int ai=0;ai<adaptors5.Count();ai++) {
1115 >   for (int r=0;r<numruns;r++) {
1116 >     if (r) {
1117 >          seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1118 >                 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1119          }
1120 < #endif
1121 < if (hlen2>hlen) {
1122 <     for (int iw=hlen+1;iw<=hlen2;iw++) {
1123 <         int apstart=a5len-iw-wordSize;
1124 <         fi=0;
1125 <         //int* qv=(int32 *)(adapter5.chars()+apstart);
1126 <         int qv=get3mer_value(adapter5.chars()+apstart);
1127 <         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1128 <         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1129 <           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1130 <                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1131 <           fi++;
1132 <           if (fi>imax) break;
1133 <           }
1134 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1120 >     else {
1121 >            seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1122 >                 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1123 >        }
1124 >         GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type,
1125 >                                                       minEndAdapter, gxmem_l, 90);
1126 >         if (aln) {
1127 >           if (aln->strong) {
1128 >                   trimmed=true;
1129 >                   bestalns.Add(aln);
1130 >                   break; //will check the rest next time
1131 >                   }
1132 >            else bestalns.Add(aln);
1133 >           }
1134 >         } //forward and reverse?
1135 >   if (trimmed) break; //will check the rest in the next cycle
1136 >  }//for each 5' adaptor
1137 >  if (bestalns.Count()>0) {
1138 >           GXAlnInfo* aln=bestalns[0];
1139 >           if (aln->sl-1 > wlen-aln->sr) {
1140 >                   //keep left side
1141 >                   l3-=(wlen-aln->sl+1);
1142 >                   if (l3<0) l3=0;
1143 >                   }
1144 >           else { //keep right side
1145 >                   l5+=aln->sr;
1146 >                   if (l5>=rlen) l5=rlen-1;
1147 >                   }
1148 >           //delete aln;
1149 >           //if (l3-l5+1<min_read_len) return true;
1150 >           wseq=seq.substr(l5,l3-l5+1);
1151 >           wlen=wseq.length();
1152 >           return true; //break the loops here to report a good find
1153       }
1154 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1155 < // lastly, analyze collected a5segs for a possible gapped alignment:
1284 < GList<CSegChain> segchains(false,true,false);
1285 < #ifdef DEBUG
1286 < if (debug && a5segs.Count()>0) {
1287 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1288 <   }
1289 < #endif
1290 < for (int i=0;i<a5segs.Count();i++) {
1291 <   if (a5segs[i]->chain==NULL) {
1292 <       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1293 <       CSegChain* newchain=new CSegChain(true);
1294 <       newchain->setFreeItem(false);
1295 <       newchain->addSegPair(a5segs[i]);
1296 <       a5segs[i]->chain=newchain;
1297 <       segchains.Add(newchain); //just to free them when done
1298 <       }
1299 <   for (int j=i+1;j<a5segs.Count();j++) {
1300 <      CSegChain* chain=a5segs[i]->chain;
1301 <      if (chain->extendChain(a5segs[j])) {
1302 <         a5segs[j]->chain=chain;
1303 < #ifdef DEBUG
1304 <         if (debug) dbgPrintChain(*chain, adapter5.chars());
1305 < #endif      
1306 <      //save time by checking here if the extended chain is already acceptable for trimming
1307 <         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1308 <            l5=chain->aend;
1309 <            l3=rlen-1;
1310 <            return true;
1311 <            }
1312 <         } //chain can be extended
1313 <      }
1314 <   } //collect segment alignments into chains
1315 < */
1316 < return false; //no adapter parts found
1317 < }
1154 >  return false;
1155 > }
1156  
1157 < //convert qvs to/from phred64 from/to phread33
1157 > //convert qvs to/from phred64 from/to phread33
1158   void convertPhred(GStr& q) {
1159   for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1160   }
# Line 1325 | Line 1163
1163   for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1164   }
1165  
1166 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1166 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1167            GStr& rname, GStr& rinfo, GStr& infname) {
1168   rseq="";
1169   rqv="";
# Line 1341 | Line 1179
1179        } //raw qseq format
1180   else { // FASTQ or FASTA */
1181   isfasta=(l[0]=='>');
1182 < if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1182 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1183        infname.chars(), l);
1184   GStr s(l);
1185   rname=&(l[1]);
1186   for (int i=0;i<rname.length();i++)
1187 <    if (rname[i]<=' ') {
1188 <       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1189 <       rname.cut(i);
1190 <       break;
1187 >    if (rname[i]<=' ') {
1188 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1189 >       rname.cut(i);
1190 >       break;
1191         }
1192    //now get the sequence
1193 < if ((l=fq.getLine())==NULL)
1193 > if ((l=fq.getLine())==NULL)
1194        GError("Error: unexpected EOF after header for read %s (%s)\n",
1195                     rname.chars(), infname.chars());
1196   rseq=l; //this must be the DNA line
1197   while ((l=fq.getLine())!=NULL) {
1198        //seq can span multiple lines
1199        if (l[0]=='>' || l[0]=='+') {
1200 <           fq.pushBack();
1200 >           fq.pushBack();
1201             break; //
1202             }
1203        rseq+=l;
1204 <      } //check for multi-line seq
1204 >      } //check for multi-line seq
1205   if (!isfasta) { //reading fastq quality values, which can also be multi-line
1206      if ((l=fq.getLine())==NULL)
1207          GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1208      if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1209 <    if ((l=fq.getLine())==NULL)
1209 >    if ((l=fq.getLine())==NULL)
1210          GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1211      rqv=l;
1212 <    //if (rqv.length()!=rseq.length())
1212 >    //if (rqv.length()!=rseq.length())
1213      //  GError("Error: qv len != seq len for %s\n", rname.chars());
1214      while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1215        rqv+=l; //append to qv string
1216        }
1217      }// fastq
1218   // } //<-- FASTA or FASTQ
1219 < rseq.upper(); //TODO: what if we care about masking?
1219 > rseq.upper();
1220   return true;
1221   }
1222  
1223 + #ifdef GDEBUG
1224 + void showTrim(GStr& s, int l5, int l3) {
1225 +  if (l5>0 || l3==0) {
1226 +    color_bg(c_red);
1227 +    }
1228 +  for (int i=0;i<s.length()-1;i++) {
1229 +    if (i && i==l5) color_resetbg();
1230 +    fprintf(stderr, "%c", s[i]);
1231 +    if (i && i==l3) color_bg(c_red);
1232 +   }
1233 +  fprintf(stderr, "%c", s[s.length()-1]);
1234 +  color_reset();
1235 +  fprintf(stderr, "\n");
1236 + }
1237 + #endif
1238 +
1239   char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1240 < //returns 0 if the read was untouched, 1 if it was just trimmed
1240 > //returns 0 if the read was untouched, 1 if it was just trimmed
1241   // and a trash code if it was trashed
1242   l5=0;
1243   l3=rseq.length()-1;
1244 + #ifdef GDEBUG
1245 +   //rseq.reverse();
1246 +   GMessage(">%s\n", rname.chars());
1247 +   GMessage("%s\n",rseq.chars());
1248 + #endif
1249   if (l3-l5+1<min_read_len) {
1250     return 's';
1251     }
# Line 1394 | Line 1253
1253   GStr wqv(rqv.chars());
1254   int w5=l5;
1255   int w3=l3;
1256 + //count Ns
1257 + b_totalIn+=rseq.length();
1258 + for (int i=0;i<rseq.length();i++) {
1259 + if (isACGT[(int)rseq[i]]==0) b_totalN++;
1260 + }
1261 +
1262   //first do the q-based trimming
1263   if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1264 +   b_trimQ+=(w5-l5)+(l3-w3);
1265 +   num_trimQ++;
1266     if (w3-w5+1<min_read_len) {
1267       return 'Q'; //invalid read
1268       }
# Line 1408 | Line 1275
1275     } //qv trimming
1276   // N-trimming on the remaining read seq
1277   if (ntrim(wseq, w5, w3)) {
1278 +   //Note: ntrim resets w5 and w3
1279     //GMessage("before: %s\n",wseq.chars());
1280     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1281 +   int trim3=(wseq.length()-1-w3);
1282 +   b_trimN+=w5+trim3;
1283 +   num_trimN++;
1284     l5+=w5;
1285 <   l3-=(wseq.length()-1-w3);
1285 >   l3-=trim3;
1286     if (w3-w5+1<min_read_len) {
1287       return 'N'; //to be trashed
1288       }
# Line 1422 | Line 1293
1293     w5=0;
1294     w3=wseq.length()-1;
1295     }
1296 < if (a3len>0) {
1297 <  if (trim_adapter3(wseq, w5, w3)) {
1298 <     int trimlen=wseq.length()-(w3-w5+1);
1299 <     num_trimmed3++;
1300 <     if (trimlen<min_trimmed3)
1301 <         min_trimmed3=trimlen;
1302 <     l5+=w5;
1303 <     l3-=(wseq.length()-1-w3);
1304 <     if (w3-w5+1<min_read_len) {
1305 <         return '3';
1306 <         }
1307 <      //-- keep only the w5..w3 range
1308 <      wseq=wseq.substr(w5, w3-w5+1);
1309 <      if (!wqv.is_empty())
1310 <         wqv=wqv.substr(w5, w3-w5+1);
1311 <      }//some adapter was trimmed
1312 <   } //adapter trimming
1313 < if (a5len>0) {
1314 <  if (trim_adapter5(wseq, w5, w3)) {
1315 <     int trimlen=wseq.length()-(w3-w5+1);
1316 <     num_trimmed5++;
1317 <     if (trimlen<min_trimmed5)
1318 <         min_trimmed5=trimlen;
1296 > char trim_code;
1297 > //clean the more dirty end first - 3'
1298 > bool trimmedA=false;
1299 > bool trimmedT=false;
1300 > bool trimmedV=false;
1301 >
1302 > int prev_w5=0;
1303 > int prev_w3=0;
1304 > bool w3upd=true;
1305 > bool w5upd=true;
1306 > do {
1307 >  trim_code=0;
1308 >  if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1309 >      trim_code='A';
1310 >      b_trimA+=(w5+(wseq.length()-1-w3));
1311 >      if (!trimmedA) { num_trimA++; trimmedA=true; }
1312 >      }
1313 >  else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1314 >      trim_code='T';
1315 >      b_trimT+=(w5+(wseq.length()-1-w3));
1316 >      if (!trimmedT) { num_trimT++; trimmedT=true; }
1317 >      }
1318 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1319 >      trim_code='A';
1320 >      b_trimA+=(w5+(wseq.length()-1-w3));
1321 >      if (!trimmedA) { num_trimA++; trimmedA=true; }
1322 >      }
1323 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1324 >      trim_code='T';
1325 >      b_trimT+=(w5+(wseq.length()-1-w3));
1326 >      if (!trimmedT) { num_trimT++; trimmedT=true; }
1327 >      }
1328 >  else if (trim_adaptor5(wseq, w5, w3)) {
1329 >      trim_code='V';
1330 >      b_trimV+=(w5+(wseq.length()-1-w3));
1331 >      if (!trimmedV) { num_trimV++; trimmedV=true; }
1332 >      }
1333 >  else if (trim_adaptor3(wseq, w5, w3)) {
1334 >      trim_code='V';
1335 >      b_trimV+=(w5+(wseq.length()-1-w3));
1336 >      if (!trimmedV) { num_trimV++; trimmedV=true; }
1337 >      }
1338 >  if (trim_code) {
1339 >     w3upd=(w3!=prev_w3);
1340 >         w5upd=(w5!=prev_w5);
1341 >         if (w3upd) prev_w3=w3;
1342 >         if (w5upd) prev_w5=w5;
1343 >   #ifdef GDEBUG
1344 >     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1345 >     showTrim(wseq, w5, w3);
1346 >   #endif
1347 >     //int trimlen=wseq.length()-(w3-w5+1);
1348       l5+=w5;
1349       l3-=(wseq.length()-1-w3);
1350       if (w3-w5+1<min_read_len) {
1351 <         return '5';
1351 >         return trim_code;
1352           }
1353        //-- keep only the w5..w3 range
1354        wseq=wseq.substr(w5, w3-w5+1);
1355        if (!wqv.is_empty())
1356           wqv=wqv.substr(w5, w3-w5+1);
1357 <      }//some adapter was trimmed
1358 <   } //adapter trimming
1357 >      }//trimming at 3' end
1358 > } while (trim_code);
1359   if (doCollapse) {
1360     //keep read for later
1361     FqDupRec* dr=dhash.Find(wseq.chars());
1362     if (dr==NULL) { //new entry
1363 <          //if (prefix.is_empty())
1364 <             dhash.Add(wseq.chars(),
1363 >          //if (prefix.is_empty())
1364 >             dhash.Add(wseq.chars(),
1365                    new FqDupRec(&wqv, rname.chars()));
1366            //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1367           }
# Line 1495 | Line 1395
1395         fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1396         }
1397        else {
1398 <       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1398 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1399                            rseq.chars());
1400         }
1401       }
# Line 1506 | Line 1406
1406         fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1407         }
1408        else
1409 <       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1409 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1410                            rseq.chars(),rqv.chars() );
1411       }
1412   }
# Line 1514 | Line 1414
1414   void trash_report(char trashcode, GStr& rname, FILE* freport) {
1415   if (freport==NULL || trashcode<=' ') return;
1416   if (trashcode=='3' || trashcode=='5') {
1417 <   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1417 >   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1418     }
1419   else {
1420     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
# Line 1606 | Line 1506
1506      }
1507   }
1508  
1509 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1510 <                       GStr& s, GStr& infname, GStr& infname2) {
1509 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1510 > //TODO: prepare CASeqData here, and collect hexamers as well
1511 >  if (seq.is_empty() || seq=="-" ||
1512 >          seq=="N/A" || seq==".") return;
1513 >
1514 > CASeqData adata(revCompl);
1515 > int idx=adaptors.Add(adata);
1516 > if (idx<0) GError("Error: failed to add adaptor!\n");
1517 > adaptors[idx].trim_type=trim_type;
1518 > adaptors[idx].update(seq.chars());
1519 > }
1520 >
1521 >
1522 > int loadAdaptors(const char* fname) {
1523 >  GLineReader lr(fname);
1524 >  char* l;
1525 >  while ((l=lr.nextLine())!=NULL) {
1526 >   if (lr.length()<=3 || l[0]=='#') continue;
1527 >   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1528 >        l[0]==';'|| l[0]==':' ) {
1529 >      int i=1;
1530 >      while (l[i]!=0 && isspace(l[i])) {
1531 >        i++;
1532 >        }
1533 >      if (l[i]!=0) {
1534 >        GStr s(&(l[i]));
1535 >      #ifdef GDEBUG
1536 >          //s.reverse();
1537 >      #endif
1538 >        addAdaptor(adaptors3, s, galn_TrimRight);
1539 >        continue;
1540 >        }
1541 >      }
1542 >    else {
1543 >      GStr s(l);
1544 >      s.startTokenize("\t ;,:");
1545 >      GStr a5,a3;
1546 >      if (s.nextToken(a5))
1547 >            s.nextToken(a3);
1548 >        else continue; //no tokens on this line
1549 >      GAlnTrimType ttype5=galn_TrimLeft;
1550 >      a5.upper();
1551 >      a3.upper();
1552 >      if (a3.is_empty() || a3==a5 || a3=="=") {
1553 >         a3.clear();
1554 >         ttype5=galn_TrimEither;
1555 >         }
1556 >     #ifdef GDEBUG
1557 >     //   a5.reverse();
1558 >     //   a3.reverse();
1559 >     #endif
1560 >      addAdaptor(adaptors5, a5, ttype5);
1561 >      addAdaptor(adaptors3, a3, galn_TrimRight);
1562 >      }
1563 >   }
1564 >   return adaptors5.Count()+adaptors3.Count();
1565 > }
1566 >
1567 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1568 >                       GStr& s, GStr& infname, GStr& infname2) {
1569   // uses outsuffix to generate output file names and open file handles as needed
1570   infname="";
1571   infname2="";
# Line 1635 | Line 1593
1593   s.startTokenize(",:");
1594   s.nextToken(infname);
1595   bool paired=s.nextToken(infname2);
1596 < if (fileExists(infname.chars())==0)
1596 > if (fileExists(infname.chars())==0)
1597      GError("Error: cannot find file %s!\n",infname.chars());
1598   GStr fname(getFileName(infname.chars()));
1599   GStr picmd;
# Line 1657 | Line 1615
1615   if (!paired) return;
1616   if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1617   // ---- paired reads:-------------
1618 < if (fileExists(infname2.chars())==0)
1618 > if (fileExists(infname2.chars())==0)
1619       GError("Error: cannot find file %s!\n",infname2.chars());
1620   picmd="";
1621   GStr fname2(getFileName(infname2.chars()));

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines