ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 3 | Line 3
3   #include "GHash.hh"
4   #include "GList.hh"
5   #include <ctype.h>
6 + #include "GAlnExtend.h"
7 + #include <inttypes.h>
8  
9   #define USAGE "Usage:\n\
10 < fqtrim [-5 <5adapter>] [-3 <3adapter>] [-a <min_matchlen>] [-p {64|33}] [-q <minq> [-t <trim_max>]]\\\n\
11 <   [-n <rename_prefix>] [-o <outsuffix>] [-z <zcmd>] [-r <discarded.lst>]\\\n\
12 <   [-l <minlen>] [-C] [-D] [-Q] <input.fq>[,<input_mates.fq>\n\
10 > fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
11 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
12 >   [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
13 >    <input.fq>[,<input_mates.fq>\n\
14   \n\
15 < Trim low quality bases at the 3' end, optionally trim adapter sequence, filter\n\
16 < for low complexity and collapse duplicate reads\n\
15 > Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
16 > for low complexity and collapse duplicate reads.\n\
17   If read pairs should be trimmed and kept together (i.e. without discarding\n\
18   one read in a pair), the two file names should be given delimited by a comma\n\
19   or a colon character\n\
# Line 19 | Line 22
22   -n  rename all the reads using the <prefix> followed by a read counter;\n\
23      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
24      the read duplication count\n\
25 < -o  write the trimmed/filtered reads to file(s) named <input>.<outsuffix>\n\
26 <    which will be created in the current (working) directory\n\
27 < -5  trim the given adapter or primer sequence at the 5' end of each read\n\
25 > -o  unless this parameter is '-', write the trimmed/filtered reads to \n\
26 >    file(s) named <input>.<outsuffix> which will be created in the \n\
27 >    current (working) directory; (writes to stdout if -o- is given);\n\
28 >    a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
29 > -f  file with adaptor sequences to trim, each line having this format:\n\
30 >    <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
31 > -5  trim the given adaptor or primer sequence at the 5' end of each read\n\
32      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
33 < -3  trim the given adapter sequence at the 3' end of each read\n\
33 > -3  trim the given adaptor sequence at the 3' end of each read\n\
34      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
35 < -a  minimum bases to match to adaptor sequence (default 5)\n\
35 > -A  disable polyA/T trimming (enabled by default)\n\
36 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
37   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38 < -t  for -q option, maximum trimming at the 3' end is limited to <trim_max>\n\
38 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
40   -l  minimum \"clean\" length after trimming that a read must have\n\
41      in order to pass the filter (default: 16)\n\
42   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
43      one letter code for the reason why they were trashed\n\
44   -D  apply a low-complexity (dust) filter and discard any read that has over\n\
45 <    50% of its length detected as low complexity\n\
45 >    50%% of its length detected as low complexity\n\
46   -C  collapse duplicate reads and add a prefix with their count to the read\n\
47      name (e.g. for microRNAs)\n\
48   -p  input is phred64/phred33 (use -p64 or -p33)\n\
49   -Q  convert quality values to the other Phred qv type\n\
50 < -V  verbose processing\n\
50 > -V  be verbose when done (print summary counts)\n\
51   "
52 <
52 >
53   //-z  for -o option, the output stream(s) will be first piped into the given\n
54   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
55  
48
49 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
56  
57 < //For pair ends sequencing:
57 > // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
58 >
59 > //For paired reads sequencing:
60   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
61   //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
62   //FILE* f_out=NULL; //stdout if not provided
63   //FILE* f_out2=NULL; //for paired reads
64   //FILE* f_in=NULL; //input fastq (stdin if not provided)
65   //FILE* f_in2=NULL; //for paired reads
66 +
67   FILE* freport=NULL;
68  
69   bool debug=false;
70   bool verbose=false;
71   bool doCollapse=false;
72   bool doDust=false;
73 + bool doPolyTrim=true;
74   bool fastaOutput=false;
75   bool trashReport=false;
76 < //bool rawFormat=false;
76 > bool revCompl=false; //also reverse complement adaptor sequences
77   int min_read_len=16;
78   double max_perc_N=7.0;
79   int dust_cutoff=16;
80   bool isfasta=false;
81   bool convert_phred=false;
82 < GStr outsuffix; // -o
73 < GStr adapter3;
74 < GStr adapter5;
82 > GStr outsuffix; // -o
83   GStr prefix;
84   GStr zcmd;
85 < int num_trimmed5=0;
86 < int num_trimmed3=0;
87 < int num_trimmedN=0;
88 < int num_trimmedQ=0;
89 < int min_trimmed5=INT_MAX;
90 < int min_trimmed3=INT_MAX;
85 > char isACGT[256];
86 >
87 > uint num_trimN=0; //reads trimmed by N%
88 > uint num_trimQ=0; //reads trimmed by qv threshold
89 > uint num_trimV=0; //reads trimmed by adapter match
90 > uint num_trimA=0; //reads trimmed by polyA
91 > uint num_trimT=0; //reads trimmed by polyT
92 > uint num_trim5=0; //number of reads trimmed at 5' end
93 > uint num_trim3=0; //number of reads trimmed at 3' end
94 >
95 > uint64 b_totalIn=0; //total number of input bases
96 > uint64 b_totalN=0;  //total number of undetermined bases found in input bases
97 >
98 > uint64 b_trimN=0; //total number of bases trimmed due to N-trim
99 > uint64 b_trimQ=0; //number of bases trimmed due to qv threshold
100 > uint64 b_trimV=0; //total number of bases trimmed due to adaptor matches
101 > uint64 b_trimA=0; //number of bases trimmed due to poly-A tails
102 > uint64 b_trimT=0; //number of bases trimmed due to poly-T tails
103 > uint64 b_trim5=0; //total bases trimmed on the 5' side
104 > uint64 b_trim3=0; //total bases trimmed on the 3' side
105 > //int min_trimmed5=INT_MAX;
106 > //int min_trimmed3=INT_MAX;
107  
108   int qvtrim_qmin=0;
109   int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
110   int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
111   int qv_cvtadd=0; //could be -31 or +31
112  
113 < int a3len=0;
114 < int a5len=0;
115 < // adaptor matching metrics -- for extendMatch() function
116 < const int a_m_score=2; //match score
117 < const int a_mis_score=-3; //mismatch
118 < const int a_dropoff_score=7;
119 < int a_min_score=10; //an exact match of 5 bases at the proper ends WILL be trimmed
120 < const int a_min_chain_score=15; //for gapped alignments
121 <
122 < class CSegChain;
123 <
124 < class CSegPair {
125 <  public:
126 <   GSeg a;
127 <   GSeg b; //the adapter segment
128 <   int score;
129 <   int flags;
130 <   CSegChain* chain;
131 <   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
132 <      score=mscore;
133 <      if (score==0) score=a.len()*a_m_score;
134 <      flags=0;
135 <      chain=NULL;
136 <      }
137 <   int len() { return  a.len(); }
138 <   bool operator==(CSegPair& d){
139 <      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
140 <      //make equal even segments that are included into one another:
141 <      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
142 <      }
143 <   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
120 <     if (b.start==d.b.start) {
121 <        if (score==d.score) {
122 <           //just try to be consistent:
123 <           if (b.end==d.b.end) {
124 <             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
125 <             }
126 <           return (b.end>d.b.end);
127 <           }
128 <         else return (score<d.score);
129 <        }
130 <     return (b.start>d.b.start);
131 <     }
132 <   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
133 <     /*if (b.start==d.b.start && b.end==d.b.end) {
134 <          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
135 <          }
136 <     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
137 <     if (b.start==d.b.start) {
138 <        if (score==d.score) {
139 <           //just try to be consistent:
140 <           if (b.end==d.b.end) {
141 <             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
142 <             }
143 <           return (b.end<d.b.end);
144 <           }
145 <         else return (score>d.score);
113 > // adaptor matching metrics -- for X-drop ungapped extension
114 > //const int match_reward=2;
115 > //const int mismatch_penalty=3;
116 > const int match_reward=2;
117 > const int mismatch_penalty=4;
118 > const int Xdrop=10;
119 >
120 > const int poly_m_score=2; //match score
121 > const int poly_mis_score=-3; //mismatch
122 > const int poly_dropoff_score=7;
123 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
124 >
125 > const char *polyA_seed="AAAA";
126 > const char *polyT_seed="TTTT";
127 >
128 > struct CASeqData {
129 >   //positional data for every possible hexamer in an adaptor
130 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
131 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
132 >   GStr seq; //actual adaptor sequence data
133 >   GStr seqr; //reverse complement sequence
134 >   int amlen; //fraction of adaptor length matching that's
135 >              //enough to consider the alignment
136 >   GAlnTrimType trim_type;
137 >   bool use_reverse;
138 >   CASeqData(bool rev=false):seq(),seqr(),
139 >             amlen(0), use_reverse(rev) {
140 >     trim_type=galn_None; //should be updated later!
141 >     for (int i=0;i<4096;i++) {
142 >        pz[i]=NULL;
143 >        pzr[i]=NULL;
144          }
147     return (b.start<d.b.start);
145       }
149 };
146  
147 < int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
148 < CSegPair& x = *(CSegPair *)sa;
149 < CSegPair& y = *(CSegPair *)sb;
150 < /*
151 < if (x.b.end==y.b.end) {
152 <     if (x.b.start==y.b.start) {
153 <         if (x.a.end==y.a.end) {
154 <            if (x.a.start==y.a.start) return 0;
155 <            return ((x.a.start>y.a.start) ? -1 : 1);
156 <            }
157 <          else {
158 <            return ((x.a.end>y.a.end) ? -1 : 1);
159 <            }
160 <          }
161 <      else {
162 <       return ((x.b.start>y.b.start) ? -1 : 1);
163 <       }
164 <     }
165 <    else {
170 <     return ((x.b.end>y.b.end) ? -1 : 1);
171 <     }
172 < */
173 <  if (x.b.end==y.b.end) {
174 <     if (x.score==y.score) {
175 <     if (x.b.start==y.b.start) {
176 <         if (x.a.end==y.a.end) {
177 <            if (x.a.start==y.a.start) return 0;
178 <            return ((x.a.start<y.a.start) ? -1 : 1);
179 <            }
180 <          else {
181 <            return ((x.a.end<y.a.end) ? -1 : 1);
182 <            }
183 <          }
184 <      else {
185 <       return ((x.b.start<y.b.start) ? -1 : 1);
186 <       }
187 <      } else return ((x.score>y.score) ? -1 : 1);
188 <     }
189 <    else {
190 <     return ((x.b.end>y.b.end) ? -1 : 1);
147 >   void update(const char* s) {
148 >         seq=s;
149 >         table6mers(seq.chars(), seq.length(), pz);
150 >         amlen=iround(double(seq.length())*0.8);
151 >         if (amlen<12)
152 >                amlen=12;
153 >         if (!use_reverse) return;
154 >         //reverse complement
155 >         seqr=s;
156 >         int slen=seq.length();
157 >         for (int i=0;i<slen;i++)
158 >           seqr[i]=ntComplement(seq[slen-i-1]);
159 >         table6mers(seqr.chars(), seqr.length(), pzr);
160 >   }
161 >
162 >   ~CASeqData() {
163 >     for (int i=0;i<4096;i++) {
164 >       delete pz[i];
165 >       delete pzr[i];
166       }
167 +   }
168 + };
169  
170 < }
170 > GVec<CASeqData> adaptors5;
171 > GVec<CASeqData> adaptors3;
172  
173 < class CSegChain:public GList<CSegPair> {
174 < public:
197 <   uint astart;
198 <   uint aend;
199 <   uint bstart;
200 <   uint bend;
201 <   int score;
202 <   bool endSort;
203 <  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
204 <   //as SegPairs are inserted, they will be sorted by a.start coordinate
205 <   score=0;
206 <   astart=MAX_UINT;
207 <   aend=0;
208 <   bstart=MAX_UINT;
209 <   bend=0;
210 <   endSort=aln5;
211 <   if (aln5) { setSorted(cmpSegEnds); }
212 <   }
213 < bool operator==(CSegChain& d) {
214 <   //return (score==d.score);
215 <    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
216 <   }
217 < bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
218 <   //return (score<d.score);
219 <   if (bstart==d.bstart && bend==d.bend) {
220 <          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
221 <          }
222 <     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
223 <   }
224 < bool operator<(CSegChain& d) {
225 <   //return (score>d.score);
226 <   if (bstart==d.bstart && bend==d.bend) {
227 <          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
228 <          }
229 <     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
230 <   }
231 < void addSegPair(CSegPair* segp) {
232 <   if (AddIfNew(segp)!=segp) return;
233 <   score+=segp->score;
234 <   if (astart>segp->a.start) astart=segp->a.start;
235 <   if (aend<segp->a.end) aend=segp->a.end;
236 <   if (bstart>segp->b.start) bstart=segp->b.start;
237 <   if (bend<segp->b.end) bend=segp->b.end;
238 <   }
239 < //for building actual chains:
240 < bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
241 <   int bgap=0;
242 <   int agap=0;
243 <   //if (endSort) {
244 <   if (bstart>segp->b.start) {
245 <      bgap = (int)(bstart-segp->b.end);
246 <      if (abs(bgap)>2) return false;
247 <      agap = (int)(astart-segp->a.end);
248 <      if (abs(agap)>2) return false;
249 <      }
250 <     else {
251 <      bgap = (int) (segp->b.start-bend);
252 <      if (abs(bgap)>2) return false;
253 <      agap = (int)(segp->a.start-aend);
254 <      if (abs(agap)>2) return false;
255 <      }
256 <   if (agap*bgap<0) return false;
257 <   addSegPair(segp);
258 <   score-=abs(agap)+abs(bgap);
259 <   return true;
260 <   }
261 < };
173 > CGreedyAlignData* gxmem_l=NULL;
174 > CGreedyAlignData* gxmem_r=NULL;
175  
176   // element in dhash:
177   class FqDupRec {
# Line 308 | Line 221
221  
222   GHash<FqDupRec> dhash; //hash to keep track of duplicates
223  
224 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
225 <                       GStr& s, GStr& infname, GStr& infname2);
224 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
225 > int loadAdaptors(const char* fname);
226 >
227 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
228 >                       GStr& s, GStr& infname, GStr& infname2);
229   // uses outsuffix to generate output file names and open file handles as needed
230 <
230 >
231   void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
232   void trash_report(char trashcode, GStr& rname, FILE* freport);
233  
234 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
234 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
235            GStr& rname, GStr& rinfo, GStr& infname);
236  
237 < char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
237 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
238   //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
239  
240   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
241   bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
242   int dust(GStr& seq);
243 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
244 < bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
243 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
244 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
245 > bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
246 > bool trim_adaptor3(GStr& seq, int &l5, int &l3);
247  
248   void convertPhred(char* q, int len);
249   void convertPhred(GStr& q);
250  
251   int main(int argc, char * const argv[]) {
252 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:t:o:z:a:");
252 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
253    int e;
254    if ((e=args.isError())>0) {
255        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 342 | Line 260
260    convert_phred=(args.getOpt('Q')!=NULL);
261    doCollapse=(args.getOpt('C')!=NULL);
262    doDust=(args.getOpt('D')!=NULL);
263 +  revCompl=(args.getOpt('R')!=NULL);
264 +  if (args.getOpt('A')) doPolyTrim=false;
265    /*
266    rawFormat=(args.getOpt('R')!=NULL);
267    if (rawFormat) {
# Line 350 | Line 270
270    */
271    prefix=args.getOpt('n');
272    GStr s=args.getOpt('l');
273 <  if (!s.is_empty())
273 >  if (!s.is_empty())
274       min_read_len=s.asInt();
275    s=args.getOpt('m');
276 <  if (!s.is_empty())
276 >  if (!s.is_empty())
277       max_perc_N=s.asDouble();
278    s=args.getOpt('d');
279    if (!s.is_empty()) {
# Line 379 | Line 299
299          qv_phredtype=64;
300          qv_cvtadd=-31;
301          }
302 <       else
302 >       else
303           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
304       }
305 <  if (args.getOpt('3')!=NULL) {
306 <    adapter3=args.getOpt('3');
307 <    adapter3.upper();
308 <    a3len=adapter3.length();
309 <    }
310 <  if (args.getOpt('5')!=NULL) {
311 <    adapter5=args.getOpt('5');
312 <    adapter5.upper();
313 <    a5len=adapter5.length();
305 >  memset((void*)isACGT, 0, 256);
306 >  isACGT['A']=isACGT['a']=isACGT['C']=isACGT['c']=1;
307 >  isACGT['G']=isACGT['g']=isACGT['T']=isACGT['t']=1;
308 >  s=args.getOpt('f');
309 >  if (!s.is_empty()) {
310 >   loadAdaptors(s.chars());
311 >   }
312 >  bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
313 >  s=args.getOpt('5');
314 >  if (!s.is_empty()) {
315 >    if (fileAdaptors)
316 >      GError("Error: options -5 and -f cannot be used together!\n");
317 >    s.upper();
318 >    addAdaptor(adaptors5, s, galn_TrimLeft);
319 >    }
320 >  s=args.getOpt('3');
321 >  if (!s.is_empty()) {
322 >    if (fileAdaptors)
323 >      GError("Error: options -3 and -f cannot be used together!\n");
324 >      s.upper();
325 >      addAdaptor(adaptors3, s, galn_TrimRight);
326      }
327 <  s=args.getOpt('a');
327 >  s=args.getOpt('y');
328    if (!s.is_empty()) {
329 <     int a_minmatch=s.asInt();
330 <     a_min_score=a_minmatch<<1;
329 >     int minmatch=s.asInt();
330 >     poly_minScore=minmatch*poly_m_score;
331       }
332  
333    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
334 +                         else outsuffix="-";
335    trashReport=  (args.getOpt('r')!=NULL);
336    int fcount=args.startNonOpt();
337    if (fcount==0) {
# Line 413 | Line 346
346    if (trashReport)
347      openfw(freport, args, 'r');
348    char* infile=NULL;
349 +
350 +  if (adaptors5.Count()>0)
351 +    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
352 +        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
353 +  if (adaptors3.Count()>0)
354 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
355 +
356    while ((infile=args.nextNonOpt())!=NULL) {
357 +    //for each input file
358      int incounter=0; //counter for input reads
359      int outcounter=0; //counter for output reads
360      int trash_s=0; //too short from the get go
361      int trash_Q=0;
362      int trash_N=0;
363      int trash_D=0;
364 <    int trash_A3=0;
365 <    int trash_A5=0;
364 >    int trash_poly=0;
365 >    int trash_V=0;
366 >    num_trimN=0;
367 >    num_trimQ=0;
368 >    num_trimV=0;
369 >    num_trimA=0;
370 >    num_trimT=0;
371 >    num_trim5=0;
372 >    num_trim3=0;
373 >
374 >    b_totalIn=0;
375 >    b_totalN=0;
376 >    b_trimN=0;
377 >    b_trimQ=0;
378 >    b_trimV=0;
379 >    b_trimA=0;
380 >    b_trimT=0;
381 >    b_trim5=0;
382 >    b_trim3=0;
383 >
384      s=infile;
385      GStr infname;
386      GStr infname2;
# Line 444 | Line 403
403         int a5=0, a3=0, b5=0, b3=0;
404         char tcode=0, tcode2=0;
405         tcode=process_read(seqid, rseq, rqv, a5, a3);
406 <       //if (!doCollapse) {
407 <         if (fq2!=NULL) {
406 >       if (a5>0) {
407 >          b_trim5+=a5;
408 >          num_trim5++;
409 >          }
410 >       if (a3<rseq.length()-1) {
411 >          b_trim3+=rseq.length()-a3-1;
412 >          num_trim3++;
413 >          }
414 >       if (fq2!=NULL) {
415              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
416              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
417                 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
418                    seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
419                 }
420              tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
421 +            if (b5>0) {
422 +               b_trim5+=b5;
423 +               num_trim5++;
424 +               }
425 +            if (b3<rseq2.length()-1) {
426 +               b_trim3+=rseq2.length()-b3-1;
427 +               num_trim3++;
428 +               }
429              //decide what to do with this pair and print rseq2 if the pair makes it
430              if (tcode>1 && tcode2<=1) {
431                 //"untrash" rseq
# Line 477 | Line 451
451                 int nocounter=0;
452                 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
453                 }
454 <            } //paired read
481 <       // }
454 >            } //pair read
455         if (tcode>1) { //trashed
456 +         #ifdef GDEBUG
457 +         GMessage(" !!!!TRASH code = %c\n",tcode);
458 +         #endif
459            if (tcode=='s') trash_s++;
460 +          else if (tcode=='A' || tcode=='T') trash_poly++;
461              else if (tcode=='Q') trash_Q++;
462                else if (tcode=='N') trash_N++;
463                 else if (tcode=='D') trash_D++;
464 <                else if (tcode=='3') trash_A3++;
465 <                 else if (tcode=='5') trash_A5++;
464 >                else if (tcode=='V') trash_V++;
465 >                 //else if (tcode=='5') trash_A5++;
466            if (trashReport) trash_report(tcode, seqid, freport);
467            }
468           else if (!doCollapse) { //write it
# Line 493 | Line 470
470              rseq=rseq.substr(a5,a3-a5+1);
471              if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
472              }
473 +         #ifdef GDEBUG
474 +            GMessage("  After trimming:\n");
475 +            GMessage("%s\n",rseq.chars());
476 +         #endif
477            writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
478            }
479         } //for each fastq record
# Line 520 | Line 501
501                 }
502              }
503           outcounter++;
504 <         if (qd->count>maxdup_count) {
504 >         if (qd->count>maxdup_count) {
505              maxdup_count=qd->count;
506              maxdup_seq=seq;
507              }
508           if (isfasta) {
509             if (prefix.is_empty()) {
510 <             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
510 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
511                             rseq.chars());
512               }
513             else { //use custom read name
# Line 537 | Line 518
518           else { //fastq format
519            if (convert_phred) convertPhred(qd->qv, qd->len);
520            if (prefix.is_empty()) {
521 <            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
521 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
522                             rseq.chars(), qd->qv);
523              }
524            else { //use custom read name
# Line 553 | Line 534
534      if (verbose) {
535         if (paired_reads) {
536             GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
537 <           GMessage("Number of input pairs :%9d\n", incounter);
538 <           GMessage("         Output pairs :%9d\n", outcounter);
537 >           GMessage("Number of input pairs :%9u\n", incounter);
538 >           GMessage("         Output pairs :%9u\t(%u trashed)\n", outcounter, incounter-outcounter);
539             }
540           else {
541             GMessage(">Input file : %s\n", infname.chars());
542             GMessage("Number of input reads :%9d\n", incounter);
543 <           GMessage("         Output reads :%9d\n", outcounter);
543 >           GMessage("         Output reads :%9d  (%u trashed)\n", outcounter, incounter-outcounter);
544             }
545 <       GMessage("------------------------------------\n");
546 <       if (num_trimmed5)
547 <          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
548 <       if (num_trimmed3)
549 <          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
545 >       GMessage("--------------- Read trimming: ------------------\n");
546 >       if (num_trim5)
547 >          GMessage("           5' trimmed :%9u\n", num_trim5);
548 >       if (num_trim3)
549 >          GMessage("           3' trimmed :%9u\n", num_trim3);
550 >       if (num_trimQ)
551 >          GMessage("         q.v. trimmed :%9u\n", num_trimQ);
552 >       if (num_trimN)
553 >          GMessage("            N trimmed :%9u\n", num_trimN);
554 >       if (num_trimT)
555 >          GMessage("       poly-T trimmed :%9u\n", num_trimT);
556 >       if (num_trimA)
557 >          GMessage("       poly-A trimmed :%9u\n", num_trimA);
558 >       if (num_trimV)
559 >          GMessage("      Adapter trimmed :%9u\n", num_trimV);
560         GMessage("------------------------------------\n");
561         if (trash_s>0)
562 <         GMessage("     Trashed by length:%9d\n", trash_s);
562 >         GMessage("Trashed by initial len:%9d\n", trash_s);
563         if (trash_N>0)
564           GMessage("         Trashed by N%%:%9d\n", trash_N);
565         if (trash_Q>0)
566           GMessage("Trashed by low quality:%9d\n", trash_Q);
567 <       if (trash_A5>0)
568 <         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
569 <       if (trash_A3>0)
570 <         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
567 >       if (trash_poly>0)
568 >         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
569 >       if (trash_V>0)
570 >         GMessage("    Trashed by adaptor:%9d\n", trash_V);
571 >       GMessage("\n--------------- Base counts  --------------------\n");
572 >       GMessage("    Input bases   :%12llu\n", b_totalIn);
573 >       double percN=100.0* ((double)b_totalN/(double)b_totalIn);
574 >       GMessage("          N bases :%12llu (%4.2f%%)\n", b_totalN, percN);
575 >       GMessage("   trimmed from 5':%12llu\n", b_trim5);
576 >       GMessage("   trimmed from 3':%12llu\n", b_trim3);
577 >       GMessage("\n");
578 >       if (b_trimQ)
579 >       GMessage("     q.v. trimmed :%12llu\n", b_trimQ);
580 >       if (b_trimN)
581 >       GMessage("        N trimmed :%12llu\n", b_trimN);
582 >       if (b_trimT)
583 >       GMessage("   poly-T trimmed :%12llu\n", b_trimT);
584 >       if (b_trimA)
585 >       GMessage("   poly-A trimmed :%12llu\n", b_trimA);
586 >       if (b_trimV)
587 >       GMessage("  Adapter trimmed :%12llu\n", b_trimV);
588 >
589         }
590      if (trashReport) {
591            FWCLOSE(freport);
# Line 584 | Line 593
593      FWCLOSE(f_out);
594      FWCLOSE(f_out2);
595     } //while each input file
596 <
596 > delete gxmem_l;
597 > delete gxmem_r;
598   //getc(stdin);
599   }
600  
# Line 599 | Line 609
609     const char* seq;
610     bool valid;
611     NData() {
612 +    seqlen=0;
613      NCount=0;
614      end5=0;
615      end3=0;
# Line 629 | Line 640
640       perc_N=(n*100.0)/(end5-end3+1);
641       }
642   };
643 <
643 >
644   static NData feat;
645   int perc_lenN=12; // incremental distance from ends, in percentage of
646            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
647 <          
647 >
648   void N_analyze(int l5, int l3, int p5, int p3) {
649   /* assumes feat was filled properly */
650   int old_dif, t5,t3,v;
651   if (l3<l5+2 || p5>p3 ) {
652     feat.end5=l5+1;
653     feat.end3=l3+1;
654 <   return;
654 >   return;
655     }
656  
657   t5=feat.NPos[p5]-l5;
658   t3=l3-feat.NPos[p3];
659   old_dif=p3-p5;
660   v=(int)((((double)(l3-l5))*perc_lenN)/100);
661 < if (v>20) v=20; /* enforce N-search limit for very long reads */
661 > if (v>20) v=20; /* enforce N-search limit for very long reads */
662   if (t5 < v ) {
663     l5=feat.NPos[p5]+1;
664     p5++;
# Line 664 | Line 675
675             feat.end3=l3+1;
676             return;
677             }
678 <    else
678 >    else
679        N_analyze(l5,l3, p5,p3);
680   }
681  
# Line 705 | Line 716
716   feat.init(rseq);
717   l5=feat.end5-1;
718   l3=feat.end3-1;
719 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
719 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
720   if (l5==feat.end5-1 && l3==feat.end3-1) {
721      if (feat.perc_N>max_perc_N) {
722             feat.valid=false;
# Line 723 | Line 734
734     return true;
735     }
736   feat.N_calc();
737 <
737 >
738   if (feat.perc_N>max_perc_N) {
739        feat.valid=false;
740        l3=l5+1;
# Line 735 | Line 746
746   //--------------- dust functions ----------------
747   class DNADuster {
748   public:
749 <  int dustword;
750 <  int dustwindow;
751 <  int dustwindow2;
749 >  int dustword;
750 >  int dustwindow;
751 >  int dustwindow2;
752    int dustcutoff;
753    int mv, iv, jv;
754    int counts[32*32*32];
# Line 832 | Line 843
843                      }
844             }
845           }
846 < //return first;
846 > //return first;
847   }
848   };
849  
# Line 850 | Line 861
861   return ncount;
862   }
863  
864 + struct SLocScore {
865 +  int pos;
866 +  int score;
867 +  SLocScore(int p=0,int s=0) {
868 +    pos=p;
869 +    score=s;
870 +    }
871 +  void set(int p, int s) {
872 +    pos=p;
873 +    score=s;
874 +    }
875 +  void add(int p, int add) {
876 +    pos=p;
877 +    score+=add;
878 +    }
879 + };
880  
881 < // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
882 < //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
883 < //check minimum score and
884 < //for 3' adapter trimming:
885 < //     require that the right end of the alignment for either the adaptor OR the read must be
886 < //     < 3 distance from its right end
887 < // for 5' adapter trimming:
888 < //     require that the left end of the alignment for either the adaptor OR the read must
889 < //     be at coordinate < 3 from start
890 <
891 < bool extendMatch(const char* a, int alen, int ai,
892 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
893 < //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
894 < #ifdef DEBUG
895 < GStr dbg(b);
869 < #endif
870 < //if (debug) {
871 < //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
872 < //  }
873 < int a_l=ai; //alignment coordinates on a
874 < int a_r=ai+mlen-1;
875 < int b_l=bi; //alignment coordinates on b
876 < int b_r=bi+mlen-1;
877 < int ai_maxscore=ai;
878 < int bi_maxscore=bi;
879 < int score=mlen*a_m_score;
880 < int maxscore=score;
881 < int mism5score=a_mis_score;
882 < if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
883 < //try to extend to the left first, if possible
884 < while (ai>0 && bi>0) {
885 <   ai--;
886 <   bi--;
887 <   score+= (a[ai]==b[bi])? a_m_score : mism5score;
888 <   if (score>maxscore) {
889 <       ai_maxscore=ai;
890 <       bi_maxscore=bi;
891 <       maxscore=score;
892 <       }
893 <     else if (maxscore-score>a_dropoff_score) break;
894 <   }
895 < a_l=ai_maxscore;
896 < b_l=bi_maxscore;
897 < //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
898 < //now extend to the right
899 < ai_maxscore=a_r;
900 < bi_maxscore=b_r;
901 < ai=a_r;
902 < bi=b_r;
903 < score=maxscore;
904 < //sometimes there are extra AAAAs at the end of the read, ignore those
905 < if (strcmp(&a[alen-4],"AAAA")==0) {
906 <    alen-=3;
907 <    while (a[alen-1]=='A' && alen>ai) alen--;
908 <    }
909 < while (ai<alen-1 && bi<blen-1) {
910 <   ai++;
911 <   bi++;
912 <   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
913 <   if (a[ai]==b[bi]) { //match
914 <      score+=a_m_score;
915 <      if (ai>=alen-2) {
916 <           score+=a_m_score-(alen-ai-1);
917 <           }
881 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
882 > if (!doPolyTrim) return false;
883 > int rlen=seq.length();
884 > l5=0;
885 > l3=rlen-1;
886 > int32 seedVal=*(int32*)poly_seed;
887 > char polyChar=poly_seed[0];
888 > //assumes N trimming was already done
889 > //so a poly match should be very close to the end of the read
890 > // -- find the initial match (seed)
891 > int lmin=GMAX((rlen-16), 0);
892 > int li;
893 > for (li=rlen-4;li>lmin;li--) {
894 >   if (seedVal==*(int*)&(seq[li])) {
895 >      break;
896        }
919    else { //mismatch
920      score+=a_mis_score;
921      }  
922   if (score>maxscore) {
923       ai_maxscore=ai;
924       bi_maxscore=bi;
925       maxscore=score;
926       }
927     else if (maxscore-score>a_dropoff_score) break;
897     }
898 <  a_r=ai_maxscore;
899 <  b_r=bi_maxscore;
900 <  int a_ovh3=alen-a_r-1;
901 <  int b_ovh3=blen-b_r-1;
902 <  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
903 <  int mmovh5=(a_l<b_l)? a_l : b_l;
904 <  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
905 < #ifdef DEBUG
906 <  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
907 < #endif
908 <  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
909 <     if (a_l<a_ovh3) {
910 <        //adapter closer to the left end (typical for 5' adapter)
911 <        l5=a_r+1;
912 <        l3=alen-1;
898 > if (li<=lmin) return false;
899 > //seed found, try to extend it both ways
900 > //extend right
901 > int ri=li+3;
902 > SLocScore loc(ri, poly_m_score<<2);
903 > SLocScore maxloc(loc);
904 > //extend right
905 > while (ri<rlen-1) {
906 >   ri++;
907 >   if (seq[ri]==polyChar) {
908 >                loc.add(ri,poly_m_score);
909 >                }
910 >   else if (seq[ri]=='N') {
911 >                loc.add(ri,0);
912 >                }
913 >   else { //mismatch
914 >        loc.add(ri,poly_mis_score);
915 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
916          }
917 <      else {
918 <        //adapter matching at the right end (typical for 3' adapter)
919 <        l5=0;
948 <        l3=a_l-1;
949 <        }
950 <     return true;
951 <     }
952 < else { //keep this segment pair for later (gapped alignment)
953 <   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
954 <   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
955 <   }
956 <  //do not trim:
957 <  l5=0;
958 <  l3=alen-1;
959 <  return false;
960 < }
961 <
962 < /*
963 < int getWordValue(const char* s, int wlen) {
964 < int r=0;
965 < while (wlen--) { r+=(((int)s[wlen])<<wlen) }
966 < return r;
967 < }
968 < */
969 < int get3mer_value(const char* s) {
970 < return (s[0]<<16)+(s[1]<<8)+s[2];
971 < }
972 <
973 < int w3_match(int qv, const char* str, int slen, int start_index=0) {
974 < if (start_index>=slen || start_index<0) return -1;
975 < for (int i=start_index;i<slen-3;i++) {
976 <   int rv=get3mer_value(str+i);
977 <   if (rv==qv) return i;
978 <   }
979 < return -1;
980 < }
981 <
982 < int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
983 < if (end_index>=slen) return -1;
984 < if (end_index<0) end_index=slen-1;
985 < for (int i=end_index-2;i>=0;i--) {
986 <   int rv=get3mer_value(str+i);
987 <   if (rv==qv) return i;
988 <   }
989 < return -1;
990 < }
991 <
992 < int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
993 < if (start_index>=slen || start_index<0) return -1;
994 < for (int i=start_index;i<slen-4;i++) {
995 <   int32* rv=(int32*)(str+i);
996 <   if (*rv==qv) return i;
997 <   }
998 < return -1;
999 < }
1000 <
1001 < int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1002 < if (end_index>=slen) return -1;
1003 < if (end_index<0) end_index=slen-1;
1004 < for (int i=end_index-3;i>=0;i--) {
1005 <   int32* rv=(int32*)(str+i);
1006 <   if (*rv==qv) return i;
1007 <   }
1008 < return -1;
1009 < }
1010 <
1011 < #ifdef DEBUG
1012 < void dbgPrintChain(CSegChain& chain, const char* aseq) {
1013 <  GStr s(aseq);
1014 <  for (int i=0;i<chain.Count();i++) {
1015 <   CSegPair& seg=*chain[i];
1016 <   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1017 <            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
917 >   if (maxloc.score<=loc.score) {
918 >      maxloc=loc;
919 >      }
920     }
921 + ri=maxloc.pos;
922 + if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
923 + //ri = right boundary for the poly match
924 + //extend left
925 + loc.set(li, maxloc.score);
926 + maxloc.pos=li;
927 + while (li>0) {
928 +    li--;
929 +    if (seq[li]==polyChar) {
930 +                 loc.add(li,poly_m_score);
931 +                 }
932 +    else if (seq[li]=='N') {
933 +                 loc.add(li,0);
934 +                 }
935 +    else { //mismatch
936 +         loc.add(li,poly_mis_score);
937 +         if (maxloc.score-loc.score>poly_dropoff_score) break;
938 +         }
939 +    if (maxloc.score<=loc.score) {
940 +       maxloc=loc;
941 +       }
942 +    }
943 + li=maxloc.pos;
944 + if ((maxloc.score==poly_minScore && ri==rlen-1) ||
945 +    (maxloc.score>poly_minScore && ri>=rlen-3) ||
946 +    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
947 +  //trimming this li-ri match at 3' end
948 +    l3=li-1;
949 +    if (l3<0) l3=0;
950 +    return true;
951 +    }
952 + return false;
953   }
1020 #endif
954  
955 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
955 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
956 > if (!doPolyTrim) return false;
957   int rlen=seq.length();
958   l5=0;
959   l3=rlen-1;
960 < //first try a full match, we might get lucky
961 < int fi=-1;
962 < if ((fi=seq.index(adapter3))>=0) {
963 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
964 <      l5=fi+a3len;
965 <      l3=rlen-1;
966 <      }
967 <    else {
968 <      l5=0;
969 <      l3=fi-1;
960 > int32 seedVal=*(int32*)poly_seed;
961 > char polyChar=poly_seed[0];
962 > //assumes N trimming was already done
963 > //so a poly match should be very close to the end of the read
964 > // -- find the initial match (seed)
965 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
966 > int li;
967 > for (li=0;li<=lmax;li++) {
968 >   if (seedVal==*(int*)&(seq[li])) {
969 >      break;
970        }
1037   return true;
971     }
972 < #ifdef DEBUG
973 < if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
974 < #endif
975 <
976 < //also, for fast detection of other adapter-only reads that start past
977 < // the beginning of the adapter sequence, try to see if the first a3len-4
978 < // bases of the read are a substring of the adapter
979 < if (rlen>a3len-3) {
980 <   GStr rstart=seq.substr(1,a3len-4);
981 <   if ((fi=adapter3.index(rstart))>=0) {
982 <     l3=rlen-1;
983 <     l5=a3len-4;
984 <     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
985 <     return true;
986 <     }
987 <  }
988 < CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
989 <  //check the easy cases - 11 bases exact match at the end
990 < int fdlen=11;
991 <  if (a3len<16) {
1059 <   fdlen=a3len>>1;
1060 <   }
1061 < if (fdlen>4) {
1062 <     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1063 <     GStr rstart=seq.substr(-fdlen-3,fdlen);
1064 <     if ((fi=adapter3.index(rstart))>=0) {
1065 < #ifdef DEBUG
1066 <       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1067 < #endif
1068 <       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1069 <                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
1070 <            return true;
1071 <       }
1072 <     //another easy case: first 11 characters of the adaptor found as a substring of the read
1073 <     GStr bstr=adapter3.substr(0, fdlen);
1074 <     if ((fi=seq.rindex(bstr))>=0) {
1075 < #ifdef DEBUG
1076 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1077 < #endif
1078 <       if (extendMatch(seq.chars(), rlen, fi,
1079 <                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
1080 <            return true;
972 > if (li>lmax) return false;
973 > //seed found, try to extend it both ways
974 > //extend left
975 > int ri=li+3; //save rightmost base of the seed
976 > SLocScore loc(li, poly_m_score<<2);
977 > SLocScore maxloc(loc);
978 > while (li>0) {
979 >    li--;
980 >    if (seq[li]==polyChar) {
981 >                 loc.add(li,poly_m_score);
982 >                 }
983 >    else if (seq[li]=='N') {
984 >                 loc.add(li,0);
985 >                 }
986 >    else { //mismatch
987 >         loc.add(li,poly_mis_score);
988 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
989 >         }
990 >    if (maxloc.score<=loc.score) {
991 >       maxloc=loc;
992         }
993 <     } //tried to match 11 bases first
994 <    
995 < //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
996 < //-- only extend if the match is in the 3' (ending) region of the read
997 < int wordSize=3;
998 < int hlen=12;
999 < if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1000 < int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1001 < if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1002 < imin=rlen-imin;
1003 < for (int iw=0;iw<hlen;iw++) {
1004 <   //int32* qv=(int32*)(adapter3.chars()+iw);
1005 <   int qv=get3mer_value(adapter3.chars()+iw);
1006 <   fi=-1;
1007 <   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1008 <   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1009 <     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
993 >    }
994 > li=maxloc.pos;
995 > if (li>5) return false; //no trimming wanted, too far from 5' end
996 > //li = right boundary for the poly match
997 >
998 > //extend right
999 > loc.set(ri, maxloc.score);
1000 > maxloc.pos=ri;
1001 > while (ri<rlen-1) {
1002 >   ri++;
1003 >   if (seq[ri]==polyChar) {
1004 >                loc.add(ri,poly_m_score);
1005 >                }
1006 >   else if (seq[ri]=='N') {
1007 >                loc.add(ri,0);
1008 >                }
1009 >   else { //mismatch
1010 >        loc.add(ri,poly_mis_score);
1011 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
1012 >        }
1013 >   if (maxloc.score<=loc.score) {
1014 >      maxloc=loc;
1015 >      }
1016 >   }
1017 > ri=maxloc.pos;
1018 > if ((maxloc.score==poly_minScore && li==0) ||
1019 >     (maxloc.score>poly_minScore && li<2)
1020 >     || (maxloc.score>(poly_minScore*3) && li<8)) {
1021 >    //adjust l5 to reflect this trimming of 5' end
1022 >    l5=ri+1;
1023 >    if (l5>rlen-1) l5=rlen-1;
1024 >    return true;
1025 >    }
1026 > return false;
1027 > }
1028  
1029 < #ifdef DEBUG
1030 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1031 < #endif
1032 <     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1033 <                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
1034 <     fi--;
1035 <     if (fi<imin) break;
1036 <     }
1037 <   } //for each wmer in the first hlen bases of the adaptor
1038 < /*
1039 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1040 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1041 < if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1042 < int hlen2=a3len-wordSize;
1043 < //if (hlen2>a3len-4) hlen2=a3len-4;
1044 < if (hlen2>hlen) {
1116 < #ifdef DEBUG
1117 <     if (debug && a3segs.Count()>0) {
1118 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1029 > bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
1030 > if (adaptors3.Count()==0) return false;
1031 > int rlen=seq.length();
1032 > l5=0;
1033 > l3=rlen-1;
1034 > bool trimmed=false;
1035 > GStr wseq(seq);
1036 > int wlen=rlen;
1037 > GXSeqData seqdata;
1038 > int numruns=revCompl ? 2 : 1;
1039 > GList<GXAlnInfo> bestalns(true, true, false);
1040 > for (int ai=0;ai<adaptors3.Count();ai++) {
1041 >   for (int r=0;r<numruns;r++) {
1042 >     if (r) {
1043 >          seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
1044 >                 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
1045          }
1046 < #endif
1047 <     for (int iw=hlen;iw<hlen2;iw++) {
1048 <         //int* qv=(int32 *)(adapter3.chars()+iw);
1049 <         int qv=get3mer_value(adapter3.chars()+iw);
1050 <         fi=-1;
1051 <         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1052 <         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1053 <           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1054 <                         a3len, iw, wordSize, l5,l3, a3segs);
1055 <           fi--;
1056 <           if (fi<imin) break;
1057 <           }
1058 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1046 >     else {
1047 >            seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
1048 >                 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
1049 >        }
1050 >     GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
1051 >         if (aln) {
1052 >           if (aln->strong) {
1053 >                   trimmed=true;
1054 >                   bestalns.Add(aln);
1055 >                   break; //will check the rest next time
1056 >                   }
1057 >            else bestalns.Add(aln);
1058 >           }
1059 >   }//forward and reverse adaptors
1060 >   if (trimmed) break; //will check the rest in the next cycle
1061 >  }//for each 3' adaptor
1062 > if (bestalns.Count()>0) {
1063 >           GXAlnInfo* aln=bestalns[0];
1064 >           if (aln->sl-1 > wlen-aln->sr) {
1065 >                   //keep left side
1066 >                   l3-=(wlen-aln->sl+1);
1067 >                   if (l3<0) l3=0;
1068 >                   }
1069 >           else { //keep right side
1070 >                   l5+=aln->sr;
1071 >                   if (l5>=rlen) l5=rlen-1;
1072 >                   }
1073 >           //delete aln;
1074 >           //if (l3-l5+1<min_read_len) return true;
1075 >           wseq=seq.substr(l5,l3-l5+1);
1076 >           wlen=wseq.length();
1077 >           return true; //break the loops here to report a good find
1078       }
1079 < //lastly, analyze collected a3segs for a possible gapped alignment:
1135 < GList<CSegChain> segchains(false,true,false);
1136 < #ifdef DEBUG
1137 < if (debug && a3segs.Count()>0) {
1138 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1139 <   }
1140 < #endif
1141 < for (int i=0;i<a3segs.Count();i++) {
1142 <   if (a3segs[i]->chain==NULL) {
1143 <       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1144 <       CSegChain* newchain=new CSegChain();
1145 <       newchain->setFreeItem(false);
1146 <       newchain->addSegPair(a3segs[i]);
1147 <       a3segs[i]->chain=newchain;
1148 <       segchains.Add(newchain); //just to free them when done
1149 <       }
1150 <   for (int j=i+1;j<a3segs.Count();j++) {
1151 <      CSegChain* chain=a3segs[i]->chain;
1152 <      if (chain->extendChain(a3segs[j])) {
1153 <          a3segs[j]->chain=chain;
1154 < #ifdef DEBUG
1155 <          if (debug) dbgPrintChain(*chain, adapter3.chars());
1156 < #endif      
1157 <          //save time by checking here if the extended chain is already acceptable for trimming
1158 <          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1159 <            l5=0;
1160 <            l3=chain->astart-2;
1161 < #ifdef DEBUG
1162 <          if (debug && a3segs.Count()>0) {
1163 <            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1164 <            }
1165 < #endif
1166 <            return true;
1167 <            }
1168 <          } //chain can be extended
1169 <      }
1170 <   } //collect segment alignments into chains
1171 < */  
1172 < return false; //no adapter parts found
1079 >  return false;
1080   }
1081  
1082 < bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1083 < //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1082 > bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1083 > if (adaptors5.Count()==0) return false;
1084   int rlen=seq.length();
1085   l5=0;
1086   l3=rlen-1;
1087 < //try to see if adapter is fully included in the read
1088 < int fi=-1;
1089 < if ((fi=seq.index(adapter5))>=0) {
1090 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
1091 <      l5=fi+a5len;
1092 <      l3=rlen-1;
1093 <      }
1094 <    else {
1095 <      l5=0;
1096 <      l3=fi-1;
1097 <      }
1191 <   return true;
1192 <   }
1193 < #ifdef DEBUG
1194 < if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1195 < #endif
1196 <
1197 < CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1198 <
1199 < //try the easy way out first - look for an exact match of 11 bases
1200 < int fdlen=11;
1201 <  if (a5len<16) {
1202 <   fdlen=a5len>>1;
1203 <   }
1204 < if (fdlen>4) {
1205 <     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1206 <     if ((fi=adapter5.index(rstart))>=0) {
1207 < #ifdef DEBUG
1208 <       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1209 < #endif
1210 <       if (extendMatch(seq.chars(), rlen, 1,
1211 <                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1212 <           return true;
1213 <       }
1214 <     //another easy case: last 11 characters of the adaptor found as a substring of the read
1215 <     GStr bstr=adapter5.substr(-fdlen);
1216 <     if ((fi=seq.index(bstr))>=0) {
1217 < #ifdef DEBUG
1218 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1219 < #endif
1220 <       if (extendMatch(seq.chars(), rlen, fi,
1221 <                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1222 <          return true;
1223 <       }
1224 <     } //tried to matching at most 11 bases first
1225 <    
1226 < //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1227 < //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1228 < int wordSize=3;
1229 < int hlen=12;
1230 < if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1231 < int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1232 < if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1233 < for (int iw=0;iw<=hlen;iw++) {
1234 <   int apstart=a5len-iw-wordSize;
1235 <   fi=0;
1236 <   //int* qv=(int32 *)(adapter5.chars()+apstart);
1237 <   int qv=get3mer_value(adapter5.chars()+apstart);
1238 <   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1239 <   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1240 < #ifdef DEBUG
1241 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1242 < #endif
1243 <     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1244 <                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1245 <     fi++;
1246 <     if (fi>imax) break;
1247 <     }
1248 <   } //for each wmer in the last hlen bases of the adaptor
1249 < /*
1250 <
1251 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1252 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1253 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1254 < int hlen2=a5len-wordSize;
1255 < //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1256 < #ifdef DEBUG
1257 <      if (debug && a5segs.Count()>0) {
1258 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1087 > bool trimmed=false;
1088 > GStr wseq(seq);
1089 > int wlen=rlen;
1090 > GXSeqData seqdata;
1091 > int numruns=revCompl ? 2 : 1;
1092 > GList<GXAlnInfo> bestalns(true, true, false);
1093 > for (int ai=0;ai<adaptors5.Count();ai++) {
1094 >   for (int r=0;r<numruns;r++) {
1095 >     if (r) {
1096 >          seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1097 >                 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1098          }
1099 < #endif
1100 < if (hlen2>hlen) {
1101 <     for (int iw=hlen+1;iw<=hlen2;iw++) {
1102 <         int apstart=a5len-iw-wordSize;
1103 <         fi=0;
1104 <         //int* qv=(int32 *)(adapter5.chars()+apstart);
1105 <         int qv=get3mer_value(adapter5.chars()+apstart);
1106 <         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1107 <         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1108 <           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1109 <                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1110 <           fi++;
1111 <           if (fi>imax) break;
1112 <           }
1113 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1099 >     else {
1100 >            seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1101 >                 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1102 >        }
1103 >         GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1104 >         if (aln) {
1105 >           if (aln->strong) {
1106 >                   trimmed=true;
1107 >                   bestalns.Add(aln);
1108 >                   break; //will check the rest next time
1109 >                   }
1110 >            else bestalns.Add(aln);
1111 >           }
1112 >         } //forward and reverse?
1113 >   if (trimmed) break; //will check the rest in the next cycle
1114 >  }//for each 5' adaptor
1115 >  if (bestalns.Count()>0) {
1116 >           GXAlnInfo* aln=bestalns[0];
1117 >           if (aln->sl-1 > wlen-aln->sr) {
1118 >                   //keep left side
1119 >                   l3-=(wlen-aln->sl+1);
1120 >                   if (l3<0) l3=0;
1121 >                   }
1122 >           else { //keep right side
1123 >                   l5+=aln->sr;
1124 >                   if (l5>=rlen) l5=rlen-1;
1125 >                   }
1126 >           //delete aln;
1127 >           //if (l3-l5+1<min_read_len) return true;
1128 >           wseq=seq.substr(l5,l3-l5+1);
1129 >           wlen=wseq.length();
1130 >           return true; //break the loops here to report a good find
1131       }
1132 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1133 < // lastly, analyze collected a5segs for a possible gapped alignment:
1278 < GList<CSegChain> segchains(false,true,false);
1279 < #ifdef DEBUG
1280 < if (debug && a5segs.Count()>0) {
1281 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1282 <   }
1283 < #endif
1284 < for (int i=0;i<a5segs.Count();i++) {
1285 <   if (a5segs[i]->chain==NULL) {
1286 <       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1287 <       CSegChain* newchain=new CSegChain(true);
1288 <       newchain->setFreeItem(false);
1289 <       newchain->addSegPair(a5segs[i]);
1290 <       a5segs[i]->chain=newchain;
1291 <       segchains.Add(newchain); //just to free them when done
1292 <       }
1293 <   for (int j=i+1;j<a5segs.Count();j++) {
1294 <      CSegChain* chain=a5segs[i]->chain;
1295 <      if (chain->extendChain(a5segs[j])) {
1296 <         a5segs[j]->chain=chain;
1297 < #ifdef DEBUG
1298 <         if (debug) dbgPrintChain(*chain, adapter5.chars());
1299 < #endif      
1300 <      //save time by checking here if the extended chain is already acceptable for trimming
1301 <         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1302 <            l5=chain->aend;
1303 <            l3=rlen-1;
1304 <            return true;
1305 <            }
1306 <         } //chain can be extended
1307 <      }
1308 <   } //collect segment alignments into chains
1309 < */
1310 < return false; //no adapter parts found
1311 < }
1132 >  return false;
1133 > }
1134  
1135 < //convert qvs to/from phred64 from/to phread33
1135 > //convert qvs to/from phred64 from/to phread33
1136   void convertPhred(GStr& q) {
1137   for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1138   }
# Line 1319 | Line 1141
1141   for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1142   }
1143  
1144 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1144 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1145            GStr& rname, GStr& rinfo, GStr& infname) {
1146   rseq="";
1147   rqv="";
# Line 1335 | Line 1157
1157        } //raw qseq format
1158   else { // FASTQ or FASTA */
1159   isfasta=(l[0]=='>');
1160 < if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1160 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1161        infname.chars(), l);
1162   GStr s(l);
1163   rname=&(l[1]);
1164   for (int i=0;i<rname.length();i++)
1165 <    if (rname[i]<=' ') {
1166 <       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1167 <       rname.cut(i);
1168 <       break;
1165 >    if (rname[i]<=' ') {
1166 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1167 >       rname.cut(i);
1168 >       break;
1169         }
1170    //now get the sequence
1171 < if ((l=fq.getLine())==NULL)
1171 > if ((l=fq.getLine())==NULL)
1172        GError("Error: unexpected EOF after header for read %s (%s)\n",
1173                     rname.chars(), infname.chars());
1174   rseq=l; //this must be the DNA line
1175   while ((l=fq.getLine())!=NULL) {
1176        //seq can span multiple lines
1177        if (l[0]=='>' || l[0]=='+') {
1178 <           fq.pushBack();
1178 >           fq.pushBack();
1179             break; //
1180             }
1181        rseq+=l;
1182 <      } //check for multi-line seq
1182 >      } //check for multi-line seq
1183   if (!isfasta) { //reading fastq quality values, which can also be multi-line
1184      if ((l=fq.getLine())==NULL)
1185          GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1186      if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1187 <    if ((l=fq.getLine())==NULL)
1187 >    if ((l=fq.getLine())==NULL)
1188          GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1189      rqv=l;
1190 <    //if (rqv.length()!=rseq.length())
1190 >    //if (rqv.length()!=rseq.length())
1191      //  GError("Error: qv len != seq len for %s\n", rname.chars());
1192      while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1193        rqv+=l; //append to qv string
1194        }
1195      }// fastq
1196   // } //<-- FASTA or FASTQ
1197 < rseq.upper(); //TODO: what if we care about masking?
1197 > rseq.upper();
1198   return true;
1199   }
1200  
1201 + #ifdef GDEBUG
1202 + void showTrim(GStr& s, int l5, int l3) {
1203 +  if (l5>0 || l3==0) {
1204 +    color_bg(c_red);
1205 +    }
1206 +  for (int i=0;i<s.length()-1;i++) {
1207 +    if (i && i==l5) color_resetbg();
1208 +    fprintf(stderr, "%c", s[i]);
1209 +    if (i && i==l3) color_bg(c_red);
1210 +   }
1211 +  fprintf(stderr, "%c", s[s.length()-1]);
1212 +  color_reset();
1213 +  fprintf(stderr, "\n");
1214 + }
1215 + #endif
1216 +
1217   char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1218 < //returns 0 if the read was untouched, 1 if it was just trimmed
1218 > //returns 0 if the read was untouched, 1 if it was just trimmed
1219   // and a trash code if it was trashed
1220   l5=0;
1221   l3=rseq.length()-1;
1222 + #ifdef GDEBUG
1223 +   //rseq.reverse();
1224 +   GMessage(">%s\n", rname.chars());
1225 +   GMessage("%s\n",rseq.chars());
1226 + #endif
1227   if (l3-l5+1<min_read_len) {
1228     return 's';
1229     }
# Line 1388 | Line 1231
1231   GStr wqv(rqv.chars());
1232   int w5=l5;
1233   int w3=l3;
1234 + //count Ns
1235 + b_totalIn+=rseq.length();
1236 + for (int i=0;i<rseq.length();i++) {
1237 + if (isACGT[(int)rseq[i]]==0) b_totalN++;
1238 + }
1239 +
1240   //first do the q-based trimming
1241   if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1242 +   b_trimQ+=(w5-l5)+(l3-w3);
1243 +   num_trimQ++;
1244     if (w3-w5+1<min_read_len) {
1245       return 'Q'; //invalid read
1246       }
# Line 1402 | Line 1253
1253     } //qv trimming
1254   // N-trimming on the remaining read seq
1255   if (ntrim(wseq, w5, w3)) {
1256 +   //Note: ntrim resets w5 and w3
1257     //GMessage("before: %s\n",wseq.chars());
1258     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1259 +   int trim3=(wseq.length()-1-w3);
1260 +   b_trimN+=w5+trim3;
1261 +   num_trimN++;
1262     l5+=w5;
1263 <   l3-=(wseq.length()-1-w3);
1263 >   l3-=trim3;
1264     if (w3-w5+1<min_read_len) {
1265       return 'N'; //to be trashed
1266       }
# Line 1416 | Line 1271
1271     w5=0;
1272     w3=wseq.length()-1;
1273     }
1274 < if (a3len>0) {
1275 <  if (trim_adapter3(wseq, w5, w3)) {
1276 <     int trimlen=wseq.length()-(w3-w5+1);
1277 <     num_trimmed3++;
1278 <     if (trimlen<min_trimmed3)
1279 <         min_trimmed3=trimlen;
1280 <     l5+=w5;
1281 <     l3-=(wseq.length()-1-w3);
1282 <     if (w3-w5+1<min_read_len) {
1283 <         return '3';
1284 <         }
1285 <      //-- keep only the w5..w3 range
1286 <      wseq=wseq.substr(w5, w3-w5+1);
1287 <      if (!wqv.is_empty())
1288 <         wqv=wqv.substr(w5, w3-w5+1);
1289 <      }//some adapter was trimmed
1290 <   } //adapter trimming
1291 < if (a5len>0) {
1292 <  if (trim_adapter5(wseq, w5, w3)) {
1293 <     int trimlen=wseq.length()-(w3-w5+1);
1294 <     num_trimmed5++;
1295 <     if (trimlen<min_trimmed5)
1296 <         min_trimmed5=trimlen;
1274 > char trim_code;
1275 > //clean the more dirty end first - 3'
1276 > bool trimmedA=false;
1277 > bool trimmedT=false;
1278 > bool trimmedV=false;
1279 >
1280 > int prev_w5=0;
1281 > int prev_w3=0;
1282 > bool w3upd=true;
1283 > bool w5upd=true;
1284 > do {
1285 >  trim_code=0;
1286 >  if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1287 >      trim_code='A';
1288 >      b_trimA+=(w5+(wseq.length()-1-w3));
1289 >      if (!trimmedA) { num_trimA++; trimmedA=true; }
1290 >      }
1291 >  else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1292 >      trim_code='T';
1293 >      b_trimT+=(w5+(wseq.length()-1-w3));
1294 >      if (!trimmedT) { num_trimT++; trimmedT=true; }
1295 >      }
1296 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1297 >      trim_code='A';
1298 >      b_trimA+=(w5+(wseq.length()-1-w3));
1299 >      if (!trimmedA) { num_trimA++; trimmedA=true; }
1300 >      }
1301 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1302 >      trim_code='T';
1303 >      b_trimT+=(w5+(wseq.length()-1-w3));
1304 >      if (!trimmedT) { num_trimT++; trimmedT=true; }
1305 >      }
1306 >  else if (trim_adaptor5(wseq, w5, w3)) {
1307 >      trim_code='V';
1308 >      b_trimV+=(w5+(wseq.length()-1-w3));
1309 >      if (!trimmedV) { num_trimV++; trimmedV=true; }
1310 >      }
1311 >  else if (trim_adaptor3(wseq, w5, w3)) {
1312 >      trim_code='V';
1313 >      b_trimV+=(w5+(wseq.length()-1-w3));
1314 >      if (!trimmedV) { num_trimV++; trimmedV=true; }
1315 >      }
1316 >  if (trim_code) {
1317 >     w3upd=(w3!=prev_w3);
1318 >         w5upd=(w5!=prev_w5);
1319 >         if (w3upd) prev_w3=w3;
1320 >         if (w5upd) prev_w5=w5;
1321 >   #ifdef GDEBUG
1322 >     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1323 >     showTrim(wseq, w5, w3);
1324 >   #endif
1325 >     //int trimlen=wseq.length()-(w3-w5+1);
1326       l5+=w5;
1327       l3-=(wseq.length()-1-w3);
1328       if (w3-w5+1<min_read_len) {
1329 <         return '5';
1329 >         return trim_code;
1330           }
1331        //-- keep only the w5..w3 range
1332        wseq=wseq.substr(w5, w3-w5+1);
1333        if (!wqv.is_empty())
1334           wqv=wqv.substr(w5, w3-w5+1);
1335 <      }//some adapter was trimmed
1336 <   } //adapter trimming
1335 >      }//trimming at 3' end
1336 > } while (trim_code);
1337   if (doCollapse) {
1338     //keep read for later
1339     FqDupRec* dr=dhash.Find(wseq.chars());
1340     if (dr==NULL) { //new entry
1341 <          //if (prefix.is_empty())
1342 <             dhash.Add(wseq.chars(),
1341 >          //if (prefix.is_empty())
1342 >             dhash.Add(wseq.chars(),
1343                    new FqDupRec(&wqv, rname.chars()));
1344            //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1345           }
# Line 1489 | Line 1373
1373         fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1374         }
1375        else {
1376 <       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1376 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1377                            rseq.chars());
1378         }
1379       }
# Line 1500 | Line 1384
1384         fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1385         }
1386        else
1387 <       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1387 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1388                            rseq.chars(),rqv.chars() );
1389       }
1390   }
# Line 1508 | Line 1392
1392   void trash_report(char trashcode, GStr& rname, FILE* freport) {
1393   if (freport==NULL || trashcode<=' ') return;
1394   if (trashcode=='3' || trashcode=='5') {
1395 <   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1395 >   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1396     }
1397   else {
1398     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
# Line 1600 | Line 1484
1484      }
1485   }
1486  
1487 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1488 <                       GStr& s, GStr& infname, GStr& infname2) {
1487 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1488 > //TODO: prepare CASeqData here, and collect hexamers as well
1489 >  if (seq.is_empty() || seq=="-" ||
1490 >          seq=="N/A" || seq==".") return;
1491 >
1492 > CASeqData adata(revCompl);
1493 > int idx=adaptors.Add(adata);
1494 > if (idx<0) GError("Error: failed to add adaptor!\n");
1495 > adaptors[idx].trim_type=trim_type;
1496 > adaptors[idx].update(seq.chars());
1497 > }
1498 >
1499 >
1500 > int loadAdaptors(const char* fname) {
1501 >  GLineReader lr(fname);
1502 >  char* l;
1503 >  while ((l=lr.nextLine())!=NULL) {
1504 >   if (lr.length()<=3 || l[0]=='#') continue;
1505 >   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1506 >        l[0]==';'|| l[0]==':' ) {
1507 >      int i=1;
1508 >      while (l[i]!=0 && isspace(l[i])) {
1509 >        i++;
1510 >        }
1511 >      if (l[i]!=0) {
1512 >        GStr s(&(l[i]));
1513 >      #ifdef GDEBUG
1514 >          //s.reverse();
1515 >      #endif
1516 >        addAdaptor(adaptors3, s, galn_TrimRight);
1517 >        continue;
1518 >        }
1519 >      }
1520 >    else {
1521 >      GStr s(l);
1522 >      s.startTokenize("\t ;,:");
1523 >      GStr a5,a3;
1524 >      if (s.nextToken(a5))
1525 >            s.nextToken(a3);
1526 >        else continue; //no tokens on this line
1527 >      GAlnTrimType ttype5=galn_TrimLeft;
1528 >      a5.upper();
1529 >      a3.upper();
1530 >      if (a3.is_empty() || a3==a5 || a3=="=") {
1531 >         a3.clear();
1532 >         ttype5=galn_TrimEither;
1533 >         }
1534 >     #ifdef GDEBUG
1535 >     //   a5.reverse();
1536 >     //   a3.reverse();
1537 >     #endif
1538 >      addAdaptor(adaptors5, a5, ttype5);
1539 >      addAdaptor(adaptors3, a3, galn_TrimRight);
1540 >      }
1541 >   }
1542 >   return adaptors5.Count()+adaptors3.Count();
1543 > }
1544 >
1545 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1546 >                       GStr& s, GStr& infname, GStr& infname2) {
1547   // uses outsuffix to generate output file names and open file handles as needed
1548   infname="";
1549   infname2="";
# Line 1611 | Line 1553
1553   f_out2=NULL;
1554   //analyze outsuffix intent
1555   GStr pocmd;
1556 < GStr ox=getFext(outsuffix);
1557 < if (ox.length()>2) ox=ox.substr(0,2);
1558 < if (ox=="gz") pocmd="gzip -9 -c ";
1559 <   else if (ox=="bz") pocmd="bzip2 -9 -c ";
1556 > if (outsuffix=="-") {
1557 >    f_out=stdout;
1558 >    }
1559 >   else {
1560 >    GStr ox=getFext(outsuffix);
1561 >    if (ox.length()>2) ox=ox.substr(0,2);
1562 >    if (ox=="gz") pocmd="gzip -9 -c ";
1563 >        else if (ox=="bz") pocmd="bzip2 -9 -c ";
1564 >    }
1565   if (s=="-") {
1566      f_in=stdin;
1567      infname="stdin";
# Line 1624 | Line 1571
1571   s.startTokenize(",:");
1572   s.nextToken(infname);
1573   bool paired=s.nextToken(infname2);
1574 < if (fileExists(infname.chars())==0)
1574 > if (fileExists(infname.chars())==0)
1575      GError("Error: cannot find file %s!\n",infname.chars());
1576   GStr fname(getFileName(infname.chars()));
1577   GStr picmd;
# Line 1638 | Line 1585
1585     f_in=popen(picmd.chars(), "r");
1586     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1587     }
1588 + if (f_out==stdout) {
1589 +   if (paired) GError("Error: output suffix required for paired reads\n");
1590 +   return;
1591 +   }
1592   f_out=prepOutFile(infname, pocmd);
1593   if (!paired) return;
1594   if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1595   // ---- paired reads:-------------
1596 < if (fileExists(infname2.chars())==0)
1596 > if (fileExists(infname2.chars())==0)
1597       GError("Error: cannot find file %s!\n",infname2.chars());
1598   picmd="";
1599   GStr fname2(getFileName(infname2.chars()));

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines