ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 3 | Line 3
3   #include "GHash.hh"
4   #include "GList.hh"
5   #include <ctype.h>
6 + #include "GAlnExtend.h"
7 + #include <inttypes.h>
8  
9   #define USAGE "Usage:\n\
10 < fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
11 <   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10 > fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
11 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
12     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
13      <input.fq>[,<input_mates.fq>\n\
14   \n\
15 < Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 > Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
16   for low complexity and collapse duplicate reads.\n\
17   If read pairs should be trimmed and kept together (i.e. without discarding\n\
18   one read in a pair), the two file names should be given delimited by a comma\n\
# Line 24 | Line 26
26      file(s) named <input>.<outsuffix> which will be created in the \n\
27      current (working) directory; (writes to stdout if -o- is given);\n\
28      a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
29 < -f  file with adapter sequences to trim, each line having this format:\n\
30 <    <5'-adapter-sequence> <3'-adapter-sequence>\n\
31 < -5  trim the given adapter or primer sequence at the 5' end of each read\n\
29 > -f  file with adaptor sequences to trim, each line having this format:\n\
30 >    <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
31 > -5  trim the given adaptor or primer sequence at the 5' end of each read\n\
32      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
33 < -3  trim the given adapter sequence at the 3' end of each read\n\
33 > -3  trim the given adaptor sequence at the 3' end of each read\n\
34      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
35 < -a  minimum length of exact match to adaptor sequence at the proper end (6)\n\
35 > -A  disable polyA/T trimming (enabled by default)\n\
36 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
37   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38   -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
# Line 39 | Line 42
42   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
43      one letter code for the reason why they were trashed\n\
44   -D  apply a low-complexity (dust) filter and discard any read that has over\n\
45 <    50% of its length detected as low complexity\n\
45 >    50%% of its length detected as low complexity\n\
46   -C  collapse duplicate reads and add a prefix with their count to the read\n\
47      name (e.g. for microRNAs)\n\
48   -p  input is phred64/phred33 (use -p64 or -p33)\n\
49   -Q  convert quality values to the other Phred qv type\n\
50 < -V  verbose processing\n\
50 > -V  be verbose when done (print summary counts)\n\
51   "
52 <
52 >
53   //-z  for -o option, the output stream(s) will be first piped into the given\n
54   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
55  
56 <
57 < // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
56 >
57 > // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
58  
59   //For paired reads sequencing:
60   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
# Line 60 | Line 63
63   //FILE* f_out2=NULL; //for paired reads
64   //FILE* f_in=NULL; //input fastq (stdin if not provided)
65   //FILE* f_in2=NULL; //for paired reads
66 +
67   FILE* freport=NULL;
68  
69   bool debug=false;
70   bool verbose=false;
71   bool doCollapse=false;
72   bool doDust=false;
73 + bool doPolyTrim=true;
74   bool fastaOutput=false;
75   bool trashReport=false;
76 < //bool rawFormat=false;
76 > bool revCompl=false; //also reverse complement adaptor sequences
77   int min_read_len=16;
78   double max_perc_N=7.0;
79   int dust_cutoff=16;
80   bool isfasta=false;
81   bool convert_phred=false;
82 < GStr outsuffix; // -o
78 < //GStr adapter3;
79 < //GStr adapter5;
82 > GStr outsuffix; // -o
83   GStr prefix;
84   GStr zcmd;
85 < int num_trimmed5=0;
86 < int num_trimmed3=0;
87 < int num_trimmedN=0;
88 < int num_trimmedQ=0;
89 < int min_trimmed5=INT_MAX;
90 < int min_trimmed3=INT_MAX;
85 > char isACGT[256];
86 >
87 > uint num_trimN=0; //reads trimmed by N%
88 > uint num_trimQ=0; //reads trimmed by qv threshold
89 > uint num_trimV=0; //reads trimmed by adapter match
90 > uint num_trimA=0; //reads trimmed by polyA
91 > uint num_trimT=0; //reads trimmed by polyT
92 > uint num_trim5=0; //number of reads trimmed at 5' end
93 > uint num_trim3=0; //number of reads trimmed at 3' end
94 >
95 > uint64 b_totalIn=0; //total number of input bases
96 > uint64 b_totalN=0;  //total number of undetermined bases found in input bases
97 >
98 > uint64 b_trimN=0; //total number of bases trimmed due to N-trim
99 > uint64 b_trimQ=0; //number of bases trimmed due to qv threshold
100 > uint64 b_trimV=0; //total number of bases trimmed due to adaptor matches
101 > uint64 b_trimA=0; //number of bases trimmed due to poly-A tails
102 > uint64 b_trimT=0; //number of bases trimmed due to poly-T tails
103 > uint64 b_trim5=0; //total bases trimmed on the 5' side
104 > uint64 b_trim3=0; //total bases trimmed on the 3' side
105 > //int min_trimmed5=INT_MAX;
106 > //int min_trimmed3=INT_MAX;
107  
108   int qvtrim_qmin=0;
109   int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
110   int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
111   int qv_cvtadd=0; //could be -31 or +31
112  
113 < int a3len=0;
114 < int a5len=0;
115 < // adaptor matching metrics -- for extendMatch() function
116 < const int match_score=2; //match score
117 < const int mismatch_score=-3; //mismatch
118 <
119 < const int a_m_score=2; //match score
120 < const int a_mis_score=-3; //mismatch
121 < const int a_dropoff_score=7;
113 > // adaptor matching metrics -- for X-drop ungapped extension
114 > //const int match_reward=2;
115 > //const int mismatch_penalty=3;
116 > const int match_reward=2;
117 > const int mismatch_penalty=4;
118 > const int Xdrop=10;
119 >
120 > const int poly_m_score=2; //match score
121 > const int poly_mis_score=-3; //mismatch
122   const int poly_dropoff_score=7;
123 < int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
105 < const int a_min_chain_score=15; //for gapped alignments
123 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
124  
125 < class CSegChain;
125 > const char *polyA_seed="AAAA";
126 > const char *polyT_seed="TTTT";
127  
128 < class CSegPair {
129 <  public:
130 <   GSeg a;
131 <   GSeg b; //the adapter segment
132 <   int score;
133 <   int flags;
134 <   CSegChain* chain;
135 <   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
136 <      score=mscore;
137 <      if (score==0) score=a.len()*a_m_score;
138 <      flags=0;
139 <      chain=NULL;
140 <      }
141 <   int len() { return  a.len(); }
142 <   bool operator==(CSegPair& d){
143 <      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
125 <      //make equal even segments that are included into one another:
126 <      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
127 <      }
128 <   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
129 <     if (b.start==d.b.start) {
130 <        if (score==d.score) {
131 <           //just try to be consistent:
132 <           if (b.end==d.b.end) {
133 <             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
134 <             }
135 <           return (b.end>d.b.end);
136 <           }
137 <         else return (score<d.score);
138 <        }
139 <     return (b.start>d.b.start);
140 <     }
141 <   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
142 <     /*if (b.start==d.b.start && b.end==d.b.end) {
143 <          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
144 <          }
145 <     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
146 <     if (b.start==d.b.start) {
147 <        if (score==d.score) {
148 <           //just try to be consistent:
149 <           if (b.end==d.b.end) {
150 <             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
151 <             }
152 <           return (b.end<d.b.end);
153 <           }
154 <         else return (score>d.score);
128 > struct CASeqData {
129 >   //positional data for every possible hexamer in an adaptor
130 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
131 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
132 >   GStr seq; //actual adaptor sequence data
133 >   GStr seqr; //reverse complement sequence
134 >   int amlen; //fraction of adaptor length matching that's
135 >              //enough to consider the alignment
136 >   GAlnTrimType trim_type;
137 >   bool use_reverse;
138 >   CASeqData(bool rev=false):seq(),seqr(),
139 >             amlen(0), use_reverse(rev) {
140 >     trim_type=galn_None; //should be updated later!
141 >     for (int i=0;i<4096;i++) {
142 >        pz[i]=NULL;
143 >        pzr[i]=NULL;
144          }
156     return (b.start<d.b.start);
145       }
158 };
146  
147 < int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
148 < CSegPair& x = *(CSegPair *)sa;
149 < CSegPair& y = *(CSegPair *)sb;
150 < /*
151 < if (x.b.end==y.b.end) {
152 <     if (x.b.start==y.b.start) {
153 <         if (x.a.end==y.a.end) {
154 <            if (x.a.start==y.a.start) return 0;
155 <            return ((x.a.start>y.a.start) ? -1 : 1);
156 <            }
157 <          else {
158 <            return ((x.a.end>y.a.end) ? -1 : 1);
159 <            }
160 <          }
161 <      else {
162 <       return ((x.b.start>y.b.start) ? -1 : 1);
163 <       }
164 <     }
165 <    else {
179 <     return ((x.b.end>y.b.end) ? -1 : 1);
180 <     }
181 < */
182 <  if (x.b.end==y.b.end) {
183 <     if (x.score==y.score) {
184 <     if (x.b.start==y.b.start) {
185 <         if (x.a.end==y.a.end) {
186 <            if (x.a.start==y.a.start) return 0;
187 <            return ((x.a.start<y.a.start) ? -1 : 1);
188 <            }
189 <          else {
190 <            return ((x.a.end<y.a.end) ? -1 : 1);
191 <            }
192 <          }
193 <      else {
194 <       return ((x.b.start<y.b.start) ? -1 : 1);
195 <       }
196 <      } else return ((x.score>y.score) ? -1 : 1);
197 <     }
198 <    else {
199 <     return ((x.b.end>y.b.end) ? -1 : 1);
147 >   void update(const char* s) {
148 >         seq=s;
149 >         table6mers(seq.chars(), seq.length(), pz);
150 >         amlen=iround(double(seq.length())*0.8);
151 >         if (amlen<12)
152 >                amlen=12;
153 >         if (!use_reverse) return;
154 >         //reverse complement
155 >         seqr=s;
156 >         int slen=seq.length();
157 >         for (int i=0;i<slen;i++)
158 >           seqr[i]=ntComplement(seq[slen-i-1]);
159 >         table6mers(seqr.chars(), seqr.length(), pzr);
160 >   }
161 >
162 >   ~CASeqData() {
163 >     for (int i=0;i<4096;i++) {
164 >       delete pz[i];
165 >       delete pzr[i];
166       }
167 +   }
168 + };
169  
170 < }
170 > GVec<CASeqData> adaptors5;
171 > GVec<CASeqData> adaptors3;
172  
173 < class CSegChain:public GList<CSegPair> {
174 < public:
206 <   uint astart;
207 <   uint aend;
208 <   uint bstart;
209 <   uint bend;
210 <   int score;
211 <   bool endSort;
212 <  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
213 <   //as SegPairs are inserted, they will be sorted by a.start coordinate
214 <   score=0;
215 <   astart=MAX_UINT;
216 <   aend=0;
217 <   bstart=MAX_UINT;
218 <   bend=0;
219 <   endSort=aln5;
220 <   if (aln5) { setSorted(cmpSegEnds); }
221 <   }
222 < bool operator==(CSegChain& d) {
223 <   //return (score==d.score);
224 <    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
225 <   }
226 < bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
227 <   //return (score<d.score);
228 <   if (bstart==d.bstart && bend==d.bend) {
229 <          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
230 <          }
231 <     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
232 <   }
233 < bool operator<(CSegChain& d) {
234 <   //return (score>d.score);
235 <   if (bstart==d.bstart && bend==d.bend) {
236 <          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
237 <          }
238 <     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
239 <   }
240 < void addSegPair(CSegPair* segp) {
241 <   if (AddIfNew(segp)!=segp) return;
242 <   score+=segp->score;
243 <   if (astart>segp->a.start) astart=segp->a.start;
244 <   if (aend<segp->a.end) aend=segp->a.end;
245 <   if (bstart>segp->b.start) bstart=segp->b.start;
246 <   if (bend<segp->b.end) bend=segp->b.end;
247 <   }
248 < //for building actual chains:
249 < bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
250 <   int bgap=0;
251 <   int agap=0;
252 <   //if (endSort) {
253 <   if (bstart>segp->b.start) {
254 <      bgap = (int)(bstart-segp->b.end);
255 <      if (abs(bgap)>2) return false;
256 <      agap = (int)(astart-segp->a.end);
257 <      if (abs(agap)>2) return false;
258 <      }
259 <     else {
260 <      bgap = (int) (segp->b.start-bend);
261 <      if (abs(bgap)>2) return false;
262 <      agap = (int)(segp->a.start-aend);
263 <      if (abs(agap)>2) return false;
264 <      }
265 <   if (agap*bgap<0) return false;
266 <   addSegPair(segp);
267 <   score-=abs(agap)+abs(bgap);
268 <   return true;
269 <   }
270 < };
173 > CGreedyAlignData* gxmem_l=NULL;
174 > CGreedyAlignData* gxmem_r=NULL;
175  
176   // element in dhash:
177   class FqDupRec {
# Line 317 | Line 221
221  
222   GHash<FqDupRec> dhash; //hash to keep track of duplicates
223  
224 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
225 <                       GStr& s, GStr& infname, GStr& infname2);
224 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
225 > int loadAdaptors(const char* fname);
226 >
227 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
228 >                       GStr& s, GStr& infname, GStr& infname2);
229   // uses outsuffix to generate output file names and open file handles as needed
230 <
230 >
231   void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
232   void trash_report(char trashcode, GStr& rname, FILE* freport);
233  
234 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
234 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
235            GStr& rname, GStr& rinfo, GStr& infname);
236  
237 < char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
237 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
238   //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
239  
240   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
334 bool polytrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
241   bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
242   int dust(GStr& seq);
243 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
244 < bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
243 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
244 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
245 > bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
246 > bool trim_adaptor3(GStr& seq, int &l5, int &l3);
247  
248   void convertPhred(char* q, int len);
249   void convertPhred(GStr& q);
250  
251   int main(int argc, char * const argv[]) {
252 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
252 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
253    int e;
254    if ((e=args.isError())>0) {
255        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 352 | Line 260
260    convert_phred=(args.getOpt('Q')!=NULL);
261    doCollapse=(args.getOpt('C')!=NULL);
262    doDust=(args.getOpt('D')!=NULL);
263 +  revCompl=(args.getOpt('R')!=NULL);
264 +  if (args.getOpt('A')) doPolyTrim=false;
265    /*
266    rawFormat=(args.getOpt('R')!=NULL);
267    if (rawFormat) {
# Line 360 | Line 270
270    */
271    prefix=args.getOpt('n');
272    GStr s=args.getOpt('l');
273 <  if (!s.is_empty())
273 >  if (!s.is_empty())
274       min_read_len=s.asInt();
275    s=args.getOpt('m');
276 <  if (!s.is_empty())
276 >  if (!s.is_empty())
277       max_perc_N=s.asDouble();
278    s=args.getOpt('d');
279    if (!s.is_empty()) {
# Line 389 | Line 299
299          qv_phredtype=64;
300          qv_cvtadd=-31;
301          }
302 <       else
302 >       else
303           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
304       }
305 <  if (args.getOpt('3')!=NULL) {
306 <    adapter3=args.getOpt('3');
307 <    adapter3.upper();
308 <    a3len=adapter3.length();
309 <    }
310 <  if (args.getOpt('5')!=NULL) {
311 <    adapter5=args.getOpt('5');
312 <    adapter5.upper();
313 <    a5len=adapter5.length();
305 >  memset((void*)isACGT, 0, 256);
306 >  isACGT['A']=isACGT['a']=isACGT['C']=isACGT['c']=1;
307 >  isACGT['G']=isACGT['g']=isACGT['T']=isACGT['t']=1;
308 >  s=args.getOpt('f');
309 >  if (!s.is_empty()) {
310 >   loadAdaptors(s.chars());
311 >   }
312 >  bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
313 >  s=args.getOpt('5');
314 >  if (!s.is_empty()) {
315 >    if (fileAdaptors)
316 >      GError("Error: options -5 and -f cannot be used together!\n");
317 >    s.upper();
318 >    addAdaptor(adaptors5, s, galn_TrimLeft);
319 >    }
320 >  s=args.getOpt('3');
321 >  if (!s.is_empty()) {
322 >    if (fileAdaptors)
323 >      GError("Error: options -3 and -f cannot be used together!\n");
324 >      s.upper();
325 >      addAdaptor(adaptors3, s, galn_TrimRight);
326      }
327 <  s=args.getOpt('a');
327 >  s=args.getOpt('y');
328    if (!s.is_empty()) {
329 <     int a_minmatch=s.asInt();
330 <     a_min_score=a_minmatch<<1;
329 >     int minmatch=s.asInt();
330 >     poly_minScore=minmatch*poly_m_score;
331       }
332 <  
332 >
333    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
334                           else outsuffix="-";
335    trashReport=  (args.getOpt('r')!=NULL);
# Line 424 | Line 346
346    if (trashReport)
347      openfw(freport, args, 'r');
348    char* infile=NULL;
349 +
350 +  if (adaptors5.Count()>0)
351 +    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
352 +        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
353 +  if (adaptors3.Count()>0)
354 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
355 +
356    while ((infile=args.nextNonOpt())!=NULL) {
357 +    //for each input file
358      int incounter=0; //counter for input reads
359      int outcounter=0; //counter for output reads
360      int trash_s=0; //too short from the get go
361      int trash_Q=0;
362      int trash_N=0;
363      int trash_D=0;
364 <    int trash_A3=0;
365 <    int trash_A5=0;
364 >    int trash_poly=0;
365 >    int trash_V=0;
366 >    num_trimN=0;
367 >    num_trimQ=0;
368 >    num_trimV=0;
369 >    num_trimA=0;
370 >    num_trimT=0;
371 >    num_trim5=0;
372 >    num_trim3=0;
373 >
374 >    b_totalIn=0;
375 >    b_totalN=0;
376 >    b_trimN=0;
377 >    b_trimQ=0;
378 >    b_trimV=0;
379 >    b_trimA=0;
380 >    b_trimT=0;
381 >    b_trim5=0;
382 >    b_trim3=0;
383 >
384      s=infile;
385      GStr infname;
386      GStr infname2;
# Line 455 | Line 403
403         int a5=0, a3=0, b5=0, b3=0;
404         char tcode=0, tcode2=0;
405         tcode=process_read(seqid, rseq, rqv, a5, a3);
406 <       //if (!doCollapse) {
407 <         if (fq2!=NULL) {
406 >       if (a5>0) {
407 >          b_trim5+=a5;
408 >          num_trim5++;
409 >          }
410 >       if (a3<rseq.length()-1) {
411 >          b_trim3+=rseq.length()-a3-1;
412 >          num_trim3++;
413 >          }
414 >       if (fq2!=NULL) {
415              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
416              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
417                 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
418                    seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
419                 }
420              tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
421 +            if (b5>0) {
422 +               b_trim5+=b5;
423 +               num_trim5++;
424 +               }
425 +            if (b3<rseq2.length()-1) {
426 +               b_trim3+=rseq2.length()-b3-1;
427 +               num_trim3++;
428 +               }
429              //decide what to do with this pair and print rseq2 if the pair makes it
430              if (tcode>1 && tcode2<=1) {
431                 //"untrash" rseq
# Line 488 | Line 451
451                 int nocounter=0;
452                 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
453                 }
454 <            } //paired read
492 <       // }
454 >            } //pair read
455         if (tcode>1) { //trashed
456 +         #ifdef GDEBUG
457 +         GMessage(" !!!!TRASH code = %c\n",tcode);
458 +         #endif
459            if (tcode=='s') trash_s++;
460 +          else if (tcode=='A' || tcode=='T') trash_poly++;
461              else if (tcode=='Q') trash_Q++;
462                else if (tcode=='N') trash_N++;
463                 else if (tcode=='D') trash_D++;
464 <                else if (tcode=='3') trash_A3++;
465 <                 else if (tcode=='5') trash_A5++;
464 >                else if (tcode=='V') trash_V++;
465 >                 //else if (tcode=='5') trash_A5++;
466            if (trashReport) trash_report(tcode, seqid, freport);
467            }
468           else if (!doCollapse) { //write it
# Line 504 | Line 470
470              rseq=rseq.substr(a5,a3-a5+1);
471              if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
472              }
473 +         #ifdef GDEBUG
474 +            GMessage("  After trimming:\n");
475 +            GMessage("%s\n",rseq.chars());
476 +         #endif
477            writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
478            }
479         } //for each fastq record
# Line 531 | Line 501
501                 }
502              }
503           outcounter++;
504 <         if (qd->count>maxdup_count) {
504 >         if (qd->count>maxdup_count) {
505              maxdup_count=qd->count;
506              maxdup_seq=seq;
507              }
508           if (isfasta) {
509             if (prefix.is_empty()) {
510 <             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
510 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
511                             rseq.chars());
512               }
513             else { //use custom read name
# Line 548 | Line 518
518           else { //fastq format
519            if (convert_phred) convertPhred(qd->qv, qd->len);
520            if (prefix.is_empty()) {
521 <            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
521 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
522                             rseq.chars(), qd->qv);
523              }
524            else { //use custom read name
# Line 564 | Line 534
534      if (verbose) {
535         if (paired_reads) {
536             GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
537 <           GMessage("Number of input pairs :%9d\n", incounter);
538 <           GMessage("         Output pairs :%9d\n", outcounter);
537 >           GMessage("Number of input pairs :%9u\n", incounter);
538 >           GMessage("         Output pairs :%9u\t(%u trashed)\n", outcounter, incounter-outcounter);
539             }
540           else {
541             GMessage(">Input file : %s\n", infname.chars());
542             GMessage("Number of input reads :%9d\n", incounter);
543 <           GMessage("         Output reads :%9d\n", outcounter);
543 >           GMessage("         Output reads :%9d  (%u trashed)\n", outcounter, incounter-outcounter);
544             }
545 <       GMessage("------------------------------------\n");
546 <       if (num_trimmed5)
547 <          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
548 <       if (num_trimmed3)
549 <          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
545 >       GMessage("--------------- Read trimming: ------------------\n");
546 >       if (num_trim5)
547 >          GMessage("           5' trimmed :%9u\n", num_trim5);
548 >       if (num_trim3)
549 >          GMessage("           3' trimmed :%9u\n", num_trim3);
550 >       if (num_trimQ)
551 >          GMessage("         q.v. trimmed :%9u\n", num_trimQ);
552 >       if (num_trimN)
553 >          GMessage("            N trimmed :%9u\n", num_trimN);
554 >       if (num_trimT)
555 >          GMessage("       poly-T trimmed :%9u\n", num_trimT);
556 >       if (num_trimA)
557 >          GMessage("       poly-A trimmed :%9u\n", num_trimA);
558 >       if (num_trimV)
559 >          GMessage("      Adapter trimmed :%9u\n", num_trimV);
560         GMessage("------------------------------------\n");
561         if (trash_s>0)
562 <         GMessage("     Trashed by length:%9d\n", trash_s);
562 >         GMessage("Trashed by initial len:%9d\n", trash_s);
563         if (trash_N>0)
564           GMessage("         Trashed by N%%:%9d\n", trash_N);
565         if (trash_Q>0)
566           GMessage("Trashed by low quality:%9d\n", trash_Q);
567 <       if (trash_A5>0)
568 <         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
569 <       if (trash_A3>0)
570 <         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
567 >       if (trash_poly>0)
568 >         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
569 >       if (trash_V>0)
570 >         GMessage("    Trashed by adaptor:%9d\n", trash_V);
571 >       GMessage("\n--------------- Base counts  --------------------\n");
572 >       GMessage("    Input bases   :%12llu\n", b_totalIn);
573 >       double percN=100.0* ((double)b_totalN/(double)b_totalIn);
574 >       GMessage("          N bases :%12llu (%4.2f%%)\n", b_totalN, percN);
575 >       GMessage("   trimmed from 5':%12llu\n", b_trim5);
576 >       GMessage("   trimmed from 3':%12llu\n", b_trim3);
577 >       GMessage("\n");
578 >       if (b_trimQ)
579 >       GMessage("     q.v. trimmed :%12llu\n", b_trimQ);
580 >       if (b_trimN)
581 >       GMessage("        N trimmed :%12llu\n", b_trimN);
582 >       if (b_trimT)
583 >       GMessage("   poly-T trimmed :%12llu\n", b_trimT);
584 >       if (b_trimA)
585 >       GMessage("   poly-A trimmed :%12llu\n", b_trimA);
586 >       if (b_trimV)
587 >       GMessage("  Adapter trimmed :%12llu\n", b_trimV);
588 >
589         }
590      if (trashReport) {
591            FWCLOSE(freport);
# Line 595 | Line 593
593      FWCLOSE(f_out);
594      FWCLOSE(f_out2);
595     } //while each input file
596 <
596 > delete gxmem_l;
597 > delete gxmem_r;
598   //getc(stdin);
599   }
600  
# Line 610 | Line 609
609     const char* seq;
610     bool valid;
611     NData() {
612 +    seqlen=0;
613      NCount=0;
614      end5=0;
615      end3=0;
# Line 640 | Line 640
640       perc_N=(n*100.0)/(end5-end3+1);
641       }
642   };
643 <
643 >
644   static NData feat;
645   int perc_lenN=12; // incremental distance from ends, in percentage of
646            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
647 <          
647 >
648   void N_analyze(int l5, int l3, int p5, int p3) {
649   /* assumes feat was filled properly */
650   int old_dif, t5,t3,v;
651   if (l3<l5+2 || p5>p3 ) {
652     feat.end5=l5+1;
653     feat.end3=l3+1;
654 <   return;
654 >   return;
655     }
656  
657   t5=feat.NPos[p5]-l5;
658   t3=l3-feat.NPos[p3];
659   old_dif=p3-p5;
660   v=(int)((((double)(l3-l5))*perc_lenN)/100);
661 < if (v>20) v=20; /* enforce N-search limit for very long reads */
661 > if (v>20) v=20; /* enforce N-search limit for very long reads */
662   if (t5 < v ) {
663     l5=feat.NPos[p5]+1;
664     p5++;
# Line 675 | Line 675
675             feat.end3=l3+1;
676             return;
677             }
678 <    else
678 >    else
679        N_analyze(l5,l3, p5,p3);
680   }
681  
# Line 716 | Line 716
716   feat.init(rseq);
717   l5=feat.end5-1;
718   l3=feat.end3-1;
719 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
719 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
720   if (l5==feat.end5-1 && l3==feat.end3-1) {
721      if (feat.perc_N>max_perc_N) {
722             feat.valid=false;
# Line 734 | Line 734
734     return true;
735     }
736   feat.N_calc();
737 <
737 >
738   if (feat.perc_N>max_perc_N) {
739        feat.valid=false;
740        l3=l5+1;
# Line 746 | Line 746
746   //--------------- dust functions ----------------
747   class DNADuster {
748   public:
749 <  int dustword;
750 <  int dustwindow;
751 <  int dustwindow2;
749 >  int dustword;
750 >  int dustwindow;
751 >  int dustwindow2;
752    int dustcutoff;
753    int mv, iv, jv;
754    int counts[32*32*32];
# Line 843 | Line 843
843                      }
844             }
845           }
846 < //return first;
846 > //return first;
847   }
848   };
849  
# Line 861 | Line 861
861   return ncount;
862   }
863  
864 + struct SLocScore {
865 +  int pos;
866 +  int score;
867 +  SLocScore(int p=0,int s=0) {
868 +    pos=p;
869 +    score=s;
870 +    }
871 +  void set(int p, int s) {
872 +    pos=p;
873 +    score=s;
874 +    }
875 +  void add(int p, int add) {
876 +    pos=p;
877 +    score+=add;
878 +    }
879 + };
880  
881 < // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
882 < //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
883 < //check minimum score and
884 < //for 3' adapter trimming:
885 < //     require that the right end of the alignment for either the adaptor OR the read must be
886 < //     < 3 distance from its right end
887 < // for 5' adapter trimming:
888 < //     require that the left end of the alignment for either the adaptor OR the read must
889 < //     be at coordinate < 3 from start
890 <
891 < bool extendMatch(const char* a, int alen, int ai,
892 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
893 < //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
894 < #ifdef DEBUG
895 < GStr dbg(b);
880 < #endif
881 < //if (debug) {
882 < //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
883 < //  }
884 < int a_l=ai; //alignment coordinates on a
885 < int a_r=ai+mlen-1;
886 < int b_l=bi; //alignment coordinates on b
887 < int b_r=bi+mlen-1;
888 < int ai_maxscore=ai;
889 < int bi_maxscore=bi;
890 < int score=mlen*a_m_score;
891 < int maxscore=score;
892 < int mism5score=a_mis_score;
893 < if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
894 < //try to extend to the left first, if possible
895 < while (ai>0 && bi>0) {
896 <   ai--;
897 <   bi--;
898 <   score+= (a[ai]==b[bi])? a_m_score : mism5score;
899 <   if (score>maxscore) {
900 <       ai_maxscore=ai;
901 <       bi_maxscore=bi;
902 <       maxscore=score;
903 <       }
904 <     else if (maxscore-score>a_dropoff_score) break;
905 <   }
906 < a_l=ai_maxscore;
907 < b_l=bi_maxscore;
908 < //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
909 < //now extend to the right
910 < ai_maxscore=a_r;
911 < bi_maxscore=b_r;
912 < ai=a_r;
913 < bi=b_r;
914 < score=maxscore;
915 < //sometimes there are extra AAAAs at the end of the read, ignore those
916 < if (strcmp(&a[alen-4],"AAAA")==0) {
917 <    alen-=3;
918 <    while (a[alen-1]=='A' && alen>ai) alen--;
919 <    }
920 < while (ai<alen-1 && bi<blen-1) {
921 <   ai++;
922 <   bi++;
923 <   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
924 <   if (a[ai]==b[bi]) { //match
925 <      score+=a_m_score;
926 <      if (ai>=alen-2) {
927 <           score+=a_m_score-(alen-ai-1);
928 <           }
881 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
882 > if (!doPolyTrim) return false;
883 > int rlen=seq.length();
884 > l5=0;
885 > l3=rlen-1;
886 > int32 seedVal=*(int32*)poly_seed;
887 > char polyChar=poly_seed[0];
888 > //assumes N trimming was already done
889 > //so a poly match should be very close to the end of the read
890 > // -- find the initial match (seed)
891 > int lmin=GMAX((rlen-16), 0);
892 > int li;
893 > for (li=rlen-4;li>lmin;li--) {
894 >   if (seedVal==*(int*)&(seq[li])) {
895 >      break;
896        }
930    else { //mismatch
931      score+=a_mis_score;
932      }  
933   if (score>maxscore) {
934       ai_maxscore=ai;
935       bi_maxscore=bi;
936       maxscore=score;
937       }
938     else if (maxscore-score>a_dropoff_score) break;
897     }
898 <  a_r=ai_maxscore;
899 <  b_r=bi_maxscore;
900 <  int a_ovh3=alen-a_r-1;
901 <  int b_ovh3=blen-b_r-1;
902 <  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
903 <  int mmovh5=(a_l<b_l)? a_l : b_l;
904 <  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
905 < #ifdef DEBUG
906 <  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
907 < #endif
908 <  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
909 <     if (a_l<a_ovh3) {
910 <        //adapter closer to the left end (typical for 5' adapter)
911 <        l5=a_r+1;
912 <        l3=alen-1;
898 > if (li<=lmin) return false;
899 > //seed found, try to extend it both ways
900 > //extend right
901 > int ri=li+3;
902 > SLocScore loc(ri, poly_m_score<<2);
903 > SLocScore maxloc(loc);
904 > //extend right
905 > while (ri<rlen-1) {
906 >   ri++;
907 >   if (seq[ri]==polyChar) {
908 >                loc.add(ri,poly_m_score);
909 >                }
910 >   else if (seq[ri]=='N') {
911 >                loc.add(ri,0);
912 >                }
913 >   else { //mismatch
914 >        loc.add(ri,poly_mis_score);
915 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
916          }
917 <      else {
918 <        //adapter matching at the right end (typical for 3' adapter)
919 <        l5=0;
959 <        l3=a_l-1;
960 <        }
961 <     return true;
962 <     }
963 < else { //keep this segment pair for later (gapped alignment)
964 <   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
965 <   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
966 <   }
967 <  //do not trim:
968 <  l5=0;
969 <  l3=alen-1;
970 <  return false;
971 < }
972 <
973 < /*
974 < int getWordValue(const char* s, int wlen) {
975 < int r=0;
976 < while (wlen--) { r+=(((int)s[wlen])<<wlen) }
977 < return r;
978 < }
979 < */
980 < int get3mer_value(const char* s) {
981 < return (s[0]<<16)+(s[1]<<8)+s[2];
982 < }
983 <
984 < int w3_match(int qv, const char* str, int slen, int start_index=0) {
985 < if (start_index>=slen || start_index<0) return -1;
986 < for (int i=start_index;i<slen-3;i++) {
987 <   int rv=get3mer_value(str+i);
988 <   if (rv==qv) return i;
989 <   }
990 < return -1;
991 < }
992 <
993 < int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
994 < if (end_index>=slen) return -1;
995 < if (end_index<0) end_index=slen-1;
996 < for (int i=end_index-2;i>=0;i--) {
997 <   int rv=get3mer_value(str+i);
998 <   if (rv==qv) return i;
999 <   }
1000 < return -1;
1001 < }
1002 <
1003 < int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
1004 < if (start_index>=slen || start_index<0) return -1;
1005 < for (int i=start_index;i<slen-4;i++) {
1006 <   int32* rv=(int32*)(str+i);
1007 <   if (*rv==qv) return i;
1008 <   }
1009 < return -1;
1010 < }
1011 <
1012 < int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1013 < if (end_index>=slen) return -1;
1014 < if (end_index<0) end_index=slen-1;
1015 < for (int i=end_index-3;i>=0;i--) {
1016 <   int32* rv=(int32*)(str+i);
1017 <   if (*rv==qv) return i;
1018 <   }
1019 < return -1;
1020 < }
1021 <
1022 < #ifdef DEBUG
1023 < void dbgPrintChain(CSegChain& chain, const char* aseq) {
1024 <  GStr s(aseq);
1025 <  for (int i=0;i<chain.Count();i++) {
1026 <   CSegPair& seg=*chain[i];
1027 <   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1028 <            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
917 >   if (maxloc.score<=loc.score) {
918 >      maxloc=loc;
919 >      }
920     }
921 + ri=maxloc.pos;
922 + if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
923 + //ri = right boundary for the poly match
924 + //extend left
925 + loc.set(li, maxloc.score);
926 + maxloc.pos=li;
927 + while (li>0) {
928 +    li--;
929 +    if (seq[li]==polyChar) {
930 +                 loc.add(li,poly_m_score);
931 +                 }
932 +    else if (seq[li]=='N') {
933 +                 loc.add(li,0);
934 +                 }
935 +    else { //mismatch
936 +         loc.add(li,poly_mis_score);
937 +         if (maxloc.score-loc.score>poly_dropoff_score) break;
938 +         }
939 +    if (maxloc.score<=loc.score) {
940 +       maxloc=loc;
941 +       }
942 +    }
943 + li=maxloc.pos;
944 + if ((maxloc.score==poly_minScore && ri==rlen-1) ||
945 +    (maxloc.score>poly_minScore && ri>=rlen-3) ||
946 +    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
947 +  //trimming this li-ri match at 3' end
948 +    l3=li-1;
949 +    if (l3<0) l3=0;
950 +    return true;
951 +    }
952 + return false;
953   }
1031 #endif
1032 const char *pA_seed="AAAA";
1033 const char *pT_seed="TTTT";
1034 const int polyA_seed=*(int*)pA_seed;
1035 const int polyT_seed=*(int*)pT_seed;
954  
955 < bool trim_poly3(GStr &seq, int &l5, int &l3) {
955 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
956 > if (!doPolyTrim) return false;
957   int rlen=seq.length();
958   l5=0;
959   l3=rlen-1;
960 + int32 seedVal=*(int32*)poly_seed;
961 + char polyChar=poly_seed[0];
962   //assumes N trimming was already done
963   //so a poly match should be very close to the end of the read
1043 //TODO: should we attempt polyA and polyT at the same time, same end?
964   // -- find the initial match (seed)
965 < int rlo=GMAX((rlen-10), 0);
966 < int si;
967 < for (si=rlen-5;si>rlo;si--) {
968 <   if (polyA_seed==*(int*)&(seq.chars()+si)) {
965 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
966 > int li;
967 > for (li=0;li<=lmax;li++) {
968 >   if (seedVal==*(int*)&(seq[li])) {
969        break;
970        }
971     }
972 < if (si<=rlo) return false;
973 < //seed found, try to extend it to left and right
974 < int score=4*match_score;
975 < max_rlo=rlo;
976 < rlo--;
977 < while (rlo>=0) {
978 <    rlo--;
979 <    score+=
980 <    
972 > if (li>lmax) return false;
973 > //seed found, try to extend it both ways
974 > //extend left
975 > int ri=li+3; //save rightmost base of the seed
976 > SLocScore loc(li, poly_m_score<<2);
977 > SLocScore maxloc(loc);
978 > while (li>0) {
979 >    li--;
980 >    if (seq[li]==polyChar) {
981 >                 loc.add(li,poly_m_score);
982 >                 }
983 >    else if (seq[li]=='N') {
984 >                 loc.add(li,0);
985 >                 }
986 >    else { //mismatch
987 >         loc.add(li,poly_mis_score);
988 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
989 >         }
990 >    if (maxloc.score<=loc.score) {
991 >       maxloc=loc;
992 >       }
993 >    }
994 > li=maxloc.pos;
995 > if (li>5) return false; //no trimming wanted, too far from 5' end
996 > //li = right boundary for the poly match
997 >
998 > //extend right
999 > loc.set(ri, maxloc.score);
1000 > maxloc.pos=ri;
1001 > while (ri<rlen-1) {
1002 >   ri++;
1003 >   if (seq[ri]==polyChar) {
1004 >                loc.add(ri,poly_m_score);
1005 >                }
1006 >   else if (seq[ri]=='N') {
1007 >                loc.add(ri,0);
1008 >                }
1009 >   else { //mismatch
1010 >        loc.add(ri,poly_mis_score);
1011 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
1012 >        }
1013 >   if (maxloc.score<=loc.score) {
1014 >      maxloc=loc;
1015 >      }
1016 >   }
1017 > ri=maxloc.pos;
1018 > if ((maxloc.score==poly_minScore && li==0) ||
1019 >     (maxloc.score>poly_minScore && li<2)
1020 >     || (maxloc.score>(poly_minScore*3) && li<8)) {
1021 >    //adjust l5 to reflect this trimming of 5' end
1022 >    l5=ri+1;
1023 >    if (l5>rlen-1) l5=rlen-1;
1024 >    return true;
1025      }
1026 + return false;
1027   }
1028  
1029 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1029 > bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
1030 > if (adaptors3.Count()==0) return false;
1031   int rlen=seq.length();
1032   l5=0;
1033   l3=rlen-1;
1034 < //first try a full match, we might get lucky
1035 < int fi=-1;
1036 < if ((fi=seq.index(adapter3))>=0) {
1037 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
1038 <      l5=fi+a3len;
1039 <      l3=rlen-1;
1040 <      }
1041 <    else {
1042 <      l5=0;
1043 <      l3=fi-1;
1044 <      }
1079 <   return true;
1080 <   }
1081 < #ifdef DEBUG
1082 < if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
1083 < #endif
1084 <
1085 < //also, for fast detection of other adapter-only reads that start past
1086 < // the beginning of the adapter sequence, try to see if the first a3len-4
1087 < // bases of the read are a substring of the adapter
1088 < if (rlen>a3len-3) {
1089 <   GStr rstart=seq.substr(1,a3len-4);
1090 <   if ((fi=adapter3.index(rstart))>=0) {
1091 <     l3=rlen-1;
1092 <     l5=a3len-4;
1093 <     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1094 <     return true;
1095 <     }
1096 <  }
1097 < CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1098 <  //check the easy cases - 11 bases exact match at the end
1099 < int fdlen=11;
1100 <  if (a3len<16) {
1101 <   fdlen=a3len>>1;
1102 <   }
1103 < if (fdlen>4) {
1104 <     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1105 <     GStr rstart=seq.substr(-fdlen-3,fdlen);
1106 <     if ((fi=adapter3.index(rstart))>=0) {
1107 < #ifdef DEBUG
1108 <       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1109 < #endif
1110 <       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1111 <                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
1112 <            return true;
1113 <       }
1114 <     //another easy case: first 11 characters of the adaptor found as a substring of the read
1115 <     GStr bstr=adapter3.substr(0, fdlen);
1116 <     if ((fi=seq.rindex(bstr))>=0) {
1117 < #ifdef DEBUG
1118 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1119 < #endif
1120 <       if (extendMatch(seq.chars(), rlen, fi,
1121 <                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
1122 <            return true;
1123 <       }
1124 <     } //tried to match 11 bases first
1125 <    
1126 < //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1127 < //-- only extend if the match is in the 3' (ending) region of the read
1128 < int wordSize=3;
1129 < int hlen=12;
1130 < if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1131 < int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1132 < if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1133 < imin=rlen-imin;
1134 < for (int iw=0;iw<hlen;iw++) {
1135 <   //int32* qv=(int32*)(adapter3.chars()+iw);
1136 <   int qv=get3mer_value(adapter3.chars()+iw);
1137 <   fi=-1;
1138 <   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1139 <   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1140 <     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1141 <
1142 < #ifdef DEBUG
1143 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1144 < #endif
1145 <     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1146 <                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
1147 <     fi--;
1148 <     if (fi<imin) break;
1149 <     }
1150 <   } //for each wmer in the first hlen bases of the adaptor
1151 < /*
1152 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1153 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1154 < if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1155 < int hlen2=a3len-wordSize;
1156 < //if (hlen2>a3len-4) hlen2=a3len-4;
1157 < if (hlen2>hlen) {
1158 < #ifdef DEBUG
1159 <     if (debug && a3segs.Count()>0) {
1160 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1034 > bool trimmed=false;
1035 > GStr wseq(seq);
1036 > int wlen=rlen;
1037 > GXSeqData seqdata;
1038 > int numruns=revCompl ? 2 : 1;
1039 > GList<GXAlnInfo> bestalns(true, true, false);
1040 > for (int ai=0;ai<adaptors3.Count();ai++) {
1041 >   for (int r=0;r<numruns;r++) {
1042 >     if (r) {
1043 >          seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
1044 >                 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
1045          }
1046 < #endif
1047 <     for (int iw=hlen;iw<hlen2;iw++) {
1048 <         //int* qv=(int32 *)(adapter3.chars()+iw);
1049 <         int qv=get3mer_value(adapter3.chars()+iw);
1050 <         fi=-1;
1051 <         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1052 <         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1053 <           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1054 <                         a3len, iw, wordSize, l5,l3, a3segs);
1055 <           fi--;
1056 <           if (fi<imin) break;
1057 <           }
1058 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1046 >     else {
1047 >            seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
1048 >                 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
1049 >        }
1050 >     GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
1051 >         if (aln) {
1052 >           if (aln->strong) {
1053 >                   trimmed=true;
1054 >                   bestalns.Add(aln);
1055 >                   break; //will check the rest next time
1056 >                   }
1057 >            else bestalns.Add(aln);
1058 >           }
1059 >   }//forward and reverse adaptors
1060 >   if (trimmed) break; //will check the rest in the next cycle
1061 >  }//for each 3' adaptor
1062 > if (bestalns.Count()>0) {
1063 >           GXAlnInfo* aln=bestalns[0];
1064 >           if (aln->sl-1 > wlen-aln->sr) {
1065 >                   //keep left side
1066 >                   l3-=(wlen-aln->sl+1);
1067 >                   if (l3<0) l3=0;
1068 >                   }
1069 >           else { //keep right side
1070 >                   l5+=aln->sr;
1071 >                   if (l5>=rlen) l5=rlen-1;
1072 >                   }
1073 >           //delete aln;
1074 >           //if (l3-l5+1<min_read_len) return true;
1075 >           wseq=seq.substr(l5,l3-l5+1);
1076 >           wlen=wseq.length();
1077 >           return true; //break the loops here to report a good find
1078       }
1079 < //lastly, analyze collected a3segs for a possible gapped alignment:
1177 < GList<CSegChain> segchains(false,true,false);
1178 < #ifdef DEBUG
1179 < if (debug && a3segs.Count()>0) {
1180 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1181 <   }
1182 < #endif
1183 < for (int i=0;i<a3segs.Count();i++) {
1184 <   if (a3segs[i]->chain==NULL) {
1185 <       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1186 <       CSegChain* newchain=new CSegChain();
1187 <       newchain->setFreeItem(false);
1188 <       newchain->addSegPair(a3segs[i]);
1189 <       a3segs[i]->chain=newchain;
1190 <       segchains.Add(newchain); //just to free them when done
1191 <       }
1192 <   for (int j=i+1;j<a3segs.Count();j++) {
1193 <      CSegChain* chain=a3segs[i]->chain;
1194 <      if (chain->extendChain(a3segs[j])) {
1195 <          a3segs[j]->chain=chain;
1196 < #ifdef DEBUG
1197 <          if (debug) dbgPrintChain(*chain, adapter3.chars());
1198 < #endif      
1199 <          //save time by checking here if the extended chain is already acceptable for trimming
1200 <          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1201 <            l5=0;
1202 <            l3=chain->astart-2;
1203 < #ifdef DEBUG
1204 <          if (debug && a3segs.Count()>0) {
1205 <            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1206 <            }
1207 < #endif
1208 <            return true;
1209 <            }
1210 <          } //chain can be extended
1211 <      }
1212 <   } //collect segment alignments into chains
1213 < */  
1214 < return false; //no adapter parts found
1079 >  return false;
1080   }
1081  
1082 < bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1083 < //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1082 > bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1083 > if (adaptors5.Count()==0) return false;
1084   int rlen=seq.length();
1085   l5=0;
1086   l3=rlen-1;
1087 < //try to see if adapter is fully included in the read
1088 < int fi=-1;
1089 < if ((fi=seq.index(adapter5))>=0) {
1090 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
1091 <      l5=fi+a5len;
1092 <      l3=rlen-1;
1093 <      }
1094 <    else {
1095 <      l5=0;
1096 <      l3=fi-1;
1097 <      }
1233 <   return true;
1234 <   }
1235 < #ifdef DEBUG
1236 < if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1237 < #endif
1238 <
1239 < CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1240 <
1241 < //try the easy way out first - look for an exact match of 11 bases
1242 < int fdlen=11;
1243 <  if (a5len<16) {
1244 <   fdlen=a5len>>1;
1245 <   }
1246 < if (fdlen>4) {
1247 <     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1248 <     if ((fi=adapter5.index(rstart))>=0) {
1249 < #ifdef DEBUG
1250 <       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1251 < #endif
1252 <       if (extendMatch(seq.chars(), rlen, 1,
1253 <                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1254 <           return true;
1255 <       }
1256 <     //another easy case: last 11 characters of the adaptor found as a substring of the read
1257 <     GStr bstr=adapter5.substr(-fdlen);
1258 <     if ((fi=seq.index(bstr))>=0) {
1259 < #ifdef DEBUG
1260 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1261 < #endif
1262 <       if (extendMatch(seq.chars(), rlen, fi,
1263 <                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1264 <          return true;
1265 <       }
1266 <     } //tried to matching at most 11 bases first
1267 <    
1268 < //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1269 < //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1270 < int wordSize=3;
1271 < int hlen=12;
1272 < if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1273 < int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1274 < if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1275 < for (int iw=0;iw<=hlen;iw++) {
1276 <   int apstart=a5len-iw-wordSize;
1277 <   fi=0;
1278 <   //int* qv=(int32 *)(adapter5.chars()+apstart);
1279 <   int qv=get3mer_value(adapter5.chars()+apstart);
1280 <   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1281 <   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1282 < #ifdef DEBUG
1283 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1284 < #endif
1285 <     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1286 <                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1287 <     fi++;
1288 <     if (fi>imax) break;
1289 <     }
1290 <   } //for each wmer in the last hlen bases of the adaptor
1291 < /*
1292 <
1293 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1294 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1295 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1296 < int hlen2=a5len-wordSize;
1297 < //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1298 < #ifdef DEBUG
1299 <      if (debug && a5segs.Count()>0) {
1300 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1087 > bool trimmed=false;
1088 > GStr wseq(seq);
1089 > int wlen=rlen;
1090 > GXSeqData seqdata;
1091 > int numruns=revCompl ? 2 : 1;
1092 > GList<GXAlnInfo> bestalns(true, true, false);
1093 > for (int ai=0;ai<adaptors5.Count();ai++) {
1094 >   for (int r=0;r<numruns;r++) {
1095 >     if (r) {
1096 >          seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1097 >                 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1098          }
1099 < #endif
1100 < if (hlen2>hlen) {
1101 <     for (int iw=hlen+1;iw<=hlen2;iw++) {
1102 <         int apstart=a5len-iw-wordSize;
1103 <         fi=0;
1104 <         //int* qv=(int32 *)(adapter5.chars()+apstart);
1105 <         int qv=get3mer_value(adapter5.chars()+apstart);
1106 <         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1107 <         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1108 <           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1109 <                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1110 <           fi++;
1111 <           if (fi>imax) break;
1112 <           }
1113 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1099 >     else {
1100 >            seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1101 >                 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1102 >        }
1103 >         GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1104 >         if (aln) {
1105 >           if (aln->strong) {
1106 >                   trimmed=true;
1107 >                   bestalns.Add(aln);
1108 >                   break; //will check the rest next time
1109 >                   }
1110 >            else bestalns.Add(aln);
1111 >           }
1112 >         } //forward and reverse?
1113 >   if (trimmed) break; //will check the rest in the next cycle
1114 >  }//for each 5' adaptor
1115 >  if (bestalns.Count()>0) {
1116 >           GXAlnInfo* aln=bestalns[0];
1117 >           if (aln->sl-1 > wlen-aln->sr) {
1118 >                   //keep left side
1119 >                   l3-=(wlen-aln->sl+1);
1120 >                   if (l3<0) l3=0;
1121 >                   }
1122 >           else { //keep right side
1123 >                   l5+=aln->sr;
1124 >                   if (l5>=rlen) l5=rlen-1;
1125 >                   }
1126 >           //delete aln;
1127 >           //if (l3-l5+1<min_read_len) return true;
1128 >           wseq=seq.substr(l5,l3-l5+1);
1129 >           wlen=wseq.length();
1130 >           return true; //break the loops here to report a good find
1131       }
1132 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1133 < // lastly, analyze collected a5segs for a possible gapped alignment:
1320 < GList<CSegChain> segchains(false,true,false);
1321 < #ifdef DEBUG
1322 < if (debug && a5segs.Count()>0) {
1323 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1324 <   }
1325 < #endif
1326 < for (int i=0;i<a5segs.Count();i++) {
1327 <   if (a5segs[i]->chain==NULL) {
1328 <       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1329 <       CSegChain* newchain=new CSegChain(true);
1330 <       newchain->setFreeItem(false);
1331 <       newchain->addSegPair(a5segs[i]);
1332 <       a5segs[i]->chain=newchain;
1333 <       segchains.Add(newchain); //just to free them when done
1334 <       }
1335 <   for (int j=i+1;j<a5segs.Count();j++) {
1336 <      CSegChain* chain=a5segs[i]->chain;
1337 <      if (chain->extendChain(a5segs[j])) {
1338 <         a5segs[j]->chain=chain;
1339 < #ifdef DEBUG
1340 <         if (debug) dbgPrintChain(*chain, adapter5.chars());
1341 < #endif      
1342 <      //save time by checking here if the extended chain is already acceptable for trimming
1343 <         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1344 <            l5=chain->aend;
1345 <            l3=rlen-1;
1346 <            return true;
1347 <            }
1348 <         } //chain can be extended
1349 <      }
1350 <   } //collect segment alignments into chains
1351 < */
1352 < return false; //no adapter parts found
1353 < }
1132 >  return false;
1133 > }
1134  
1135 < //convert qvs to/from phred64 from/to phread33
1135 > //convert qvs to/from phred64 from/to phread33
1136   void convertPhred(GStr& q) {
1137   for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1138   }
# Line 1361 | Line 1141
1141   for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1142   }
1143  
1144 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1144 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1145            GStr& rname, GStr& rinfo, GStr& infname) {
1146   rseq="";
1147   rqv="";
# Line 1377 | Line 1157
1157        } //raw qseq format
1158   else { // FASTQ or FASTA */
1159   isfasta=(l[0]=='>');
1160 < if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1160 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1161        infname.chars(), l);
1162   GStr s(l);
1163   rname=&(l[1]);
1164   for (int i=0;i<rname.length();i++)
1165 <    if (rname[i]<=' ') {
1166 <       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1167 <       rname.cut(i);
1168 <       break;
1165 >    if (rname[i]<=' ') {
1166 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1167 >       rname.cut(i);
1168 >       break;
1169         }
1170    //now get the sequence
1171 < if ((l=fq.getLine())==NULL)
1171 > if ((l=fq.getLine())==NULL)
1172        GError("Error: unexpected EOF after header for read %s (%s)\n",
1173                     rname.chars(), infname.chars());
1174   rseq=l; //this must be the DNA line
1175   while ((l=fq.getLine())!=NULL) {
1176        //seq can span multiple lines
1177        if (l[0]=='>' || l[0]=='+') {
1178 <           fq.pushBack();
1178 >           fq.pushBack();
1179             break; //
1180             }
1181        rseq+=l;
1182 <      } //check for multi-line seq
1182 >      } //check for multi-line seq
1183   if (!isfasta) { //reading fastq quality values, which can also be multi-line
1184      if ((l=fq.getLine())==NULL)
1185          GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1186      if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1187 <    if ((l=fq.getLine())==NULL)
1187 >    if ((l=fq.getLine())==NULL)
1188          GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1189      rqv=l;
1190 <    //if (rqv.length()!=rseq.length())
1190 >    //if (rqv.length()!=rseq.length())
1191      //  GError("Error: qv len != seq len for %s\n", rname.chars());
1192      while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1193        rqv+=l; //append to qv string
1194        }
1195      }// fastq
1196   // } //<-- FASTA or FASTQ
1197 < rseq.upper(); //TODO: what if we care about masking?
1197 > rseq.upper();
1198   return true;
1199   }
1200  
1201 + #ifdef GDEBUG
1202 + void showTrim(GStr& s, int l5, int l3) {
1203 +  if (l5>0 || l3==0) {
1204 +    color_bg(c_red);
1205 +    }
1206 +  for (int i=0;i<s.length()-1;i++) {
1207 +    if (i && i==l5) color_resetbg();
1208 +    fprintf(stderr, "%c", s[i]);
1209 +    if (i && i==l3) color_bg(c_red);
1210 +   }
1211 +  fprintf(stderr, "%c", s[s.length()-1]);
1212 +  color_reset();
1213 +  fprintf(stderr, "\n");
1214 + }
1215 + #endif
1216 +
1217   char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1218 < //returns 0 if the read was untouched, 1 if it was just trimmed
1218 > //returns 0 if the read was untouched, 1 if it was just trimmed
1219   // and a trash code if it was trashed
1220   l5=0;
1221   l3=rseq.length()-1;
1222 + #ifdef GDEBUG
1223 +   //rseq.reverse();
1224 +   GMessage(">%s\n", rname.chars());
1225 +   GMessage("%s\n",rseq.chars());
1226 + #endif
1227   if (l3-l5+1<min_read_len) {
1228     return 's';
1229     }
# Line 1430 | Line 1231
1231   GStr wqv(rqv.chars());
1232   int w5=l5;
1233   int w3=l3;
1234 + //count Ns
1235 + b_totalIn+=rseq.length();
1236 + for (int i=0;i<rseq.length();i++) {
1237 + if (isACGT[(int)rseq[i]]==0) b_totalN++;
1238 + }
1239 +
1240   //first do the q-based trimming
1241   if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1242 +   b_trimQ+=(w5-l5)+(l3-w3);
1243 +   num_trimQ++;
1244     if (w3-w5+1<min_read_len) {
1245       return 'Q'; //invalid read
1246       }
# Line 1444 | Line 1253
1253     } //qv trimming
1254   // N-trimming on the remaining read seq
1255   if (ntrim(wseq, w5, w3)) {
1256 +   //Note: ntrim resets w5 and w3
1257     //GMessage("before: %s\n",wseq.chars());
1258     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1259 +   int trim3=(wseq.length()-1-w3);
1260 +   b_trimN+=w5+trim3;
1261 +   num_trimN++;
1262     l5+=w5;
1263 <   l3-=(wseq.length()-1-w3);
1263 >   l3-=trim3;
1264     if (w3-w5+1<min_read_len) {
1265       return 'N'; //to be trashed
1266       }
# Line 1458 | Line 1271
1271     w5=0;
1272     w3=wseq.length()-1;
1273     }
1274 < if (a3len>0) {
1275 <  if (trim_adapter3(wseq, w5, w3)) {
1276 <     int trimlen=wseq.length()-(w3-w5+1);
1277 <     num_trimmed3++;
1278 <     if (trimlen<min_trimmed3)
1279 <         min_trimmed3=trimlen;
1280 <     l5+=w5;
1281 <     l3-=(wseq.length()-1-w3);
1282 <     if (w3-w5+1<min_read_len) {
1283 <         return '3';
1284 <         }
1285 <      //-- keep only the w5..w3 range
1286 <      wseq=wseq.substr(w5, w3-w5+1);
1287 <      if (!wqv.is_empty())
1288 <         wqv=wqv.substr(w5, w3-w5+1);
1289 <      }//some adapter was trimmed
1290 <   } //adapter trimming
1291 < if (a5len>0) {
1292 <  if (trim_adapter5(wseq, w5, w3)) {
1293 <     int trimlen=wseq.length()-(w3-w5+1);
1294 <     num_trimmed5++;
1295 <     if (trimlen<min_trimmed5)
1296 <         min_trimmed5=trimlen;
1274 > char trim_code;
1275 > //clean the more dirty end first - 3'
1276 > bool trimmedA=false;
1277 > bool trimmedT=false;
1278 > bool trimmedV=false;
1279 >
1280 > int prev_w5=0;
1281 > int prev_w3=0;
1282 > bool w3upd=true;
1283 > bool w5upd=true;
1284 > do {
1285 >  trim_code=0;
1286 >  if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1287 >      trim_code='A';
1288 >      b_trimA+=(w5+(wseq.length()-1-w3));
1289 >      if (!trimmedA) { num_trimA++; trimmedA=true; }
1290 >      }
1291 >  else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1292 >      trim_code='T';
1293 >      b_trimT+=(w5+(wseq.length()-1-w3));
1294 >      if (!trimmedT) { num_trimT++; trimmedT=true; }
1295 >      }
1296 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1297 >      trim_code='A';
1298 >      b_trimA+=(w5+(wseq.length()-1-w3));
1299 >      if (!trimmedA) { num_trimA++; trimmedA=true; }
1300 >      }
1301 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1302 >      trim_code='T';
1303 >      b_trimT+=(w5+(wseq.length()-1-w3));
1304 >      if (!trimmedT) { num_trimT++; trimmedT=true; }
1305 >      }
1306 >  else if (trim_adaptor5(wseq, w5, w3)) {
1307 >      trim_code='V';
1308 >      b_trimV+=(w5+(wseq.length()-1-w3));
1309 >      if (!trimmedV) { num_trimV++; trimmedV=true; }
1310 >      }
1311 >  else if (trim_adaptor3(wseq, w5, w3)) {
1312 >      trim_code='V';
1313 >      b_trimV+=(w5+(wseq.length()-1-w3));
1314 >      if (!trimmedV) { num_trimV++; trimmedV=true; }
1315 >      }
1316 >  if (trim_code) {
1317 >     w3upd=(w3!=prev_w3);
1318 >         w5upd=(w5!=prev_w5);
1319 >         if (w3upd) prev_w3=w3;
1320 >         if (w5upd) prev_w5=w5;
1321 >   #ifdef GDEBUG
1322 >     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1323 >     showTrim(wseq, w5, w3);
1324 >   #endif
1325 >     //int trimlen=wseq.length()-(w3-w5+1);
1326       l5+=w5;
1327       l3-=(wseq.length()-1-w3);
1328       if (w3-w5+1<min_read_len) {
1329 <         return '5';
1329 >         return trim_code;
1330           }
1331        //-- keep only the w5..w3 range
1332        wseq=wseq.substr(w5, w3-w5+1);
1333        if (!wqv.is_empty())
1334           wqv=wqv.substr(w5, w3-w5+1);
1335 <      }//some adapter was trimmed
1336 <   } //adapter trimming
1335 >      }//trimming at 3' end
1336 > } while (trim_code);
1337   if (doCollapse) {
1338     //keep read for later
1339     FqDupRec* dr=dhash.Find(wseq.chars());
1340     if (dr==NULL) { //new entry
1341 <          //if (prefix.is_empty())
1342 <             dhash.Add(wseq.chars(),
1341 >          //if (prefix.is_empty())
1342 >             dhash.Add(wseq.chars(),
1343                    new FqDupRec(&wqv, rname.chars()));
1344            //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1345           }
# Line 1531 | Line 1373
1373         fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1374         }
1375        else {
1376 <       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1376 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1377                            rseq.chars());
1378         }
1379       }
# Line 1542 | Line 1384
1384         fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1385         }
1386        else
1387 <       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1387 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1388                            rseq.chars(),rqv.chars() );
1389       }
1390   }
# Line 1550 | Line 1392
1392   void trash_report(char trashcode, GStr& rname, FILE* freport) {
1393   if (freport==NULL || trashcode<=' ') return;
1394   if (trashcode=='3' || trashcode=='5') {
1395 <   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1395 >   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1396     }
1397   else {
1398     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
# Line 1642 | Line 1484
1484      }
1485   }
1486  
1487 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1488 <                       GStr& s, GStr& infname, GStr& infname2) {
1487 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1488 > //TODO: prepare CASeqData here, and collect hexamers as well
1489 >  if (seq.is_empty() || seq=="-" ||
1490 >          seq=="N/A" || seq==".") return;
1491 >
1492 > CASeqData adata(revCompl);
1493 > int idx=adaptors.Add(adata);
1494 > if (idx<0) GError("Error: failed to add adaptor!\n");
1495 > adaptors[idx].trim_type=trim_type;
1496 > adaptors[idx].update(seq.chars());
1497 > }
1498 >
1499 >
1500 > int loadAdaptors(const char* fname) {
1501 >  GLineReader lr(fname);
1502 >  char* l;
1503 >  while ((l=lr.nextLine())!=NULL) {
1504 >   if (lr.length()<=3 || l[0]=='#') continue;
1505 >   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1506 >        l[0]==';'|| l[0]==':' ) {
1507 >      int i=1;
1508 >      while (l[i]!=0 && isspace(l[i])) {
1509 >        i++;
1510 >        }
1511 >      if (l[i]!=0) {
1512 >        GStr s(&(l[i]));
1513 >      #ifdef GDEBUG
1514 >          //s.reverse();
1515 >      #endif
1516 >        addAdaptor(adaptors3, s, galn_TrimRight);
1517 >        continue;
1518 >        }
1519 >      }
1520 >    else {
1521 >      GStr s(l);
1522 >      s.startTokenize("\t ;,:");
1523 >      GStr a5,a3;
1524 >      if (s.nextToken(a5))
1525 >            s.nextToken(a3);
1526 >        else continue; //no tokens on this line
1527 >      GAlnTrimType ttype5=galn_TrimLeft;
1528 >      a5.upper();
1529 >      a3.upper();
1530 >      if (a3.is_empty() || a3==a5 || a3=="=") {
1531 >         a3.clear();
1532 >         ttype5=galn_TrimEither;
1533 >         }
1534 >     #ifdef GDEBUG
1535 >     //   a5.reverse();
1536 >     //   a3.reverse();
1537 >     #endif
1538 >      addAdaptor(adaptors5, a5, ttype5);
1539 >      addAdaptor(adaptors3, a3, galn_TrimRight);
1540 >      }
1541 >   }
1542 >   return adaptors5.Count()+adaptors3.Count();
1543 > }
1544 >
1545 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1546 >                       GStr& s, GStr& infname, GStr& infname2) {
1547   // uses outsuffix to generate output file names and open file handles as needed
1548   infname="";
1549   infname2="";
# Line 1671 | Line 1571
1571   s.startTokenize(",:");
1572   s.nextToken(infname);
1573   bool paired=s.nextToken(infname2);
1574 < if (fileExists(infname.chars())==0)
1574 > if (fileExists(infname.chars())==0)
1575      GError("Error: cannot find file %s!\n",infname.chars());
1576   GStr fname(getFileName(infname.chars()));
1577   GStr picmd;
# Line 1693 | Line 1593
1593   if (!paired) return;
1594   if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1595   // ---- paired reads:-------------
1596 < if (fileExists(infname2.chars())==0)
1596 > if (fileExists(infname2.chars())==0)
1597       GError("Error: cannot find file %s!\n",infname2.chars());
1598   picmd="";
1599   GStr fname2(getFileName(infname2.chars()));

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines