ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 3 | Line 3
3   #include "GHash.hh"
4   #include "GList.hh"
5   #include <ctype.h>
6 + #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9 < fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 <   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
9 > fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12      <input.fq>[,<input_mates.fq>\n\
13   \n\
14 < Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
14 > Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15   for low complexity and collapse duplicate reads.\n\
16   If read pairs should be trimmed and kept together (i.e. without discarding\n\
17   one read in a pair), the two file names should be given delimited by a comma\n\
# Line 24 | Line 25
25      file(s) named <input>.<outsuffix> which will be created in the \n\
26      current (working) directory; (writes to stdout if -o- is given);\n\
27      a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 < -f  file with adapter sequences to trim, each line having this format:\n\
29 <    <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 < -5  trim the given adapter or primer sequence at the 5' end of each read\n\
28 > -f  file with adaptor sequences to trim, each line having this format:\n\
29 >    <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30 > -5  trim the given adaptor or primer sequence at the 5' end of each read\n\
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 < -3  trim the given adapter sequence at the 3' end of each read\n\
32 > -3  trim the given adaptor sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -a  minimum length of exact match to adaptor sequence at the proper end (6)\n\
34 > -A  disable polyA/T trimming (enabled by default)\n\
35 > -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
36   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37   -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
# Line 46 | Line 48
48   -Q  convert quality values to the other Phred qv type\n\
49   -V  verbose processing\n\
50   "
51 <
51 >
52   //-z  for -o option, the output stream(s) will be first piped into the given\n
53   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54  
55 <
56 < // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
55 >
56 > // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57  
58   //For paired reads sequencing:
59   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
# Line 60 | Line 62
62   //FILE* f_out2=NULL; //for paired reads
63   //FILE* f_in=NULL; //input fastq (stdin if not provided)
64   //FILE* f_in2=NULL; //for paired reads
65 +
66   FILE* freport=NULL;
67  
68   bool debug=false;
69   bool verbose=false;
70   bool doCollapse=false;
71   bool doDust=false;
72 + bool doPolyTrim=true;
73   bool fastaOutput=false;
74   bool trashReport=false;
75 < //bool rawFormat=false;
75 > bool revCompl=false; //also reverse complement adaptor sequences
76   int min_read_len=16;
77   double max_perc_N=7.0;
78   int dust_cutoff=16;
79   bool isfasta=false;
80   bool convert_phred=false;
81 < GStr outsuffix; // -o
78 < //GStr adapter3;
79 < //GStr adapter5;
81 > GStr outsuffix; // -o
82   GStr prefix;
83   GStr zcmd;
84   int num_trimmed5=0;
# Line 91 | Line 93
93   int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94   int qv_cvtadd=0; //could be -31 or +31
95  
96 < int a3len=0;
97 < int a5len=0;
98 < // adaptor matching metrics -- for extendMatch() function
99 < const int match_score=2; //match score
100 < const int mismatch_score=-3; //mismatch
101 <
102 < const int a_m_score=2; //match score
103 < const int a_mis_score=-3; //mismatch
104 < const int a_dropoff_score=7;
96 > // adaptor matching metrics -- for X-drop ungapped extension
97 > //const int match_reward=2;
98 > //const int mismatch_penalty=3;
99 > const int match_reward=2;
100 > const int mismatch_penalty=4;
101 > const int Xdrop=10;
102 >
103 > const int poly_m_score=2; //match score
104 > const int poly_mis_score=-3; //mismatch
105   const int poly_dropoff_score=7;
106 < int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
105 < const int a_min_chain_score=15; //for gapped alignments
106 > int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
107  
108 < class CSegChain;
108 > const char *polyA_seed="AAAA";
109 > const char *polyT_seed="TTTT";
110  
111 < class CSegPair {
112 <  public:
113 <   GSeg a;
114 <   GSeg b; //the adapter segment
115 <   int score;
116 <   int flags;
117 <   CSegChain* chain;
118 <   CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) {
119 <      score=mscore;
120 <      if (score==0) score=a.len()*a_m_score;
121 <      flags=0;
122 <      chain=NULL;
123 <      }
124 <   int len() { return  a.len(); }
125 <   bool operator==(CSegPair& d){
126 <      //return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end);
125 <      //make equal even segments that are included into one another:
126 <      return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end);
127 <      }
128 <   bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score
129 <     if (b.start==d.b.start) {
130 <        if (score==d.score) {
131 <           //just try to be consistent:
132 <           if (b.end==d.b.end) {
133 <             return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
134 <             }
135 <           return (b.end>d.b.end);
136 <           }
137 <         else return (score<d.score);
138 <        }
139 <     return (b.start>d.b.start);
140 <     }
141 <   bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord
142 <     /*if (b.start==d.b.start && b.end==d.b.end) {
143 <          return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start);
144 <          }
145 <     return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/
146 <     if (b.start==d.b.start) {
147 <        if (score==d.score) {
148 <           //just try to be consistent:
149 <           if (b.end==d.b.end) {
150 <             return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start);
151 <             }
152 <           return (b.end<d.b.end);
153 <           }
154 <         else return (score>d.score);
111 > struct CASeqData {
112 >   //positional data for every possible hexamer in an adaptor
113 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115 >   GStr seq; //actual adaptor sequence data
116 >   GStr seqr; //reverse complement sequence
117 >   int amlen; //fraction of adaptor length matching that's
118 >              //enough to consider the alignment
119 >   GAlnTrimType trim_type;
120 >   bool use_reverse;
121 >   CASeqData(bool rev=false):seq(),seqr(),
122 >             amlen(0), use_reverse(rev) {
123 >     trim_type=galn_None; //should be updated later!
124 >     for (int i=0;i<4096;i++) {
125 >        pz[i]=NULL;
126 >        pzr[i]=NULL;
127          }
156     return (b.start<d.b.start);
128       }
158 };
129  
130 < int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score
131 < CSegPair& x = *(CSegPair *)sa;
132 < CSegPair& y = *(CSegPair *)sb;
133 < /*
134 < if (x.b.end==y.b.end) {
135 <     if (x.b.start==y.b.start) {
136 <         if (x.a.end==y.a.end) {
137 <            if (x.a.start==y.a.start) return 0;
138 <            return ((x.a.start>y.a.start) ? -1 : 1);
139 <            }
140 <          else {
141 <            return ((x.a.end>y.a.end) ? -1 : 1);
142 <            }
143 <          }
144 <      else {
145 <       return ((x.b.start>y.b.start) ? -1 : 1);
146 <       }
147 <     }
148 <    else {
179 <     return ((x.b.end>y.b.end) ? -1 : 1);
180 <     }
181 < */
182 <  if (x.b.end==y.b.end) {
183 <     if (x.score==y.score) {
184 <     if (x.b.start==y.b.start) {
185 <         if (x.a.end==y.a.end) {
186 <            if (x.a.start==y.a.start) return 0;
187 <            return ((x.a.start<y.a.start) ? -1 : 1);
188 <            }
189 <          else {
190 <            return ((x.a.end<y.a.end) ? -1 : 1);
191 <            }
192 <          }
193 <      else {
194 <       return ((x.b.start<y.b.start) ? -1 : 1);
195 <       }
196 <      } else return ((x.score>y.score) ? -1 : 1);
197 <     }
198 <    else {
199 <     return ((x.b.end>y.b.end) ? -1 : 1);
130 >   void update(const char* s) {
131 >         seq=s;
132 >         table6mers(seq.chars(), seq.length(), pz);
133 >         amlen=iround(double(seq.length())*0.8);
134 >         if (amlen<12)
135 >                amlen=12;
136 >         if (!use_reverse) return;
137 >         //reverse complement
138 >         seqr=s;
139 >         int slen=seq.length();
140 >         for (int i=0;i<slen;i++)
141 >           seqr[i]=ntComplement(seq[slen-i-1]);
142 >         table6mers(seqr.chars(), seqr.length(), pzr);
143 >   }
144 >
145 >   ~CASeqData() {
146 >     for (int i=0;i<4096;i++) {
147 >       delete pz[i];
148 >       delete pzr[i];
149       }
150 +   }
151 + };
152  
153 < }
153 > GVec<CASeqData> adaptors5;
154 > GVec<CASeqData> adaptors3;
155  
156 < class CSegChain:public GList<CSegPair> {
157 < public:
206 <   uint astart;
207 <   uint aend;
208 <   uint bstart;
209 <   uint bend;
210 <   int score;
211 <   bool endSort;
212 <  CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique
213 <   //as SegPairs are inserted, they will be sorted by a.start coordinate
214 <   score=0;
215 <   astart=MAX_UINT;
216 <   aend=0;
217 <   bstart=MAX_UINT;
218 <   bend=0;
219 <   endSort=aln5;
220 <   if (aln5) { setSorted(cmpSegEnds); }
221 <   }
222 < bool operator==(CSegChain& d) {
223 <   //return (score==d.score);
224 <    return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend);
225 <   }
226 < bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate
227 <   //return (score<d.score);
228 <   if (bstart==d.bstart && bend==d.bend) {
229 <          return (astart==d.astart)?(aend>d.aend):(astart>d.astart);
230 <          }
231 <     return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart);
232 <   }
233 < bool operator<(CSegChain& d) {
234 <   //return (score>d.score);
235 <   if (bstart==d.bstart && bend==d.bend) {
236 <          return (astart==d.astart)?(aend<d.aend):(astart<d.astart);
237 <          }
238 <     return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart);
239 <   }
240 < void addSegPair(CSegPair* segp) {
241 <   if (AddIfNew(segp)!=segp) return;
242 <   score+=segp->score;
243 <   if (astart>segp->a.start) astart=segp->a.start;
244 <   if (aend<segp->a.end) aend=segp->a.end;
245 <   if (bstart>segp->b.start) bstart=segp->b.start;
246 <   if (bend<segp->b.end) bend=segp->b.end;
247 <   }
248 < //for building actual chains:
249 < bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain
250 <   int bgap=0;
251 <   int agap=0;
252 <   //if (endSort) {
253 <   if (bstart>segp->b.start) {
254 <      bgap = (int)(bstart-segp->b.end);
255 <      if (abs(bgap)>2) return false;
256 <      agap = (int)(astart-segp->a.end);
257 <      if (abs(agap)>2) return false;
258 <      }
259 <     else {
260 <      bgap = (int) (segp->b.start-bend);
261 <      if (abs(bgap)>2) return false;
262 <      agap = (int)(segp->a.start-aend);
263 <      if (abs(agap)>2) return false;
264 <      }
265 <   if (agap*bgap<0) return false;
266 <   addSegPair(segp);
267 <   score-=abs(agap)+abs(bgap);
268 <   return true;
269 <   }
270 < };
156 > CGreedyAlignData* gxmem_l=NULL;
157 > CGreedyAlignData* gxmem_r=NULL;
158  
159   // element in dhash:
160   class FqDupRec {
# Line 317 | Line 204
204  
205   GHash<FqDupRec> dhash; //hash to keep track of duplicates
206  
207 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
208 <                       GStr& s, GStr& infname, GStr& infname2);
207 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208 > int loadAdaptors(const char* fname);
209 >
210 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211 >                       GStr& s, GStr& infname, GStr& infname2);
212   // uses outsuffix to generate output file names and open file handles as needed
213 <
213 >
214   void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
215   void trash_report(char trashcode, GStr& rname, FILE* freport);
216  
217 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
217 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
218            GStr& rname, GStr& rinfo, GStr& infname);
219  
220 < char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
220 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
221   //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
222  
223   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
334 bool polytrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
224   bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
225   int dust(GStr& seq);
226 < bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
227 < bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
226 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 > bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 > bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230  
231   void convertPhred(char* q, int len);
232   void convertPhred(GStr& q);
233  
234   int main(int argc, char * const argv[]) {
235 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
235 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
236    int e;
237    if ((e=args.isError())>0) {
238        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 352 | Line 243
243    convert_phred=(args.getOpt('Q')!=NULL);
244    doCollapse=(args.getOpt('C')!=NULL);
245    doDust=(args.getOpt('D')!=NULL);
246 +  revCompl=(args.getOpt('R')!=NULL);
247 +  if (args.getOpt('A')) doPolyTrim=false;
248    /*
249    rawFormat=(args.getOpt('R')!=NULL);
250    if (rawFormat) {
# Line 360 | Line 253
253    */
254    prefix=args.getOpt('n');
255    GStr s=args.getOpt('l');
256 <  if (!s.is_empty())
256 >  if (!s.is_empty())
257       min_read_len=s.asInt();
258    s=args.getOpt('m');
259 <  if (!s.is_empty())
259 >  if (!s.is_empty())
260       max_perc_N=s.asDouble();
261    s=args.getOpt('d');
262    if (!s.is_empty()) {
# Line 389 | Line 282
282          qv_phredtype=64;
283          qv_cvtadd=-31;
284          }
285 <       else
285 >       else
286           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
287       }
288 <  if (args.getOpt('3')!=NULL) {
289 <    adapter3=args.getOpt('3');
290 <    adapter3.upper();
291 <    a3len=adapter3.length();
292 <    }
293 <  if (args.getOpt('5')!=NULL) {
294 <    adapter5=args.getOpt('5');
295 <    adapter5.upper();
296 <    a5len=adapter5.length();
288 >  s=args.getOpt('f');
289 >  if (!s.is_empty()) {
290 >   loadAdaptors(s.chars());
291 >   }
292 >  bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293 >  s=args.getOpt('5');
294 >  if (!s.is_empty()) {
295 >    if (fileAdaptors)
296 >      GError("Error: options -5 and -f cannot be used together!\n");
297 >    s.upper();
298 >    addAdaptor(adaptors5, s, galn_TrimLeft);
299      }
300 <  s=args.getOpt('a');
300 >  s=args.getOpt('3');
301    if (!s.is_empty()) {
302 <     int a_minmatch=s.asInt();
303 <     a_min_score=a_minmatch<<1;
302 >    if (fileAdaptors)
303 >      GError("Error: options -3 and -f cannot be used together!\n");
304 >      s.upper();
305 >      addAdaptor(adaptors3, s, galn_TrimRight);
306 >    }
307 >  s=args.getOpt('y');
308 >  if (!s.is_empty()) {
309 >     int minmatch=s.asInt();
310 >     poly_minScore=minmatch*poly_m_score;
311       }
312 <  
312 >
313    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
314                           else outsuffix="-";
315    trashReport=  (args.getOpt('r')!=NULL);
# Line 424 | Line 326
326    if (trashReport)
327      openfw(freport, args, 'r');
328    char* infile=NULL;
329 +
330 +  if (adaptors5.Count()>0)
331 +    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332 +        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 +  if (adaptors3.Count()>0)
334 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335 +
336    while ((infile=args.nextNonOpt())!=NULL) {
337 +    //for each input file
338      int incounter=0; //counter for input reads
339      int outcounter=0; //counter for output reads
340      int trash_s=0; //too short from the get go
341      int trash_Q=0;
342      int trash_N=0;
343      int trash_D=0;
344 +    int trash_poly=0;
345      int trash_A3=0;
346      int trash_A5=0;
347      s=infile;
# Line 455 | Line 366
366         int a5=0, a3=0, b5=0, b3=0;
367         char tcode=0, tcode2=0;
368         tcode=process_read(seqid, rseq, rqv, a5, a3);
369 <       //if (!doCollapse) {
459 <         if (fq2!=NULL) {
369 >       if (fq2!=NULL) {
370              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
371              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
372                 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
# Line 488 | Line 398
398                 int nocounter=0;
399                 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
400                 }
401 <            } //paired read
492 <       // }
401 >            } //pair read
402         if (tcode>1) { //trashed
403 +         #ifdef GDEBUG
404 +         GMessage(" !!!!TRASH code = %c\n",tcode);
405 +         #endif
406            if (tcode=='s') trash_s++;
407 +          else if (tcode=='A' || tcode=='T') trash_poly++;
408              else if (tcode=='Q') trash_Q++;
409                else if (tcode=='N') trash_N++;
410                 else if (tcode=='D') trash_D++;
# Line 504 | Line 417
417              rseq=rseq.substr(a5,a3-a5+1);
418              if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
419              }
420 +         #ifdef GDEBUG
421 +            GMessage("  After trimming:\n");
422 +            GMessage("%s\n",rseq.chars());
423 +         #endif
424            writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
425            }
426         } //for each fastq record
# Line 531 | Line 448
448                 }
449              }
450           outcounter++;
451 <         if (qd->count>maxdup_count) {
451 >         if (qd->count>maxdup_count) {
452              maxdup_count=qd->count;
453              maxdup_seq=seq;
454              }
455           if (isfasta) {
456             if (prefix.is_empty()) {
457 <             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
457 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
458                             rseq.chars());
459               }
460             else { //use custom read name
# Line 548 | Line 465
465           else { //fastq format
466            if (convert_phred) convertPhred(qd->qv, qd->len);
467            if (prefix.is_empty()) {
468 <            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
468 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
469                             rseq.chars(), qd->qv);
470              }
471            else { //use custom read name
# Line 584 | Line 501
501           GMessage("         Trashed by N%%:%9d\n", trash_N);
502         if (trash_Q>0)
503           GMessage("Trashed by low quality:%9d\n", trash_Q);
504 +       if (trash_poly>0)
505 +         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
506         if (trash_A5>0)
507 <         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
507 >         GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508         if (trash_A3>0)
509 <         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
509 >         GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510         }
511      if (trashReport) {
512            FWCLOSE(freport);
# Line 595 | Line 514
514      FWCLOSE(f_out);
515      FWCLOSE(f_out2);
516     } //while each input file
517 <
517 > delete gxmem_l;
518 > delete gxmem_r;
519   //getc(stdin);
520   }
521  
# Line 610 | Line 530
530     const char* seq;
531     bool valid;
532     NData() {
533 +    seqlen=0;
534      NCount=0;
535      end5=0;
536      end3=0;
# Line 640 | Line 561
561       perc_N=(n*100.0)/(end5-end3+1);
562       }
563   };
564 <
564 >
565   static NData feat;
566   int perc_lenN=12; // incremental distance from ends, in percentage of
567            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
568 <          
568 >
569   void N_analyze(int l5, int l3, int p5, int p3) {
570   /* assumes feat was filled properly */
571   int old_dif, t5,t3,v;
572   if (l3<l5+2 || p5>p3 ) {
573     feat.end5=l5+1;
574     feat.end3=l3+1;
575 <   return;
575 >   return;
576     }
577  
578   t5=feat.NPos[p5]-l5;
579   t3=l3-feat.NPos[p3];
580   old_dif=p3-p5;
581   v=(int)((((double)(l3-l5))*perc_lenN)/100);
582 < if (v>20) v=20; /* enforce N-search limit for very long reads */
582 > if (v>20) v=20; /* enforce N-search limit for very long reads */
583   if (t5 < v ) {
584     l5=feat.NPos[p5]+1;
585     p5++;
# Line 675 | Line 596
596             feat.end3=l3+1;
597             return;
598             }
599 <    else
599 >    else
600        N_analyze(l5,l3, p5,p3);
601   }
602  
# Line 716 | Line 637
637   feat.init(rseq);
638   l5=feat.end5-1;
639   l3=feat.end3-1;
640 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
640 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
641   if (l5==feat.end5-1 && l3==feat.end3-1) {
642      if (feat.perc_N>max_perc_N) {
643             feat.valid=false;
# Line 734 | Line 655
655     return true;
656     }
657   feat.N_calc();
658 <
658 >
659   if (feat.perc_N>max_perc_N) {
660        feat.valid=false;
661        l3=l5+1;
# Line 746 | Line 667
667   //--------------- dust functions ----------------
668   class DNADuster {
669   public:
670 <  int dustword;
671 <  int dustwindow;
672 <  int dustwindow2;
670 >  int dustword;
671 >  int dustwindow;
672 >  int dustwindow2;
673    int dustcutoff;
674    int mv, iv, jv;
675    int counts[32*32*32];
# Line 843 | Line 764
764                      }
765             }
766           }
767 < //return first;
767 > //return first;
768   }
769   };
770  
# Line 861 | Line 782
782   return ncount;
783   }
784  
785 + struct SLocScore {
786 +  int pos;
787 +  int score;
788 +  SLocScore(int p=0,int s=0) {
789 +    pos=p;
790 +    score=s;
791 +    }
792 +  void set(int p, int s) {
793 +    pos=p;
794 +    score=s;
795 +    }
796 +  void add(int p, int add) {
797 +    pos=p;
798 +    score+=add;
799 +    }
800 + };
801  
802 < // ------------------ adapter matching - simple k-mer seed & extend, no indels for now
803 < //when a k-mer match is found, simply try to extend the alignment using a drop-off scheme
804 < //check minimum score and
805 < //for 3' adapter trimming:
806 < //     require that the right end of the alignment for either the adaptor OR the read must be
807 < //     < 3 distance from its right end
808 < // for 5' adapter trimming:
809 < //     require that the left end of the alignment for either the adaptor OR the read must
810 < //     be at coordinate < 3 from start
811 <
812 < bool extendMatch(const char* a, int alen, int ai,
813 <                 const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) {
814 < //so the alignment starts at ai in a, bi in b, with a perfect match of length mlen
815 < #ifdef DEBUG
816 < GStr dbg(b);
880 < #endif
881 < //if (debug) {
882 < //  GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai);
883 < //  }
884 < int a_l=ai; //alignment coordinates on a
885 < int a_r=ai+mlen-1;
886 < int b_l=bi; //alignment coordinates on b
887 < int b_r=bi+mlen-1;
888 < int ai_maxscore=ai;
889 < int bi_maxscore=bi;
890 < int score=mlen*a_m_score;
891 < int maxscore=score;
892 < int mism5score=a_mis_score;
893 < if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end
894 < //try to extend to the left first, if possible
895 < while (ai>0 && bi>0) {
896 <   ai--;
897 <   bi--;
898 <   score+= (a[ai]==b[bi])? a_m_score : mism5score;
899 <   if (score>maxscore) {
900 <       ai_maxscore=ai;
901 <       bi_maxscore=bi;
902 <       maxscore=score;
903 <       }
904 <     else if (maxscore-score>a_dropoff_score) break;
905 <   }
906 < a_l=ai_maxscore;
907 < b_l=bi_maxscore;
908 < //if (debug) GMessage("  after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
909 < //now extend to the right
910 < ai_maxscore=a_r;
911 < bi_maxscore=b_r;
912 < ai=a_r;
913 < bi=b_r;
914 < score=maxscore;
915 < //sometimes there are extra AAAAs at the end of the read, ignore those
916 < if (strcmp(&a[alen-4],"AAAA")==0) {
917 <    alen-=3;
918 <    while (a[alen-1]=='A' && alen>ai) alen--;
919 <    }
920 < while (ai<alen-1 && bi<blen-1) {
921 <   ai++;
922 <   bi++;
923 <   //score+= (a[ai]==b[bi])? a_m_score : a_mis_score;
924 <   if (a[ai]==b[bi]) { //match
925 <      score+=a_m_score;
926 <      if (ai>=alen-2) {
927 <           score+=a_m_score-(alen-ai-1);
928 <           }
802 > bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
803 > if (!doPolyTrim) return false;
804 > int rlen=seq.length();
805 > l5=0;
806 > l3=rlen-1;
807 > int32 seedVal=*(int32*)poly_seed;
808 > char polyChar=poly_seed[0];
809 > //assumes N trimming was already done
810 > //so a poly match should be very close to the end of the read
811 > // -- find the initial match (seed)
812 > int lmin=GMAX((rlen-16), 0);
813 > int li;
814 > for (li=rlen-4;li>lmin;li--) {
815 >   if (seedVal==*(int*)&(seq[li])) {
816 >      break;
817        }
930    else { //mismatch
931      score+=a_mis_score;
932      }  
933   if (score>maxscore) {
934       ai_maxscore=ai;
935       bi_maxscore=bi;
936       maxscore=score;
937       }
938     else if (maxscore-score>a_dropoff_score) break;
818     }
819 <  a_r=ai_maxscore;
820 <  b_r=bi_maxscore;
821 <  int a_ovh3=alen-a_r-1;
822 <  int b_ovh3=blen-b_r-1;
823 <  int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3;
824 <  int mmovh5=(a_l<b_l)? a_l : b_l;
825 <  //if (debug) GMessage("  after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore);
826 < #ifdef DEBUG
827 <  if (debug) GMessage("     extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars());
828 < #endif
829 <  if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) {
830 <     if (a_l<a_ovh3) {
831 <        //adapter closer to the left end (typical for 5' adapter)
832 <        l5=a_r+1;
833 <        l3=alen-1;
819 > if (li<=lmin) return false;
820 > //seed found, try to extend it both ways
821 > //extend right
822 > int ri=li+3;
823 > SLocScore loc(ri, poly_m_score<<2);
824 > SLocScore maxloc(loc);
825 > //extend right
826 > while (ri<rlen-1) {
827 >   ri++;
828 >   if (seq[ri]==polyChar) {
829 >                loc.add(ri,poly_m_score);
830 >                }
831 >   else if (seq[ri]=='N') {
832 >                loc.add(ri,0);
833 >                }
834 >   else { //mismatch
835 >        loc.add(ri,poly_mis_score);
836 >        if (maxloc.score-loc.score>poly_dropoff_score) break;
837          }
838 <      else {
839 <        //adapter matching at the right end (typical for 3' adapter)
840 <        l5=0;
959 <        l3=a_l-1;
960 <        }
961 <     return true;
962 <     }
963 < else { //keep this segment pair for later (gapped alignment)
964 <   segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore));
965 <   //this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend)
966 <   }
967 <  //do not trim:
968 <  l5=0;
969 <  l3=alen-1;
970 <  return false;
971 < }
972 <
973 < /*
974 < int getWordValue(const char* s, int wlen) {
975 < int r=0;
976 < while (wlen--) { r+=(((int)s[wlen])<<wlen) }
977 < return r;
978 < }
979 < */
980 < int get3mer_value(const char* s) {
981 < return (s[0]<<16)+(s[1]<<8)+s[2];
982 < }
983 <
984 < int w3_match(int qv, const char* str, int slen, int start_index=0) {
985 < if (start_index>=slen || start_index<0) return -1;
986 < for (int i=start_index;i<slen-3;i++) {
987 <   int rv=get3mer_value(str+i);
988 <   if (rv==qv) return i;
989 <   }
990 < return -1;
991 < }
992 <
993 < int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
994 < if (end_index>=slen) return -1;
995 < if (end_index<0) end_index=slen-1;
996 < for (int i=end_index-2;i>=0;i--) {
997 <   int rv=get3mer_value(str+i);
998 <   if (rv==qv) return i;
999 <   }
1000 < return -1;
1001 < }
1002 <
1003 < int fast4match(int32 qv, const char* str, int slen, int start_index=0) {
1004 < if (start_index>=slen || start_index<0) return -1;
1005 < for (int i=start_index;i<slen-4;i++) {
1006 <   int32* rv=(int32*)(str+i);
1007 <   if (*rv==qv) return i;
1008 <   }
1009 < return -1;
1010 < }
1011 <
1012 < int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) {
1013 < if (end_index>=slen) return -1;
1014 < if (end_index<0) end_index=slen-1;
1015 < for (int i=end_index-3;i>=0;i--) {
1016 <   int32* rv=(int32*)(str+i);
1017 <   if (*rv==qv) return i;
1018 <   }
1019 < return -1;
1020 < }
1021 <
1022 < #ifdef DEBUG
1023 < void dbgPrintChain(CSegChain& chain, const char* aseq) {
1024 <  GStr s(aseq);
1025 <  for (int i=0;i<chain.Count();i++) {
1026 <   CSegPair& seg=*chain[i];
1027 <   GMessage("  dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(),
1028 <            s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end);
838 >   if (maxloc.score<=loc.score) {
839 >      maxloc=loc;
840 >      }
841     }
842 + ri=maxloc.pos;
843 + if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
844 + //ri = right boundary for the poly match
845 + //extend left
846 + loc.set(li, maxloc.score);
847 + maxloc.pos=li;
848 + while (li>0) {
849 +    li--;
850 +    if (seq[li]==polyChar) {
851 +                 loc.add(li,poly_m_score);
852 +                 }
853 +    else if (seq[li]=='N') {
854 +                 loc.add(li,0);
855 +                 }
856 +    else { //mismatch
857 +         loc.add(li,poly_mis_score);
858 +         if (maxloc.score-loc.score>poly_dropoff_score) break;
859 +         }
860 +    if (maxloc.score<=loc.score) {
861 +       maxloc=loc;
862 +       }
863 +    }
864 + li=maxloc.pos;
865 + if ((maxloc.score==poly_minScore && ri==rlen-1) ||
866 +    (maxloc.score>poly_minScore && ri>=rlen-3) ||
867 +    (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
868 +  //trimming this li-ri match at 3' end
869 +    l3=li-1;
870 +    if (l3<0) l3=0;
871 +    return true;
872 +    }
873 + return false;
874   }
1031 #endif
1032 const char *pA_seed="AAAA";
1033 const char *pT_seed="TTTT";
1034 const int polyA_seed=*(int*)pA_seed;
1035 const int polyT_seed=*(int*)pT_seed;
875  
876 < bool trim_poly3(GStr &seq, int &l5, int &l3) {
876 > bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
877 > if (!doPolyTrim) return false;
878   int rlen=seq.length();
879   l5=0;
880   l3=rlen-1;
881 + int32 seedVal=*(int32*)poly_seed;
882 + char polyChar=poly_seed[0];
883   //assumes N trimming was already done
884   //so a poly match should be very close to the end of the read
1043 //TODO: should we attempt polyA and polyT at the same time, same end?
885   // -- find the initial match (seed)
886 < int rlo=GMAX((rlen-10), 0);
887 < int si;
888 < for (si=rlen-5;si>rlo;si--) {
889 <   if (polyA_seed==*(int*)&(seq.chars()+si)) {
886 > int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
887 > int li;
888 > for (li=0;li<=lmax;li++) {
889 >   if (seedVal==*(int*)&(seq[li])) {
890        break;
891        }
892     }
893 < if (si<=rlo) return false;
894 < //seed found, try to extend it to left and right
895 < int score=4*match_score;
896 < max_rlo=rlo;
897 < rlo--;
898 < while (rlo>=0) {
899 <    rlo--;
900 <    score+=
901 <    
893 > if (li>lmax) return false;
894 > //seed found, try to extend it both ways
895 > //extend left
896 > int ri=li+3; //save rightmost base of the seed
897 > SLocScore loc(li, poly_m_score<<2);
898 > SLocScore maxloc(loc);
899 > while (li>0) {
900 >    li--;
901 >    if (seq[li]==polyChar) {
902 >                 loc.add(li,poly_m_score);
903 >                 }
904 >    else if (seq[li]=='N') {
905 >                 loc.add(li,0);
906 >                 }
907 >    else { //mismatch
908 >         loc.add(li,poly_mis_score);
909 >         if (maxloc.score-loc.score>poly_dropoff_score) break;
910 >         }
911 >    if (maxloc.score<=loc.score) {
912 >       maxloc=loc;
913 >       }
914      }
915 + li=maxloc.pos;
916 + if (li>5) return false; //no trimming wanted, too far from 5' end
917 + //li = right boundary for the poly match
918 +
919 + //extend right
920 + loc.set(ri, maxloc.score);
921 + maxloc.pos=ri;
922 + while (ri<rlen-1) {
923 +   ri++;
924 +   if (seq[ri]==polyChar) {
925 +                loc.add(ri,poly_m_score);
926 +                }
927 +   else if (seq[ri]=='N') {
928 +                loc.add(ri,0);
929 +                }
930 +   else { //mismatch
931 +        loc.add(ri,poly_mis_score);
932 +        if (maxloc.score-loc.score>poly_dropoff_score) break;
933 +        }
934 +   if (maxloc.score<=loc.score) {
935 +      maxloc=loc;
936 +      }
937 +   }
938 + ri=maxloc.pos;
939 + if ((maxloc.score==poly_minScore && li==0) ||
940 +     (maxloc.score>poly_minScore && li<2)
941 +     || (maxloc.score>(poly_minScore*3) && li<8)) {
942 +    //adjust l5 to reflect this trimming of 5' end
943 +    l5=ri+1;
944 +    if (l5>rlen-1) l5=rlen-1;
945 +    return true;
946 +    }
947 + return false;
948   }
949  
950 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
950 > bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951 > if (adaptors3.Count()==0) return false;
952   int rlen=seq.length();
953   l5=0;
954   l3=rlen-1;
955 < //first try a full match, we might get lucky
956 < int fi=-1;
957 < if ((fi=seq.index(adapter3))>=0) {
958 <   if (fi<rlen-fi-a3len) {//match is closer to the right end
959 <      l5=fi+a3len;
960 <      l3=rlen-1;
961 <      }
962 <    else {
963 <      l5=0;
964 <      l3=fi-1;
965 <      }
1079 <   return true;
1080 <   }
1081 < #ifdef DEBUG
1082 < if (debug) GMessage(">TRIM3 >>   Read: %s\n",seq.chars());
1083 < #endif
1084 <
1085 < //also, for fast detection of other adapter-only reads that start past
1086 < // the beginning of the adapter sequence, try to see if the first a3len-4
1087 < // bases of the read are a substring of the adapter
1088 < if (rlen>a3len-3) {
1089 <   GStr rstart=seq.substr(1,a3len-4);
1090 <   if ((fi=adapter3.index(rstart))>=0) {
1091 <     l3=rlen-1;
1092 <     l5=a3len-4;
1093 <     while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
1094 <     return true;
1095 <     }
1096 <  }
1097 < CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
1098 <  //check the easy cases - 11 bases exact match at the end
1099 < int fdlen=11;
1100 <  if (a3len<16) {
1101 <   fdlen=a3len>>1;
1102 <   }
1103 < if (fdlen>4) {
1104 <     //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
1105 <     GStr rstart=seq.substr(-fdlen-3,fdlen);
1106 <     if ((fi=adapter3.index(rstart))>=0) {
1107 < #ifdef DEBUG
1108 <       if (debug) GMessage("  W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
1109 < #endif
1110 <       if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
1111 <                     adapter3.chars(), a3len, fi,  fdlen, l5,l3, a3segs))
1112 <            return true;
1113 <       }
1114 <     //another easy case: first 11 characters of the adaptor found as a substring of the read
1115 <     GStr bstr=adapter3.substr(0, fdlen);
1116 <     if ((fi=seq.rindex(bstr))>=0) {
1117 < #ifdef DEBUG
1118 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1119 < #endif
1120 <       if (extendMatch(seq.chars(), rlen, fi,
1121 <                     adapter3.chars(), a3len, 0,  fdlen, l5,l3, a3segs))
1122 <            return true;
1123 <       }
1124 <     } //tried to match 11 bases first
1125 <    
1126 < //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
1127 < //-- only extend if the match is in the 3' (ending) region of the read
1128 < int wordSize=3;
1129 < int hlen=12;
1130 < if (hlen>a3len-wordSize) hlen=a3len-wordSize;
1131 < int imin=rlen>>1; //last half of the read, left boundary for the wmer match
1132 < if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
1133 < imin=rlen-imin;
1134 < for (int iw=0;iw<hlen;iw++) {
1135 <   //int32* qv=(int32*)(adapter3.chars()+iw);
1136 <   int qv=get3mer_value(adapter3.chars()+iw);
1137 <   fi=-1;
1138 <   //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1139 <   while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1140 <     //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
1141 <
1142 < #ifdef DEBUG
1143 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
1144 < #endif
1145 <     if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1146 <                   a3len, iw, wordSize, l5,l3, a3segs)) return true;
1147 <     fi--;
1148 <     if (fi<imin) break;
1149 <     }
1150 <   } //for each wmer in the first hlen bases of the adaptor
1151 < /*
1152 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1153 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1154 < if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
1155 < int hlen2=a3len-wordSize;
1156 < //if (hlen2>a3len-4) hlen2=a3len-4;
1157 < if (hlen2>hlen) {
1158 < #ifdef DEBUG
1159 <     if (debug && a3segs.Count()>0) {
1160 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
955 > bool trimmed=false;
956 > GStr wseq(seq);
957 > int wlen=rlen;
958 > GXSeqData seqdata;
959 > int numruns=revCompl ? 2 : 1;
960 >
961 > for (int ai=0;ai<adaptors3.Count();ai++) {
962 >   for (int r=0;r<numruns;r++) {
963 >     if (r) {
964 >          seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
965 >                 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
966          }
967 < #endif
968 <     for (int iw=hlen;iw<hlen2;iw++) {
969 <         //int* qv=(int32 *)(adapter3.chars()+iw);
970 <         int qv=get3mer_value(adapter3.chars()+iw);
971 <         fi=-1;
972 <         //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
973 <         while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
974 <           extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
975 <                         a3len, iw, wordSize, l5,l3, a3segs);
976 <           fi--;
977 <           if (fi<imin) break;
978 <           }
979 <         } //for each wmer between hlen2 and hlen bases of the adaptor
980 <     }
981 < //lastly, analyze collected a3segs for a possible gapped alignment:
982 < GList<CSegChain> segchains(false,true,false);
983 < #ifdef DEBUG
984 < if (debug && a3segs.Count()>0) {
985 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
986 <   }
987 < #endif
988 < for (int i=0;i<a3segs.Count();i++) {
989 <   if (a3segs[i]->chain==NULL) {
990 <       if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
991 <       CSegChain* newchain=new CSegChain();
992 <       newchain->setFreeItem(false);
1188 <       newchain->addSegPair(a3segs[i]);
1189 <       a3segs[i]->chain=newchain;
1190 <       segchains.Add(newchain); //just to free them when done
1191 <       }
1192 <   for (int j=i+1;j<a3segs.Count();j++) {
1193 <      CSegChain* chain=a3segs[i]->chain;
1194 <      if (chain->extendChain(a3segs[j])) {
1195 <          a3segs[j]->chain=chain;
1196 < #ifdef DEBUG
1197 <          if (debug) dbgPrintChain(*chain, adapter3.chars());
1198 < #endif      
1199 <          //save time by checking here if the extended chain is already acceptable for trimming
1200 <          if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1201 <            l5=0;
1202 <            l3=chain->astart-2;
1203 < #ifdef DEBUG
1204 <          if (debug && a3segs.Count()>0) {
1205 <            GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1206 <            }
1207 < #endif
1208 <            return true;
1209 <            }
1210 <          } //chain can be extended
1211 <      }
1212 <   } //collect segment alignments into chains
1213 < */  
1214 < return false; //no adapter parts found
967 >     else {
968 >            seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
969 >                 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
970 >        }
971 >
972 >  GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
973 >  if (aln) {
974 >     trimmed=true;
975 >     //keep unmatched region on the left OR right (the longer one)
976 >     if (aln->sl > wlen-aln->sr) {
977 >         //keep left side
978 >         l3-=(wlen-aln->sl+1);
979 >         if (l3<0) l3=0;
980 >         }
981 >     else { //keep right side
982 >         l5+=aln->sr;
983 >         if (l5>=rlen) l5=rlen-1;
984 >         }
985 >     delete aln;
986 >     if (l3-l5+1<min_read_len) return true;
987 >     wseq=seq.substr(l5,l3-l5+1);
988 >     wlen=wseq.length();
989 >     }
990 >   }//forward and reverse adaptors
991 >  }//for each 3' adaptor
992 >  return trimmed;
993   }
994  
995 < bool trim_adapter5(GStr& seq, int&l5, int &l3) {
996 < //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
995 > bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
996 > if (adaptors5.Count()==0) return false;
997   int rlen=seq.length();
998   l5=0;
999   l3=rlen-1;
1000 < //try to see if adapter is fully included in the read
1001 < int fi=-1;
1002 < if ((fi=seq.index(adapter5))>=0) {
1003 <   if (fi<rlen-fi-a5len) {//match is closer to the right end
1004 <      l5=fi+a5len;
1005 <      l3=rlen-1;
1006 <      }
1007 <    else {
1008 <      l5=0;
1009 <      l3=fi-1;
1010 <      }
1233 <   return true;
1234 <   }
1235 < #ifdef DEBUG
1236 < if (debug) GMessage(">TRIM5 >>   Read: %s\n",seq.chars());
1237 < #endif
1238 <
1239 < CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1240 <
1241 < //try the easy way out first - look for an exact match of 11 bases
1242 < int fdlen=11;
1243 <  if (a5len<16) {
1244 <   fdlen=a5len>>1;
1245 <   }
1246 < if (fdlen>4) {
1247 <     GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1248 <     if ((fi=adapter5.index(rstart))>=0) {
1249 < #ifdef DEBUG
1250 <       if (debug) GMessage("  W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1251 < #endif
1252 <       if (extendMatch(seq.chars(), rlen, 1,
1253 <                     adapter5.chars(), a5len, fi,  fdlen, l5,l3, a5segs, true))
1254 <           return true;
1255 <       }
1256 <     //another easy case: last 11 characters of the adaptor found as a substring of the read
1257 <     GStr bstr=adapter5.substr(-fdlen);
1258 <     if ((fi=seq.index(bstr))>=0) {
1259 < #ifdef DEBUG
1260 <       if (debug) GMessage("  A11match found: %*s\n", fi+fdlen, bstr.chars());
1261 < #endif
1262 <       if (extendMatch(seq.chars(), rlen, fi,
1263 <                     adapter5.chars(), a5len, a5len-fdlen,  fdlen, l5,l3,a5segs,true))
1264 <          return true;
1265 <       }
1266 <     } //tried to matching at most 11 bases first
1267 <    
1268 < //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1269 < //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1270 < int wordSize=3;
1271 < int hlen=12;
1272 < if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1273 < int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1274 < if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1275 < for (int iw=0;iw<=hlen;iw++) {
1276 <   int apstart=a5len-iw-wordSize;
1277 <   fi=0;
1278 <   //int* qv=(int32 *)(adapter5.chars()+apstart);
1279 <   int qv=get3mer_value(adapter5.chars()+apstart);
1280 <   //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1281 <   while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1282 < #ifdef DEBUG
1283 <     if (debug) GMessage("    Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1284 < #endif
1285 <     if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1286 <                a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1287 <     fi++;
1288 <     if (fi>imax) break;
1289 <     }
1290 <   } //for each wmer in the last hlen bases of the adaptor
1291 < /*
1292 <
1293 < //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1294 < //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1295 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1296 < int hlen2=a5len-wordSize;
1297 < //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1298 < #ifdef DEBUG
1299 <      if (debug && a5segs.Count()>0) {
1300 <        GMessage("  >>>>>2nd. hash: %s\n",seq.chars());
1000 > bool trimmed=false;
1001 > GStr wseq(seq);
1002 > int wlen=rlen;
1003 > GXSeqData seqdata;
1004 > int numruns=revCompl ? 2 : 1;
1005 > GList<GXAlnInfo> bestalns(true, true, false);
1006 > for (int ai=0;ai<adaptors5.Count();ai++) {
1007 >   for (int r=0;r<numruns;r++) {
1008 >     if (r) {
1009 >          seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1010 >                 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1011          }
1012 < #endif
1013 < if (hlen2>hlen) {
1014 <     for (int iw=hlen+1;iw<=hlen2;iw++) {
1015 <         int apstart=a5len-iw-wordSize;
1016 <         fi=0;
1017 <         //int* qv=(int32 *)(adapter5.chars()+apstart);
1018 <         int qv=get3mer_value(adapter5.chars()+apstart);
1019 <         //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1020 <         while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1021 <           extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1022 <                      a5len, apstart, wordSize, l5,l3, a5segs, true);
1023 <           fi++;
1024 <           if (fi>imax) break;
1025 <           }
1026 <         } //for each wmer between hlen2 and hlen bases of the adaptor
1012 >     else {
1013 >            seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1014 >                 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1015 >        }
1016 >         GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1017 >         if (aln) {
1018 >           if (aln->strong) {
1019 >                   trimmed=true;
1020 >                   bestalns.Add(aln);
1021 >                   break; //will check the rest next time
1022 >                   }
1023 >            else bestalns.Add(aln);
1024 >           }
1025 >         } //forward and reverse?
1026 >   if (trimmed) break; //will check the rest in the next cycle
1027 >  }//for each 5' adaptor
1028 >  if (bestalns.Count()>0) {
1029 >           GXAlnInfo* aln=bestalns[0];
1030 >           if (aln->sl-1 > wlen-aln->sr) {
1031 >                   //keep left side
1032 >                   l3-=(wlen-aln->sl+1);
1033 >                   if (l3<0) l3=0;
1034 >                   }
1035 >           else { //keep right side
1036 >                   l5+=aln->sr;
1037 >                   if (l5>=rlen) l5=rlen-1;
1038 >                   }
1039 >           //delete aln;
1040 >           //if (l3-l5+1<min_read_len) return true;
1041 >           wseq=seq.substr(l5,l3-l5+1);
1042 >           wlen=wseq.length();
1043 >           return true; //break the loops here to report a good find
1044       }
1045 < if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1046 < // lastly, analyze collected a5segs for a possible gapped alignment:
1320 < GList<CSegChain> segchains(false,true,false);
1321 < #ifdef DEBUG
1322 < if (debug && a5segs.Count()>0) {
1323 <   GMessage(">>>>>>>>>   Read: %s\n",seq.chars());
1324 <   }
1325 < #endif
1326 < for (int i=0;i<a5segs.Count();i++) {
1327 <   if (a5segs[i]->chain==NULL) {
1328 <       if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1329 <       CSegChain* newchain=new CSegChain(true);
1330 <       newchain->setFreeItem(false);
1331 <       newchain->addSegPair(a5segs[i]);
1332 <       a5segs[i]->chain=newchain;
1333 <       segchains.Add(newchain); //just to free them when done
1334 <       }
1335 <   for (int j=i+1;j<a5segs.Count();j++) {
1336 <      CSegChain* chain=a5segs[i]->chain;
1337 <      if (chain->extendChain(a5segs[j])) {
1338 <         a5segs[j]->chain=chain;
1339 < #ifdef DEBUG
1340 <         if (debug) dbgPrintChain(*chain, adapter5.chars());
1341 < #endif      
1342 <      //save time by checking here if the extended chain is already acceptable for trimming
1343 <         if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1344 <            l5=chain->aend;
1345 <            l3=rlen-1;
1346 <            return true;
1347 <            }
1348 <         } //chain can be extended
1349 <      }
1350 <   } //collect segment alignments into chains
1351 < */
1352 < return false; //no adapter parts found
1353 < }
1045 >  return false;
1046 > }
1047  
1048 < //convert qvs to/from phred64 from/to phread33
1048 > //convert qvs to/from phred64 from/to phread33
1049   void convertPhred(GStr& q) {
1050   for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1051   }
# Line 1361 | Line 1054
1054   for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1055   }
1056  
1057 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1057 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1058            GStr& rname, GStr& rinfo, GStr& infname) {
1059   rseq="";
1060   rqv="";
# Line 1377 | Line 1070
1070        } //raw qseq format
1071   else { // FASTQ or FASTA */
1072   isfasta=(l[0]=='>');
1073 < if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1073 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1074        infname.chars(), l);
1075   GStr s(l);
1076   rname=&(l[1]);
1077   for (int i=0;i<rname.length();i++)
1078 <    if (rname[i]<=' ') {
1079 <       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1080 <       rname.cut(i);
1081 <       break;
1078 >    if (rname[i]<=' ') {
1079 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1080 >       rname.cut(i);
1081 >       break;
1082         }
1083    //now get the sequence
1084 < if ((l=fq.getLine())==NULL)
1084 > if ((l=fq.getLine())==NULL)
1085        GError("Error: unexpected EOF after header for read %s (%s)\n",
1086                     rname.chars(), infname.chars());
1087   rseq=l; //this must be the DNA line
1088   while ((l=fq.getLine())!=NULL) {
1089        //seq can span multiple lines
1090        if (l[0]=='>' || l[0]=='+') {
1091 <           fq.pushBack();
1091 >           fq.pushBack();
1092             break; //
1093             }
1094        rseq+=l;
1095 <      } //check for multi-line seq
1095 >      } //check for multi-line seq
1096   if (!isfasta) { //reading fastq quality values, which can also be multi-line
1097      if ((l=fq.getLine())==NULL)
1098          GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1099      if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1100 <    if ((l=fq.getLine())==NULL)
1100 >    if ((l=fq.getLine())==NULL)
1101          GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1102      rqv=l;
1103 <    //if (rqv.length()!=rseq.length())
1103 >    //if (rqv.length()!=rseq.length())
1104      //  GError("Error: qv len != seq len for %s\n", rname.chars());
1105      while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1106        rqv+=l; //append to qv string
1107        }
1108      }// fastq
1109   // } //<-- FASTA or FASTQ
1110 < rseq.upper(); //TODO: what if we care about masking?
1110 > rseq.upper();
1111   return true;
1112   }
1113  
1114 + #ifdef GDEBUG
1115 + void showTrim(GStr& s, int l5, int l3) {
1116 +  if (l5>0 || l3==0) {
1117 +    color_bg(c_red);
1118 +    }
1119 +  for (int i=0;i<s.length()-1;i++) {
1120 +    if (i && i==l5) color_resetbg();
1121 +    fprintf(stderr, "%c", s[i]);
1122 +    if (i && i==l3) color_bg(c_red);
1123 +   }
1124 +  fprintf(stderr, "%c", s[s.length()-1]);
1125 +  color_reset();
1126 +  fprintf(stderr, "\n");
1127 + }
1128 + #endif
1129 +
1130   char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1131 < //returns 0 if the read was untouched, 1 if it was just trimmed
1131 > //returns 0 if the read was untouched, 1 if it was just trimmed
1132   // and a trash code if it was trashed
1133   l5=0;
1134   l3=rseq.length()-1;
1135 + #ifdef GDEBUG
1136 +   //rseq.reverse();
1137 +   GMessage(">%s\n", rname.chars());
1138 +   GMessage("%s\n",rseq.chars());
1139 + #endif
1140   if (l3-l5+1<min_read_len) {
1141     return 's';
1142     }
# Line 1458 | Line 1172
1172     w5=0;
1173     w3=wseq.length()-1;
1174     }
1175 < if (a3len>0) {
1176 <  if (trim_adapter3(wseq, w5, w3)) {
1175 > char trim_code;
1176 > //clean the more dirty end first - 3'
1177 > int prev_w5=0;
1178 > int prev_w3=0;
1179 > bool w3upd=true;
1180 > bool w5upd=true;
1181 > do {
1182 >  trim_code=0;
1183 >  if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1184 >      trim_code='A';
1185 >      }
1186 >  else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1187 >      trim_code='T';
1188 >      }
1189 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1190 >      trim_code='A';
1191 >      }
1192 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1193 >      trim_code='T';
1194 >      }
1195 >  else if (trim_adaptor5(wseq, w5, w3)) {
1196 >      trim_code='5';
1197 >      }
1198 >  else if (trim_adaptor3(wseq, w5, w3)) {
1199 >      trim_code='3';
1200 >      }
1201 >  if (trim_code) {
1202 >     w3upd=(w3!=prev_w3);
1203 >         w5upd=(w5!=prev_w5);
1204 >         if (w3upd) prev_w3=w3;
1205 >         if (w5upd) prev_w5=w5;
1206 >   #ifdef GDEBUG
1207 >     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1208 >     showTrim(wseq, w5, w3);
1209 >   #endif
1210       int trimlen=wseq.length()-(w3-w5+1);
1211       num_trimmed3++;
1212 <     if (trimlen<min_trimmed3)
1212 >     if (trimlen<min_trimmed3)
1213           min_trimmed3=trimlen;
1214       l5+=w5;
1215       l3-=(wseq.length()-1-w3);
1216       if (w3-w5+1<min_read_len) {
1217 <         return '3';
1217 >         return trim_code;
1218           }
1219        //-- keep only the w5..w3 range
1220        wseq=wseq.substr(w5, w3-w5+1);
1221        if (!wqv.is_empty())
1222           wqv=wqv.substr(w5, w3-w5+1);
1223 <      }//some adapter was trimmed
1224 <   } //adapter trimming
1225 < if (a5len>0) {
1479 <  if (trim_adapter5(wseq, w5, w3)) {
1480 <     int trimlen=wseq.length()-(w3-w5+1);
1481 <     num_trimmed5++;
1482 <     if (trimlen<min_trimmed5)
1483 <         min_trimmed5=trimlen;
1484 <     l5+=w5;
1485 <     l3-=(wseq.length()-1-w3);
1486 <     if (w3-w5+1<min_read_len) {
1487 <         return '5';
1488 <         }
1489 <      //-- keep only the w5..w3 range
1490 <      wseq=wseq.substr(w5, w3-w5+1);
1491 <      if (!wqv.is_empty())
1492 <         wqv=wqv.substr(w5, w3-w5+1);
1493 <      }//some adapter was trimmed
1494 <   } //adapter trimming
1223 >      }//trimming at 3' end
1224 > } while (trim_code);
1225 >
1226   if (doCollapse) {
1227     //keep read for later
1228     FqDupRec* dr=dhash.Find(wseq.chars());
1229     if (dr==NULL) { //new entry
1230 <          //if (prefix.is_empty())
1231 <             dhash.Add(wseq.chars(),
1230 >          //if (prefix.is_empty())
1231 >             dhash.Add(wseq.chars(),
1232                    new FqDupRec(&wqv, rname.chars()));
1233            //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1234           }
# Line 1531 | Line 1262
1262         fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1263         }
1264        else {
1265 <       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1265 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1266                            rseq.chars());
1267         }
1268       }
# Line 1542 | Line 1273
1273         fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1274         }
1275        else
1276 <       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1276 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1277                            rseq.chars(),rqv.chars() );
1278       }
1279   }
# Line 1550 | Line 1281
1281   void trash_report(char trashcode, GStr& rname, FILE* freport) {
1282   if (freport==NULL || trashcode<=' ') return;
1283   if (trashcode=='3' || trashcode=='5') {
1284 <   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1284 >   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1285     }
1286   else {
1287     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
# Line 1642 | Line 1373
1373      }
1374   }
1375  
1376 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1377 <                       GStr& s, GStr& infname, GStr& infname2) {
1376 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1377 > //TODO: prepare CASeqData here, and collect hexamers as well
1378 >  if (seq.is_empty() || seq=="-" ||
1379 >          seq=="N/A" || seq==".") return;
1380 >
1381 > CASeqData adata(revCompl);
1382 > int idx=adaptors.Add(adata);
1383 > if (idx<0) GError("Error: failed to add adaptor!\n");
1384 > adaptors[idx].trim_type=trim_type;
1385 > adaptors[idx].update(seq.chars());
1386 > }
1387 >
1388 >
1389 > int loadAdaptors(const char* fname) {
1390 >  GLineReader lr(fname);
1391 >  char* l;
1392 >  while ((l=lr.nextLine())!=NULL) {
1393 >   if (lr.length()<=3 || l[0]=='#') continue;
1394 >   if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1395 >        l[0]==';'|| l[0]==':' ) {
1396 >      int i=1;
1397 >      while (l[i]!=0 && isspace(l[i])) {
1398 >        i++;
1399 >        }
1400 >      if (l[i]!=0) {
1401 >        GStr s(&(l[i]));
1402 >      #ifdef GDEBUG
1403 >          //s.reverse();
1404 >      #endif
1405 >        addAdaptor(adaptors3, s, galn_TrimRight);
1406 >        continue;
1407 >        }
1408 >      }
1409 >    else {
1410 >      GStr s(l);
1411 >      s.startTokenize("\t ;,:");
1412 >      GStr a5,a3;
1413 >      if (s.nextToken(a5))
1414 >            s.nextToken(a3);
1415 >        else continue; //no tokens on this line
1416 >      GAlnTrimType ttype5=galn_TrimLeft;
1417 >      a5.upper();
1418 >      a3.upper();
1419 >      if (a3.is_empty() || a3==a5 || a3=="=") {
1420 >         a3.clear();
1421 >         ttype5=galn_TrimEither;
1422 >         }
1423 >     #ifdef GDEBUG
1424 >     //   a5.reverse();
1425 >     //   a3.reverse();
1426 >     #endif
1427 >      addAdaptor(adaptors5, a5, ttype5);
1428 >      addAdaptor(adaptors3, a3, galn_TrimRight);
1429 >      }
1430 >   }
1431 >   return adaptors5.Count()+adaptors3.Count();
1432 > }
1433 >
1434 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1435 >                       GStr& s, GStr& infname, GStr& infname2) {
1436   // uses outsuffix to generate output file names and open file handles as needed
1437   infname="";
1438   infname2="";
# Line 1671 | Line 1460
1460   s.startTokenize(",:");
1461   s.nextToken(infname);
1462   bool paired=s.nextToken(infname2);
1463 < if (fileExists(infname.chars())==0)
1463 > if (fileExists(infname.chars())==0)
1464      GError("Error: cannot find file %s!\n",infname.chars());
1465   GStr fname(getFileName(infname.chars()));
1466   GStr picmd;
# Line 1693 | Line 1482
1482   if (!paired) return;
1483   if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1484   // ---- paired reads:-------------
1485 < if (fileExists(infname2.chars())==0)
1485 > if (fileExists(infname2.chars())==0)
1486       GError("Error: cannot find file %s!\n",infname2.chars());
1487   picmd="";
1488   GStr fname2(getFileName(infname2.chars()));

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines