ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 4 | Line 4
4   #include "GList.hh"
5   #include <ctype.h>
6   #include "GAlnExtend.h"
7 + #include <inttypes.h>
8  
9   #define USAGE "Usage:\n\
10 < fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 > fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_match>]\\\n\
11     [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
12     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
13 <    <input.fq>[,<input_mates.fq>\n\
13 >   [-a <min_poly>] [-A] <input.fq>[,<input_mates.fq>\n\
14   \n\
15 < Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 > Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
16   for low complexity and collapse duplicate reads.\n\
17   If read pairs should be trimmed and kept together (i.e. without discarding\n\
18   one read in a pair), the two file names should be given delimited by a comma\n\
# Line 25 | Line 26
26      file(s) named <input>.<outsuffix> which will be created in the \n\
27      current (working) directory; (writes to stdout if -o- is given);\n\
28      a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
29 < -f  file with adapter sequences to trim, each line having this format:\n\
30 <    <5'-adapter-sequence> <3'-adapter-sequence>\n\
31 < -5  trim the given adapter or primer sequence at the 5' end of each read\n\
29 > -f  file with adaptor sequences to trim, each line having this format:\n\
30 >    <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
31 > -5  trim the given adaptor or primer sequence at the 5' end of each read\n\
32      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
33 < -3  trim the given adapter sequence at the 3' end of each read\n\
33 > -3  trim the given adaptor sequence at the 3' end of each read\n\
34      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
35 + -a  minimum exact match to adaptor sequence to trim at the end (6)\n\
36   -A  disable polyA/T trimming (enabled by default)\n\
37 < -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
37 > -y  minimum length of poly-A/T run to remove (6)\n\
38   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
39   -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
40   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
# Line 41 | Line 43
43   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
44      one letter code for the reason why they were trashed\n\
45   -D  apply a low-complexity (dust) filter and discard any read that has over\n\
46 <    50% of its length detected as low complexity\n\
46 >    50%% of its length detected as low complexity\n\
47   -C  collapse duplicate reads and add a prefix with their count to the read\n\
48      name (e.g. for microRNAs)\n\
49   -p  input is phred64/phred33 (use -p64 or -p33)\n\
50   -Q  convert quality values to the other Phred qv type\n\
51 < -V  verbose processing\n\
51 > -V  be verbose when done (print summary counts)\n\
52   "
53  
54   //-z  for -o option, the output stream(s) will be first piped into the given\n
55   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
56  
57  
58 < // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
58 > // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
59  
60   //For paired reads sequencing:
61   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
# Line 81 | Line 83
83   GStr outsuffix; // -o
84   GStr prefix;
85   GStr zcmd;
86 < int num_trimmed5=0;
87 < int num_trimmed3=0;
88 < int num_trimmedN=0;
89 < int num_trimmedQ=0;
90 < int min_trimmed5=INT_MAX;
91 < int min_trimmed3=INT_MAX;
86 > char isACGT[256];
87 >
88 > uint num_trimN=0; //reads trimmed by N%
89 > uint num_trimQ=0; //reads trimmed by qv threshold
90 > uint num_trimV=0; //reads trimmed by adapter match
91 > uint num_trimA=0; //reads trimmed by polyA
92 > uint num_trimT=0; //reads trimmed by polyT
93 > uint num_trim5=0; //number of reads trimmed at 5' end
94 > uint num_trim3=0; //number of reads trimmed at 3' end
95 >
96 > uint64 b_totalIn=0; //total number of input bases
97 > uint64 b_totalN=0;  //total number of undetermined bases found in input bases
98 >
99 > uint64 b_trimN=0; //total number of bases trimmed due to N-trim
100 > uint64 b_trimQ=0; //number of bases trimmed due to qv threshold
101 > uint64 b_trimV=0; //total number of bases trimmed due to adaptor matches
102 > uint64 b_trimA=0; //number of bases trimmed due to poly-A tails
103 > uint64 b_trimT=0; //number of bases trimmed due to poly-T tails
104 > uint64 b_trim5=0; //total bases trimmed on the 5' side
105 > uint64 b_trim3=0; //total bases trimmed on the 3' side
106 > //int min_trimmed5=INT_MAX;
107 > //int min_trimmed3=INT_MAX;
108  
109   int qvtrim_qmin=0;
110   int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
# Line 94 | Line 112
112   int qv_cvtadd=0; //could be -31 or +31
113  
114   // adaptor matching metrics -- for X-drop ungapped extension
115 < const int match_reward=2;
116 < const int mismatch_penalty=3;
117 < const int Xdrop=8;
115 > //const int match_reward=2;
116 > //const int mismatch_penalty=3;
117 > int match_reward=1;
118 > int mismatch_penalty=5;
119 > int Xdrop=16;
120 > int minEndAdapter=6;
121 >
122  
123   const int poly_m_score=2; //match score
124   const int poly_mis_score=-3; //mismatch
# Line 107 | Line 129
129   const char *polyT_seed="TTTT";
130  
131   struct CASeqData {
132 <   //positional data for every possible hexamer in an adapter
133 <   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
134 <   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
135 <   GStr seq; //actual adapter sequence data
132 >   //positional data for every possible hexamer in an adaptor
133 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
134 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
135 >   GStr seq; //actual adaptor sequence data
136     GStr seqr; //reverse complement sequence
137 +   int amlen; //fraction of adaptor length matching that's
138 +              //enough to consider the alignment
139 +   GAlnTrimType trim_type;
140     bool use_reverse;
141 <   CASeqData(bool rev=false):seq(),seqr() {
142 <     use_reverse=rev;
141 >   CASeqData(bool rev=false):seq(),seqr(),
142 >             amlen(0), use_reverse(rev) {
143 >     trim_type=galn_None; //should be updated later!
144       for (int i=0;i<4096;i++) {
145          pz[i]=NULL;
146          pzr[i]=NULL;
# Line 122 | Line 148
148       }
149  
150     void update(const char* s) {
151 <   seq=s;
152 <   table6mers(seq.chars(), seq.length(), pz);
153 <   if (!use_reverse) return;
154 <   //reverse complement
155 <   seqr=s;
156 <   int slen=seq.length();
157 <   for (int i=0;i<slen;i++)
158 <     seqr[i]=ntComplement(seq[slen-i-1]);
159 <   table6mers(seqr.chars(), seqr.length(), pzr);
151 >         seq=s;
152 >         table6mers(seq.chars(), seq.length(), pz);
153 >         amlen=calc_safelen(seq.length());
154 >         if (!use_reverse) return;
155 >         //reverse complement
156 >         seqr=s;
157 >         int slen=seq.length();
158 >         for (int i=0;i<slen;i++)
159 >           seqr[i]=ntComplement(seq[slen-i-1]);
160 >         table6mers(seqr.chars(), seqr.length(), pzr);
161     }
162  
163     ~CASeqData() {
# Line 141 | Line 168
168     }
169   };
170  
171 < GVec<CASeqData> adapters5;
172 < GVec<CASeqData> adapters3;
171 > GVec<CASeqData> adaptors5;
172 > GVec<CASeqData> adaptors3;
173  
174   CGreedyAlignData* gxmem_l=NULL;
175   CGreedyAlignData* gxmem_r=NULL;
# Line 195 | Line 222
222  
223   GHash<FqDupRec> dhash; //hash to keep track of duplicates
224  
225 < void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
226 < int loadAdapters(const char* fname);
225 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
226 > int loadAdaptors(const char* fname);
227  
228   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
229                         GStr& s, GStr& infname, GStr& infname2);
# Line 216 | Line 243
243   int dust(GStr& seq);
244   bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
245   bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
246 < bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
247 < bool trim_adapter3(GStr& seq, int &l5, int &l3);
246 > bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
247 > bool trim_adaptor3(GStr& seq, int &l5, int &l3);
248  
249   void convertPhred(char* q, int len);
250   void convertPhred(GStr& q);
251  
252   int main(int argc, char * const argv[]) {
253 <  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
253 >  GArgs args(argc, argv, "MATCH=MISMATCH=XDROP=YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:y:");
254    int e;
255    if ((e=args.isError())>0) {
256        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 262 | Line 289
289    if (!s.is_empty()) {
290       qvtrim_max=s.asInt();
291       }
292 +  s=args.getOpt("MATCH");
293 +  if (!s.is_empty())
294 +            match_reward=s.asInt();
295 +  s=args.getOpt("MISMATCH");
296 +  if (!s.is_empty())
297 +            mismatch_penalty=s.asInt();
298 +  s=args.getOpt("XDROP");
299 +  if (!s.is_empty())
300 +            Xdrop=s.asInt();
301 +
302    s=args.getOpt('p');
303    if (!s.is_empty()) {
304       int v=s.asInt();
# Line 276 | Line 313
313         else
314           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
315       }
316 +  memset((void*)isACGT, 0, 256);
317 +  isACGT['A']=isACGT['a']=isACGT['C']=isACGT['c']=1;
318 +  isACGT['G']=isACGT['g']=isACGT['T']=isACGT['t']=1;
319    s=args.getOpt('f');
320    if (!s.is_empty()) {
321 <   loadAdapters(s.chars());
321 >   loadAdaptors(s.chars());
322     }
323 <  bool fileAdapters=adapters5.Count()+adapters3.Count();
323 >  bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
324    s=args.getOpt('5');
325    if (!s.is_empty()) {
326 <    if (fileAdapters)
326 >    if (fileAdaptors)
327        GError("Error: options -5 and -f cannot be used together!\n");
328      s.upper();
329 <    addAdapter(adapters5, s);
329 >    addAdaptor(adaptors5, s, galn_TrimLeft);
330      }
331    s=args.getOpt('3');
332    if (!s.is_empty()) {
333 <    if (fileAdapters)
333 >    if (fileAdaptors)
334        GError("Error: options -3 and -f cannot be used together!\n");
335        s.upper();
336 <      addAdapter(adapters3, s);
336 >      addAdaptor(adaptors3, s, galn_TrimRight);
337      }
338    s=args.getOpt('y');
339    if (!s.is_empty()) {
340       int minmatch=s.asInt();
341 <     poly_minScore=minmatch*poly_m_score;
341 >     if (minmatch>2)
342 >        poly_minScore=minmatch*poly_m_score;
343 >     else GMessage("Warning: invalid -y option, ignored.\n");
344 >     }
345 >  s=args.getOpt('a');
346 >  if (!s.is_empty()) {
347 >     int minmatch=s.asInt();
348 >     if (minmatch>2)
349 >        minEndAdapter=minmatch;
350 >     else GMessage("Warning: invalid -a option, ignored.\n");
351       }
352  
353 +
354    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
355                           else outsuffix="-";
356    trashReport=  (args.getOpt('r')!=NULL);
# Line 318 | Line 368
368      openfw(freport, args, 'r');
369    char* infile=NULL;
370  
371 <  if (adapters5.Count()>0)
372 <    gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
373 <  if (adapters3.Count()>0)
371 >  if (adaptors5.Count()>0)
372 >    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
373 >        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
374 >  if (adaptors3.Count()>0)
375      gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
376  
377    while ((infile=args.nextNonOpt())!=NULL) {
# Line 332 | Line 383
383      int trash_N=0;
384      int trash_D=0;
385      int trash_poly=0;
386 <    int trash_A3=0;
387 <    int trash_A5=0;
386 >    int trash_V=0;
387 >    num_trimN=0;
388 >    num_trimQ=0;
389 >    num_trimV=0;
390 >    num_trimA=0;
391 >    num_trimT=0;
392 >    num_trim5=0;
393 >    num_trim3=0;
394 >
395 >    b_totalIn=0;
396 >    b_totalN=0;
397 >    b_trimN=0;
398 >    b_trimQ=0;
399 >    b_trimV=0;
400 >    b_trimA=0;
401 >    b_trimT=0;
402 >    b_trim5=0;
403 >    b_trim3=0;
404 >
405      s=infile;
406      GStr infname;
407      GStr infname2;
# Line 356 | Line 424
424         int a5=0, a3=0, b5=0, b3=0;
425         char tcode=0, tcode2=0;
426         tcode=process_read(seqid, rseq, rqv, a5, a3);
427 +       if (a5>0) {
428 +          b_trim5+=a5;
429 +          num_trim5++;
430 +          }
431 +       if (a3<rseq.length()-1) {
432 +          b_trim3+=rseq.length()-a3-1;
433 +          num_trim3++;
434 +          }
435         if (fq2!=NULL) {
436              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
437              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
# Line 363 | Line 439
439                    seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
440                 }
441              tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
442 +            if (b5>0) {
443 +               b_trim5+=b5;
444 +               num_trim5++;
445 +               }
446 +            if (b3<rseq2.length()-1) {
447 +               b_trim3+=rseq2.length()-b3-1;
448 +               num_trim3++;
449 +               }
450              //decide what to do with this pair and print rseq2 if the pair makes it
451              if (tcode>1 && tcode2<=1) {
452                 //"untrash" rseq
# Line 391 | Line 475
475              } //pair read
476         if (tcode>1) { //trashed
477           #ifdef GDEBUG
478 <         GMessage(" !!!!TRASH => 'N'\n");
478 >         GMessage(" !!!!TRASH code = %c\n",tcode);
479           #endif
480            if (tcode=='s') trash_s++;
481            else if (tcode=='A' || tcode=='T') trash_poly++;
482              else if (tcode=='Q') trash_Q++;
483                else if (tcode=='N') trash_N++;
484                 else if (tcode=='D') trash_D++;
485 <                else if (tcode=='3') trash_A3++;
486 <                 else if (tcode=='5') trash_A5++;
485 >                else if (tcode=='V') trash_V++;
486 >                 //else if (tcode=='5') trash_A5++;
487            if (trashReport) trash_report(tcode, seqid, freport);
488            }
489           else if (!doCollapse) { //write it
# Line 471 | Line 555
555      if (verbose) {
556         if (paired_reads) {
557             GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
558 <           GMessage("Number of input pairs :%9d\n", incounter);
559 <           GMessage("         Output pairs :%9d\n", outcounter);
558 >           GMessage("Number of input pairs :%9u\n", incounter);
559 >           GMessage("         Output pairs :%9u\t(%u trashed)\n", outcounter, incounter-outcounter);
560             }
561           else {
562             GMessage(">Input file : %s\n", infname.chars());
563             GMessage("Number of input reads :%9d\n", incounter);
564 <           GMessage("         Output reads :%9d\n", outcounter);
564 >           GMessage("         Output reads :%9d  (%u trashed)\n", outcounter, incounter-outcounter);
565             }
566 <       GMessage("------------------------------------\n");
567 <       if (num_trimmed5)
568 <          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
569 <       if (num_trimmed3)
570 <          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
566 >       GMessage("--------------- Read trimming: ------------------\n");
567 >       if (num_trim5)
568 >          GMessage("           5' trimmed :%9u\n", num_trim5);
569 >       if (num_trim3)
570 >          GMessage("           3' trimmed :%9u\n", num_trim3);
571 >       if (num_trimQ)
572 >          GMessage("         q.v. trimmed :%9u\n", num_trimQ);
573 >       if (num_trimN)
574 >          GMessage("            N trimmed :%9u\n", num_trimN);
575 >       if (num_trimT)
576 >          GMessage("       poly-T trimmed :%9u\n", num_trimT);
577 >       if (num_trimA)
578 >          GMessage("       poly-A trimmed :%9u\n", num_trimA);
579 >       if (num_trimV)
580 >          GMessage("      Adapter trimmed :%9u\n", num_trimV);
581         GMessage("------------------------------------\n");
582         if (trash_s>0)
583 <         GMessage("     Trashed by length:%9d\n", trash_s);
583 >         GMessage("Trashed by initial len:%9d\n", trash_s);
584         if (trash_N>0)
585           GMessage("         Trashed by N%%:%9d\n", trash_N);
586         if (trash_Q>0)
587           GMessage("Trashed by low quality:%9d\n", trash_Q);
588         if (trash_poly>0)
589           GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
590 <       if (trash_A5>0)
591 <         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
592 <       if (trash_A3>0)
593 <         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
590 >       if (trash_V>0)
591 >         GMessage("    Trashed by adaptor:%9d\n", trash_V);
592 >       GMessage("\n--------------- Base counts  --------------------\n");
593 >       GMessage("    Input bases   :%12llu\n", b_totalIn);
594 >       double percN=100.0* ((double)b_totalN/(double)b_totalIn);
595 >       GMessage("          N bases :%12llu (%4.2f%%)\n", b_totalN, percN);
596 >       GMessage("   trimmed from 5':%12llu\n", b_trim5);
597 >       GMessage("   trimmed from 3':%12llu\n", b_trim3);
598 >       GMessage("\n");
599 >       if (b_trimQ)
600 >       GMessage("     q.v. trimmed :%12llu\n", b_trimQ);
601 >       if (b_trimN)
602 >       GMessage("        N trimmed :%12llu\n", b_trimN);
603 >       if (b_trimT)
604 >       GMessage("   poly-T trimmed :%12llu\n", b_trimT);
605 >       if (b_trimA)
606 >       GMessage("   poly-A trimmed :%12llu\n", b_trimA);
607 >       if (b_trimV)
608 >       GMessage("  Adapter trimmed :%12llu\n", b_trimV);
609 >
610         }
611      if (trashReport) {
612            FWCLOSE(freport);
# Line 937 | Line 1047
1047   return false;
1048   }
1049  
1050 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1051 < if (adapters3.Count()==0) return false;
1050 > bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
1051 > if (adaptors3.Count()==0) return false;
1052   int rlen=seq.length();
1053   l5=0;
1054   l3=rlen-1;
1055   bool trimmed=false;
1056 < GStr wseq(seq.chars());
1056 > GStr wseq(seq);
1057   int wlen=rlen;
1058 < for (int ai=0;ai<adapters3.Count();ai++) {
1059 <  //if (adapters3[ai].is_empty()) continue;
1060 <  int alen=adapters3[ai].seq.length();
1061 <  GStr& aseq=adapters3[ai].seq;
1062 <  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, adapters3[ai].pz,
1063 <                            wseq.chars(), wlen, gxmem_r, 74);
1064 <  if (r_bestaln) {
1065 <     trimmed=true;
1066 <     //keep unmatched region on the left, if any
1067 <     l3-=(wlen-r_bestaln->sl+1);
1068 <     delete r_bestaln;
1069 <     if (l3<0) l3=0;
1070 <     if (l3-l5+1<min_read_len) return true;
1071 <     wseq=seq.substr(l5,l3-l5+1);
1072 <     wlen=wseq.length();
1058 > GXSeqData seqdata;
1059 > int numruns=revCompl ? 2 : 1;
1060 > GList<GXAlnInfo> bestalns(true, true, false);
1061 > for (int ai=0;ai<adaptors3.Count();ai++) {
1062 >   for (int r=0;r<numruns;r++) {
1063 >     if (r) {
1064 >          seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
1065 >                 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
1066 >        }
1067 >     else {
1068 >            seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
1069 >                 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
1070 >        }
1071 >     GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, minEndAdapter, gxmem_r, 86);
1072 >         if (aln) {
1073 >           if (aln->strong) {
1074 >                   trimmed=true;
1075 >                   bestalns.Add(aln);
1076 >                   break; //will check the rest next time
1077 >                   }
1078 >            else bestalns.Add(aln);
1079 >           }
1080 >   }//forward and reverse adaptors
1081 >   if (trimmed) break; //will check the rest in the next cycle
1082 >  }//for each 3' adaptor
1083 > if (bestalns.Count()>0) {
1084 >           GXAlnInfo* aln=bestalns[0];
1085 >           if (aln->sl-1 > wlen-aln->sr) {
1086 >                   //keep left side
1087 >                   l3-=(wlen-aln->sl+1);
1088 >                   if (l3<0) l3=0;
1089 >                   }
1090 >           else { //keep right side
1091 >                   l5+=aln->sr;
1092 >                   if (l5>=rlen) l5=rlen-1;
1093 >                   }
1094 >           //delete aln;
1095 >           //if (l3-l5+1<min_read_len) return true;
1096 >           wseq=seq.substr(l5,l3-l5+1);
1097 >           wlen=wseq.length();
1098 >           return true; //break the loops here to report a good find
1099       }
1100 <  }//for each 5' adapter
965 <  return trimmed;
1100 >  return false;
1101   }
1102  
1103 < bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1104 < if (adapters5.Count()==0) return false;
1103 > bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1104 > if (adaptors5.Count()==0) return false;
1105   int rlen=seq.length();
1106   l5=0;
1107   l3=rlen-1;
1108   bool trimmed=false;
1109 < GStr wseq(seq.chars());
1109 > GStr wseq(seq);
1110   int wlen=rlen;
1111 < for (int ai=0;ai<adapters5.Count();ai++) {
1112 <  //if (adapters5[ai].is_empty()) continue;
1113 <  int alen=adapters5[ai].seq.length();
1114 <  GStr& aseq=adapters5[ai].seq;
1115 <  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
1116 <                 wseq.chars(), wlen, gxmem_l, 84);
1117 <  if (l_bestaln) {
1118 <     trimmed=true;
1119 <     l5+=l_bestaln->sr;
1120 <     delete l_bestaln;
1121 <     if (l5>=rlen) l5=rlen-1;
1122 <     if (l3-l5+1<min_read_len) return true;
1123 <     wseq=seq.substr(l5,l3-l5+1);
1124 <     wlen=wseq.length();
1111 > GXSeqData seqdata;
1112 > int numruns=revCompl ? 2 : 1;
1113 > GList<GXAlnInfo> bestalns(true, true, false);
1114 > for (int ai=0;ai<adaptors5.Count();ai++) {
1115 >   for (int r=0;r<numruns;r++) {
1116 >     if (r) {
1117 >          seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1118 >                 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1119 >        }
1120 >     else {
1121 >            seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1122 >                 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1123 >        }
1124 >         GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type,
1125 >                                                       minEndAdapter, gxmem_l, 90);
1126 >         if (aln) {
1127 >           if (aln->strong) {
1128 >                   trimmed=true;
1129 >                   bestalns.Add(aln);
1130 >                   break; //will check the rest next time
1131 >                   }
1132 >            else bestalns.Add(aln);
1133 >           }
1134 >         } //forward and reverse?
1135 >   if (trimmed) break; //will check the rest in the next cycle
1136 >  }//for each 5' adaptor
1137 >  if (bestalns.Count()>0) {
1138 >           GXAlnInfo* aln=bestalns[0];
1139 >           if (aln->sl-1 > wlen-aln->sr) {
1140 >                   //keep left side
1141 >                   l3-=(wlen-aln->sl+1);
1142 >                   if (l3<0) l3=0;
1143 >                   }
1144 >           else { //keep right side
1145 >                   l5+=aln->sr;
1146 >                   if (l5>=rlen) l5=rlen-1;
1147 >                   }
1148 >           //delete aln;
1149 >           //if (l3-l5+1<min_read_len) return true;
1150 >           wseq=seq.substr(l5,l3-l5+1);
1151 >           wlen=wseq.length();
1152 >           return true; //break the loops here to report a good find
1153       }
1154 <  }//for each 5' adapter
992 <  return trimmed;
1154 >  return false;
1155   }
1156  
1157   //convert qvs to/from phred64 from/to phread33
# Line 1060 | Line 1222
1222  
1223   #ifdef GDEBUG
1224   void showTrim(GStr& s, int l5, int l3) {
1225 <  if (l5>0) {
1225 >  if (l5>0 || l3==0) {
1226      color_bg(c_red);
1227      }
1228    for (int i=0;i<s.length()-1;i++) {
1229      if (i && i==l5) color_resetbg();
1230      fprintf(stderr, "%c", s[i]);
1231 <    if (i==l3) color_bg(c_red);
1231 >    if (i && i==l3) color_bg(c_red);
1232     }
1233    fprintf(stderr, "%c", s[s.length()-1]);
1234    color_reset();
# Line 1091 | Line 1253
1253   GStr wqv(rqv.chars());
1254   int w5=l5;
1255   int w3=l3;
1256 + //count Ns
1257 + b_totalIn+=rseq.length();
1258 + for (int i=0;i<rseq.length();i++) {
1259 + if (isACGT[(int)rseq[i]]==0) b_totalN++;
1260 + }
1261 +
1262   //first do the q-based trimming
1263   if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1264 +   b_trimQ+=(w5-l5)+(l3-w3);
1265 +   num_trimQ++;
1266     if (w3-w5+1<min_read_len) {
1267       return 'Q'; //invalid read
1268       }
# Line 1105 | Line 1275
1275     } //qv trimming
1276   // N-trimming on the remaining read seq
1277   if (ntrim(wseq, w5, w3)) {
1278 +   //Note: ntrim resets w5 and w3
1279     //GMessage("before: %s\n",wseq.chars());
1280     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1281 +   int trim3=(wseq.length()-1-w3);
1282 +   b_trimN+=w5+trim3;
1283 +   num_trimN++;
1284     l5+=w5;
1285 <   l3-=(wseq.length()-1-w3);
1285 >   l3-=trim3;
1286     if (w3-w5+1<min_read_len) {
1287       return 'N'; //to be trashed
1288       }
# Line 1120 | Line 1294
1294     w3=wseq.length()-1;
1295     }
1296   char trim_code;
1297 + //clean the more dirty end first - 3'
1298 + bool trimmedA=false;
1299 + bool trimmedT=false;
1300 + bool trimmedV=false;
1301 +
1302 + int prev_w5=0;
1303 + int prev_w3=0;
1304 + bool w3upd=true;
1305 + bool w5upd=true;
1306   do {
1307    trim_code=0;
1308 <  if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1308 >  if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1309        trim_code='A';
1310 +      b_trimA+=(w5+(wseq.length()-1-w3));
1311 +      if (!trimmedA) { num_trimA++; trimmedA=true; }
1312        }
1313 <  else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1313 >  else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1314        trim_code='T';
1315 +      b_trimT+=(w5+(wseq.length()-1-w3));
1316 +      if (!trimmedT) { num_trimT++; trimmedT=true; }
1317        }
1318 <  else if (trim_adapter5(wseq, w5, w3)) {
1132 <      trim_code='5';
1133 <      }
1134 <  if (trim_code) {
1135 <     #ifdef GDEBUG
1136 <      GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1137 <      showTrim(wseq, w5, w3);
1138 <     #endif
1139 <     int trimlen=wseq.length()-(w3-w5+1);
1140 <     num_trimmed5++;
1141 <     if (trimlen<min_trimmed5)
1142 <         min_trimmed5=trimlen;
1143 <     l5+=w5;
1144 <     l3-=(wseq.length()-1-w3);
1145 <     if (w3-w5+1<min_read_len) {
1146 <         return trim_code;
1147 <         }
1148 <      //-- keep only the w5..w3 range
1149 <      wseq=wseq.substr(w5, w3-w5+1);
1150 <      if (!wqv.is_empty())
1151 <         wqv=wqv.substr(w5, w3-w5+1);
1152 <      }// trimmed at 5' end
1153 < } while (trim_code);
1154 <
1155 < do {
1156 <  trim_code=0;
1157 <  if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1318 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1319        trim_code='A';
1320 +      b_trimA+=(w5+(wseq.length()-1-w3));
1321 +      if (!trimmedA) { num_trimA++; trimmedA=true; }
1322        }
1323 <  else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1323 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1324        trim_code='T';
1325 +      b_trimT+=(w5+(wseq.length()-1-w3));
1326 +      if (!trimmedT) { num_trimT++; trimmedT=true; }
1327 +      }
1328 +  else if (trim_adaptor5(wseq, w5, w3)) {
1329 +      trim_code='V';
1330 +      b_trimV+=(w5+(wseq.length()-1-w3));
1331 +      if (!trimmedV) { num_trimV++; trimmedV=true; }
1332        }
1333 <  else if (trim_adapter3(wseq, w5, w3)) {
1334 <      trim_code='3';
1333 >  else if (trim_adaptor3(wseq, w5, w3)) {
1334 >      trim_code='V';
1335 >      b_trimV+=(w5+(wseq.length()-1-w3));
1336 >      if (!trimmedV) { num_trimV++; trimmedV=true; }
1337        }
1338    if (trim_code) {
1339 <     #ifdef GDEBUG
1339 >     w3upd=(w3!=prev_w3);
1340 >         w5upd=(w5!=prev_w5);
1341 >         if (w3upd) prev_w3=w3;
1342 >         if (w5upd) prev_w5=w5;
1343 >   #ifdef GDEBUG
1344       GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1345       showTrim(wseq, w5, w3);
1346 <     #endif
1347 <     int trimlen=wseq.length()-(w3-w5+1);
1172 <     num_trimmed3++;
1173 <     if (trimlen<min_trimmed3)
1174 <         min_trimmed3=trimlen;
1346 >   #endif
1347 >     //int trimlen=wseq.length()-(w3-w5+1);
1348       l5+=w5;
1349       l3-=(wseq.length()-1-w3);
1350       if (w3-w5+1<min_read_len) {
# Line 1183 | Line 1356
1356           wqv=wqv.substr(w5, w3-w5+1);
1357        }//trimming at 3' end
1358   } while (trim_code);
1186
1187
1359   if (doCollapse) {
1360     //keep read for later
1361     FqDupRec* dr=dhash.Find(wseq.chars());
# Line 1335 | Line 1506
1506      }
1507   }
1508  
1509 < void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1509 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1510   //TODO: prepare CASeqData here, and collect hexamers as well
1511 +  if (seq.is_empty() || seq=="-" ||
1512 +          seq=="N/A" || seq==".") return;
1513 +
1514   CASeqData adata(revCompl);
1515 < int idx=adapters.Add(adata);
1515 > int idx=adaptors.Add(adata);
1516   if (idx<0) GError("Error: failed to add adaptor!\n");
1517 < adapters[idx].update(seq.chars());
1517 > adaptors[idx].trim_type=trim_type;
1518 > adaptors[idx].update(seq.chars());
1519   }
1520  
1521  
1522 < int loadAdapters(const char* fname) {
1522 > int loadAdaptors(const char* fname) {
1523    GLineReader lr(fname);
1524    char* l;
1525    while ((l=lr.nextLine())!=NULL) {
# Line 1360 | Line 1535
1535        #ifdef GDEBUG
1536            //s.reverse();
1537        #endif
1538 <        addAdapter(adapters3, s);
1538 >        addAdaptor(adaptors3, s, galn_TrimRight);
1539          continue;
1540          }
1541        }
# Line 1369 | Line 1544
1544        s.startTokenize("\t ;,:");
1545        GStr a5,a3;
1546        if (s.nextToken(a5))
1547 <         s.nextToken(a3);
1547 >            s.nextToken(a3);
1548 >        else continue; //no tokens on this line
1549 >      GAlnTrimType ttype5=galn_TrimLeft;
1550        a5.upper();
1551        a3.upper();
1552 +      if (a3.is_empty() || a3==a5 || a3=="=") {
1553 +         a3.clear();
1554 +         ttype5=galn_TrimEither;
1555 +         }
1556       #ifdef GDEBUG
1557       //   a5.reverse();
1558       //   a3.reverse();
1559       #endif
1560 <      addAdapter(adapters5, a5);
1561 <      addAdapter(adapters3, a3);
1560 >      addAdaptor(adaptors5, a5, ttype5);
1561 >      addAdaptor(adaptors3, a3, galn_TrimRight);
1562        }
1563     }
1564 <   return adapters5.Count()+adapters3.Count();
1564 >   return adaptors5.Count()+adaptors3.Count();
1565   }
1566  
1567   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines