ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 4 | Line 4
4   #include "GList.hh"
5   #include <ctype.h>
6   #include "GAlnExtend.h"
7 + #include <inttypes.h>
8  
9   #define USAGE "Usage:\n\
10 < fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 > fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
11     [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
12     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
13      <input.fq>[,<input_mates.fq>\n\
14   \n\
15 < Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 > Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
16   for low complexity and collapse duplicate reads.\n\
17   If read pairs should be trimmed and kept together (i.e. without discarding\n\
18   one read in a pair), the two file names should be given delimited by a comma\n\
# Line 25 | Line 26
26      file(s) named <input>.<outsuffix> which will be created in the \n\
27      current (working) directory; (writes to stdout if -o- is given);\n\
28      a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
29 < -f  file with adapter sequences to trim, each line having this format:\n\
30 <    <5'-adapter-sequence> <3'-adapter-sequence>\n\
31 < -5  trim the given adapter or primer sequence at the 5' end of each read\n\
29 > -f  file with adaptor sequences to trim, each line having this format:\n\
30 >    <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
31 > -5  trim the given adaptor or primer sequence at the 5' end of each read\n\
32      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
33 < -3  trim the given adapter sequence at the 3' end of each read\n\
33 > -3  trim the given adaptor sequence at the 3' end of each read\n\
34      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
35   -A  disable polyA/T trimming (enabled by default)\n\
36   -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
# Line 41 | Line 42
42   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
43      one letter code for the reason why they were trashed\n\
44   -D  apply a low-complexity (dust) filter and discard any read that has over\n\
45 <    50% of its length detected as low complexity\n\
45 >    50%% of its length detected as low complexity\n\
46   -C  collapse duplicate reads and add a prefix with their count to the read\n\
47      name (e.g. for microRNAs)\n\
48   -p  input is phred64/phred33 (use -p64 or -p33)\n\
49   -Q  convert quality values to the other Phred qv type\n\
50 < -V  verbose processing\n\
50 > -V  be verbose when done (print summary counts)\n\
51   "
52  
53   //-z  for -o option, the output stream(s) will be first piped into the given\n
54   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
55  
56  
57 < // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 > // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
58  
59   //For paired reads sequencing:
60   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
# Line 81 | Line 82
82   GStr outsuffix; // -o
83   GStr prefix;
84   GStr zcmd;
85 < int num_trimmed5=0;
86 < int num_trimmed3=0;
87 < int num_trimmedN=0;
88 < int num_trimmedQ=0;
89 < int min_trimmed5=INT_MAX;
90 < int min_trimmed3=INT_MAX;
85 > char isACGT[256];
86 >
87 > uint num_trimN=0; //reads trimmed by N%
88 > uint num_trimQ=0; //reads trimmed by qv threshold
89 > uint num_trimV=0; //reads trimmed by adapter match
90 > uint num_trimA=0; //reads trimmed by polyA
91 > uint num_trimT=0; //reads trimmed by polyT
92 > uint num_trim5=0; //number of reads trimmed at 5' end
93 > uint num_trim3=0; //number of reads trimmed at 3' end
94 >
95 > uint64 b_totalIn=0; //total number of input bases
96 > uint64 b_totalN=0;  //total number of undetermined bases found in input bases
97 >
98 > uint64 b_trimN=0; //total number of bases trimmed due to N-trim
99 > uint64 b_trimQ=0; //number of bases trimmed due to qv threshold
100 > uint64 b_trimV=0; //total number of bases trimmed due to adaptor matches
101 > uint64 b_trimA=0; //number of bases trimmed due to poly-A tails
102 > uint64 b_trimT=0; //number of bases trimmed due to poly-T tails
103 > uint64 b_trim5=0; //total bases trimmed on the 5' side
104 > uint64 b_trim3=0; //total bases trimmed on the 3' side
105 > //int min_trimmed5=INT_MAX;
106 > //int min_trimmed3=INT_MAX;
107  
108   int qvtrim_qmin=0;
109   int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
# Line 94 | Line 111
111   int qv_cvtadd=0; //could be -31 or +31
112  
113   // adaptor matching metrics -- for X-drop ungapped extension
114 + //const int match_reward=2;
115 + //const int mismatch_penalty=3;
116   const int match_reward=2;
117 < const int mismatch_penalty=3;
118 < const int Xdrop=8;
117 > const int mismatch_penalty=4;
118 > const int Xdrop=10;
119  
120   const int poly_m_score=2; //match score
121   const int poly_mis_score=-3; //mismatch
# Line 107 | Line 126
126   const char *polyT_seed="TTTT";
127  
128   struct CASeqData {
129 <   //positional data for every possible hexamer in an adapter
130 <   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
131 <   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
132 <   GStr seq; //actual adapter sequence data
129 >   //positional data for every possible hexamer in an adaptor
130 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
131 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
132 >   GStr seq; //actual adaptor sequence data
133     GStr seqr; //reverse complement sequence
134 +   int amlen; //fraction of adaptor length matching that's
135 +              //enough to consider the alignment
136 +   GAlnTrimType trim_type;
137     bool use_reverse;
138 <   CASeqData(bool rev=false):seq(),seqr() {
139 <     use_reverse=rev;
138 >   CASeqData(bool rev=false):seq(),seqr(),
139 >             amlen(0), use_reverse(rev) {
140 >     trim_type=galn_None; //should be updated later!
141       for (int i=0;i<4096;i++) {
142          pz[i]=NULL;
143          pzr[i]=NULL;
# Line 122 | Line 145
145       }
146  
147     void update(const char* s) {
148 <   seq=s;
149 <   table6mers(seq.chars(), seq.length(), pz);
150 <   if (!use_reverse) return;
151 <   //reverse complement
152 <   seqr=s;
153 <   int slen=seq.length();
154 <   for (int i=0;i<slen;i++)
155 <     seqr[i]=ntComplement(seq[slen-i-1]);
156 <   table6mers(seqr.chars(), seqr.length(), pzr);
148 >         seq=s;
149 >         table6mers(seq.chars(), seq.length(), pz);
150 >         amlen=iround(double(seq.length())*0.8);
151 >         if (amlen<12)
152 >                amlen=12;
153 >         if (!use_reverse) return;
154 >         //reverse complement
155 >         seqr=s;
156 >         int slen=seq.length();
157 >         for (int i=0;i<slen;i++)
158 >           seqr[i]=ntComplement(seq[slen-i-1]);
159 >         table6mers(seqr.chars(), seqr.length(), pzr);
160     }
161  
162     ~CASeqData() {
# Line 141 | Line 167
167     }
168   };
169  
170 < GVec<CASeqData> adapters5;
171 < GVec<CASeqData> adapters3;
170 > GVec<CASeqData> adaptors5;
171 > GVec<CASeqData> adaptors3;
172  
173   CGreedyAlignData* gxmem_l=NULL;
174   CGreedyAlignData* gxmem_r=NULL;
# Line 195 | Line 221
221  
222   GHash<FqDupRec> dhash; //hash to keep track of duplicates
223  
224 < void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
225 < int loadAdapters(const char* fname);
224 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
225 > int loadAdaptors(const char* fname);
226  
227   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
228                         GStr& s, GStr& infname, GStr& infname2);
# Line 216 | Line 242
242   int dust(GStr& seq);
243   bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
244   bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
245 < bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
246 < bool trim_adapter3(GStr& seq, int &l5, int &l3);
245 > bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
246 > bool trim_adaptor3(GStr& seq, int &l5, int &l3);
247  
248   void convertPhred(char* q, int len);
249   void convertPhred(GStr& q);
# Line 276 | Line 302
302         else
303           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
304       }
305 +  memset((void*)isACGT, 0, 256);
306 +  isACGT['A']=isACGT['a']=isACGT['C']=isACGT['c']=1;
307 +  isACGT['G']=isACGT['g']=isACGT['T']=isACGT['t']=1;
308    s=args.getOpt('f');
309    if (!s.is_empty()) {
310 <   loadAdapters(s.chars());
310 >   loadAdaptors(s.chars());
311     }
312 <  bool fileAdapters=adapters5.Count()+adapters3.Count();
312 >  bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
313    s=args.getOpt('5');
314    if (!s.is_empty()) {
315 <    if (fileAdapters)
315 >    if (fileAdaptors)
316        GError("Error: options -5 and -f cannot be used together!\n");
317      s.upper();
318 <    addAdapter(adapters5, s);
318 >    addAdaptor(adaptors5, s, galn_TrimLeft);
319      }
320    s=args.getOpt('3');
321    if (!s.is_empty()) {
322 <    if (fileAdapters)
322 >    if (fileAdaptors)
323        GError("Error: options -3 and -f cannot be used together!\n");
324        s.upper();
325 <      addAdapter(adapters3, s);
325 >      addAdaptor(adaptors3, s, galn_TrimRight);
326      }
327    s=args.getOpt('y');
328    if (!s.is_empty()) {
# Line 318 | Line 347
347      openfw(freport, args, 'r');
348    char* infile=NULL;
349  
350 <  if (adapters5.Count()>0)
351 <    gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
352 <  if (adapters3.Count()>0)
350 >  if (adaptors5.Count()>0)
351 >    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
352 >        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
353 >  if (adaptors3.Count()>0)
354      gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
355  
356    while ((infile=args.nextNonOpt())!=NULL) {
# Line 332 | Line 362
362      int trash_N=0;
363      int trash_D=0;
364      int trash_poly=0;
365 <    int trash_A3=0;
366 <    int trash_A5=0;
365 >    int trash_V=0;
366 >    num_trimN=0;
367 >    num_trimQ=0;
368 >    num_trimV=0;
369 >    num_trimA=0;
370 >    num_trimT=0;
371 >    num_trim5=0;
372 >    num_trim3=0;
373 >
374 >    b_totalIn=0;
375 >    b_totalN=0;
376 >    b_trimN=0;
377 >    b_trimQ=0;
378 >    b_trimV=0;
379 >    b_trimA=0;
380 >    b_trimT=0;
381 >    b_trim5=0;
382 >    b_trim3=0;
383 >
384      s=infile;
385      GStr infname;
386      GStr infname2;
# Line 356 | Line 403
403         int a5=0, a3=0, b5=0, b3=0;
404         char tcode=0, tcode2=0;
405         tcode=process_read(seqid, rseq, rqv, a5, a3);
406 +       if (a5>0) {
407 +          b_trim5+=a5;
408 +          num_trim5++;
409 +          }
410 +       if (a3<rseq.length()-1) {
411 +          b_trim3+=rseq.length()-a3-1;
412 +          num_trim3++;
413 +          }
414         if (fq2!=NULL) {
415              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
416              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
# Line 363 | Line 418
418                    seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
419                 }
420              tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
421 +            if (b5>0) {
422 +               b_trim5+=b5;
423 +               num_trim5++;
424 +               }
425 +            if (b3<rseq2.length()-1) {
426 +               b_trim3+=rseq2.length()-b3-1;
427 +               num_trim3++;
428 +               }
429              //decide what to do with this pair and print rseq2 if the pair makes it
430              if (tcode>1 && tcode2<=1) {
431                 //"untrash" rseq
# Line 391 | Line 454
454              } //pair read
455         if (tcode>1) { //trashed
456           #ifdef GDEBUG
457 <         GMessage(" !!!!TRASH => 'N'\n");
457 >         GMessage(" !!!!TRASH code = %c\n",tcode);
458           #endif
459            if (tcode=='s') trash_s++;
460            else if (tcode=='A' || tcode=='T') trash_poly++;
461              else if (tcode=='Q') trash_Q++;
462                else if (tcode=='N') trash_N++;
463                 else if (tcode=='D') trash_D++;
464 <                else if (tcode=='3') trash_A3++;
465 <                 else if (tcode=='5') trash_A5++;
464 >                else if (tcode=='V') trash_V++;
465 >                 //else if (tcode=='5') trash_A5++;
466            if (trashReport) trash_report(tcode, seqid, freport);
467            }
468           else if (!doCollapse) { //write it
# Line 471 | Line 534
534      if (verbose) {
535         if (paired_reads) {
536             GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
537 <           GMessage("Number of input pairs :%9d\n", incounter);
538 <           GMessage("         Output pairs :%9d\n", outcounter);
537 >           GMessage("Number of input pairs :%9u\n", incounter);
538 >           GMessage("         Output pairs :%9u\t(%u trashed)\n", outcounter, incounter-outcounter);
539             }
540           else {
541             GMessage(">Input file : %s\n", infname.chars());
542             GMessage("Number of input reads :%9d\n", incounter);
543 <           GMessage("         Output reads :%9d\n", outcounter);
543 >           GMessage("         Output reads :%9d  (%u trashed)\n", outcounter, incounter-outcounter);
544             }
545 <       GMessage("------------------------------------\n");
546 <       if (num_trimmed5)
547 <          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
548 <       if (num_trimmed3)
549 <          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
545 >       GMessage("--------------- Read trimming: ------------------\n");
546 >       if (num_trim5)
547 >          GMessage("           5' trimmed :%9u\n", num_trim5);
548 >       if (num_trim3)
549 >          GMessage("           3' trimmed :%9u\n", num_trim3);
550 >       if (num_trimQ)
551 >          GMessage("         q.v. trimmed :%9u\n", num_trimQ);
552 >       if (num_trimN)
553 >          GMessage("            N trimmed :%9u\n", num_trimN);
554 >       if (num_trimT)
555 >          GMessage("       poly-T trimmed :%9u\n", num_trimT);
556 >       if (num_trimA)
557 >          GMessage("       poly-A trimmed :%9u\n", num_trimA);
558 >       if (num_trimV)
559 >          GMessage("      Adapter trimmed :%9u\n", num_trimV);
560         GMessage("------------------------------------\n");
561         if (trash_s>0)
562 <         GMessage("     Trashed by length:%9d\n", trash_s);
562 >         GMessage("Trashed by initial len:%9d\n", trash_s);
563         if (trash_N>0)
564           GMessage("         Trashed by N%%:%9d\n", trash_N);
565         if (trash_Q>0)
566           GMessage("Trashed by low quality:%9d\n", trash_Q);
567         if (trash_poly>0)
568           GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
569 <       if (trash_A5>0)
570 <         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
571 <       if (trash_A3>0)
572 <         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
569 >       if (trash_V>0)
570 >         GMessage("    Trashed by adaptor:%9d\n", trash_V);
571 >       GMessage("\n--------------- Base counts  --------------------\n");
572 >       GMessage("    Input bases   :%12llu\n", b_totalIn);
573 >       double percN=100.0* ((double)b_totalN/(double)b_totalIn);
574 >       GMessage("          N bases :%12llu (%4.2f%%)\n", b_totalN, percN);
575 >       GMessage("   trimmed from 5':%12llu\n", b_trim5);
576 >       GMessage("   trimmed from 3':%12llu\n", b_trim3);
577 >       GMessage("\n");
578 >       if (b_trimQ)
579 >       GMessage("     q.v. trimmed :%12llu\n", b_trimQ);
580 >       if (b_trimN)
581 >       GMessage("        N trimmed :%12llu\n", b_trimN);
582 >       if (b_trimT)
583 >       GMessage("   poly-T trimmed :%12llu\n", b_trimT);
584 >       if (b_trimA)
585 >       GMessage("   poly-A trimmed :%12llu\n", b_trimA);
586 >       if (b_trimV)
587 >       GMessage("  Adapter trimmed :%12llu\n", b_trimV);
588 >
589         }
590      if (trashReport) {
591            FWCLOSE(freport);
# Line 937 | Line 1026
1026   return false;
1027   }
1028  
1029 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1030 < if (adapters3.Count()==0) return false;
1029 > bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
1030 > if (adaptors3.Count()==0) return false;
1031   int rlen=seq.length();
1032   l5=0;
1033   l3=rlen-1;
1034   bool trimmed=false;
1035 < GStr wseq(seq.chars());
1035 > GStr wseq(seq);
1036   int wlen=rlen;
1037 < for (int ai=0;ai<adapters3.Count();ai++) {
1038 <  //if (adapters3[ai].is_empty()) continue;
1039 <  int alen=adapters3[ai].seq.length();
1040 <  GStr& aseq=adapters3[ai].seq;
1041 <  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, adapters3[ai].pz,
1042 <                            wseq.chars(), wlen, gxmem_r, 74);
1043 <  if (r_bestaln) {
1044 <     trimmed=true;
1045 <     //keep unmatched region on the left, if any
1046 <     l3-=(wlen-r_bestaln->sl+1);
1047 <     delete r_bestaln;
1048 <     if (l3<0) l3=0;
1049 <     if (l3-l5+1<min_read_len) return true;
1050 <     wseq=seq.substr(l5,l3-l5+1);
1051 <     wlen=wseq.length();
1037 > GXSeqData seqdata;
1038 > int numruns=revCompl ? 2 : 1;
1039 > GList<GXAlnInfo> bestalns(true, true, false);
1040 > for (int ai=0;ai<adaptors3.Count();ai++) {
1041 >   for (int r=0;r<numruns;r++) {
1042 >     if (r) {
1043 >          seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
1044 >                 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
1045 >        }
1046 >     else {
1047 >            seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
1048 >                 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
1049 >        }
1050 >     GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
1051 >         if (aln) {
1052 >           if (aln->strong) {
1053 >                   trimmed=true;
1054 >                   bestalns.Add(aln);
1055 >                   break; //will check the rest next time
1056 >                   }
1057 >            else bestalns.Add(aln);
1058 >           }
1059 >   }//forward and reverse adaptors
1060 >   if (trimmed) break; //will check the rest in the next cycle
1061 >  }//for each 3' adaptor
1062 > if (bestalns.Count()>0) {
1063 >           GXAlnInfo* aln=bestalns[0];
1064 >           if (aln->sl-1 > wlen-aln->sr) {
1065 >                   //keep left side
1066 >                   l3-=(wlen-aln->sl+1);
1067 >                   if (l3<0) l3=0;
1068 >                   }
1069 >           else { //keep right side
1070 >                   l5+=aln->sr;
1071 >                   if (l5>=rlen) l5=rlen-1;
1072 >                   }
1073 >           //delete aln;
1074 >           //if (l3-l5+1<min_read_len) return true;
1075 >           wseq=seq.substr(l5,l3-l5+1);
1076 >           wlen=wseq.length();
1077 >           return true; //break the loops here to report a good find
1078       }
1079 <  }//for each 5' adapter
965 <  return trimmed;
1079 >  return false;
1080   }
1081  
1082 < bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1083 < if (adapters5.Count()==0) return false;
1082 > bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1083 > if (adaptors5.Count()==0) return false;
1084   int rlen=seq.length();
1085   l5=0;
1086   l3=rlen-1;
1087   bool trimmed=false;
1088 < GStr wseq(seq.chars());
1088 > GStr wseq(seq);
1089   int wlen=rlen;
1090 < for (int ai=0;ai<adapters5.Count();ai++) {
1091 <  //if (adapters5[ai].is_empty()) continue;
1092 <  int alen=adapters5[ai].seq.length();
1093 <  GStr& aseq=adapters5[ai].seq;
1094 <  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
1095 <                 wseq.chars(), wlen, gxmem_l, 84);
1096 <  if (l_bestaln) {
1097 <     trimmed=true;
1098 <     l5+=l_bestaln->sr;
1099 <     delete l_bestaln;
1100 <     if (l5>=rlen) l5=rlen-1;
1101 <     if (l3-l5+1<min_read_len) return true;
1102 <     wseq=seq.substr(l5,l3-l5+1);
1103 <     wlen=wseq.length();
1090 > GXSeqData seqdata;
1091 > int numruns=revCompl ? 2 : 1;
1092 > GList<GXAlnInfo> bestalns(true, true, false);
1093 > for (int ai=0;ai<adaptors5.Count();ai++) {
1094 >   for (int r=0;r<numruns;r++) {
1095 >     if (r) {
1096 >          seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1097 >                 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1098 >        }
1099 >     else {
1100 >            seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1101 >                 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1102 >        }
1103 >         GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1104 >         if (aln) {
1105 >           if (aln->strong) {
1106 >                   trimmed=true;
1107 >                   bestalns.Add(aln);
1108 >                   break; //will check the rest next time
1109 >                   }
1110 >            else bestalns.Add(aln);
1111 >           }
1112 >         } //forward and reverse?
1113 >   if (trimmed) break; //will check the rest in the next cycle
1114 >  }//for each 5' adaptor
1115 >  if (bestalns.Count()>0) {
1116 >           GXAlnInfo* aln=bestalns[0];
1117 >           if (aln->sl-1 > wlen-aln->sr) {
1118 >                   //keep left side
1119 >                   l3-=(wlen-aln->sl+1);
1120 >                   if (l3<0) l3=0;
1121 >                   }
1122 >           else { //keep right side
1123 >                   l5+=aln->sr;
1124 >                   if (l5>=rlen) l5=rlen-1;
1125 >                   }
1126 >           //delete aln;
1127 >           //if (l3-l5+1<min_read_len) return true;
1128 >           wseq=seq.substr(l5,l3-l5+1);
1129 >           wlen=wseq.length();
1130 >           return true; //break the loops here to report a good find
1131       }
1132 <  }//for each 5' adapter
992 <  return trimmed;
1132 >  return false;
1133   }
1134  
1135   //convert qvs to/from phred64 from/to phread33
# Line 1060 | Line 1200
1200  
1201   #ifdef GDEBUG
1202   void showTrim(GStr& s, int l5, int l3) {
1203 <  if (l5>0) {
1203 >  if (l5>0 || l3==0) {
1204      color_bg(c_red);
1205      }
1206    for (int i=0;i<s.length()-1;i++) {
1207      if (i && i==l5) color_resetbg();
1208      fprintf(stderr, "%c", s[i]);
1209 <    if (i==l3) color_bg(c_red);
1209 >    if (i && i==l3) color_bg(c_red);
1210     }
1211    fprintf(stderr, "%c", s[s.length()-1]);
1212    color_reset();
# Line 1091 | Line 1231
1231   GStr wqv(rqv.chars());
1232   int w5=l5;
1233   int w3=l3;
1234 + //count Ns
1235 + b_totalIn+=rseq.length();
1236 + for (int i=0;i<rseq.length();i++) {
1237 + if (isACGT[(int)rseq[i]]==0) b_totalN++;
1238 + }
1239 +
1240   //first do the q-based trimming
1241   if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1242 +   b_trimQ+=(w5-l5)+(l3-w3);
1243 +   num_trimQ++;
1244     if (w3-w5+1<min_read_len) {
1245       return 'Q'; //invalid read
1246       }
# Line 1105 | Line 1253
1253     } //qv trimming
1254   // N-trimming on the remaining read seq
1255   if (ntrim(wseq, w5, w3)) {
1256 +   //Note: ntrim resets w5 and w3
1257     //GMessage("before: %s\n",wseq.chars());
1258     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1259 +   int trim3=(wseq.length()-1-w3);
1260 +   b_trimN+=w5+trim3;
1261 +   num_trimN++;
1262     l5+=w5;
1263 <   l3-=(wseq.length()-1-w3);
1263 >   l3-=trim3;
1264     if (w3-w5+1<min_read_len) {
1265       return 'N'; //to be trashed
1266       }
# Line 1120 | Line 1272
1272     w3=wseq.length()-1;
1273     }
1274   char trim_code;
1275 + //clean the more dirty end first - 3'
1276 + bool trimmedA=false;
1277 + bool trimmedT=false;
1278 + bool trimmedV=false;
1279 +
1280 + int prev_w5=0;
1281 + int prev_w3=0;
1282 + bool w3upd=true;
1283 + bool w5upd=true;
1284   do {
1285    trim_code=0;
1286 <  if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1286 >  if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1287        trim_code='A';
1288 +      b_trimA+=(w5+(wseq.length()-1-w3));
1289 +      if (!trimmedA) { num_trimA++; trimmedA=true; }
1290        }
1291 <  else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1291 >  else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1292        trim_code='T';
1293 +      b_trimT+=(w5+(wseq.length()-1-w3));
1294 +      if (!trimmedT) { num_trimT++; trimmedT=true; }
1295        }
1296 <  else if (trim_adapter5(wseq, w5, w3)) {
1132 <      trim_code='5';
1133 <      }
1134 <  if (trim_code) {
1135 <     #ifdef GDEBUG
1136 <      GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1137 <      showTrim(wseq, w5, w3);
1138 <     #endif
1139 <     int trimlen=wseq.length()-(w3-w5+1);
1140 <     num_trimmed5++;
1141 <     if (trimlen<min_trimmed5)
1142 <         min_trimmed5=trimlen;
1143 <     l5+=w5;
1144 <     l3-=(wseq.length()-1-w3);
1145 <     if (w3-w5+1<min_read_len) {
1146 <         return trim_code;
1147 <         }
1148 <      //-- keep only the w5..w3 range
1149 <      wseq=wseq.substr(w5, w3-w5+1);
1150 <      if (!wqv.is_empty())
1151 <         wqv=wqv.substr(w5, w3-w5+1);
1152 <      }// trimmed at 5' end
1153 < } while (trim_code);
1154 <
1155 < do {
1156 <  trim_code=0;
1157 <  if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1296 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1297        trim_code='A';
1298 +      b_trimA+=(w5+(wseq.length()-1-w3));
1299 +      if (!trimmedA) { num_trimA++; trimmedA=true; }
1300        }
1301 <  else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1301 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1302        trim_code='T';
1303 +      b_trimT+=(w5+(wseq.length()-1-w3));
1304 +      if (!trimmedT) { num_trimT++; trimmedT=true; }
1305 +      }
1306 +  else if (trim_adaptor5(wseq, w5, w3)) {
1307 +      trim_code='V';
1308 +      b_trimV+=(w5+(wseq.length()-1-w3));
1309 +      if (!trimmedV) { num_trimV++; trimmedV=true; }
1310        }
1311 <  else if (trim_adapter3(wseq, w5, w3)) {
1312 <      trim_code='3';
1311 >  else if (trim_adaptor3(wseq, w5, w3)) {
1312 >      trim_code='V';
1313 >      b_trimV+=(w5+(wseq.length()-1-w3));
1314 >      if (!trimmedV) { num_trimV++; trimmedV=true; }
1315        }
1316    if (trim_code) {
1317 <     #ifdef GDEBUG
1317 >     w3upd=(w3!=prev_w3);
1318 >         w5upd=(w5!=prev_w5);
1319 >         if (w3upd) prev_w3=w3;
1320 >         if (w5upd) prev_w5=w5;
1321 >   #ifdef GDEBUG
1322       GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1323       showTrim(wseq, w5, w3);
1324 <     #endif
1325 <     int trimlen=wseq.length()-(w3-w5+1);
1172 <     num_trimmed3++;
1173 <     if (trimlen<min_trimmed3)
1174 <         min_trimmed3=trimlen;
1324 >   #endif
1325 >     //int trimlen=wseq.length()-(w3-w5+1);
1326       l5+=w5;
1327       l3-=(wseq.length()-1-w3);
1328       if (w3-w5+1<min_read_len) {
# Line 1183 | Line 1334
1334           wqv=wqv.substr(w5, w3-w5+1);
1335        }//trimming at 3' end
1336   } while (trim_code);
1186
1187
1337   if (doCollapse) {
1338     //keep read for later
1339     FqDupRec* dr=dhash.Find(wseq.chars());
# Line 1335 | Line 1484
1484      }
1485   }
1486  
1487 < void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1487 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1488   //TODO: prepare CASeqData here, and collect hexamers as well
1489 +  if (seq.is_empty() || seq=="-" ||
1490 +          seq=="N/A" || seq==".") return;
1491 +
1492   CASeqData adata(revCompl);
1493 < int idx=adapters.Add(adata);
1493 > int idx=adaptors.Add(adata);
1494   if (idx<0) GError("Error: failed to add adaptor!\n");
1495 < adapters[idx].update(seq.chars());
1495 > adaptors[idx].trim_type=trim_type;
1496 > adaptors[idx].update(seq.chars());
1497   }
1498  
1499  
1500 < int loadAdapters(const char* fname) {
1500 > int loadAdaptors(const char* fname) {
1501    GLineReader lr(fname);
1502    char* l;
1503    while ((l=lr.nextLine())!=NULL) {
# Line 1360 | Line 1513
1513        #ifdef GDEBUG
1514            //s.reverse();
1515        #endif
1516 <        addAdapter(adapters3, s);
1516 >        addAdaptor(adaptors3, s, galn_TrimRight);
1517          continue;
1518          }
1519        }
# Line 1369 | Line 1522
1522        s.startTokenize("\t ;,:");
1523        GStr a5,a3;
1524        if (s.nextToken(a5))
1525 <         s.nextToken(a3);
1525 >            s.nextToken(a3);
1526 >        else continue; //no tokens on this line
1527 >      GAlnTrimType ttype5=galn_TrimLeft;
1528        a5.upper();
1529        a3.upper();
1530 +      if (a3.is_empty() || a3==a5 || a3=="=") {
1531 +         a3.clear();
1532 +         ttype5=galn_TrimEither;
1533 +         }
1534       #ifdef GDEBUG
1535       //   a5.reverse();
1536       //   a3.reverse();
1537       #endif
1538 <      addAdapter(adapters5, a5);
1539 <      addAdapter(adapters3, a3);
1538 >      addAdaptor(adaptors5, a5, ttype5);
1539 >      addAdaptor(adaptors3, a3, galn_TrimRight);
1540        }
1541     }
1542 <   return adapters5.Count()+adapters3.Count();
1542 >   return adaptors5.Count()+adaptors3.Count();
1543   }
1544  
1545   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines