ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 6 | Line 6
6   #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9 < fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
9 > fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10     [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12      <input.fq>[,<input_mates.fq>\n\
13   \n\
14 < Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
14 > Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15   for low complexity and collapse duplicate reads.\n\
16   If read pairs should be trimmed and kept together (i.e. without discarding\n\
17   one read in a pair), the two file names should be given delimited by a comma\n\
# Line 25 | Line 25
25      file(s) named <input>.<outsuffix> which will be created in the \n\
26      current (working) directory; (writes to stdout if -o- is given);\n\
27      a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 < -f  file with adapter sequences to trim, each line having this format:\n\
29 <    <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 < -5  trim the given adapter or primer sequence at the 5' end of each read\n\
28 > -f  file with adaptor sequences to trim, each line having this format:\n\
29 >    <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30 > -5  trim the given adaptor or primer sequence at the 5' end of each read\n\
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 < -3  trim the given adapter sequence at the 3' end of each read\n\
32 > -3  trim the given adaptor sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34   -A  disable polyA/T trimming (enabled by default)\n\
35   -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
# Line 53 | Line 53
53   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54  
55  
56 < // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
56 > // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57  
58   //For paired reads sequencing:
59   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
# Line 109 | Line 109
109   const char *polyT_seed="TTTT";
110  
111   struct CASeqData {
112 <   //positional data for every possible hexamer in an adapter
113 <   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
114 <   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
115 <   GStr seq; //actual adapter sequence data
112 >   //positional data for every possible hexamer in an adaptor
113 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115 >   GStr seq; //actual adaptor sequence data
116     GStr seqr; //reverse complement sequence
117 <   int amlen; //fraction of adapter length matching that's
117 >   int amlen; //fraction of adaptor length matching that's
118                //enough to consider the alignment
119 +   GAlnTrimType trim_type;
120     bool use_reverse;
121     CASeqData(bool rev=false):seq(),seqr(),
122               amlen(0), use_reverse(rev) {
123 +     trim_type=galn_None; //should be updated later!
124       for (int i=0;i<4096;i++) {
125          pz[i]=NULL;
126          pzr[i]=NULL;
# Line 148 | Line 150
150     }
151   };
152  
153 < GVec<CASeqData> adapters5;
154 < GVec<CASeqData> adapters3;
153 > GVec<CASeqData> adaptors5;
154 > GVec<CASeqData> adaptors3;
155  
156   CGreedyAlignData* gxmem_l=NULL;
157   CGreedyAlignData* gxmem_r=NULL;
# Line 202 | Line 204
204  
205   GHash<FqDupRec> dhash; //hash to keep track of duplicates
206  
207 < void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
208 < int loadAdapters(const char* fname);
207 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208 > int loadAdaptors(const char* fname);
209  
210   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211                         GStr& s, GStr& infname, GStr& infname2);
# Line 223 | Line 225
225   int dust(GStr& seq);
226   bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227   bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 < bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 < bool trim_adapter3(GStr& seq, int &l5, int &l3);
228 > bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 > bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230  
231   void convertPhred(char* q, int len);
232   void convertPhred(GStr& q);
# Line 285 | Line 287
287       }
288    s=args.getOpt('f');
289    if (!s.is_empty()) {
290 <   loadAdapters(s.chars());
290 >   loadAdaptors(s.chars());
291     }
292 <  bool fileAdapters=adapters5.Count()+adapters3.Count();
292 >  bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293    s=args.getOpt('5');
294    if (!s.is_empty()) {
295 <    if (fileAdapters)
295 >    if (fileAdaptors)
296        GError("Error: options -5 and -f cannot be used together!\n");
297      s.upper();
298 <    addAdapter(adapters5, s);
298 >    addAdaptor(adaptors5, s, galn_TrimLeft);
299      }
300    s=args.getOpt('3');
301    if (!s.is_empty()) {
302 <    if (fileAdapters)
302 >    if (fileAdaptors)
303        GError("Error: options -3 and -f cannot be used together!\n");
304        s.upper();
305 <      addAdapter(adapters3, s);
305 >      addAdaptor(adaptors3, s, galn_TrimRight);
306      }
307    s=args.getOpt('y');
308    if (!s.is_empty()) {
# Line 325 | Line 327
327      openfw(freport, args, 'r');
328    char* infile=NULL;
329  
330 <  if (adapters5.Count()>0)
330 >  if (adaptors5.Count()>0)
331      //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332          gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 <  if (adapters3.Count()>0)
333 >  if (adaptors3.Count()>0)
334      gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335  
336    while ((infile=args.nextNonOpt())!=NULL) {
# Line 502 | Line 504
504         if (trash_poly>0)
505           GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
506         if (trash_A5>0)
507 <         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
507 >         GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508         if (trash_A3>0)
509 <         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
509 >         GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510         }
511      if (trashReport) {
512            FWCLOSE(freport);
# Line 945 | Line 947
947   return false;
948   }
949  
950 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
951 < if (adapters3.Count()==0) return false;
950 > bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951 > if (adaptors3.Count()==0) return false;
952   int rlen=seq.length();
953   l5=0;
954   l3=rlen-1;
# Line 954 | Line 956
956   GStr wseq(seq);
957   int wlen=rlen;
958   GXSeqData seqdata;
959 < for (int ai=0;ai<adapters3.Count();ai++) {
960 <  seqdata.update(adapters3[ai].seq.chars(), adapters3[ai].seq.length(),
961 <       adapters3[ai].pz, wseq.chars(), wlen, adapters3[ai].amlen);
962 <  GXAlnInfo* bestaln=match_RightEnd(seqdata, gxmem_r, 86);
959 > int numruns=revCompl ? 2 : 1;
960 > for (int ai=0;ai<adaptors3.Count();ai++) {
961 >   for (int r=0;r<numruns;r++) {
962 >     if (r) {
963 >          seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
964 >                 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
965 >        }
966 >     else {
967 >            seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
968 >                 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
969 >        }
970 >
971 >  GXAlnInfo* bestaln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
972    if (bestaln) {
973       trimmed=true;
974       //keep unmatched region on the left OR right (the longer one)
# Line 975 | Line 986
986       wseq=seq.substr(l5,l3-l5+1);
987       wlen=wseq.length();
988       }
989 <  }//for each 5' adapter
989 >   }//forward and reverse adaptors
990 >  }//for each 3' adaptor
991    return trimmed;
992   }
993  
994 < bool trim_adapter5(GStr& seq, int&l5, int &l3) {
995 < if (adapters5.Count()==0) return false;
994 > bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
995 > if (adaptors5.Count()==0) return false;
996   int rlen=seq.length();
997   l5=0;
998   l3=rlen-1;
# Line 988 | Line 1000
1000   GStr wseq(seq);
1001   int wlen=rlen;
1002   GXSeqData seqdata;
1003 < for (int ai=0;ai<adapters5.Count();ai++) {
1004 <   seqdata.update(adapters5[ai].seq.chars(), adapters5[ai].seq.length(),
1005 <       adapters5[ai].pz, wseq.chars(), wlen, adapters5[ai].amlen);
1006 <  GXAlnInfo* bestaln=match_LeftEnd(seqdata, gxmem_l, 90);
1007 <  if (bestaln) {
1008 <     trimmed=true;
1009 <     if (bestaln->sl-1 > wlen-bestaln->sr) {
1010 <         //keep left side
1011 <         l3-=(wlen-bestaln->sl+1);
1012 <         if (l3<0) l3=0;
1013 <         }
1014 <     else { //keep right side
1015 <         l5+=bestaln->sr;
1016 <         if (l5>=rlen) l5=rlen-1;
1017 <         }
1018 <     delete bestaln;
1019 <     if (l3-l5+1<min_read_len) return true;
1020 <     wseq=seq.substr(l5,l3-l5+1);
1021 <     wlen=wseq.length();
1022 <     }
1023 <  }//for each 5' adapter
1003 > int numruns=revCompl ? 2 : 1;
1004 > for (int ai=0;ai<adaptors5.Count();ai++) {
1005 >   for (int r=0;r<numruns;r++) {
1006 >     if (r) {
1007 >          seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1008 >                 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1009 >        }
1010 >     else {
1011 >            seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1012 >                 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1013 >        }
1014 >         GXAlnInfo* bestaln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1015 >         if (bestaln) {
1016 >           trimmed=true;
1017 >           if (bestaln->sl-1 > wlen-bestaln->sr) {
1018 >                   //keep left side
1019 >                   l3-=(wlen-bestaln->sl+1);
1020 >                   if (l3<0) l3=0;
1021 >                   }
1022 >           else { //keep right side
1023 >                   l5+=bestaln->sr;
1024 >                   if (l5>=rlen) l5=rlen-1;
1025 >                   }
1026 >           delete bestaln;
1027 >           if (l3-l5+1<min_read_len) return true;
1028 >           wseq=seq.substr(l5,l3-l5+1);
1029 >           wlen=wseq.length();
1030 >           }
1031 >         } //forward and reverse?
1032 >  }//for each 5' adaptor
1033    return trimmed;
1034   }
1035  
# Line 1140 | Line 1161
1161     w3=wseq.length()-1;
1162     }
1163   char trim_code;
1164 + //clean the more dirty end first - 3'
1165 + int prev_w5=0;
1166 + int prev_w3=0;
1167 + bool w3upd=true;
1168 + bool w5upd=true;
1169   do {
1170    trim_code=0;
1171 <  if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1171 >  if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1172        trim_code='A';
1173        }
1174 <  else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1174 >  else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1175        trim_code='T';
1176        }
1177 <  else if (trim_adapter5(wseq, w5, w3)) {
1152 <      trim_code='5';
1153 <      }
1154 <  if (trim_code) {
1155 <     #ifdef GDEBUG
1156 <      GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1157 <      showTrim(wseq, w5, w3);
1158 <     #endif
1159 <     int trimlen=wseq.length()-(w3-w5+1);
1160 <     num_trimmed5++;
1161 <     if (trimlen<min_trimmed5)
1162 <         min_trimmed5=trimlen;
1163 <     l5+=w5;
1164 <     l3-=(wseq.length()-1-w3);
1165 <     if (w3-w5+1<min_read_len) {
1166 <         return trim_code;
1167 <         }
1168 <      //-- keep only the w5..w3 range
1169 <      wseq=wseq.substr(w5, w3-w5+1);
1170 <      if (!wqv.is_empty())
1171 <         wqv=wqv.substr(w5, w3-w5+1);
1172 <      }// trimmed at 5' end
1173 < } while (trim_code);
1174 <
1175 < do {
1176 <  trim_code=0;
1177 <  if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1177 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1178        trim_code='A';
1179        }
1180 <  else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1180 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1181        trim_code='T';
1182        }
1183 <  else if (trim_adapter3(wseq, w5, w3)) {
1183 >  else if (trim_adaptor5(wseq, w5, w3)) {
1184 >      trim_code='5';
1185 >      }
1186 >  else if (trim_adaptor3(wseq, w5, w3)) {
1187        trim_code='3';
1188        }
1189    if (trim_code) {
1190 <     #ifdef GDEBUG
1190 >     w3upd=(w3!=prev_w3);
1191 >         w5upd=(w5!=prev_w5);
1192 >         if (w3upd) prev_w3=w3;
1193 >         if (w5upd) prev_w5=w5;
1194 >   #ifdef GDEBUG
1195       GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1196       showTrim(wseq, w5, w3);
1197 <     #endif
1197 >   #endif
1198       int trimlen=wseq.length()-(w3-w5+1);
1199       num_trimmed3++;
1200       if (trimlen<min_trimmed3)
# Line 1204 | Line 1211
1211        }//trimming at 3' end
1212   } while (trim_code);
1213  
1207
1214   if (doCollapse) {
1215     //keep read for later
1216     FqDupRec* dr=dhash.Find(wseq.chars());
# Line 1355 | Line 1361
1361      }
1362   }
1363  
1364 < void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1364 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1365   //TODO: prepare CASeqData here, and collect hexamers as well
1366 +  if (seq.is_empty() || seq=="-" ||
1367 +          seq=="N/A" || seq==".") return;
1368 +
1369   CASeqData adata(revCompl);
1370 < int idx=adapters.Add(adata);
1370 > int idx=adaptors.Add(adata);
1371   if (idx<0) GError("Error: failed to add adaptor!\n");
1372 < adapters[idx].update(seq.chars());
1372 > adaptors[idx].trim_type=trim_type;
1373 > adaptors[idx].update(seq.chars());
1374   }
1375  
1376  
1377 < int loadAdapters(const char* fname) {
1377 > int loadAdaptors(const char* fname) {
1378    GLineReader lr(fname);
1379    char* l;
1380    while ((l=lr.nextLine())!=NULL) {
# Line 1380 | Line 1390
1390        #ifdef GDEBUG
1391            //s.reverse();
1392        #endif
1393 <        addAdapter(adapters3, s);
1393 >        addAdaptor(adaptors3, s, galn_TrimRight);
1394          continue;
1395          }
1396        }
# Line 1389 | Line 1399
1399        s.startTokenize("\t ;,:");
1400        GStr a5,a3;
1401        if (s.nextToken(a5))
1402 <         s.nextToken(a3);
1402 >            s.nextToken(a3);
1403 >        else continue; //no tokens on this line
1404 >      GAlnTrimType ttype5=galn_TrimLeft;
1405        a5.upper();
1406        a3.upper();
1407 +      if (a3.is_empty() || a3==a5 || a3=="=") {
1408 +         a3.clear();
1409 +         ttype5=galn_TrimEither;
1410 +         }
1411       #ifdef GDEBUG
1412       //   a5.reverse();
1413       //   a3.reverse();
1414       #endif
1415 <      addAdapter(adapters5, a5);
1416 <      addAdapter(adapters3, a3);
1415 >      addAdaptor(adaptors5, a5, ttype5);
1416 >      addAdaptor(adaptors3, a3, galn_TrimRight);
1417        }
1418     }
1419 <   return adapters5.Count()+adapters3.Count();
1419 >   return adaptors5.Count()+adaptors3.Count();
1420   }
1421  
1422   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines