ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 6 | Line 6
6   #include "GAlnExtend.h"
7  
8   #define USAGE "Usage:\n\
9 < fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
9 > fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10     [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12      <input.fq>[,<input_mates.fq>\n\
13   \n\
14 < Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
14 > Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15   for low complexity and collapse duplicate reads.\n\
16   If read pairs should be trimmed and kept together (i.e. without discarding\n\
17   one read in a pair), the two file names should be given delimited by a comma\n\
# Line 25 | Line 25
25      file(s) named <input>.<outsuffix> which will be created in the \n\
26      current (working) directory; (writes to stdout if -o- is given);\n\
27      a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 < -f  file with adapter sequences to trim, each line having this format:\n\
29 <    <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 < -5  trim the given adapter or primer sequence at the 5' end of each read\n\
28 > -f  file with adaptor sequences to trim, each line having this format:\n\
29 >    <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30 > -5  trim the given adaptor or primer sequence at the 5' end of each read\n\
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 < -3  trim the given adapter sequence at the 3' end of each read\n\
32 > -3  trim the given adaptor sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34   -A  disable polyA/T trimming (enabled by default)\n\
35   -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
# Line 53 | Line 53
53   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54  
55  
56 < // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
56 > // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57  
58   //For paired reads sequencing:
59   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
# Line 94 | Line 94
94   int qv_cvtadd=0; //could be -31 or +31
95  
96   // adaptor matching metrics -- for X-drop ungapped extension
97 + //const int match_reward=2;
98 + //const int mismatch_penalty=3;
99   const int match_reward=2;
100 < const int mismatch_penalty=3;
101 < const int Xdrop=8;
100 > const int mismatch_penalty=4;
101 > const int Xdrop=10;
102  
103   const int poly_m_score=2; //match score
104   const int poly_mis_score=-3; //mismatch
# Line 107 | Line 109
109   const char *polyT_seed="TTTT";
110  
111   struct CASeqData {
112 <   //positional data for every possible hexamer in an adapter
113 <   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
114 <   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
115 <   GStr seq; //actual adapter sequence data
112 >   //positional data for every possible hexamer in an adaptor
113 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115 >   GStr seq; //actual adaptor sequence data
116     GStr seqr; //reverse complement sequence
117 +   int amlen; //fraction of adaptor length matching that's
118 +              //enough to consider the alignment
119 +   GAlnTrimType trim_type;
120     bool use_reverse;
121 <   CASeqData(bool rev=false):seq(),seqr() {
122 <     use_reverse=rev;
121 >   CASeqData(bool rev=false):seq(),seqr(),
122 >             amlen(0), use_reverse(rev) {
123 >     trim_type=galn_None; //should be updated later!
124       for (int i=0;i<4096;i++) {
125          pz[i]=NULL;
126          pzr[i]=NULL;
# Line 122 | Line 128
128       }
129  
130     void update(const char* s) {
131 <   seq=s;
132 <   table6mers(seq.chars(), seq.length(), pz);
133 <   if (!use_reverse) return;
134 <   //reverse complement
135 <   seqr=s;
136 <   int slen=seq.length();
137 <   for (int i=0;i<slen;i++)
138 <     seqr[i]=ntComplement(seq[slen-i-1]);
139 <   table6mers(seqr.chars(), seqr.length(), pzr);
131 >         seq=s;
132 >         table6mers(seq.chars(), seq.length(), pz);
133 >         amlen=iround(double(seq.length())*0.8);
134 >         if (amlen<12)
135 >                amlen=12;
136 >         if (!use_reverse) return;
137 >         //reverse complement
138 >         seqr=s;
139 >         int slen=seq.length();
140 >         for (int i=0;i<slen;i++)
141 >           seqr[i]=ntComplement(seq[slen-i-1]);
142 >         table6mers(seqr.chars(), seqr.length(), pzr);
143     }
144  
145     ~CASeqData() {
# Line 141 | Line 150
150     }
151   };
152  
153 < GVec<CASeqData> adapters5;
154 < GVec<CASeqData> adapters3;
153 > GVec<CASeqData> adaptors5;
154 > GVec<CASeqData> adaptors3;
155  
156   CGreedyAlignData* gxmem_l=NULL;
157   CGreedyAlignData* gxmem_r=NULL;
# Line 195 | Line 204
204  
205   GHash<FqDupRec> dhash; //hash to keep track of duplicates
206  
207 < void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
208 < int loadAdapters(const char* fname);
207 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208 > int loadAdaptors(const char* fname);
209  
210   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211                         GStr& s, GStr& infname, GStr& infname2);
# Line 216 | Line 225
225   int dust(GStr& seq);
226   bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227   bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 < bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 < bool trim_adapter3(GStr& seq, int &l5, int &l3);
228 > bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 > bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230  
231   void convertPhred(char* q, int len);
232   void convertPhred(GStr& q);
# Line 278 | Line 287
287       }
288    s=args.getOpt('f');
289    if (!s.is_empty()) {
290 <   loadAdapters(s.chars());
290 >   loadAdaptors(s.chars());
291     }
292 <  bool fileAdapters=adapters5.Count()+adapters3.Count();
292 >  bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293    s=args.getOpt('5');
294    if (!s.is_empty()) {
295 <    if (fileAdapters)
295 >    if (fileAdaptors)
296        GError("Error: options -5 and -f cannot be used together!\n");
297      s.upper();
298 <    addAdapter(adapters5, s);
298 >    addAdaptor(adaptors5, s, galn_TrimLeft);
299      }
300    s=args.getOpt('3');
301    if (!s.is_empty()) {
302 <    if (fileAdapters)
302 >    if (fileAdaptors)
303        GError("Error: options -3 and -f cannot be used together!\n");
304        s.upper();
305 <      addAdapter(adapters3, s);
305 >      addAdaptor(adaptors3, s, galn_TrimRight);
306      }
307    s=args.getOpt('y');
308    if (!s.is_empty()) {
# Line 318 | Line 327
327      openfw(freport, args, 'r');
328    char* infile=NULL;
329  
330 <  if (adapters5.Count()>0)
331 <    gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332 <  if (adapters3.Count()>0)
330 >  if (adaptors5.Count()>0)
331 >    //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332 >        gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 >  if (adaptors3.Count()>0)
334      gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335  
336    while ((infile=args.nextNonOpt())!=NULL) {
# Line 391 | Line 401
401              } //pair read
402         if (tcode>1) { //trashed
403           #ifdef GDEBUG
404 <         GMessage(" !!!!TRASH => 'N'\n");
404 >         GMessage(" !!!!TRASH code = %c\n",tcode);
405           #endif
406            if (tcode=='s') trash_s++;
407            else if (tcode=='A' || tcode=='T') trash_poly++;
# Line 494 | Line 504
504         if (trash_poly>0)
505           GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
506         if (trash_A5>0)
507 <         GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
507 >         GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508         if (trash_A3>0)
509 <         GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
509 >         GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510         }
511      if (trashReport) {
512            FWCLOSE(freport);
# Line 937 | Line 947
947   return false;
948   }
949  
950 < bool trim_adapter3(GStr& seq, int&l5, int &l3) {
951 < if (adapters3.Count()==0) return false;
950 > bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951 > if (adaptors3.Count()==0) return false;
952   int rlen=seq.length();
953   l5=0;
954   l3=rlen-1;
955   bool trimmed=false;
956 < GStr wseq(seq.chars());
956 > GStr wseq(seq);
957   int wlen=rlen;
958 < for (int ai=0;ai<adapters3.Count();ai++) {
959 <  //if (adapters3[ai].is_empty()) continue;
960 <  int alen=adapters3[ai].seq.length();
961 <  GStr& aseq=adapters3[ai].seq;
962 <  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, adapters3[ai].pz,
963 <                            wseq.chars(), wlen, gxmem_r, 74);
964 <  if (r_bestaln) {
965 <     trimmed=true;
966 <     //keep unmatched region on the left, if any
967 <     l3-=(wlen-r_bestaln->sl+1);
968 <     delete r_bestaln;
969 <     if (l3<0) l3=0;
970 <     if (l3-l5+1<min_read_len) return true;
971 <     wseq=seq.substr(l5,l3-l5+1);
972 <     wlen=wseq.length();
958 > GXSeqData seqdata;
959 > int numruns=revCompl ? 2 : 1;
960 > GList<GXAlnInfo> bestalns(true, true, false);
961 > for (int ai=0;ai<adaptors3.Count();ai++) {
962 >   for (int r=0;r<numruns;r++) {
963 >     if (r) {
964 >          seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
965 >                 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
966 >        }
967 >     else {
968 >            seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
969 >                 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
970 >        }
971 >     GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
972 >         if (aln) {
973 >           if (aln->strong) {
974 >                   trimmed=true;
975 >                   bestalns.Add(aln);
976 >                   break; //will check the rest next time
977 >                   }
978 >            else bestalns.Add(aln);
979 >           }
980 >   }//forward and reverse adaptors
981 >   if (trimmed) break; //will check the rest in the next cycle
982 >  }//for each 3' adaptor
983 > if (bestalns.Count()>0) {
984 >           GXAlnInfo* aln=bestalns[0];
985 >           if (aln->sl-1 > wlen-aln->sr) {
986 >                   //keep left side
987 >                   l3-=(wlen-aln->sl+1);
988 >                   if (l3<0) l3=0;
989 >                   }
990 >           else { //keep right side
991 >                   l5+=aln->sr;
992 >                   if (l5>=rlen) l5=rlen-1;
993 >                   }
994 >           //delete aln;
995 >           //if (l3-l5+1<min_read_len) return true;
996 >           wseq=seq.substr(l5,l3-l5+1);
997 >           wlen=wseq.length();
998 >           return true; //break the loops here to report a good find
999       }
1000 <  }//for each 5' adapter
965 <  return trimmed;
1000 >  return false;
1001   }
1002  
1003 < bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1004 < if (adapters5.Count()==0) return false;
1003 > bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1004 > if (adaptors5.Count()==0) return false;
1005   int rlen=seq.length();
1006   l5=0;
1007   l3=rlen-1;
1008   bool trimmed=false;
1009 < GStr wseq(seq.chars());
1009 > GStr wseq(seq);
1010   int wlen=rlen;
1011 < for (int ai=0;ai<adapters5.Count();ai++) {
1012 <  //if (adapters5[ai].is_empty()) continue;
1013 <  int alen=adapters5[ai].seq.length();
1014 <  GStr& aseq=adapters5[ai].seq;
1015 <  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
1016 <                 wseq.chars(), wlen, gxmem_l, 84);
1017 <  if (l_bestaln) {
1018 <     trimmed=true;
1019 <     l5+=l_bestaln->sr;
1020 <     delete l_bestaln;
1021 <     if (l5>=rlen) l5=rlen-1;
1022 <     if (l3-l5+1<min_read_len) return true;
1023 <     wseq=seq.substr(l5,l3-l5+1);
1024 <     wlen=wseq.length();
1011 > GXSeqData seqdata;
1012 > int numruns=revCompl ? 2 : 1;
1013 > GList<GXAlnInfo> bestalns(true, true, false);
1014 > for (int ai=0;ai<adaptors5.Count();ai++) {
1015 >   for (int r=0;r<numruns;r++) {
1016 >     if (r) {
1017 >          seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1018 >                 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1019 >        }
1020 >     else {
1021 >            seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1022 >                 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1023 >        }
1024 >         GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1025 >         if (aln) {
1026 >           if (aln->strong) {
1027 >                   trimmed=true;
1028 >                   bestalns.Add(aln);
1029 >                   break; //will check the rest next time
1030 >                   }
1031 >            else bestalns.Add(aln);
1032 >           }
1033 >         } //forward and reverse?
1034 >   if (trimmed) break; //will check the rest in the next cycle
1035 >  }//for each 5' adaptor
1036 >  if (bestalns.Count()>0) {
1037 >           GXAlnInfo* aln=bestalns[0];
1038 >           if (aln->sl-1 > wlen-aln->sr) {
1039 >                   //keep left side
1040 >                   l3-=(wlen-aln->sl+1);
1041 >                   if (l3<0) l3=0;
1042 >                   }
1043 >           else { //keep right side
1044 >                   l5+=aln->sr;
1045 >                   if (l5>=rlen) l5=rlen-1;
1046 >                   }
1047 >           //delete aln;
1048 >           //if (l3-l5+1<min_read_len) return true;
1049 >           wseq=seq.substr(l5,l3-l5+1);
1050 >           wlen=wseq.length();
1051 >           return true; //break the loops here to report a good find
1052       }
1053 <  }//for each 5' adapter
992 <  return trimmed;
1053 >  return false;
1054   }
1055  
1056   //convert qvs to/from phred64 from/to phread33
# Line 1060 | Line 1121
1121  
1122   #ifdef GDEBUG
1123   void showTrim(GStr& s, int l5, int l3) {
1124 <  if (l5>0) {
1124 >  if (l5>0 || l3==0) {
1125      color_bg(c_red);
1126      }
1127    for (int i=0;i<s.length()-1;i++) {
1128      if (i && i==l5) color_resetbg();
1129      fprintf(stderr, "%c", s[i]);
1130 <    if (i==l3) color_bg(c_red);
1130 >    if (i && i==l3) color_bg(c_red);
1131     }
1132    fprintf(stderr, "%c", s[s.length()-1]);
1133    color_reset();
# Line 1120 | Line 1181
1181     w3=wseq.length()-1;
1182     }
1183   char trim_code;
1184 + //clean the more dirty end first - 3'
1185 + int prev_w5=0;
1186 + int prev_w3=0;
1187 + bool w3upd=true;
1188 + bool w5upd=true;
1189   do {
1190    trim_code=0;
1191 <  if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1191 >  if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1192        trim_code='A';
1193        }
1194 <  else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1194 >  else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1195        trim_code='T';
1196        }
1197 <  else if (trim_adapter5(wseq, w5, w3)) {
1132 <      trim_code='5';
1133 <      }
1134 <  if (trim_code) {
1135 <     #ifdef GDEBUG
1136 <      GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1137 <      showTrim(wseq, w5, w3);
1138 <     #endif
1139 <     int trimlen=wseq.length()-(w3-w5+1);
1140 <     num_trimmed5++;
1141 <     if (trimlen<min_trimmed5)
1142 <         min_trimmed5=trimlen;
1143 <     l5+=w5;
1144 <     l3-=(wseq.length()-1-w3);
1145 <     if (w3-w5+1<min_read_len) {
1146 <         return trim_code;
1147 <         }
1148 <      //-- keep only the w5..w3 range
1149 <      wseq=wseq.substr(w5, w3-w5+1);
1150 <      if (!wqv.is_empty())
1151 <         wqv=wqv.substr(w5, w3-w5+1);
1152 <      }// trimmed at 5' end
1153 < } while (trim_code);
1154 <
1155 < do {
1156 <  trim_code=0;
1157 <  if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1197 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1198        trim_code='A';
1199        }
1200 <  else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1200 >  else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1201        trim_code='T';
1202        }
1203 <  else if (trim_adapter3(wseq, w5, w3)) {
1203 >  else if (trim_adaptor5(wseq, w5, w3)) {
1204 >      trim_code='5';
1205 >      }
1206 >  else if (trim_adaptor3(wseq, w5, w3)) {
1207        trim_code='3';
1208        }
1209    if (trim_code) {
1210 <     #ifdef GDEBUG
1210 >     w3upd=(w3!=prev_w3);
1211 >         w5upd=(w5!=prev_w5);
1212 >         if (w3upd) prev_w3=w3;
1213 >         if (w5upd) prev_w5=w5;
1214 >   #ifdef GDEBUG
1215       GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1216       showTrim(wseq, w5, w3);
1217 <     #endif
1217 >   #endif
1218       int trimlen=wseq.length()-(w3-w5+1);
1219       num_trimmed3++;
1220       if (trimlen<min_trimmed3)
# Line 1184 | Line 1231
1231        }//trimming at 3' end
1232   } while (trim_code);
1233  
1187
1234   if (doCollapse) {
1235     //keep read for later
1236     FqDupRec* dr=dhash.Find(wseq.chars());
# Line 1335 | Line 1381
1381      }
1382   }
1383  
1384 < void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1384 > void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1385   //TODO: prepare CASeqData here, and collect hexamers as well
1386 +  if (seq.is_empty() || seq=="-" ||
1387 +          seq=="N/A" || seq==".") return;
1388 +
1389   CASeqData adata(revCompl);
1390 < int idx=adapters.Add(adata);
1390 > int idx=adaptors.Add(adata);
1391   if (idx<0) GError("Error: failed to add adaptor!\n");
1392 < adapters[idx].update(seq.chars());
1392 > adaptors[idx].trim_type=trim_type;
1393 > adaptors[idx].update(seq.chars());
1394   }
1395  
1396  
1397 < int loadAdapters(const char* fname) {
1397 > int loadAdaptors(const char* fname) {
1398    GLineReader lr(fname);
1399    char* l;
1400    while ((l=lr.nextLine())!=NULL) {
# Line 1360 | Line 1410
1410        #ifdef GDEBUG
1411            //s.reverse();
1412        #endif
1413 <        addAdapter(adapters3, s);
1413 >        addAdaptor(adaptors3, s, galn_TrimRight);
1414          continue;
1415          }
1416        }
# Line 1369 | Line 1419
1419        s.startTokenize("\t ;,:");
1420        GStr a5,a3;
1421        if (s.nextToken(a5))
1422 <         s.nextToken(a3);
1422 >            s.nextToken(a3);
1423 >        else continue; //no tokens on this line
1424 >      GAlnTrimType ttype5=galn_TrimLeft;
1425        a5.upper();
1426        a3.upper();
1427 +      if (a3.is_empty() || a3==a5 || a3=="=") {
1428 +         a3.clear();
1429 +         ttype5=galn_TrimEither;
1430 +         }
1431       #ifdef GDEBUG
1432       //   a5.reverse();
1433       //   a3.reverse();
1434       #endif
1435 <      addAdapter(adapters5, a5);
1436 <      addAdapter(adapters3, a3);
1435 >      addAdaptor(adaptors5, a5, ttype5);
1436 >      addAdaptor(adaptors3, a3, galn_TrimRight);
1437        }
1438     }
1439 <   return adapters5.Count()+adapters3.Count();
1439 >   return adaptors5.Count()+adaptors3.Count();
1440   }
1441  
1442   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines