ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 31 | Line 31
31      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32   -3  trim the given adapter sequence at the 3' end of each read\n\
33      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 < -A  disable polyA trimming (enabled by default)\n\
35 < -T  enable polyT trimming (disabled by default)\n\
34 > -A  disable polyA/T trimming (enabled by default)\n\
35   -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
36   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37   -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
# Line 69 | Line 68
68   bool verbose=false;
69   bool doCollapse=false;
70   bool doDust=false;
71 + bool doPolyTrim=true;
72   bool fastaOutput=false;
73   bool trashReport=false;
74   //bool rawFormat=false;
# Line 93 | Line 93
93   int qv_cvtadd=0; //could be -31 or +31
94  
95   // adaptor matching metrics -- for X-drop ungapped extension
96 + const int match_reward=2;
97 + const int mismatch_penalty=3;
98 + const int Xdrop=8;
99 +
100   const int poly_m_score=2; //match score
101   const int poly_mis_score=-3; //mismatch
102   const int poly_dropoff_score=7;
# Line 100 | Line 104
104  
105   const char *polyA_seed="AAAA";
106   const char *polyT_seed="TTTT";
103 /*
104 struct CAdapters {
105    GStr a5;
106    GStr a3;
107    CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
108      }
109 };
110 */
107   GVec<GStr> adapters5;
108   GVec<GStr> adapters3;
109  
110 + CGreedyAlignData* gxmem_l=NULL;
111 + CGreedyAlignData* gxmem_r=NULL;
112 +
113   // element in dhash:
114   class FqDupRec {
115   public:
# Line 186 | Line 185
185   void convertPhred(GStr& q);
186  
187   int main(int argc, char * const argv[]) {
188 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
188 >  GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189    int e;
190    if ((e=args.isError())>0) {
191        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 197 | Line 196
196    convert_phred=(args.getOpt('Q')!=NULL);
197    doCollapse=(args.getOpt('C')!=NULL);
198    doDust=(args.getOpt('D')!=NULL);
199 +  if (args.getOpt('A')) doPolyTrim=false;
200    /*
201    rawFormat=(args.getOpt('R')!=NULL);
202    if (rawFormat) {
# Line 278 | Line 278
278    if (trashReport)
279      openfw(freport, args, 'r');
280    char* infile=NULL;
281 +
282 +  if (adapters5.Count()>0)
283 +    gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
284 +  if (adapters3.Count()>0)
285 +    gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
286 +
287    while ((infile=args.nextNonOpt())!=NULL) {
288 +    //for each input file
289      int incounter=0; //counter for input reads
290      int outcounter=0; //counter for output reads
291      int trash_s=0; //too short from the get go
292      int trash_Q=0;
293      int trash_N=0;
294      int trash_D=0;
295 +    int trash_poly=0;
296      int trash_A3=0;
297      int trash_A5=0;
298      s=infile;
# Line 309 | Line 317
317         int a5=0, a3=0, b5=0, b3=0;
318         char tcode=0, tcode2=0;
319         tcode=process_read(seqid, rseq, rqv, a5, a3);
320 <       //if (!doCollapse) {
313 <         if (fq2!=NULL) {
320 >       if (fq2!=NULL) {
321              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
322              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
323                 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
# Line 342 | Line 349
349                 int nocounter=0;
350                 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
351                 }
352 <            } //paired read
346 <       // }
352 >            } //pair read
353         if (tcode>1) { //trashed
354 +         #ifdef GDEBUG
355 +         GMessage(" !!!!TRASH => 'N'\n");
356 +         #endif
357            if (tcode=='s') trash_s++;
358 +          else if (tcode=='A' || tcode=='T') trash_poly++;
359              else if (tcode=='Q') trash_Q++;
360                else if (tcode=='N') trash_N++;
361                 else if (tcode=='D') trash_D++;
# Line 358 | Line 368
368              rseq=rseq.substr(a5,a3-a5+1);
369              if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
370              }
371 +         #ifdef GDEBUG
372 +            GMessage("  After trimming:\n");
373 +            GMessage("<==%s\n",rseq.chars());
374 +         #endif
375            writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
376            }
377         } //for each fastq record
# Line 438 | Line 452
452           GMessage("         Trashed by N%%:%9d\n", trash_N);
453         if (trash_Q>0)
454           GMessage("Trashed by low quality:%9d\n", trash_Q);
455 +       if (trash_poly>0)
456 +         GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
457         if (trash_A5>0)
458           GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
459         if (trash_A3>0)
# Line 449 | Line 465
465      FWCLOSE(f_out);
466      FWCLOSE(f_out2);
467     } //while each input file
468 <
468 > delete gxmem_l;
469 > delete gxmem_r;
470   //getc(stdin);
471   }
472  
# Line 756 | Line 773
773   };
774  
775   bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
776 + if (!doPolyTrim) return false;
777   int rlen=seq.length();
778   l5=0;
779   l3=rlen-1;
# Line 795 | Line 813
813        }
814     }
815   ri=maxloc.pos;
816 < if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end
816 > if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
817   //ri = right boundary for the poly match
818   //extend left
819   loc.set(li, maxloc.score);
# Line 816 | Line 834
834         maxloc=loc;
835         }
836      }
837 < if (maxloc.score>poly_minScore && ri>=rlen-3) {
837 > if ((maxloc.score>poly_minScore && ri>=rlen-3) ||
838 >    (maxloc.score==poly_minScore && ri==rlen-1) ||
839 >    (maxloc.score>(poly_minScore<<1) && ri>=rlen-6)) {
840      l5=li;
841      l3=ri;
842      return true;
# Line 824 | Line 844
844   return false;
845   }
846  
827
847   bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
848 + if (!doPolyTrim) return false;
849   int rlen=seq.length();
850   l5=0;
851   l3=rlen-1;
# Line 864 | Line 884
884         }
885      }
886   li=maxloc.pos;
887 < if (li>3) return false; //no trimming wanted, too far from 5' end
887 > if (li>5) return false; //no trimming wanted, too far from 5' end
888   //li = right boundary for the poly match
889  
890   //extend right
# Line 887 | Line 907
907        }
908     }
909  
910 < if (maxloc.score>poly_minScore && li<=3) {
910 > if ((maxloc.score==poly_minScore && li==0) ||
911 >     (maxloc.score>poly_minScore && li<2)
912 >     || (maxloc.score>(poly_minScore<<1) && li<6)) {
913      l5=li;
914      l3=ri;
915      return true;
# Line 896 | Line 918
918   }
919  
920   bool trim_adapter3(GStr& seq, int&l5, int &l3) {
921 + if (adapters3.Count()==0) return false;
922   int rlen=seq.length();
923   l5=0;
924   l3=rlen-1;
925 <
926 < return false; //no adapter parts found
925 > bool trimmed=false;
926 > GStr wseq(seq.chars());
927 > int wlen=rlen;
928 > for (int ai=0;ai<adapters3.Count();ai++) {
929 >  if (adapters3[ai].is_empty()) continue;
930 >  int alen=adapters3[ai].length();
931 >  GStr& aseq=adapters3[ai];
932 >  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
933 >  if (r_bestaln) {
934 >     trimmed=true;
935 >     //keep unmatched region on the left, if any
936 >     l3-=(wlen-r_bestaln->sl+1);
937 >     delete r_bestaln;
938 >     if (l3<0) l3=0;
939 >     if (l3-l5+1<min_read_len) return true;
940 >     wseq=seq.substr(l5,l3-l5+1);
941 >     wlen=wseq.length();
942 >     }
943 >  }//for each 5' adapter
944 >  return trimmed;
945   }
946  
947   bool trim_adapter5(GStr& seq, int&l5, int &l3) {
948 + if (adapters5.Count()==0) return false;
949   int rlen=seq.length();
950   l5=0;
951   l3=rlen-1;
952   bool trimmed=false;
953 + GStr wseq(seq.chars());
954 + int wlen=rlen;
955   for (int ai=0;ai<adapters5.Count();ai++) {
956    if (adapters5[ai].is_empty()) continue;
957 <  int a5len=adapters5[ai].length();
958 <  GStr& adapter5=adapters5[ai];
959 <
957 >  int alen=adapters5[ai].length();
958 >  GStr& aseq=adapters5[ai];
959 >  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_l, 84);
960 >  if (l_bestaln) {
961 >     trimmed=true;
962 >     l5+=l_bestaln->sr;
963 >     delete l_bestaln;
964 >     if (l5>=rlen) l5=rlen-1;
965 >     if (l3-l5+1<min_read_len) return true;
966 >     wseq=seq.substr(l5,l3-l5+1);
967 >     wlen=wseq.length();
968 >     }
969    }//for each 5' adapter
970    return trimmed;
971   }
# Line 988 | Line 1041
1041   // and a trash code if it was trashed
1042   l5=0;
1043   l3=rseq.length()-1;
1044 + #ifdef GDEBUG
1045 +   GMessage(">%s\n", rname.chars());
1046 +   GMessage("==>%s\n",rseq.chars());
1047 + #endif
1048   if (l3-l5+1<min_read_len) {
1049     return 's';
1050     }
# Line 1023 | Line 1080
1080     w5=0;
1081     w3=wseq.length()-1;
1082     }
1083 < if (adapters3.Count()>0) {
1084 <  if (trim_adapter3(wseq, w5, w3)) {
1083 > char trim_code;
1084 > do {
1085 >  trim_code=0;
1086 >  if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1087 >      trim_code='A';
1088 >      #ifdef GDEBUG
1089 >       GMessage("\t-trimmed poly-A at 5' end\n");
1090 >      #endif
1091 >      }
1092 >  else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1093 >      trim_code='T';
1094 >      #ifdef GDEBUG
1095 >       GMessage("\t-trimmed poly-T at 5' end\n");
1096 >      #endif
1097 >      }
1098 >  else if (trim_adapter5(wseq, w5, w3)) {
1099 >      trim_code='5';
1100 >      #ifdef GDEBUG
1101 >       GMessage("\t-trimmed adapter at 5' end\n");
1102 >      #endif
1103 >      }
1104 >  if (trim_code) {
1105       int trimlen=wseq.length()-(w3-w5+1);
1106 <     num_trimmed3++;
1107 <     if (trimlen<min_trimmed3)
1108 <         min_trimmed3=trimlen;
1106 >     num_trimmed5++;
1107 >     if (trimlen<min_trimmed5)
1108 >         min_trimmed5=trimlen;
1109       l5+=w5;
1110       l3-=(wseq.length()-1-w3);
1111       if (w3-w5+1<min_read_len) {
1112 <         return '3';
1112 >         return trim_code;
1113           }
1114        //-- keep only the w5..w3 range
1115        wseq=wseq.substr(w5, w3-w5+1);
1116        if (!wqv.is_empty())
1117           wqv=wqv.substr(w5, w3-w5+1);
1118 <      }//some adapter was trimmed
1119 <   } //adapter trimming
1120 < if (adapters5.Count()>0) {
1121 <  if (trim_adapter5(wseq, w5, w3)) {
1118 >      }// trimmed at 5' end
1119 > } while (trim_code);
1120 >
1121 > do {
1122 >  trim_code=0;
1123 >  if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1124 >      trim_code='A';
1125 >      }
1126 >  else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1127 >      trim_code='T';
1128 >      }
1129 >  else if (trim_adapter3(wseq, w5, w3)) {
1130 >      trim_code='3';
1131 >      }
1132 >  if (trim_code) {
1133       int trimlen=wseq.length()-(w3-w5+1);
1134 <     num_trimmed5++;
1135 <     if (trimlen<min_trimmed5)
1136 <         min_trimmed5=trimlen;
1134 >     num_trimmed3++;
1135 >     if (trimlen<min_trimmed3)
1136 >         min_trimmed3=trimlen;
1137       l5+=w5;
1138       l3-=(wseq.length()-1-w3);
1139       if (w3-w5+1<min_read_len) {
1140 <         return '5';
1140 >         return trim_code;
1141           }
1142        //-- keep only the w5..w3 range
1143        wseq=wseq.substr(w5, w3-w5+1);
1144        if (!wqv.is_empty())
1145           wqv=wqv.substr(w5, w3-w5+1);
1146 <      }//some adapter was trimmed
1147 <   } //adapter trimming
1146 >      }//trimming at 3' end
1147 > } while (trim_code);
1148 >
1149 >
1150   if (doCollapse) {
1151     //keep read for later
1152     FqDupRec* dr=dhash.Find(wseq.chars());
# Line 1115 | Line 1205
1205   void trash_report(char trashcode, GStr& rname, FILE* freport) {
1206   if (freport==NULL || trashcode<=' ') return;
1207   if (trashcode=='3' || trashcode=='5') {
1208 <   fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1208 >   fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1209     }
1210   else {
1211     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines