ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 48 | Line 48
48   -Q  convert quality values to the other Phred qv type\n\
49   -V  verbose processing\n\
50   "
51 <
51 >
52   //-z  for -o option, the output stream(s) will be first piped into the given\n
53   //   <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54  
55 <
55 >
56   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57  
58   //For paired reads sequencing:
# Line 62 | Line 62
62   //FILE* f_out2=NULL; //for paired reads
63   //FILE* f_in=NULL; //input fastq (stdin if not provided)
64   //FILE* f_in2=NULL; //for paired reads
65 +
66   FILE* freport=NULL;
67  
68   bool debug=false;
# Line 77 | Line 78
78   int dust_cutoff=16;
79   bool isfasta=false;
80   bool convert_phred=false;
81 < GStr outsuffix; // -o
81 > GStr outsuffix; // -o
82   GStr prefix;
83   GStr zcmd;
84   int num_trimmed5=0;
# Line 104 | Line 105
105  
106   const char *polyA_seed="AAAA";
107   const char *polyT_seed="TTTT";
108 < GVec<GStr> adapters5;
109 < GVec<GStr> adapters3;
108 >
109 > struct CASeqData {
110 >   //positional data for every possible hexamer in an adapter
111 >   GVec<uint16>* pz[64]; //0-based coordinates of all possible hexamers in the adapter sequence
112 >   GVec<uint16>* pzr[64]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
113 >   GStr seq; //actual adapter sequence data
114 >   GStr seqr; //reverse complement sequence
115 >   CASeqData(bool rev=false):seq(),seqr() {
116 >     for (int i=0;i<63;i++) {
117 >       pz[i]=new GVec<uint16>(1);
118 >       if (rev) pzr[i]=new GVec<uint16>(1);
119 >       }
120 >   }
121 >
122 >   ~CSeqData() {
123 >     for (int i=0;i<63;i++) {
124 >     delete pz[i];
125 >     if (rev) { delete pzr[i]; }
126 >     }
127 >   }
128 > };
129 >
130 > GVec<CASeqData> adapters5;
131 > GVec<CASeqData> adapters3;
132  
133   CGreedyAlignData* gxmem_l=NULL;
134   CGreedyAlignData* gxmem_r=NULL;
# Line 160 | Line 183
183  
184   int loadAdapters(const char* fname);
185  
186 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
187 <                       GStr& s, GStr& infname, GStr& infname2);
186 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
187 >                       GStr& s, GStr& infname, GStr& infname2);
188   // uses outsuffix to generate output file names and open file handles as needed
189 <
189 >
190   void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
191   void trash_report(char trashcode, GStr& rname, FILE* freport);
192  
193 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
193 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
194            GStr& rname, GStr& rinfo, GStr& infname);
195  
196 < char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
196 > char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
197   //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
198  
199   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
# Line 205 | Line 228
228    */
229    prefix=args.getOpt('n');
230    GStr s=args.getOpt('l');
231 <  if (!s.is_empty())
231 >  if (!s.is_empty())
232       min_read_len=s.asInt();
233    s=args.getOpt('m');
234 <  if (!s.is_empty())
234 >  if (!s.is_empty())
235       max_perc_N=s.asDouble();
236    s=args.getOpt('d');
237    if (!s.is_empty()) {
# Line 234 | Line 257
257          qv_phredtype=64;
258          qv_cvtadd=-31;
259          }
260 <       else
260 >       else
261           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
262       }
263    s=args.getOpt('f');
# Line 261 | Line 284
284       int minmatch=s.asInt();
285       poly_minScore=minmatch*poly_m_score;
286       }
287 <  
287 >
288    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
289                           else outsuffix="-";
290    trashReport=  (args.getOpt('r')!=NULL);
# Line 399 | Line 422
422                 }
423              }
424           outcounter++;
425 <         if (qd->count>maxdup_count) {
425 >         if (qd->count>maxdup_count) {
426              maxdup_count=qd->count;
427              maxdup_seq=seq;
428              }
429           if (isfasta) {
430             if (prefix.is_empty()) {
431 <             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
431 >             fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
432                             rseq.chars());
433               }
434             else { //use custom read name
# Line 416 | Line 439
439           else { //fastq format
440            if (convert_phred) convertPhred(qd->qv, qd->len);
441            if (prefix.is_empty()) {
442 <            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
442 >            fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
443                             rseq.chars(), qd->qv);
444              }
445            else { //use custom read name
# Line 481 | Line 504
504     const char* seq;
505     bool valid;
506     NData() {
507 +    seqlen=0;
508      NCount=0;
509      end5=0;
510      end3=0;
# Line 511 | Line 535
535       perc_N=(n*100.0)/(end5-end3+1);
536       }
537   };
538 <
538 >
539   static NData feat;
540   int perc_lenN=12; // incremental distance from ends, in percentage of
541            // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
542 <          
542 >
543   void N_analyze(int l5, int l3, int p5, int p3) {
544   /* assumes feat was filled properly */
545   int old_dif, t5,t3,v;
546   if (l3<l5+2 || p5>p3 ) {
547     feat.end5=l5+1;
548     feat.end3=l3+1;
549 <   return;
549 >   return;
550     }
551  
552   t5=feat.NPos[p5]-l5;
553   t3=l3-feat.NPos[p3];
554   old_dif=p3-p5;
555   v=(int)((((double)(l3-l5))*perc_lenN)/100);
556 < if (v>20) v=20; /* enforce N-search limit for very long reads */
556 > if (v>20) v=20; /* enforce N-search limit for very long reads */
557   if (t5 < v ) {
558     l5=feat.NPos[p5]+1;
559     p5++;
# Line 546 | Line 570
570             feat.end3=l3+1;
571             return;
572             }
573 <    else
573 >    else
574        N_analyze(l5,l3, p5,p3);
575   }
576  
# Line 587 | Line 611
611   feat.init(rseq);
612   l5=feat.end5-1;
613   l3=feat.end3-1;
614 < N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
614 > N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
615   if (l5==feat.end5-1 && l3==feat.end3-1) {
616      if (feat.perc_N>max_perc_N) {
617             feat.valid=false;
# Line 605 | Line 629
629     return true;
630     }
631   feat.N_calc();
632 <
632 >
633   if (feat.perc_N>max_perc_N) {
634        feat.valid=false;
635        l3=l5+1;
# Line 617 | Line 641
641   //--------------- dust functions ----------------
642   class DNADuster {
643   public:
644 <  int dustword;
645 <  int dustwindow;
646 <  int dustwindow2;
644 >  int dustword;
645 >  int dustwindow;
646 >  int dustwindow2;
647    int dustcutoff;
648    int mv, iv, jv;
649    int counts[32*32*32];
# Line 714 | Line 738
738                      }
739             }
740           }
741 < //return first;
741 > //return first;
742   }
743   };
744  
# Line 732 | Line 756
756   return ncount;
757   }
758  
735 int get3mer_value(const char* s) {
736 return (s[0]<<16)+(s[1]<<8)+s[2];
737 }
738
739 int w3_match(int qv, const char* str, int slen, int start_index=0) {
740 if (start_index>=slen || start_index<0) return -1;
741 for (int i=start_index;i<slen-3;i++) {
742   int rv=get3mer_value(str+i);
743   if (rv==qv) return i;
744   }
745 return -1;
746 }
747
748 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
749 if (end_index>=slen) return -1;
750 if (end_index<0) end_index=slen-1;
751 for (int i=end_index-2;i>=0;i--) {
752   int rv=get3mer_value(str+i);
753   if (rv==qv) return i;
754   }
755 return -1;
756 }
757
759   struct SLocScore {
760    int pos;
761    int score;
# Line 796 | Line 797
797   SLocScore loc(ri, poly_m_score<<2);
798   SLocScore maxloc(loc);
799   //extend right
800 < while (ri<rlen-2) {
800 > while (ri<rlen-1) {
801     ri++;
802     if (seq[ri]==polyChar) {
803                  loc.add(ri,poly_m_score);
# Line 892 | Line 893
893   //extend right
894   loc.set(ri, maxloc.score);
895   maxloc.pos=ri;
896 < while (ri<rlen-2) {
896 > while (ri<rlen-1) {
897     ri++;
898     if (seq[ri]==polyChar) {
899                  loc.add(ri,poly_m_score);
# Line 959 | Line 960
960    if (adapters5[ai].is_empty()) continue;
961    int alen=adapters5[ai].length();
962    GStr& aseq=adapters5[ai];
963 <  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_l, 84);
963 >  GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
964 >                         wseq.chars(), wlen, gxmem_l, 84);
965    if (l_bestaln) {
966       trimmed=true;
967       l5+=l_bestaln->sr;
# Line 973 | Line 975
975    return trimmed;
976   }
977  
978 < //convert qvs to/from phred64 from/to phread33
978 > //convert qvs to/from phred64 from/to phread33
979   void convertPhred(GStr& q) {
980   for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
981   }
# Line 982 | Line 984
984   for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
985   }
986  
987 < bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
987 > bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
988            GStr& rname, GStr& rinfo, GStr& infname) {
989   rseq="";
990   rqv="";
# Line 998 | Line 1000
1000        } //raw qseq format
1001   else { // FASTQ or FASTA */
1002   isfasta=(l[0]=='>');
1003 < if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1003 > if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1004        infname.chars(), l);
1005   GStr s(l);
1006   rname=&(l[1]);
1007   for (int i=0;i<rname.length();i++)
1008 <    if (rname[i]<=' ') {
1009 <       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1010 <       rname.cut(i);
1011 <       break;
1008 >    if (rname[i]<=' ') {
1009 >       if (i<rname.length()-2) rinfo=rname.substr(i+1);
1010 >       rname.cut(i);
1011 >       break;
1012         }
1013    //now get the sequence
1014 < if ((l=fq.getLine())==NULL)
1014 > if ((l=fq.getLine())==NULL)
1015        GError("Error: unexpected EOF after header for read %s (%s)\n",
1016                     rname.chars(), infname.chars());
1017   rseq=l; //this must be the DNA line
1018   while ((l=fq.getLine())!=NULL) {
1019        //seq can span multiple lines
1020        if (l[0]=='>' || l[0]=='+') {
1021 <           fq.pushBack();
1021 >           fq.pushBack();
1022             break; //
1023             }
1024        rseq+=l;
1025 <      } //check for multi-line seq
1025 >      } //check for multi-line seq
1026   if (!isfasta) { //reading fastq quality values, which can also be multi-line
1027      if ((l=fq.getLine())==NULL)
1028          GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1029      if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1030 <    if ((l=fq.getLine())==NULL)
1030 >    if ((l=fq.getLine())==NULL)
1031          GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1032      rqv=l;
1033 <    //if (rqv.length()!=rseq.length())
1033 >    //if (rqv.length()!=rseq.length())
1034      //  GError("Error: qv len != seq len for %s\n", rname.chars());
1035      while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1036        rqv+=l; //append to qv string
# Line 1056 | Line 1058
1058   #endif
1059  
1060   char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1061 < //returns 0 if the read was untouched, 1 if it was just trimmed
1061 > //returns 0 if the read was untouched, 1 if it was just trimmed
1062   // and a trash code if it was trashed
1063   l5=0;
1064   l3=rseq.length()-1;
# Line 1151 | Line 1153
1153       #endif
1154       int trimlen=wseq.length()-(w3-w5+1);
1155       num_trimmed3++;
1156 <     if (trimlen<min_trimmed3)
1156 >     if (trimlen<min_trimmed3)
1157           min_trimmed3=trimlen;
1158       l5+=w5;
1159       l3-=(wseq.length()-1-w3);
# Line 1170 | Line 1172
1172     //keep read for later
1173     FqDupRec* dr=dhash.Find(wseq.chars());
1174     if (dr==NULL) { //new entry
1175 <          //if (prefix.is_empty())
1176 <             dhash.Add(wseq.chars(),
1175 >          //if (prefix.is_empty())
1176 >             dhash.Add(wseq.chars(),
1177                    new FqDupRec(&wqv, rname.chars()));
1178            //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1179           }
# Line 1205 | Line 1207
1207         fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1208         }
1209        else {
1210 <       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1210 >       fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1211                            rseq.chars());
1212         }
1213       }
# Line 1216 | Line 1218
1218         fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1219         }
1220        else
1221 <       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1221 >       fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1222                            rseq.chars(),rqv.chars() );
1223       }
1224   }
# Line 1316 | Line 1318
1318      }
1319   }
1320  
1321 + void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1322 + //TODO: prepare CASeqData here, and collect hexamers as well
1323 +
1324 + }
1325 +
1326  
1327   int loadAdapters(const char* fname) {
1328    GLineReader lr(fname);
# Line 1333 | Line 1340
1340        #ifdef GDEBUG
1341            //s.reverse();
1342        #endif
1343 <        adapters3.Add(s);
1343 >        addAdapter(adapters3, s);
1344          continue;
1345          }
1346        }
# Line 1349 | Line 1356
1356       //   a5.reverse();
1357       //   a3.reverse();
1358       #endif
1359 <      adapters5.Add(a5);
1360 <      adapters3.Add(a3);
1359 >      addAdapter(adapters5, a5);
1360 >      addAdapter(adapters3, a3);
1361        }
1362     }
1363     return adapters5.Count()+adapters3.Count();
1364   }
1365  
1366 < void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1367 <                       GStr& s, GStr& infname, GStr& infname2) {
1366 > void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1367 >                       GStr& s, GStr& infname, GStr& infname2) {
1368   // uses outsuffix to generate output file names and open file handles as needed
1369   infname="";
1370   infname2="";
# Line 1385 | Line 1392
1392   s.startTokenize(",:");
1393   s.nextToken(infname);
1394   bool paired=s.nextToken(infname2);
1395 < if (fileExists(infname.chars())==0)
1395 > if (fileExists(infname.chars())==0)
1396      GError("Error: cannot find file %s!\n",infname.chars());
1397   GStr fname(getFileName(infname.chars()));
1398   GStr picmd;
# Line 1407 | Line 1414
1414   if (!paired) return;
1415   if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1416   // ---- paired reads:-------------
1417 < if (fileExists(infname2.chars())==0)
1417 > if (fileExists(infname2.chars())==0)
1418       GError("Error: cannot find file %s!\n",infname2.chars());
1419   picmd="";
1420   GStr fname2(getFileName(infname2.chars()));

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines