ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 4 | Line 4
4   #include "GList.hh"
5   #include <ctype.h>
6   #include "GAlnExtend.h"
7 + #include <inttypes.h>
8  
9   #define USAGE "Usage:\n\
10   fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
# Line 41 | Line 42
42   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
43      one letter code for the reason why they were trashed\n\
44   -D  apply a low-complexity (dust) filter and discard any read that has over\n\
45 <    50% of its length detected as low complexity\n\
45 >    50%% of its length detected as low complexity\n\
46   -C  collapse duplicate reads and add a prefix with their count to the read\n\
47      name (e.g. for microRNAs)\n\
48   -p  input is phred64/phred33 (use -p64 or -p33)\n\
49   -Q  convert quality values to the other Phred qv type\n\
50 < -V  verbose processing\n\
50 > -V  be verbose when done (print summary counts)\n\
51   "
52  
53   //-z  for -o option, the output stream(s) will be first piped into the given\n
# Line 81 | Line 82
82   GStr outsuffix; // -o
83   GStr prefix;
84   GStr zcmd;
85 < int num_trimmed5=0;
86 < int num_trimmed3=0;
87 < int num_trimmedN=0;
88 < int num_trimmedQ=0;
89 < int min_trimmed5=INT_MAX;
90 < int min_trimmed3=INT_MAX;
85 > char isACGT[256];
86 >
87 > uint num_trimN=0; //reads trimmed by N%
88 > uint num_trimQ=0; //reads trimmed by qv threshold
89 > uint num_trimV=0; //reads trimmed by adapter match
90 > uint num_trimA=0; //reads trimmed by polyA
91 > uint num_trimT=0; //reads trimmed by polyT
92 > uint num_trim5=0; //number of reads trimmed at 5' end
93 > uint num_trim3=0; //number of reads trimmed at 3' end
94 >
95 > uint64 b_totalIn=0; //total number of input bases
96 > uint64 b_totalN=0;  //total number of undetermined bases found in input bases
97 >
98 > uint64 b_trimN=0; //total number of bases trimmed due to N-trim
99 > uint64 b_trimQ=0; //number of bases trimmed due to qv threshold
100 > uint64 b_trimV=0; //total number of bases trimmed due to adaptor matches
101 > uint64 b_trimA=0; //number of bases trimmed due to poly-A tails
102 > uint64 b_trimT=0; //number of bases trimmed due to poly-T tails
103 > uint64 b_trim5=0; //total bases trimmed on the 5' side
104 > uint64 b_trim3=0; //total bases trimmed on the 3' side
105 > //int min_trimmed5=INT_MAX;
106 > //int min_trimmed3=INT_MAX;
107  
108   int qvtrim_qmin=0;
109   int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
# Line 285 | Line 302
302         else
303           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
304       }
305 +  memset((void*)isACGT, 0, 256);
306 +  isACGT['A']=isACGT['a']=isACGT['C']=isACGT['c']=1;
307 +  isACGT['G']=isACGT['g']=isACGT['T']=isACGT['t']=1;
308    s=args.getOpt('f');
309    if (!s.is_empty()) {
310     loadAdaptors(s.chars());
# Line 342 | Line 362
362      int trash_N=0;
363      int trash_D=0;
364      int trash_poly=0;
365 <    int trash_A3=0;
366 <    int trash_A5=0;
365 >    int trash_V=0;
366 >    num_trimN=0;
367 >    num_trimQ=0;
368 >    num_trimV=0;
369 >    num_trimA=0;
370 >    num_trimT=0;
371 >    num_trim5=0;
372 >    num_trim3=0;
373 >
374 >    b_totalIn=0;
375 >    b_totalN=0;
376 >    b_trimN=0;
377 >    b_trimQ=0;
378 >    b_trimV=0;
379 >    b_trimA=0;
380 >    b_trimT=0;
381 >    b_trim5=0;
382 >    b_trim3=0;
383 >
384      s=infile;
385      GStr infname;
386      GStr infname2;
# Line 366 | Line 403
403         int a5=0, a3=0, b5=0, b3=0;
404         char tcode=0, tcode2=0;
405         tcode=process_read(seqid, rseq, rqv, a5, a3);
406 +       if (a5>0) {
407 +          b_trim5+=a5;
408 +          num_trim5++;
409 +          }
410 +       if (a3<rseq.length()-1) {
411 +          b_trim3+=rseq.length()-a3-1;
412 +          num_trim3++;
413 +          }
414         if (fq2!=NULL) {
415              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
416              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
# Line 373 | Line 418
418                    seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
419                 }
420              tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
421 +            if (b5>0) {
422 +               b_trim5+=b5;
423 +               num_trim5++;
424 +               }
425 +            if (b3<rseq2.length()-1) {
426 +               b_trim3+=rseq2.length()-b3-1;
427 +               num_trim3++;
428 +               }
429              //decide what to do with this pair and print rseq2 if the pair makes it
430              if (tcode>1 && tcode2<=1) {
431                 //"untrash" rseq
# Line 408 | Line 461
461              else if (tcode=='Q') trash_Q++;
462                else if (tcode=='N') trash_N++;
463                 else if (tcode=='D') trash_D++;
464 <                else if (tcode=='3') trash_A3++;
465 <                 else if (tcode=='5') trash_A5++;
464 >                else if (tcode=='V') trash_V++;
465 >                 //else if (tcode=='5') trash_A5++;
466            if (trashReport) trash_report(tcode, seqid, freport);
467            }
468           else if (!doCollapse) { //write it
# Line 481 | Line 534
534      if (verbose) {
535         if (paired_reads) {
536             GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
537 <           GMessage("Number of input pairs :%9d\n", incounter);
538 <           GMessage("         Output pairs :%9d\n", outcounter);
537 >           GMessage("Number of input pairs :%9u\n", incounter);
538 >           GMessage("         Output pairs :%9u\t(%u trashed)\n", outcounter, incounter-outcounter);
539             }
540           else {
541             GMessage(">Input file : %s\n", infname.chars());
542             GMessage("Number of input reads :%9d\n", incounter);
543 <           GMessage("         Output reads :%9d\n", outcounter);
543 >           GMessage("         Output reads :%9d  (%u trashed)\n", outcounter, incounter-outcounter);
544             }
545 <       GMessage("------------------------------------\n");
546 <       if (num_trimmed5)
547 <          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
548 <       if (num_trimmed3)
549 <          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
545 >       GMessage("--------------- Read trimming: ------------------\n");
546 >       if (num_trim5)
547 >          GMessage("           5' trimmed :%9u\n", num_trim5);
548 >       if (num_trim3)
549 >          GMessage("           3' trimmed :%9u\n", num_trim3);
550 >       if (num_trimQ)
551 >          GMessage("         q.v. trimmed :%9u\n", num_trimQ);
552 >       if (num_trimN)
553 >          GMessage("            N trimmed :%9u\n", num_trimN);
554 >       if (num_trimT)
555 >          GMessage("       poly-T trimmed :%9u\n", num_trimT);
556 >       if (num_trimA)
557 >          GMessage("       poly-A trimmed :%9u\n", num_trimA);
558 >       if (num_trimV)
559 >          GMessage("      Adapter trimmed :%9u\n", num_trimV);
560         GMessage("------------------------------------\n");
561         if (trash_s>0)
562 <         GMessage("     Trashed by length:%9d\n", trash_s);
562 >         GMessage("Trashed by initial len:%9d\n", trash_s);
563         if (trash_N>0)
564           GMessage("         Trashed by N%%:%9d\n", trash_N);
565         if (trash_Q>0)
566           GMessage("Trashed by low quality:%9d\n", trash_Q);
567         if (trash_poly>0)
568           GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
569 <       if (trash_A5>0)
570 <         GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
571 <       if (trash_A3>0)
572 <         GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
569 >       if (trash_V>0)
570 >         GMessage("    Trashed by adaptor:%9d\n", trash_V);
571 >       GMessage("\n--------------- Base counts  --------------------\n");
572 >       GMessage("    Input bases   :%12llu\n", b_totalIn);
573 >       double percN=100.0* ((double)b_totalN/(double)b_totalIn);
574 >       GMessage("          N bases :%12llu (%4.2f%%)\n", b_totalN, percN);
575 >       GMessage("   trimmed from 5':%12llu\n", b_trim5);
576 >       GMessage("   trimmed from 3':%12llu\n", b_trim3);
577 >       GMessage("\n");
578 >       if (b_trimQ)
579 >       GMessage("     q.v. trimmed :%12llu\n", b_trimQ);
580 >       if (b_trimN)
581 >       GMessage("        N trimmed :%12llu\n", b_trimN);
582 >       if (b_trimT)
583 >       GMessage("   poly-T trimmed :%12llu\n", b_trimT);
584 >       if (b_trimA)
585 >       GMessage("   poly-A trimmed :%12llu\n", b_trimA);
586 >       if (b_trimV)
587 >       GMessage("  Adapter trimmed :%12llu\n", b_trimV);
588 >
589         }
590      if (trashReport) {
591            FWCLOSE(freport);
# Line 957 | Line 1036
1036   int wlen=rlen;
1037   GXSeqData seqdata;
1038   int numruns=revCompl ? 2 : 1;
1039 <
1039 > GList<GXAlnInfo> bestalns(true, true, false);
1040   for (int ai=0;ai<adaptors3.Count();ai++) {
1041     for (int r=0;r<numruns;r++) {
1042       if (r) {
# Line 968 | Line 1047
1047              seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
1048                   adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
1049          }
1050 <
1051 <  GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
1052 <  if (aln) {
1053 <     trimmed=true;
1054 <     //keep unmatched region on the left OR right (the longer one)
1055 <     if (aln->sl > wlen-aln->sr) {
1056 <         //keep left side
1057 <         l3-=(wlen-aln->sl+1);
1058 <         if (l3<0) l3=0;
980 <         }
981 <     else { //keep right side
982 <         l5+=aln->sr;
983 <         if (l5>=rlen) l5=rlen-1;
984 <         }
985 <     delete aln;
986 <     if (l3-l5+1<min_read_len) return true;
987 <     wseq=seq.substr(l5,l3-l5+1);
988 <     wlen=wseq.length();
989 <     }
1050 >     GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
1051 >         if (aln) {
1052 >           if (aln->strong) {
1053 >                   trimmed=true;
1054 >                   bestalns.Add(aln);
1055 >                   break; //will check the rest next time
1056 >                   }
1057 >            else bestalns.Add(aln);
1058 >           }
1059     }//forward and reverse adaptors
1060 +   if (trimmed) break; //will check the rest in the next cycle
1061    }//for each 3' adaptor
1062 <  return trimmed;
1062 > if (bestalns.Count()>0) {
1063 >           GXAlnInfo* aln=bestalns[0];
1064 >           if (aln->sl-1 > wlen-aln->sr) {
1065 >                   //keep left side
1066 >                   l3-=(wlen-aln->sl+1);
1067 >                   if (l3<0) l3=0;
1068 >                   }
1069 >           else { //keep right side
1070 >                   l5+=aln->sr;
1071 >                   if (l5>=rlen) l5=rlen-1;
1072 >                   }
1073 >           //delete aln;
1074 >           //if (l3-l5+1<min_read_len) return true;
1075 >           wseq=seq.substr(l5,l3-l5+1);
1076 >           wlen=wseq.length();
1077 >           return true; //break the loops here to report a good find
1078 >     }
1079 >  return false;
1080   }
1081  
1082   bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
# Line 1144 | Line 1231
1231   GStr wqv(rqv.chars());
1232   int w5=l5;
1233   int w3=l3;
1234 + //count Ns
1235 + b_totalIn+=rseq.length();
1236 + for (int i=0;i<rseq.length();i++) {
1237 + if (isACGT[(int)rseq[i]]==0) b_totalN++;
1238 + }
1239 +
1240   //first do the q-based trimming
1241   if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1242 +   b_trimQ+=(w5-l5)+(l3-w3);
1243 +   num_trimQ++;
1244     if (w3-w5+1<min_read_len) {
1245       return 'Q'; //invalid read
1246       }
# Line 1158 | Line 1253
1253     } //qv trimming
1254   // N-trimming on the remaining read seq
1255   if (ntrim(wseq, w5, w3)) {
1256 +   //Note: ntrim resets w5 and w3
1257     //GMessage("before: %s\n",wseq.chars());
1258     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1259 +   int trim3=(wseq.length()-1-w3);
1260 +   b_trimN+=w5+trim3;
1261 +   num_trimN++;
1262     l5+=w5;
1263 <   l3-=(wseq.length()-1-w3);
1263 >   l3-=trim3;
1264     if (w3-w5+1<min_read_len) {
1265       return 'N'; //to be trashed
1266       }
# Line 1174 | Line 1273
1273     }
1274   char trim_code;
1275   //clean the more dirty end first - 3'
1276 + bool trimmedA=false;
1277 + bool trimmedT=false;
1278 + bool trimmedV=false;
1279 +
1280   int prev_w5=0;
1281   int prev_w3=0;
1282   bool w3upd=true;
# Line 1182 | Line 1285
1285    trim_code=0;
1286    if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1287        trim_code='A';
1288 +      b_trimA+=(w5+(wseq.length()-1-w3));
1289 +      if (!trimmedA) { num_trimA++; trimmedA=true; }
1290        }
1291    else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1292        trim_code='T';
1293 +      b_trimT+=(w5+(wseq.length()-1-w3));
1294 +      if (!trimmedT) { num_trimT++; trimmedT=true; }
1295        }
1296    else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1297        trim_code='A';
1298 +      b_trimA+=(w5+(wseq.length()-1-w3));
1299 +      if (!trimmedA) { num_trimA++; trimmedA=true; }
1300        }
1301    else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1302        trim_code='T';
1303 +      b_trimT+=(w5+(wseq.length()-1-w3));
1304 +      if (!trimmedT) { num_trimT++; trimmedT=true; }
1305        }
1306    else if (trim_adaptor5(wseq, w5, w3)) {
1307 <      trim_code='5';
1307 >      trim_code='V';
1308 >      b_trimV+=(w5+(wseq.length()-1-w3));
1309 >      if (!trimmedV) { num_trimV++; trimmedV=true; }
1310        }
1311    else if (trim_adaptor3(wseq, w5, w3)) {
1312 <      trim_code='3';
1312 >      trim_code='V';
1313 >      b_trimV+=(w5+(wseq.length()-1-w3));
1314 >      if (!trimmedV) { num_trimV++; trimmedV=true; }
1315        }
1316    if (trim_code) {
1317       w3upd=(w3!=prev_w3);
# Line 1207 | Line 1322
1322       GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1323       showTrim(wseq, w5, w3);
1324     #endif
1325 <     int trimlen=wseq.length()-(w3-w5+1);
1211 <     num_trimmed3++;
1212 <     if (trimlen<min_trimmed3)
1213 <         min_trimmed3=trimlen;
1325 >     //int trimlen=wseq.length()-(w3-w5+1);
1326       l5+=w5;
1327       l3-=(wseq.length()-1-w3);
1328       if (w3-w5+1<min_read_len) {
# Line 1222 | Line 1334
1334           wqv=wqv.substr(w5, w3-w5+1);
1335        }//trimming at 3' end
1336   } while (trim_code);
1225
1337   if (doCollapse) {
1338     //keep read for later
1339     FqDupRec* dr=dhash.Find(wseq.chars());

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines