ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 4 | Line 4
4   #include "GList.hh"
5   #include <ctype.h>
6   #include "GAlnExtend.h"
7 + #include <inttypes.h>
8  
9   #define USAGE "Usage:\n\
10 < fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10 > fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_match>]\\\n\
11     [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
12     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
13 <    <input.fq>[,<input_mates.fq>\n\
13 >   [-a <min_poly>] [-A] <input.fq>[,<input_mates.fq>\n\
14   \n\
15   Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
16   for low complexity and collapse duplicate reads.\n\
# Line 31 | Line 32
32      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
33   -3  trim the given adaptor sequence at the 3' end of each read\n\
34      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
35 + -a  minimum exact match to adaptor sequence to trim at the end (6)\n\
36   -A  disable polyA/T trimming (enabled by default)\n\
37 < -y  minimum length of exact match to adaptor sequence at the proper end (6)\n\
37 > -y  minimum length of poly-A/T run to remove (6)\n\
38   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
39   -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
40   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
# Line 41 | Line 43
43   -r  write a simple \"trash report\" file listing the discarded reads with a\n\
44      one letter code for the reason why they were trashed\n\
45   -D  apply a low-complexity (dust) filter and discard any read that has over\n\
46 <    50% of its length detected as low complexity\n\
46 >    50%% of its length detected as low complexity\n\
47   -C  collapse duplicate reads and add a prefix with their count to the read\n\
48      name (e.g. for microRNAs)\n\
49   -p  input is phred64/phred33 (use -p64 or -p33)\n\
50   -Q  convert quality values to the other Phred qv type\n\
51 < -V  verbose processing\n\
51 > -V  be verbose when done (print summary counts)\n\
52   "
53  
54   //-z  for -o option, the output stream(s) will be first piped into the given\n
# Line 81 | Line 83
83   GStr outsuffix; // -o
84   GStr prefix;
85   GStr zcmd;
86 < int num_trimmed5=0;
87 < int num_trimmed3=0;
88 < int num_trimmedN=0;
89 < int num_trimmedQ=0;
90 < int min_trimmed5=INT_MAX;
91 < int min_trimmed3=INT_MAX;
86 > char isACGT[256];
87 >
88 > uint num_trimN=0; //reads trimmed by N%
89 > uint num_trimQ=0; //reads trimmed by qv threshold
90 > uint num_trimV=0; //reads trimmed by adapter match
91 > uint num_trimA=0; //reads trimmed by polyA
92 > uint num_trimT=0; //reads trimmed by polyT
93 > uint num_trim5=0; //number of reads trimmed at 5' end
94 > uint num_trim3=0; //number of reads trimmed at 3' end
95 >
96 > uint64 b_totalIn=0; //total number of input bases
97 > uint64 b_totalN=0;  //total number of undetermined bases found in input bases
98 >
99 > uint64 b_trimN=0; //total number of bases trimmed due to N-trim
100 > uint64 b_trimQ=0; //number of bases trimmed due to qv threshold
101 > uint64 b_trimV=0; //total number of bases trimmed due to adaptor matches
102 > uint64 b_trimA=0; //number of bases trimmed due to poly-A tails
103 > uint64 b_trimT=0; //number of bases trimmed due to poly-T tails
104 > uint64 b_trim5=0; //total bases trimmed on the 5' side
105 > uint64 b_trim3=0; //total bases trimmed on the 3' side
106 > //int min_trimmed5=INT_MAX;
107 > //int min_trimmed3=INT_MAX;
108  
109   int qvtrim_qmin=0;
110   int qvtrim_max=0;  //for -q, do not trim at 3'-end more than this number of bases
# Line 96 | Line 114
114   // adaptor matching metrics -- for X-drop ungapped extension
115   //const int match_reward=2;
116   //const int mismatch_penalty=3;
117 < const int match_reward=2;
118 < const int mismatch_penalty=4;
119 < const int Xdrop=10;
117 > int match_reward=1;
118 > int mismatch_penalty=5;
119 > int Xdrop=16;
120 > int minEndAdapter=6;
121 >
122  
123   const int poly_m_score=2; //match score
124   const int poly_mis_score=-3; //mismatch
# Line 130 | Line 150
150     void update(const char* s) {
151           seq=s;
152           table6mers(seq.chars(), seq.length(), pz);
153 <         amlen=iround(double(seq.length())*0.8);
134 <         if (amlen<12)
135 <                amlen=12;
153 >         amlen=calc_safelen(seq.length());
154           if (!use_reverse) return;
155           //reverse complement
156           seqr=s;
# Line 232 | Line 250
250   void convertPhred(GStr& q);
251  
252   int main(int argc, char * const argv[]) {
253 <  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
253 >  GArgs args(argc, argv, "MATCH=MISMATCH=XDROP=YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:y:");
254    int e;
255    if ((e=args.isError())>0) {
256        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 271 | Line 289
289    if (!s.is_empty()) {
290       qvtrim_max=s.asInt();
291       }
292 +  s=args.getOpt("MATCH");
293 +  if (!s.is_empty())
294 +            match_reward=s.asInt();
295 +  s=args.getOpt("MISMATCH");
296 +  if (!s.is_empty())
297 +            mismatch_penalty=s.asInt();
298 +  s=args.getOpt("XDROP");
299 +  if (!s.is_empty())
300 +            Xdrop=s.asInt();
301 +
302    s=args.getOpt('p');
303    if (!s.is_empty()) {
304       int v=s.asInt();
# Line 285 | Line 313
313         else
314           GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
315       }
316 +  memset((void*)isACGT, 0, 256);
317 +  isACGT['A']=isACGT['a']=isACGT['C']=isACGT['c']=1;
318 +  isACGT['G']=isACGT['g']=isACGT['T']=isACGT['t']=1;
319    s=args.getOpt('f');
320    if (!s.is_empty()) {
321     loadAdaptors(s.chars());
# Line 307 | Line 338
338    s=args.getOpt('y');
339    if (!s.is_empty()) {
340       int minmatch=s.asInt();
341 <     poly_minScore=minmatch*poly_m_score;
341 >     if (minmatch>2)
342 >        poly_minScore=minmatch*poly_m_score;
343 >     else GMessage("Warning: invalid -y option, ignored.\n");
344 >     }
345 >  s=args.getOpt('a');
346 >  if (!s.is_empty()) {
347 >     int minmatch=s.asInt();
348 >     if (minmatch>2)
349 >        minEndAdapter=minmatch;
350 >     else GMessage("Warning: invalid -a option, ignored.\n");
351       }
352  
353 +
354    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
355                           else outsuffix="-";
356    trashReport=  (args.getOpt('r')!=NULL);
# Line 342 | Line 383
383      int trash_N=0;
384      int trash_D=0;
385      int trash_poly=0;
386 <    int trash_A3=0;
387 <    int trash_A5=0;
386 >    int trash_V=0;
387 >    num_trimN=0;
388 >    num_trimQ=0;
389 >    num_trimV=0;
390 >    num_trimA=0;
391 >    num_trimT=0;
392 >    num_trim5=0;
393 >    num_trim3=0;
394 >
395 >    b_totalIn=0;
396 >    b_totalN=0;
397 >    b_trimN=0;
398 >    b_trimQ=0;
399 >    b_trimV=0;
400 >    b_trimA=0;
401 >    b_trimT=0;
402 >    b_trim5=0;
403 >    b_trim3=0;
404 >
405      s=infile;
406      GStr infname;
407      GStr infname2;
# Line 366 | Line 424
424         int a5=0, a3=0, b5=0, b3=0;
425         char tcode=0, tcode2=0;
426         tcode=process_read(seqid, rseq, rqv, a5, a3);
427 +       if (a5>0) {
428 +          b_trim5+=a5;
429 +          num_trim5++;
430 +          }
431 +       if (a3<rseq.length()-1) {
432 +          b_trim3+=rseq.length()-a3-1;
433 +          num_trim3++;
434 +          }
435         if (fq2!=NULL) {
436              getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
437              if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
# Line 373 | Line 439
439                    seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
440                 }
441              tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
442 +            if (b5>0) {
443 +               b_trim5+=b5;
444 +               num_trim5++;
445 +               }
446 +            if (b3<rseq2.length()-1) {
447 +               b_trim3+=rseq2.length()-b3-1;
448 +               num_trim3++;
449 +               }
450              //decide what to do with this pair and print rseq2 if the pair makes it
451              if (tcode>1 && tcode2<=1) {
452                 //"untrash" rseq
# Line 408 | Line 482
482              else if (tcode=='Q') trash_Q++;
483                else if (tcode=='N') trash_N++;
484                 else if (tcode=='D') trash_D++;
485 <                else if (tcode=='3') trash_A3++;
486 <                 else if (tcode=='5') trash_A5++;
485 >                else if (tcode=='V') trash_V++;
486 >                 //else if (tcode=='5') trash_A5++;
487            if (trashReport) trash_report(tcode, seqid, freport);
488            }
489           else if (!doCollapse) { //write it
# Line 481 | Line 555
555      if (verbose) {
556         if (paired_reads) {
557             GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
558 <           GMessage("Number of input pairs :%9d\n", incounter);
559 <           GMessage("         Output pairs :%9d\n", outcounter);
558 >           GMessage("Number of input pairs :%9u\n", incounter);
559 >           GMessage("         Output pairs :%9u\t(%u trashed)\n", outcounter, incounter-outcounter);
560             }
561           else {
562             GMessage(">Input file : %s\n", infname.chars());
563             GMessage("Number of input reads :%9d\n", incounter);
564 <           GMessage("         Output reads :%9d\n", outcounter);
564 >           GMessage("         Output reads :%9d  (%u trashed)\n", outcounter, incounter-outcounter);
565             }
566 <       GMessage("------------------------------------\n");
567 <       if (num_trimmed5)
568 <          GMessage("           5' trimmed :%9d  (min. trim: %d)\n", num_trimmed5, min_trimmed5);
569 <       if (num_trimmed3)
570 <          GMessage("           3' trimmed :%9d  (min. trim: %d)\n", num_trimmed3, min_trimmed3);
566 >       GMessage("--------------- Read trimming: ------------------\n");
567 >       if (num_trim5)
568 >          GMessage("           5' trimmed :%9u\n", num_trim5);
569 >       if (num_trim3)
570 >          GMessage("           3' trimmed :%9u\n", num_trim3);
571 >       if (num_trimQ)
572 >          GMessage("         q.v. trimmed :%9u\n", num_trimQ);
573 >       if (num_trimN)
574 >          GMessage("            N trimmed :%9u\n", num_trimN);
575 >       if (num_trimT)
576 >          GMessage("       poly-T trimmed :%9u\n", num_trimT);
577 >       if (num_trimA)
578 >          GMessage("       poly-A trimmed :%9u\n", num_trimA);
579 >       if (num_trimV)
580 >          GMessage("      Adapter trimmed :%9u\n", num_trimV);
581         GMessage("------------------------------------\n");
582         if (trash_s>0)
583 <         GMessage("     Trashed by length:%9d\n", trash_s);
583 >         GMessage("Trashed by initial len:%9d\n", trash_s);
584         if (trash_N>0)
585           GMessage("         Trashed by N%%:%9d\n", trash_N);
586         if (trash_Q>0)
587           GMessage("Trashed by low quality:%9d\n", trash_Q);
588         if (trash_poly>0)
589           GMessage("   Trashed by poly-A/T:%9d\n", trash_poly);
590 <       if (trash_A5>0)
591 <         GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
592 <       if (trash_A3>0)
593 <         GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
590 >       if (trash_V>0)
591 >         GMessage("    Trashed by adaptor:%9d\n", trash_V);
592 >       GMessage("\n--------------- Base counts  --------------------\n");
593 >       GMessage("    Input bases   :%12llu\n", b_totalIn);
594 >       double percN=100.0* ((double)b_totalN/(double)b_totalIn);
595 >       GMessage("          N bases :%12llu (%4.2f%%)\n", b_totalN, percN);
596 >       GMessage("   trimmed from 5':%12llu\n", b_trim5);
597 >       GMessage("   trimmed from 3':%12llu\n", b_trim3);
598 >       GMessage("\n");
599 >       if (b_trimQ)
600 >       GMessage("     q.v. trimmed :%12llu\n", b_trimQ);
601 >       if (b_trimN)
602 >       GMessage("        N trimmed :%12llu\n", b_trimN);
603 >       if (b_trimT)
604 >       GMessage("   poly-T trimmed :%12llu\n", b_trimT);
605 >       if (b_trimA)
606 >       GMessage("   poly-A trimmed :%12llu\n", b_trimA);
607 >       if (b_trimV)
608 >       GMessage("  Adapter trimmed :%12llu\n", b_trimV);
609 >
610         }
611      if (trashReport) {
612            FWCLOSE(freport);
# Line 968 | Line 1068
1068              seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
1069                   adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
1070          }
1071 <     GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
1071 >     GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, minEndAdapter, gxmem_r, 86);
1072           if (aln) {
1073             if (aln->strong) {
1074                     trimmed=true;
# Line 1021 | Line 1121
1121              seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1122                   adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1123          }
1124 <         GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1124 >         GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type,
1125 >                                                       minEndAdapter, gxmem_l, 90);
1126           if (aln) {
1127             if (aln->strong) {
1128                     trimmed=true;
# Line 1152 | Line 1253
1253   GStr wqv(rqv.chars());
1254   int w5=l5;
1255   int w3=l3;
1256 + //count Ns
1257 + b_totalIn+=rseq.length();
1258 + for (int i=0;i<rseq.length();i++) {
1259 + if (isACGT[(int)rseq[i]]==0) b_totalN++;
1260 + }
1261 +
1262   //first do the q-based trimming
1263   if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1264 +   b_trimQ+=(w5-l5)+(l3-w3);
1265 +   num_trimQ++;
1266     if (w3-w5+1<min_read_len) {
1267       return 'Q'; //invalid read
1268       }
# Line 1166 | Line 1275
1275     } //qv trimming
1276   // N-trimming on the remaining read seq
1277   if (ntrim(wseq, w5, w3)) {
1278 +   //Note: ntrim resets w5 and w3
1279     //GMessage("before: %s\n",wseq.chars());
1280     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1281 +   int trim3=(wseq.length()-1-w3);
1282 +   b_trimN+=w5+trim3;
1283 +   num_trimN++;
1284     l5+=w5;
1285 <   l3-=(wseq.length()-1-w3);
1285 >   l3-=trim3;
1286     if (w3-w5+1<min_read_len) {
1287       return 'N'; //to be trashed
1288       }
# Line 1182 | Line 1295
1295     }
1296   char trim_code;
1297   //clean the more dirty end first - 3'
1298 + bool trimmedA=false;
1299 + bool trimmedT=false;
1300 + bool trimmedV=false;
1301 +
1302   int prev_w5=0;
1303   int prev_w3=0;
1304   bool w3upd=true;
# Line 1190 | Line 1307
1307    trim_code=0;
1308    if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1309        trim_code='A';
1310 +      b_trimA+=(w5+(wseq.length()-1-w3));
1311 +      if (!trimmedA) { num_trimA++; trimmedA=true; }
1312        }
1313    else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1314        trim_code='T';
1315 +      b_trimT+=(w5+(wseq.length()-1-w3));
1316 +      if (!trimmedT) { num_trimT++; trimmedT=true; }
1317        }
1318    else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1319        trim_code='A';
1320 +      b_trimA+=(w5+(wseq.length()-1-w3));
1321 +      if (!trimmedA) { num_trimA++; trimmedA=true; }
1322        }
1323    else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1324        trim_code='T';
1325 +      b_trimT+=(w5+(wseq.length()-1-w3));
1326 +      if (!trimmedT) { num_trimT++; trimmedT=true; }
1327        }
1328    else if (trim_adaptor5(wseq, w5, w3)) {
1329 <      trim_code='5';
1329 >      trim_code='V';
1330 >      b_trimV+=(w5+(wseq.length()-1-w3));
1331 >      if (!trimmedV) { num_trimV++; trimmedV=true; }
1332        }
1333    else if (trim_adaptor3(wseq, w5, w3)) {
1334 <      trim_code='3';
1334 >      trim_code='V';
1335 >      b_trimV+=(w5+(wseq.length()-1-w3));
1336 >      if (!trimmedV) { num_trimV++; trimmedV=true; }
1337        }
1338    if (trim_code) {
1339       w3upd=(w3!=prev_w3);
# Line 1215 | Line 1344
1344       GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1345       showTrim(wseq, w5, w3);
1346     #endif
1347 <     int trimlen=wseq.length()-(w3-w5+1);
1219 <     num_trimmed3++;
1220 <     if (trimlen<min_trimmed3)
1221 <         min_trimmed3=trimlen;
1347 >     //int trimlen=wseq.length()-(w3-w5+1);
1348       l5+=w5;
1349       l3-=(wseq.length()-1-w3);
1350       if (w3-w5+1<min_read_len) {
# Line 1230 | Line 1356
1356           wqv=wqv.substr(w5, w3-w5+1);
1357        }//trimming at 3' end
1358   } while (trim_code);
1233
1359   if (doCollapse) {
1360     //keep read for later
1361     FqDupRec* dr=dhash.Find(wseq.chars());

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines