ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 5 | Line 5
5   #include <ctype.h>
6  
7   #define USAGE "Usage:\n\
8 < fqtrim [-5 <5adapter>] [-3 <3adapter>] [-a <min_matchlen>] [-p {64|33}] [-q <minq> [-t <trim_max>]]\\\n\
9 <   [-n <rename_prefix>] [-o <outsuffix>] [-z <zcmd>] [-r <discarded.lst>]\\\n\
10 <   [-l <minlen>] [-C] [-D] [-Q] <input.fq>[,<input_mates.fq>\n\
8 > fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
9 >   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10 >   [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
11 >    <input.fq>[,<input_mates.fq>\n\
12   \n\
13 < Trim low quality bases at the 3' end, optionally trim adapter sequence, filter\n\
14 < for low complexity and collapse duplicate reads\n\
13 > Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
14 > for low complexity and collapse duplicate reads.\n\
15   If read pairs should be trimmed and kept together (i.e. without discarding\n\
16   one read in a pair), the two file names should be given delimited by a comma\n\
17   or a colon character\n\
# Line 19 | Line 20
20   -n  rename all the reads using the <prefix> followed by a read counter;\n\
21      if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
22      the read duplication count\n\
23 < -o  write the trimmed/filtered reads to file(s) named <input>.<outsuffix>\n\
24 <    which will be created in the current (working) directory\n\
23 > -o  unless this parameter is '-', write the trimmed/filtered reads to \n\
24 >    file(s) named <input>.<outsuffix> which will be created in the \n\
25 >    current (working) directory; (writes to stdout if -o- is given);\n\
26 >    a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
27 > -f  file with adapter sequences to trim, each line having this format:\n\
28 >    <5'-adapter-sequence> <3'-adapter-sequence>\n\
29   -5  trim the given adapter or primer sequence at the 5' end of each read\n\
30      (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
31   -3  trim the given adapter sequence at the 3' end of each read\n\
32      (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
33 < -a  minimum bases to match to adaptor sequence (default 5)\n\
33 > -a  minimum length of exact match to adaptor sequence at the proper end (6)\n\
34   -q  trim bases with quality value lower than <minq> (starting at the 3' end)\n\
35 < -t  for -q option, maximum trimming at the 3' end is limited to <trim_max>\n\
35 > -t  for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
36   -m  maximum percentage of Ns allowed in a read after trimming (default 7)\n\
37   -l  minimum \"clean\" length after trimming that a read must have\n\
38      in order to pass the filter (default: 16)\n\
# Line 48 | Line 53
53  
54   // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
55  
56 < //For pair ends sequencing:
56 > //For paired reads sequencing:
57   //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
58   //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
59   //FILE* f_out=NULL; //stdout if not provided
# Line 70 | Line 75
75   bool isfasta=false;
76   bool convert_phred=false;
77   GStr outsuffix; // -o
78 < GStr adapter3;
79 < GStr adapter5;
78 > //GStr adapter3;
79 > //GStr adapter5;
80   GStr prefix;
81   GStr zcmd;
82   int num_trimmed5=0;
# Line 89 | Line 94
94   int a3len=0;
95   int a5len=0;
96   // adaptor matching metrics -- for extendMatch() function
97 + const int match_score=2; //match score
98 + const int mismatch_score=-3; //mismatch
99 +
100   const int a_m_score=2; //match score
101   const int a_mis_score=-3; //mismatch
102   const int a_dropoff_score=7;
103 < int a_min_score=10; //an exact match of 5 bases at the proper ends WILL be trimmed
103 > const int poly_dropoff_score=7;
104 > int a_min_score=12; //an exact match of 6 bases at the proper ends WILL be trimmed
105   const int a_min_chain_score=15; //for gapped alignments
106  
107   class CSegChain;
# Line 322 | Line 331
331   //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
332  
333   bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
334 + bool polytrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
335   bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
336   int dust(GStr& seq);
337   bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
# Line 331 | Line 341
341   void convertPhred(GStr& q);
342  
343   int main(int argc, char * const argv[]) {
344 <  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:t:o:z:a:");
344 >  GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
345    int e;
346    if ((e=args.isError())>0) {
347        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 397 | Line 407
407       int a_minmatch=s.asInt();
408       a_min_score=a_minmatch<<1;
409       }
410 <
410 >  
411    if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
412 +                         else outsuffix="-";
413    trashReport=  (args.getOpt('r')!=NULL);
414    int fcount=args.startNonOpt();
415    if (fcount==0) {
# Line 1018 | Line 1029
1029     }
1030   }
1031   #endif
1032 + const char *pA_seed="AAAA";
1033 + const char *pT_seed="TTTT";
1034 + const int polyA_seed=*(int*)pA_seed;
1035 + const int polyT_seed=*(int*)pT_seed;
1036 +
1037 + bool trim_poly3(GStr &seq, int &l5, int &l3) {
1038 + int rlen=seq.length();
1039 + l5=0;
1040 + l3=rlen-1;
1041 + //assumes N trimming was already done
1042 + //so a poly match should be very close to the end of the read
1043 + //TODO: should we attempt polyA and polyT at the same time, same end?
1044 + // -- find the initial match (seed)
1045 + int rlo=GMAX((rlen-10), 0);
1046 + int si;
1047 + for (si=rlen-5;si>rlo;si--) {
1048 +   if (polyA_seed==*(int*)&(seq.chars()+si)) {
1049 +      break;
1050 +      }
1051 +   }
1052 + if (si<=rlo) return false;
1053 + //seed found, try to extend it to left and right
1054 + int score=4*match_score;
1055 + max_rlo=rlo;
1056 + rlo--;
1057 + while (rlo>=0) {
1058 +    rlo--;
1059 +    score+=
1060 +    
1061 +    }
1062 + }
1063  
1064   bool trim_adapter3(GStr& seq, int&l5, int &l3) {
1065   int rlen=seq.length();
# Line 1611 | Line 1653
1653   f_out2=NULL;
1654   //analyze outsuffix intent
1655   GStr pocmd;
1656 < GStr ox=getFext(outsuffix);
1657 < if (ox.length()>2) ox=ox.substr(0,2);
1658 < if (ox=="gz") pocmd="gzip -9 -c ";
1659 <   else if (ox=="bz") pocmd="bzip2 -9 -c ";
1656 > if (outsuffix=="-") {
1657 >    f_out=stdout;
1658 >    }
1659 >   else {
1660 >    GStr ox=getFext(outsuffix);
1661 >    if (ox.length()>2) ox=ox.substr(0,2);
1662 >    if (ox=="gz") pocmd="gzip -9 -c ";
1663 >        else if (ox=="bz") pocmd="bzip2 -9 -c ";
1664 >    }
1665   if (s=="-") {
1666      f_in=stdin;
1667      infname="stdin";
# Line 1638 | Line 1685
1685     f_in=popen(picmd.chars(), "r");
1686     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1687     }
1688 + if (f_out==stdout) {
1689 +   if (paired) GError("Error: output suffix required for paired reads\n");
1690 +   return;
1691 +   }
1692   f_out=prepOutFile(infname, pocmd);
1693   if (!paired) return;
1694   if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines