ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
(Generate patch)
# Line 7 | Line 7
7  
8   #define USAGE "Usage:\n\
9   fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 <   [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
10 >   [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12      <input.fq>[,<input_mates.fq>\n\
13   \n\
# Line 72 | Line 72
72   bool doPolyTrim=true;
73   bool fastaOutput=false;
74   bool trashReport=false;
75 < //bool rawFormat=false;
75 > bool revCompl=false; //also reverse complement adaptor sequences
76   int min_read_len=16;
77   double max_perc_N=7.0;
78   int dust_cutoff=16;
# Line 108 | Line 108
108  
109   struct CASeqData {
110     //positional data for every possible hexamer in an adapter
111 <   GVec<uint16>* pz[64]; //0-based coordinates of all possible hexamers in the adapter sequence
112 <   GVec<uint16>* pzr[64]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
111 >   GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
112 >   GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
113     GStr seq; //actual adapter sequence data
114     GStr seqr; //reverse complement sequence
115 +   bool use_reverse;
116     CASeqData(bool rev=false):seq(),seqr() {
117 <     for (int i=0;i<63;i++) {
118 <       pz[i]=new GVec<uint16>(1);
119 <       if (rev) pzr[i]=new GVec<uint16>(1);
120 <       }
121 <   }
117 >     use_reverse=rev;
118 >     for (int i=0;i<4096;i++) {
119 >        pz[i]=NULL;
120 >        pzr[i]=NULL;
121 >        }
122 >     }
123  
124 <   ~CSeqData() {
125 <     for (int i=0;i<63;i++) {
126 <     delete pz[i];
127 <     if (rev) { delete pzr[i]; }
124 >   void update(const char* s) {
125 >   seq=s;
126 >   table6mers(seq.chars(), seq.length(), pz);
127 >   if (!use_reverse) return;
128 >   //reverse complement
129 >   seqr=s;
130 >   int slen=seq.length();
131 >   for (int i=0;i<slen;i++)
132 >     seqr[i]=ntComplement(seq[slen-i-1]);
133 >   table6mers(seqr.chars(), seqr.length(), pzr);
134 >   }
135 >
136 >   ~CASeqData() {
137 >     for (int i=0;i<4096;i++) {
138 >       delete pz[i];
139 >       delete pzr[i];
140       }
141     }
142   };
# Line 181 | Line 195
195  
196   GHash<FqDupRec> dhash; //hash to keep track of duplicates
197  
198 + void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
199   int loadAdapters(const char* fname);
200  
201   void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
# Line 208 | Line 223
223   void convertPhred(GStr& q);
224  
225   int main(int argc, char * const argv[]) {
226 <  GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
226 >  GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
227    int e;
228    if ((e=args.isError())>0) {
229        GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
# Line 219 | Line 234
234    convert_phred=(args.getOpt('Q')!=NULL);
235    doCollapse=(args.getOpt('C')!=NULL);
236    doDust=(args.getOpt('D')!=NULL);
237 +  revCompl=(args.getOpt('R')!=NULL);
238    if (args.getOpt('A')) doPolyTrim=false;
239    /*
240    rawFormat=(args.getOpt('R')!=NULL);
# Line 270 | Line 286
286      if (fileAdapters)
287        GError("Error: options -5 and -f cannot be used together!\n");
288      s.upper();
289 <    adapters5.Add(s);
289 >    addAdapter(adapters5, s);
290      }
291    s=args.getOpt('3');
292    if (!s.is_empty()) {
293      if (fileAdapters)
294        GError("Error: options -3 and -f cannot be used together!\n");
295        s.upper();
296 <      adapters3.Add(s);
296 >      addAdapter(adapters3, s);
297      }
298    s=args.getOpt('y');
299    if (!s.is_empty()) {
# Line 930 | Line 946
946   GStr wseq(seq.chars());
947   int wlen=rlen;
948   for (int ai=0;ai<adapters3.Count();ai++) {
949 <  if (adapters3[ai].is_empty()) continue;
950 <  int alen=adapters3[ai].length();
951 <  GStr& aseq=adapters3[ai];
952 <  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
949 >  //if (adapters3[ai].is_empty()) continue;
950 >  int alen=adapters3[ai].seq.length();
951 >  GStr& aseq=adapters3[ai].seq;
952 >  GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, adapters3[ai].pz,
953 >                            wseq.chars(), wlen, gxmem_r, 74);
954    if (r_bestaln) {
955       trimmed=true;
956       //keep unmatched region on the left, if any
# Line 957 | Line 974
974   GStr wseq(seq.chars());
975   int wlen=rlen;
976   for (int ai=0;ai<adapters5.Count();ai++) {
977 <  if (adapters5[ai].is_empty()) continue;
978 <  int alen=adapters5[ai].length();
979 <  GStr& aseq=adapters5[ai];
977 >  //if (adapters5[ai].is_empty()) continue;
978 >  int alen=adapters5[ai].seq.length();
979 >  GStr& aseq=adapters5[ai].seq;
980    GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
981 <                         wseq.chars(), wlen, gxmem_l, 84);
981 >                 wseq.chars(), wlen, gxmem_l, 84);
982    if (l_bestaln) {
983       trimmed=true;
984       l5+=l_bestaln->sr;
# Line 1320 | Line 1337
1337  
1338   void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1339   //TODO: prepare CASeqData here, and collect hexamers as well
1340 <
1340 > CASeqData adata(revCompl);
1341 > int idx=adapters.Add(adata);
1342 > if (idx<0) GError("Error: failed to add adaptor!\n");
1343 > adapters[idx].update(seq.chars());
1344   }
1345  
1346  

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines