ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 187
Committed: Fri Feb 17 05:42:21 2012 UTC (7 years, 8 months ago) by gpertea
File size: 48278 byte(s)
Log Message:
fqtrim

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 187 #include <inttypes.h>
8 gpertea 4
9     #define USAGE "Usage:\n\
10 gpertea 180 fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
11 gpertea 172 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
12 gpertea 76 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
13     <input.fq>[,<input_mates.fq>\n\
14 gpertea 4 \n\
15 gpertea 180 Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
16 gpertea 76 for low complexity and collapse duplicate reads.\n\
17 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
18     one read in a pair), the two file names should be given delimited by a comma\n\
19     or a colon character\n\
20 gpertea 4 \n\
21     Options:\n\
22     -n rename all the reads using the <prefix> followed by a read counter;\n\
23     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
24     the read duplication count\n\
25 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
26     file(s) named <input>.<outsuffix> which will be created in the \n\
27     current (working) directory; (writes to stdout if -o- is given);\n\
28     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
29 gpertea 180 -f file with adaptor sequences to trim, each line having this format:\n\
30     <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
31     -5 trim the given adaptor or primer sequence at the 5' end of each read\n\
32 gpertea 4 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
33 gpertea 180 -3 trim the given adaptor sequence at the 3' end of each read\n\
34 gpertea 4 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
35 gpertea 103 -A disable polyA/T trimming (enabled by default)\n\
36 gpertea 94 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
37 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
40     -l minimum \"clean\" length after trimming that a read must have\n\
41     in order to pass the filter (default: 16)\n\
42 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
43     one letter code for the reason why they were trashed\n\
44 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
45 gpertea 187 50%% of its length detected as low complexity\n\
46 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
47 gpertea 12 name (e.g. for microRNAs)\n\
48     -p input is phred64/phred33 (use -p64 or -p33)\n\
49     -Q convert quality values to the other Phred qv type\n\
50 gpertea 187 -V be verbose when done (print summary counts)\n\
51 gpertea 4 "
52 gpertea 170
53 gpertea 13 //-z for -o option, the output stream(s) will be first piped into the given\n
54     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
55    
56 gpertea 170
57 gpertea 180 // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
58 gpertea 12
59 gpertea 76 //For paired reads sequencing:
60 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
61     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
62 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
63     //FILE* f_out2=NULL; //for paired reads
64     //FILE* f_in=NULL; //input fastq (stdin if not provided)
65     //FILE* f_in2=NULL; //for paired reads
66 gpertea 170
67 gpertea 4 FILE* freport=NULL;
68 gpertea 13
69 gpertea 4 bool debug=false;
70 gpertea 12 bool verbose=false;
71 gpertea 4 bool doCollapse=false;
72     bool doDust=false;
73 gpertea 103 bool doPolyTrim=true;
74 gpertea 13 bool fastaOutput=false;
75 gpertea 4 bool trashReport=false;
76 gpertea 172 bool revCompl=false; //also reverse complement adaptor sequences
77 gpertea 4 int min_read_len=16;
78 gpertea 12 double max_perc_N=7.0;
79 gpertea 4 int dust_cutoff=16;
80     bool isfasta=false;
81 gpertea 12 bool convert_phred=false;
82 gpertea 170 GStr outsuffix; // -o
83 gpertea 13 GStr prefix;
84     GStr zcmd;
85 gpertea 187 char isACGT[256];
86 gpertea 12
87 gpertea 187 uint num_trimN=0; //reads trimmed by N%
88     uint num_trimQ=0; //reads trimmed by qv threshold
89     uint num_trimV=0; //reads trimmed by adapter match
90     uint num_trimA=0; //reads trimmed by polyA
91     uint num_trimT=0; //reads trimmed by polyT
92     uint num_trim5=0; //number of reads trimmed at 5' end
93     uint num_trim3=0; //number of reads trimmed at 3' end
94    
95     uint64 b_totalIn=0; //total number of input bases
96     uint64 b_totalN=0; //total number of undetermined bases found in input bases
97    
98     uint64 b_trimN=0; //total number of bases trimmed due to N-trim
99     uint64 b_trimQ=0; //number of bases trimmed due to qv threshold
100     uint64 b_trimV=0; //total number of bases trimmed due to adaptor matches
101     uint64 b_trimA=0; //number of bases trimmed due to poly-A tails
102     uint64 b_trimT=0; //number of bases trimmed due to poly-T tails
103     uint64 b_trim5=0; //total bases trimmed on the 5' side
104     uint64 b_trim3=0; //total bases trimmed on the 3' side
105     //int min_trimmed5=INT_MAX;
106     //int min_trimmed3=INT_MAX;
107    
108 gpertea 13 int qvtrim_qmin=0;
109     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
110 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
111     int qv_cvtadd=0; //could be -31 or +31
112    
113 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
114 gpertea 177 //const int match_reward=2;
115     //const int mismatch_penalty=3;
116 gpertea 103 const int match_reward=2;
117 gpertea 177 const int mismatch_penalty=4;
118     const int Xdrop=10;
119 gpertea 103
120 gpertea 94 const int poly_m_score=2; //match score
121     const int poly_mis_score=-3; //mismatch
122 gpertea 88 const int poly_dropoff_score=7;
123 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
124 gpertea 4
125 gpertea 94 const char *polyA_seed="AAAA";
126     const char *polyT_seed="TTTT";
127 gpertea 12
128 gpertea 170 struct CASeqData {
129 gpertea 180 //positional data for every possible hexamer in an adaptor
130     GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
131     GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
132     GStr seq; //actual adaptor sequence data
133 gpertea 170 GStr seqr; //reverse complement sequence
134 gpertea 180 int amlen; //fraction of adaptor length matching that's
135 gpertea 175 //enough to consider the alignment
136 gpertea 180 GAlnTrimType trim_type;
137 gpertea 172 bool use_reverse;
138 gpertea 175 CASeqData(bool rev=false):seq(),seqr(),
139     amlen(0), use_reverse(rev) {
140 gpertea 180 trim_type=galn_None; //should be updated later!
141 gpertea 172 for (int i=0;i<4096;i++) {
142     pz[i]=NULL;
143     pzr[i]=NULL;
144     }
145     }
146    
147     void update(const char* s) {
148 gpertea 175 seq=s;
149     table6mers(seq.chars(), seq.length(), pz);
150     amlen=iround(double(seq.length())*0.8);
151     if (amlen<12)
152     amlen=12;
153     if (!use_reverse) return;
154     //reverse complement
155     seqr=s;
156     int slen=seq.length();
157     for (int i=0;i<slen;i++)
158     seqr[i]=ntComplement(seq[slen-i-1]);
159     table6mers(seqr.chars(), seqr.length(), pzr);
160 gpertea 170 }
161    
162 gpertea 172 ~CASeqData() {
163     for (int i=0;i<4096;i++) {
164     delete pz[i];
165     delete pzr[i];
166 gpertea 170 }
167     }
168     };
169    
170 gpertea 180 GVec<CASeqData> adaptors5;
171     GVec<CASeqData> adaptors3;
172 gpertea 170
173 gpertea 103 CGreedyAlignData* gxmem_l=NULL;
174     CGreedyAlignData* gxmem_r=NULL;
175    
176 gpertea 4 // element in dhash:
177     class FqDupRec {
178     public:
179     int count; //how many of these reads are the same
180     int len; //length of qv
181     char* firstname; //optional, only if we want to keep the original read names
182     char* qv;
183     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
184     len=0;
185     qv=NULL;
186     firstname=NULL;
187     count=0;
188     if (qstr!=NULL) {
189     qv=Gstrdup(qstr->chars());
190     len=qstr->length();
191     count++;
192     }
193     if (rname!=NULL) firstname=Gstrdup(rname);
194     }
195     ~FqDupRec() {
196     GFREE(qv);
197     GFREE(firstname);
198     }
199     void add(GStr& d) { //collapse another record into this one
200     if (d.length()!=len)
201     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
202     count++;
203     for (int i=0;i<len;i++)
204     qv[i]=(qv[i]+d[i])/2;
205     }
206     };
207    
208     void openfw(FILE* &f, GArgs& args, char opt) {
209     GStr s=args.getOpt(opt);
210     if (!s.is_empty()) {
211     if (s=='-') f=stdout;
212     else {
213 gpertea 13 f=fopen(s.chars(),"w");
214 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
215     }
216     }
217     }
218    
219     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
220     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
221    
222     GHash<FqDupRec> dhash; //hash to keep track of duplicates
223    
224 gpertea 180 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
225     int loadAdaptors(const char* fname);
226 gpertea 94
227 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
228     GStr& s, GStr& infname, GStr& infname2);
229 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
230 gpertea 170
231 gpertea 13 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
232     void trash_report(char trashcode, GStr& rname, FILE* freport);
233    
234 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
235 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname);
236    
237 gpertea 170 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
238 gpertea 13 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
239    
240 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
241     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
242 gpertea 4 int dust(GStr& seq);
243 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
244     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
245 gpertea 180 bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
246     bool trim_adaptor3(GStr& seq, int &l5, int &l3);
247 gpertea 4
248 gpertea 12 void convertPhred(char* q, int len);
249     void convertPhred(GStr& q);
250 gpertea 4
251     int main(int argc, char * const argv[]) {
252 gpertea 172 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
253 gpertea 4 int e;
254     if ((e=args.isError())>0) {
255     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
256     exit(224);
257     }
258 gpertea 9 debug=(args.getOpt('Y')!=NULL);
259 gpertea 13 verbose=(args.getOpt('V')!=NULL);
260 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
261 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
262     doDust=(args.getOpt('D')!=NULL);
263 gpertea 172 revCompl=(args.getOpt('R')!=NULL);
264 gpertea 103 if (args.getOpt('A')) doPolyTrim=false;
265 gpertea 12 /*
266 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
267     if (rawFormat) {
268     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
269     }
270 gpertea 12 */
271 gpertea 4 prefix=args.getOpt('n');
272     GStr s=args.getOpt('l');
273 gpertea 170 if (!s.is_empty())
274 gpertea 4 min_read_len=s.asInt();
275 gpertea 12 s=args.getOpt('m');
276 gpertea 170 if (!s.is_empty())
277 gpertea 12 max_perc_N=s.asDouble();
278 gpertea 4 s=args.getOpt('d');
279     if (!s.is_empty()) {
280     dust_cutoff=s.asInt();
281     doDust=true;
282     }
283 gpertea 12 s=args.getOpt('q');
284     if (!s.is_empty()) {
285 gpertea 13 qvtrim_qmin=s.asInt();
286 gpertea 12 }
287 gpertea 13 s=args.getOpt('t');
288     if (!s.is_empty()) {
289     qvtrim_max=s.asInt();
290     }
291 gpertea 12 s=args.getOpt('p');
292     if (!s.is_empty()) {
293     int v=s.asInt();
294     if (v==33) {
295     qv_phredtype=33;
296     qv_cvtadd=31;
297     }
298     else if (v==64) {
299     qv_phredtype=64;
300     qv_cvtadd=-31;
301     }
302 gpertea 170 else
303 gpertea 12 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
304     }
305 gpertea 187 memset((void*)isACGT, 0, 256);
306     isACGT['A']=isACGT['a']=isACGT['C']=isACGT['c']=1;
307     isACGT['G']=isACGT['g']=isACGT['T']=isACGT['t']=1;
308 gpertea 94 s=args.getOpt('f');
309     if (!s.is_empty()) {
310 gpertea 180 loadAdaptors(s.chars());
311 gpertea 94 }
312 gpertea 180 bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
313 gpertea 94 s=args.getOpt('5');
314     if (!s.is_empty()) {
315 gpertea 180 if (fileAdaptors)
316 gpertea 94 GError("Error: options -5 and -f cannot be used together!\n");
317     s.upper();
318 gpertea 180 addAdaptor(adaptors5, s, galn_TrimLeft);
319 gpertea 4 }
320 gpertea 94 s=args.getOpt('3');
321     if (!s.is_empty()) {
322 gpertea 180 if (fileAdaptors)
323 gpertea 94 GError("Error: options -3 and -f cannot be used together!\n");
324 gpertea 102 s.upper();
325 gpertea 180 addAdaptor(adaptors3, s, galn_TrimRight);
326 gpertea 4 }
327 gpertea 94 s=args.getOpt('y');
328 gpertea 13 if (!s.is_empty()) {
329 gpertea 94 int minmatch=s.asInt();
330     poly_minScore=minmatch*poly_m_score;
331 gpertea 13 }
332 gpertea 170
333 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
334 gpertea 76 else outsuffix="-";
335 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
336 gpertea 13 int fcount=args.startNonOpt();
337     if (fcount==0) {
338 gpertea 4 GMessage(USAGE);
339     exit(224);
340     }
341 gpertea 13 if (fcount>1 && doCollapse) {
342     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
343     }
344     //openfw(f_out, args, 'o');
345     //if (f_out==NULL) f_out=stdout;
346 gpertea 4 if (trashReport)
347     openfw(freport, args, 'r');
348     char* infile=NULL;
349 gpertea 103
350 gpertea 180 if (adaptors5.Count()>0)
351 gpertea 177 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
352     gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
353 gpertea 180 if (adaptors3.Count()>0)
354 gpertea 103 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
355    
356 gpertea 4 while ((infile=args.nextNonOpt())!=NULL) {
357 gpertea 103 //for each input file
358 gpertea 13 int incounter=0; //counter for input reads
359     int outcounter=0; //counter for output reads
360     int trash_s=0; //too short from the get go
361     int trash_Q=0;
362     int trash_N=0;
363     int trash_D=0;
364 gpertea 103 int trash_poly=0;
365 gpertea 187 int trash_V=0;
366     num_trimN=0;
367     num_trimQ=0;
368     num_trimV=0;
369     num_trimA=0;
370     num_trimT=0;
371     num_trim5=0;
372     num_trim3=0;
373    
374     b_totalIn=0;
375     b_totalN=0;
376     b_trimN=0;
377     b_trimQ=0;
378     b_trimV=0;
379     b_trimA=0;
380     b_trimT=0;
381     b_trim5=0;
382     b_trim3=0;
383    
384 gpertea 13 s=infile;
385     GStr infname;
386     GStr infname2;
387     FILE* f_in=NULL;
388     FILE* f_in2=NULL;
389     FILE* f_out=NULL;
390     FILE* f_out2=NULL;
391     bool paired_reads=false;
392     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
393     GLineReader fq(f_in);
394     GLineReader* fq2=NULL;
395     if (f_in2!=NULL) {
396     fq2=new GLineReader(f_in2);
397     paired_reads=true;
398     }
399     GStr rseq, rqv, seqid, seqinfo;
400     GStr rseq2, rqv2, seqid2, seqinfo2;
401     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
402     incounter++;
403     int a5=0, a3=0, b5=0, b3=0;
404     char tcode=0, tcode2=0;
405     tcode=process_read(seqid, rseq, rqv, a5, a3);
406 gpertea 187 if (a5>0) {
407     b_trim5+=a5;
408     num_trim5++;
409     }
410     if (a3<rseq.length()-1) {
411     b_trim3+=rseq.length()-a3-1;
412     num_trim3++;
413     }
414 gpertea 103 if (fq2!=NULL) {
415 gpertea 13 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
416     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
417     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
418     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
419     }
420     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
421 gpertea 187 if (b5>0) {
422     b_trim5+=b5;
423     num_trim5++;
424     }
425     if (b3<rseq2.length()-1) {
426     b_trim3+=rseq2.length()-b3-1;
427     num_trim3++;
428     }
429 gpertea 13 //decide what to do with this pair and print rseq2 if the pair makes it
430     if (tcode>1 && tcode2<=1) {
431     //"untrash" rseq
432     if (a3-a5+1<min_read_len) {
433     a5=1;
434     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
435 gpertea 4 }
436 gpertea 13 tcode=1;
437     }
438     else if (tcode<=1 && tcode2>1) {
439     //"untrash" rseq2
440     if (b3-b5+1<min_read_len) {
441     b5=1;
442     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
443     }
444     tcode2=1;
445     }
446     if (tcode<=1) { //trimmed or left intact -- write it!
447     if (tcode>0) {
448     rseq2=rseq2.substr(b5,b3-b5+1);
449     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
450 gpertea 4 }
451 gpertea 13 int nocounter=0;
452     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
453     }
454 gpertea 103 } //pair read
455 gpertea 13 if (tcode>1) { //trashed
456 gpertea 103 #ifdef GDEBUG
457 gpertea 175 GMessage(" !!!!TRASH code = %c\n",tcode);
458 gpertea 103 #endif
459 gpertea 13 if (tcode=='s') trash_s++;
460 gpertea 103 else if (tcode=='A' || tcode=='T') trash_poly++;
461 gpertea 13 else if (tcode=='Q') trash_Q++;
462     else if (tcode=='N') trash_N++;
463     else if (tcode=='D') trash_D++;
464 gpertea 187 else if (tcode=='V') trash_V++;
465     //else if (tcode=='5') trash_A5++;
466 gpertea 13 if (trashReport) trash_report(tcode, seqid, freport);
467     }
468     else if (!doCollapse) { //write it
469     if (tcode>0) {
470     rseq=rseq.substr(a5,a3-a5+1);
471     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
472     }
473 gpertea 103 #ifdef GDEBUG
474     GMessage(" After trimming:\n");
475 gpertea 105 GMessage("%s\n",rseq.chars());
476 gpertea 103 #endif
477 gpertea 13 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
478     }
479     } //for each fastq record
480     delete fq2;
481     FRCLOSE(f_in);
482     FRCLOSE(f_in2);
483     if (doCollapse) {
484     outcounter=0;
485     int maxdup_count=1;
486     char* maxdup_seq=NULL;
487     dhash.startIterate();
488     FqDupRec* qd=NULL;
489     char* seq=NULL;
490     while ((qd=dhash.NextData(seq))!=NULL) {
491     GStr rseq(seq);
492     //do the dusting here
493     if (doDust) {
494     int dustbases=dust(rseq);
495     if (dustbases>(rseq.length()>>1)) {
496     if (trashReport && qd->firstname!=NULL) {
497     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
498 gpertea 4 }
499 gpertea 13 trash_D+=qd->count;
500     continue;
501     }
502     }
503     outcounter++;
504 gpertea 170 if (qd->count>maxdup_count) {
505 gpertea 13 maxdup_count=qd->count;
506     maxdup_seq=seq;
507     }
508     if (isfasta) {
509     if (prefix.is_empty()) {
510 gpertea 170 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
511 gpertea 13 rseq.chars());
512 gpertea 4 }
513 gpertea 13 else { //use custom read name
514     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
515     qd->count, rseq.chars());
516 gpertea 4 }
517 gpertea 13 }
518     else { //fastq format
519     if (convert_phred) convertPhred(qd->qv, qd->len);
520     if (prefix.is_empty()) {
521 gpertea 170 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
522 gpertea 13 rseq.chars(), qd->qv);
523 gpertea 4 }
524 gpertea 13 else { //use custom read name
525     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
526     qd->count, rseq.chars(), qd->qv);
527     }
528     }
529     }//for each element of dhash
530     if (maxdup_count>1) {
531     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
532 gpertea 4 }
533 gpertea 13 } //collapse entries
534     if (verbose) {
535     if (paired_reads) {
536     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
537 gpertea 187 GMessage("Number of input pairs :%9u\n", incounter);
538     GMessage(" Output pairs :%9u\t(%u trashed)\n", outcounter, incounter-outcounter);
539 gpertea 13 }
540     else {
541     GMessage(">Input file : %s\n", infname.chars());
542     GMessage("Number of input reads :%9d\n", incounter);
543 gpertea 187 GMessage(" Output reads :%9d (%u trashed)\n", outcounter, incounter-outcounter);
544 gpertea 13 }
545 gpertea 187 GMessage("--------------- Read trimming: ------------------\n");
546     if (num_trim5)
547     GMessage(" 5' trimmed :%9u\n", num_trim5);
548     if (num_trim3)
549     GMessage(" 3' trimmed :%9u\n", num_trim3);
550     if (num_trimQ)
551     GMessage(" q.v. trimmed :%9u\n", num_trimQ);
552     if (num_trimN)
553     GMessage(" N trimmed :%9u\n", num_trimN);
554     if (num_trimT)
555     GMessage(" poly-T trimmed :%9u\n", num_trimT);
556     if (num_trimA)
557     GMessage(" poly-A trimmed :%9u\n", num_trimA);
558     if (num_trimV)
559     GMessage(" Adapter trimmed :%9u\n", num_trimV);
560 gpertea 13 GMessage("------------------------------------\n");
561     if (trash_s>0)
562 gpertea 187 GMessage("Trashed by initial len:%9d\n", trash_s);
563 gpertea 13 if (trash_N>0)
564     GMessage(" Trashed by N%%:%9d\n", trash_N);
565     if (trash_Q>0)
566     GMessage("Trashed by low quality:%9d\n", trash_Q);
567 gpertea 103 if (trash_poly>0)
568     GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
569 gpertea 187 if (trash_V>0)
570     GMessage(" Trashed by adaptor:%9d\n", trash_V);
571     GMessage("\n--------------- Base counts --------------------\n");
572     GMessage(" Input bases :%12llu\n", b_totalIn);
573     double percN=100.0* ((double)b_totalN/(double)b_totalIn);
574     GMessage(" N bases :%12llu (%4.2f%%)\n", b_totalN, percN);
575     GMessage(" trimmed from 5':%12llu\n", b_trim5);
576     GMessage(" trimmed from 3':%12llu\n", b_trim3);
577     GMessage("\n");
578     if (b_trimQ)
579     GMessage(" q.v. trimmed :%12llu\n", b_trimQ);
580     if (b_trimN)
581     GMessage(" N trimmed :%12llu\n", b_trimN);
582     if (b_trimT)
583     GMessage(" poly-T trimmed :%12llu\n", b_trimT);
584     if (b_trimA)
585     GMessage(" poly-A trimmed :%12llu\n", b_trimA);
586     if (b_trimV)
587     GMessage(" Adapter trimmed :%12llu\n", b_trimV);
588    
589 gpertea 13 }
590     if (trashReport) {
591     FWCLOSE(freport);
592 gpertea 4 }
593 gpertea 13 FWCLOSE(f_out);
594     FWCLOSE(f_out2);
595     } //while each input file
596 gpertea 103 delete gxmem_l;
597     delete gxmem_r;
598 gpertea 12 //getc(stdin);
599 gpertea 4 }
600    
601     class NData {
602     public:
603     int NPos[1024]; //there should be no reads longer than 1K ?
604     int NCount;
605     int end5;
606     int end3;
607     int seqlen;
608     double perc_N; //percentage of Ns in end5..end3 range only!
609     const char* seq;
610     bool valid;
611     NData() {
612 gpertea 170 seqlen=0;
613 gpertea 4 NCount=0;
614     end5=0;
615     end3=0;
616     seq=NULL;
617     perc_N=0;
618     valid=true;
619     }
620     void init(GStr& rseq) {
621     NCount=0;
622     perc_N=0;
623     seqlen=rseq.length();
624     seq=rseq.chars();
625     end5=1;
626     end3=seqlen;
627     for (int i=0;i<seqlen;i++)
628 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
629 gpertea 4 NPos[NCount]=i;
630     NCount++;
631     }
632     perc_N=(NCount*100.0)/seqlen;
633     valid=true;
634     }
635     void N_calc() { //only in the region after trimming
636     int n=0;
637     for (int i=end3-1;i<end5;i++) {
638     if (seq[i]=='N') n++;
639     }
640     perc_N=(n*100.0)/(end5-end3+1);
641     }
642     };
643 gpertea 170
644 gpertea 4 static NData feat;
645     int perc_lenN=12; // incremental distance from ends, in percentage of
646     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
647 gpertea 170
648 gpertea 4 void N_analyze(int l5, int l3, int p5, int p3) {
649     /* assumes feat was filled properly */
650     int old_dif, t5,t3,v;
651 gpertea 12 if (l3<l5+2 || p5>p3 ) {
652 gpertea 4 feat.end5=l5+1;
653     feat.end3=l3+1;
654 gpertea 170 return;
655 gpertea 4 }
656    
657     t5=feat.NPos[p5]-l5;
658     t3=l3-feat.NPos[p3];
659     old_dif=p3-p5;
660     v=(int)((((double)(l3-l5))*perc_lenN)/100);
661 gpertea 170 if (v>20) v=20; /* enforce N-search limit for very long reads */
662 gpertea 4 if (t5 < v ) {
663     l5=feat.NPos[p5]+1;
664     p5++;
665     }
666     if (t3 < v) {
667     l3=feat.NPos[p3]-1;
668     p3--;
669     }
670     /* restNs=p3-p5; number of Ns in the new CLR */
671     if (p3-p5==old_dif) { /* no change, return */
672     /* don't trim if this may shorten the read too much */
673     //if (l5-l3<min_read_len) return;
674     feat.end5=l5+1;
675     feat.end3=l3+1;
676     return;
677     }
678 gpertea 170 else
679 gpertea 4 N_analyze(l5,l3, p5,p3);
680     }
681    
682    
683 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
684 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
685 gpertea 12 if (qv_phredtype==0) {
686     //try to guess the Phred type
687     int vmin=256, vmax=0;
688     for (int i=0;i<qvs.length();i++) {
689     if (vmin>qvs[i]) vmin=qvs[i];
690     if (vmax<qvs[i]) vmax=qvs[i];
691     }
692     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
693     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
694     if (qv_phredtype==0) {
695     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
696     }
697 gpertea 13 if (verbose)
698     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
699     } //guessing Phred type
700 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
701 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
702 gpertea 12 }
703     //just in case, check also the 5' the end (?)
704     for (l5=0;l5<qvs.length()-3;l5++) {
705 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
706 gpertea 12 }
707 gpertea 13 if (qvtrim_max>0) {
708     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
709     if (l5>qvtrim_max) l5=qvtrim_max;
710     }
711 gpertea 12 return (l5>0 || l3<qvs.length()-1);
712     }
713    
714     bool ntrim(GStr& rseq, int &l5, int &l3) {
715     //count Ns in the sequence, trim N-rich ends
716 gpertea 4 feat.init(rseq);
717     l5=feat.end5-1;
718     l3=feat.end3-1;
719 gpertea 170 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
720 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
721     if (feat.perc_N>max_perc_N) {
722     feat.valid=false;
723     l3=l5+1;
724     return true;
725     }
726     else {
727     return false; //nothing changed
728     }
729     }
730 gpertea 4 l5=feat.end5-1;
731     l3=feat.end3-1;
732     if (l3-l5+1<min_read_len) {
733     feat.valid=false;
734     return true;
735     }
736     feat.N_calc();
737 gpertea 170
738 gpertea 12 if (feat.perc_N>max_perc_N) {
739 gpertea 4 feat.valid=false;
740     l3=l5+1;
741     return true;
742     }
743     return true;
744     }
745    
746     //--------------- dust functions ----------------
747     class DNADuster {
748     public:
749 gpertea 170 int dustword;
750     int dustwindow;
751     int dustwindow2;
752 gpertea 4 int dustcutoff;
753     int mv, iv, jv;
754     int counts[32*32*32];
755     int iis[32*32*32];
756     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
757     dustword=wordsize;
758     dustwindow=winsize;
759     dustwindow2 = (winsize>>1);
760     dustcutoff=cutoff;
761     mv=0;
762     iv=0;
763     jv=0;
764     }
765     void setWindowSize(int value) {
766     dustwindow = value;
767     dustwindow2 = (dustwindow >> 1);
768     }
769     void setWordSize(int value) {
770     dustword=value;
771     }
772     void wo1(int len, const char* s, int ivv) {
773     int i, ii, j, v, t, n, n1, sum;
774     int js, nis;
775     n = 32 * 32 * 32;
776     n1 = n - 1;
777     nis = 0;
778     i = 0;
779     ii = 0;
780     sum = 0;
781     v = 0;
782     for (j=0; j < len; j++, s++) {
783     ii <<= 5;
784     if (*s<=32) {
785     i=0;
786     continue;
787     }
788     ii |= *s - 'A'; //assume uppercase!
789     ii &= n1;
790     i++;
791     if (i >= dustword) {
792     for (js=0; js < nis && iis[js] != ii; js++) ;
793     if (js == nis) {
794     iis[nis] = ii;
795     counts[ii] = 0;
796     nis++;
797     }
798     if ((t = counts[ii]) > 0) {
799     sum += t;
800     v = 10 * sum / j;
801     if (mv < v) {
802     mv = v;
803     iv = ivv;
804     jv = j;
805     }
806     }
807     counts[ii]++;
808     }
809     }
810     }
811    
812     int wo(int len, const char* s, int* beg, int* end) {
813     int i, l1;
814     l1 = len - dustword + 1;
815     if (l1 < 0) {
816     *beg = 0;
817     *end = len - 1;
818     return 0;
819     }
820     mv = 0;
821     iv = 0;
822     jv = 0;
823     for (i=0; i < l1; i++) {
824     wo1(len-i, s+i, i);
825     }
826     *beg = iv;
827     *end = iv + jv;
828     return mv;
829     }
830    
831     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
832     int i, j, l, a, b, v;
833     if (cutoff==0) cutoff=dustcutoff;
834     a=0;b=0;
835     //GMessage("Dust cutoff=%d\n", cutoff);
836     for (i=0; i < seqlen; i += dustwindow2) {
837     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
838     v = wo(l, seq+i, &a, &b);
839     if (v > cutoff) {
840     //for (j = a; j <= b && j < dustwindow2; j++) {
841     for (j = a; j <= b; j++) {
842     seqmsk[i+j]='N';//could be made lowercase instead
843     }
844     }
845     }
846 gpertea 170 //return first;
847 gpertea 4 }
848     };
849    
850     static DNADuster duster;
851    
852     int dust(GStr& rseq) {
853     char* seq=Gstrdup(rseq.chars());
854     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
855     //check the number of Ns:
856     int ncount=0;
857     for (int i=0;i<rseq.length();i++) {
858     if (seq[i]=='N') ncount++;
859     }
860     GFREE(seq);
861     return ncount;
862     }
863    
864 gpertea 94 struct SLocScore {
865     int pos;
866     int score;
867     SLocScore(int p=0,int s=0) {
868     pos=p;
869     score=s;
870     }
871     void set(int p, int s) {
872     pos=p;
873     score=s;
874     }
875     void add(int p, int add) {
876     pos=p;
877     score+=add;
878     }
879     };
880 gpertea 12
881 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
882 gpertea 103 if (!doPolyTrim) return false;
883 gpertea 94 int rlen=seq.length();
884     l5=0;
885     l3=rlen-1;
886     int32 seedVal=*(int32*)poly_seed;
887     char polyChar=poly_seed[0];
888     //assumes N trimming was already done
889     //so a poly match should be very close to the end of the read
890     // -- find the initial match (seed)
891 gpertea 108 int lmin=GMAX((rlen-16), 0);
892 gpertea 94 int li;
893     for (li=rlen-4;li>lmin;li--) {
894     if (seedVal==*(int*)&(seq[li])) {
895     break;
896     }
897 gpertea 12 }
898 gpertea 94 if (li<=lmin) return false;
899     //seed found, try to extend it both ways
900     //extend right
901     int ri=li+3;
902     SLocScore loc(ri, poly_m_score<<2);
903     SLocScore maxloc(loc);
904     //extend right
905 gpertea 170 while (ri<rlen-1) {
906 gpertea 94 ri++;
907     if (seq[ri]==polyChar) {
908     loc.add(ri,poly_m_score);
909     }
910     else if (seq[ri]=='N') {
911     loc.add(ri,0);
912     }
913     else { //mismatch
914     loc.add(ri,poly_mis_score);
915     if (maxloc.score-loc.score>poly_dropoff_score) break;
916     }
917     if (maxloc.score<=loc.score) {
918     maxloc=loc;
919     }
920 gpertea 12 }
921 gpertea 94 ri=maxloc.pos;
922 gpertea 103 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
923 gpertea 94 //ri = right boundary for the poly match
924     //extend left
925     loc.set(li, maxloc.score);
926     maxloc.pos=li;
927     while (li>0) {
928     li--;
929     if (seq[li]==polyChar) {
930     loc.add(li,poly_m_score);
931     }
932     else if (seq[li]=='N') {
933     loc.add(li,0);
934     }
935     else { //mismatch
936     loc.add(li,poly_mis_score);
937     if (maxloc.score-loc.score>poly_dropoff_score) break;
938     }
939     if (maxloc.score<=loc.score) {
940     maxloc=loc;
941     }
942     }
943 gpertea 105 li=maxloc.pos;
944 gpertea 108 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
945     (maxloc.score>poly_minScore && ri>=rlen-3) ||
946     (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
947 gpertea 105 //trimming this li-ri match at 3' end
948     l3=li-1;
949     if (l3<0) l3=0;
950 gpertea 94 return true;
951     }
952     return false;
953 gpertea 12 }
954    
955 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
956 gpertea 103 if (!doPolyTrim) return false;
957 gpertea 88 int rlen=seq.length();
958     l5=0;
959     l3=rlen-1;
960 gpertea 94 int32 seedVal=*(int32*)poly_seed;
961     char polyChar=poly_seed[0];
962 gpertea 88 //assumes N trimming was already done
963     //so a poly match should be very close to the end of the read
964     // -- find the initial match (seed)
965 gpertea 108 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
966 gpertea 94 int li;
967     for (li=0;li<=lmax;li++) {
968     if (seedVal==*(int*)&(seq[li])) {
969 gpertea 88 break;
970     }
971     }
972 gpertea 94 if (li>lmax) return false;
973     //seed found, try to extend it both ways
974     //extend left
975     int ri=li+3; //save rightmost base of the seed
976     SLocScore loc(li, poly_m_score<<2);
977     SLocScore maxloc(loc);
978     while (li>0) {
979     li--;
980     if (seq[li]==polyChar) {
981     loc.add(li,poly_m_score);
982     }
983     else if (seq[li]=='N') {
984     loc.add(li,0);
985     }
986     else { //mismatch
987     loc.add(li,poly_mis_score);
988     if (maxloc.score-loc.score>poly_dropoff_score) break;
989     }
990     if (maxloc.score<=loc.score) {
991     maxloc=loc;
992     }
993 gpertea 88 }
994 gpertea 94 li=maxloc.pos;
995 gpertea 103 if (li>5) return false; //no trimming wanted, too far from 5' end
996 gpertea 94 //li = right boundary for the poly match
997    
998     //extend right
999     loc.set(ri, maxloc.score);
1000     maxloc.pos=ri;
1001 gpertea 170 while (ri<rlen-1) {
1002 gpertea 94 ri++;
1003     if (seq[ri]==polyChar) {
1004     loc.add(ri,poly_m_score);
1005     }
1006     else if (seq[ri]=='N') {
1007     loc.add(ri,0);
1008     }
1009     else { //mismatch
1010     loc.add(ri,poly_mis_score);
1011     if (maxloc.score-loc.score>poly_dropoff_score) break;
1012     }
1013     if (maxloc.score<=loc.score) {
1014     maxloc=loc;
1015     }
1016     }
1017 gpertea 105 ri=maxloc.pos;
1018 gpertea 103 if ((maxloc.score==poly_minScore && li==0) ||
1019     (maxloc.score>poly_minScore && li<2)
1020 gpertea 108 || (maxloc.score>(poly_minScore*3) && li<8)) {
1021 gpertea 105 //adjust l5 to reflect this trimming of 5' end
1022     l5=ri+1;
1023     if (l5>rlen-1) l5=rlen-1;
1024 gpertea 94 return true;
1025     }
1026     return false;
1027 gpertea 88 }
1028    
1029 gpertea 180 bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
1030     if (adaptors3.Count()==0) return false;
1031 gpertea 4 int rlen=seq.length();
1032     l5=0;
1033     l3=rlen-1;
1034 gpertea 103 bool trimmed=false;
1035 gpertea 175 GStr wseq(seq);
1036 gpertea 103 int wlen=rlen;
1037 gpertea 175 GXSeqData seqdata;
1038 gpertea 180 int numruns=revCompl ? 2 : 1;
1039 gpertea 185 GList<GXAlnInfo> bestalns(true, true, false);
1040 gpertea 180 for (int ai=0;ai<adaptors3.Count();ai++) {
1041     for (int r=0;r<numruns;r++) {
1042     if (r) {
1043     seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
1044     adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
1045     }
1046     else {
1047     seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
1048     adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
1049     }
1050 gpertea 185 GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
1051     if (aln) {
1052     if (aln->strong) {
1053     trimmed=true;
1054     bestalns.Add(aln);
1055     break; //will check the rest next time
1056     }
1057     else bestalns.Add(aln);
1058     }
1059 gpertea 180 }//forward and reverse adaptors
1060 gpertea 185 if (trimmed) break; //will check the rest in the next cycle
1061 gpertea 180 }//for each 3' adaptor
1062 gpertea 185 if (bestalns.Count()>0) {
1063     GXAlnInfo* aln=bestalns[0];
1064     if (aln->sl-1 > wlen-aln->sr) {
1065     //keep left side
1066     l3-=(wlen-aln->sl+1);
1067     if (l3<0) l3=0;
1068     }
1069     else { //keep right side
1070     l5+=aln->sr;
1071     if (l5>=rlen) l5=rlen-1;
1072     }
1073     //delete aln;
1074     //if (l3-l5+1<min_read_len) return true;
1075     wseq=seq.substr(l5,l3-l5+1);
1076     wlen=wseq.length();
1077     return true; //break the loops here to report a good find
1078     }
1079     return false;
1080 gpertea 4 }
1081    
1082 gpertea 180 bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1083     if (adaptors5.Count()==0) return false;
1084 gpertea 4 int rlen=seq.length();
1085     l5=0;
1086     l3=rlen-1;
1087 gpertea 102 bool trimmed=false;
1088 gpertea 175 GStr wseq(seq);
1089 gpertea 103 int wlen=rlen;
1090 gpertea 175 GXSeqData seqdata;
1091 gpertea 180 int numruns=revCompl ? 2 : 1;
1092 gpertea 183 GList<GXAlnInfo> bestalns(true, true, false);
1093 gpertea 180 for (int ai=0;ai<adaptors5.Count();ai++) {
1094     for (int r=0;r<numruns;r++) {
1095     if (r) {
1096     seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1097     adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1098     }
1099     else {
1100     seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1101     adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1102     }
1103 gpertea 182 GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1104     if (aln) {
1105 gpertea 183 if (aln->strong) {
1106     trimmed=true;
1107     bestalns.Add(aln);
1108     break; //will check the rest next time
1109     }
1110     else bestalns.Add(aln);
1111     }
1112     } //forward and reverse?
1113     if (trimmed) break; //will check the rest in the next cycle
1114     }//for each 5' adaptor
1115     if (bestalns.Count()>0) {
1116     GXAlnInfo* aln=bestalns[0];
1117 gpertea 182 if (aln->sl-1 > wlen-aln->sr) {
1118 gpertea 180 //keep left side
1119 gpertea 182 l3-=(wlen-aln->sl+1);
1120 gpertea 180 if (l3<0) l3=0;
1121     }
1122     else { //keep right side
1123 gpertea 182 l5+=aln->sr;
1124 gpertea 180 if (l5>=rlen) l5=rlen-1;
1125     }
1126 gpertea 183 //delete aln;
1127     //if (l3-l5+1<min_read_len) return true;
1128 gpertea 180 wseq=seq.substr(l5,l3-l5+1);
1129     wlen=wseq.length();
1130 gpertea 183 return true; //break the loops here to report a good find
1131     }
1132     return false;
1133 gpertea 98 }
1134 gpertea 4
1135 gpertea 170 //convert qvs to/from phred64 from/to phread33
1136 gpertea 12 void convertPhred(GStr& q) {
1137     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1138 gpertea 4 }
1139    
1140 gpertea 12 void convertPhred(char* q, int len) {
1141     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1142 gpertea 4 }
1143    
1144 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1145 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname) {
1146     rseq="";
1147     rqv="";
1148     rname="";
1149     rinfo="";
1150     if (fq.eof()) return false;
1151     char* l=fq.getLine();
1152     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1153     if (l==NULL) return false;
1154     /* if (rawFormat) {
1155     //TODO: implement raw qseq parsing here
1156     //if (raw type=N) then continue; //skip invalid/bad records
1157     } //raw qseq format
1158     else { // FASTQ or FASTA */
1159     isfasta=(l[0]=='>');
1160 gpertea 170 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1161 gpertea 13 infname.chars(), l);
1162     GStr s(l);
1163     rname=&(l[1]);
1164     for (int i=0;i<rname.length();i++)
1165 gpertea 170 if (rname[i]<=' ') {
1166     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1167     rname.cut(i);
1168     break;
1169 gpertea 13 }
1170     //now get the sequence
1171 gpertea 170 if ((l=fq.getLine())==NULL)
1172 gpertea 13 GError("Error: unexpected EOF after header for read %s (%s)\n",
1173     rname.chars(), infname.chars());
1174     rseq=l; //this must be the DNA line
1175     while ((l=fq.getLine())!=NULL) {
1176     //seq can span multiple lines
1177     if (l[0]=='>' || l[0]=='+') {
1178 gpertea 170 fq.pushBack();
1179 gpertea 13 break; //
1180     }
1181     rseq+=l;
1182 gpertea 170 } //check for multi-line seq
1183 gpertea 13 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1184     if ((l=fq.getLine())==NULL)
1185     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1186     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1187 gpertea 170 if ((l=fq.getLine())==NULL)
1188 gpertea 13 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1189     rqv=l;
1190 gpertea 170 //if (rqv.length()!=rseq.length())
1191 gpertea 13 // GError("Error: qv len != seq len for %s\n", rname.chars());
1192     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1193     rqv+=l; //append to qv string
1194     }
1195     }// fastq
1196     // } //<-- FASTA or FASTQ
1197 gpertea 94 rseq.upper();
1198 gpertea 13 return true;
1199     }
1200    
1201 gpertea 105 #ifdef GDEBUG
1202     void showTrim(GStr& s, int l5, int l3) {
1203 gpertea 177 if (l5>0 || l3==0) {
1204 gpertea 105 color_bg(c_red);
1205     }
1206     for (int i=0;i<s.length()-1;i++) {
1207     if (i && i==l5) color_resetbg();
1208     fprintf(stderr, "%c", s[i]);
1209 gpertea 177 if (i && i==l3) color_bg(c_red);
1210 gpertea 105 }
1211     fprintf(stderr, "%c", s[s.length()-1]);
1212     color_reset();
1213     fprintf(stderr, "\n");
1214     }
1215     #endif
1216    
1217 gpertea 13 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1218 gpertea 170 //returns 0 if the read was untouched, 1 if it was just trimmed
1219 gpertea 13 // and a trash code if it was trashed
1220     l5=0;
1221     l3=rseq.length()-1;
1222 gpertea 103 #ifdef GDEBUG
1223 gpertea 108 //rseq.reverse();
1224 gpertea 103 GMessage(">%s\n", rname.chars());
1225 gpertea 105 GMessage("%s\n",rseq.chars());
1226 gpertea 103 #endif
1227 gpertea 13 if (l3-l5+1<min_read_len) {
1228     return 's';
1229     }
1230     GStr wseq(rseq.chars());
1231     GStr wqv(rqv.chars());
1232     int w5=l5;
1233     int w3=l3;
1234 gpertea 187 //count Ns
1235     b_totalIn+=rseq.length();
1236     for (int i=0;i<rseq.length();i++) {
1237     if (isACGT[(int)rseq[i]]==0) b_totalN++;
1238     }
1239    
1240 gpertea 13 //first do the q-based trimming
1241     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1242 gpertea 187 b_trimQ+=(w5-l5)+(l3-w3);
1243     num_trimQ++;
1244 gpertea 13 if (w3-w5+1<min_read_len) {
1245     return 'Q'; //invalid read
1246     }
1247     //-- keep only the w5..w3 range
1248     l5=w5;
1249     l3=w3;
1250     wseq=wseq.substr(w5, w3-w5+1);
1251     if (!wqv.is_empty())
1252     wqv=wqv.substr(w5, w3-w5+1);
1253     } //qv trimming
1254     // N-trimming on the remaining read seq
1255     if (ntrim(wseq, w5, w3)) {
1256 gpertea 187 //Note: ntrim resets w5 and w3
1257 gpertea 13 //GMessage("before: %s\n",wseq.chars());
1258     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1259 gpertea 187 int trim3=(wseq.length()-1-w3);
1260     b_trimN+=w5+trim3;
1261     num_trimN++;
1262 gpertea 13 l5+=w5;
1263 gpertea 187 l3-=trim3;
1264 gpertea 13 if (w3-w5+1<min_read_len) {
1265     return 'N'; //to be trashed
1266     }
1267     //-- keep only the w5..w3 range
1268     wseq=wseq.substr(w5, w3-w5+1);
1269     if (!wqv.is_empty())
1270     wqv=wqv.substr(w5, w3-w5+1);
1271     w5=0;
1272     w3=wseq.length()-1;
1273     }
1274 gpertea 103 char trim_code;
1275 gpertea 180 //clean the more dirty end first - 3'
1276 gpertea 187 bool trimmedA=false;
1277     bool trimmedT=false;
1278     bool trimmedV=false;
1279    
1280 gpertea 180 int prev_w5=0;
1281     int prev_w3=0;
1282     bool w3upd=true;
1283     bool w5upd=true;
1284 gpertea 103 do {
1285     trim_code=0;
1286 gpertea 180 if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1287 gpertea 103 trim_code='A';
1288 gpertea 187 b_trimA+=(w5+(wseq.length()-1-w3));
1289     if (!trimmedA) { num_trimA++; trimmedA=true; }
1290 gpertea 103 }
1291 gpertea 180 else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1292 gpertea 103 trim_code='T';
1293 gpertea 187 b_trimT+=(w5+(wseq.length()-1-w3));
1294     if (!trimmedT) { num_trimT++; trimmedT=true; }
1295 gpertea 103 }
1296 gpertea 180 else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1297 gpertea 103 trim_code='A';
1298 gpertea 187 b_trimA+=(w5+(wseq.length()-1-w3));
1299     if (!trimmedA) { num_trimA++; trimmedA=true; }
1300 gpertea 103 }
1301 gpertea 180 else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1302 gpertea 103 trim_code='T';
1303 gpertea 187 b_trimT+=(w5+(wseq.length()-1-w3));
1304     if (!trimmedT) { num_trimT++; trimmedT=true; }
1305 gpertea 103 }
1306 gpertea 180 else if (trim_adaptor5(wseq, w5, w3)) {
1307 gpertea 187 trim_code='V';
1308     b_trimV+=(w5+(wseq.length()-1-w3));
1309     if (!trimmedV) { num_trimV++; trimmedV=true; }
1310 gpertea 180 }
1311     else if (trim_adaptor3(wseq, w5, w3)) {
1312 gpertea 187 trim_code='V';
1313     b_trimV+=(w5+(wseq.length()-1-w3));
1314     if (!trimmedV) { num_trimV++; trimmedV=true; }
1315 gpertea 103 }
1316     if (trim_code) {
1317 gpertea 180 w3upd=(w3!=prev_w3);
1318     w5upd=(w5!=prev_w5);
1319     if (w3upd) prev_w3=w3;
1320     if (w5upd) prev_w5=w5;
1321     #ifdef GDEBUG
1322 gpertea 105 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1323     showTrim(wseq, w5, w3);
1324 gpertea 180 #endif
1325 gpertea 187 //int trimlen=wseq.length()-(w3-w5+1);
1326 gpertea 13 l5+=w5;
1327     l3-=(wseq.length()-1-w3);
1328     if (w3-w5+1<min_read_len) {
1329 gpertea 103 return trim_code;
1330 gpertea 13 }
1331     //-- keep only the w5..w3 range
1332     wseq=wseq.substr(w5, w3-w5+1);
1333     if (!wqv.is_empty())
1334     wqv=wqv.substr(w5, w3-w5+1);
1335 gpertea 103 }//trimming at 3' end
1336     } while (trim_code);
1337 gpertea 13 if (doCollapse) {
1338     //keep read for later
1339     FqDupRec* dr=dhash.Find(wseq.chars());
1340     if (dr==NULL) { //new entry
1341 gpertea 170 //if (prefix.is_empty())
1342     dhash.Add(wseq.chars(),
1343 gpertea 13 new FqDupRec(&wqv, rname.chars()));
1344     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1345     }
1346     else
1347     dr->add(wqv);
1348     } //collapsing duplicates
1349     else { //not collapsing duplicates
1350     //apply the dust filter now
1351     if (doDust) {
1352     int dustbases=dust(wseq);
1353     if (dustbases>(wseq.length()>>1)) {
1354     return 'D';
1355     }
1356     }
1357     } //not collapsing duplicates
1358     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1359     }
1360    
1361     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1362     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1363     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1364     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1365     }
1366    
1367     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1368     outcounter++;
1369     bool asFasta=(rqv.is_empty() || fastaOutput);
1370     if (asFasta) {
1371     if (prefix.is_empty()) {
1372     printHeader(f_out, '>',rname,rinfo);
1373     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1374     }
1375     else {
1376 gpertea 170 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1377 gpertea 13 rseq.chars());
1378     }
1379     }
1380     else { //fastq
1381     if (convert_phred) convertPhred(rqv);
1382     if (prefix.is_empty()) {
1383     printHeader(f_out, '@', rname, rinfo);
1384     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1385     }
1386     else
1387 gpertea 170 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1388 gpertea 13 rseq.chars(),rqv.chars() );
1389     }
1390     }
1391    
1392     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1393     if (freport==NULL || trashcode<=' ') return;
1394     if (trashcode=='3' || trashcode=='5') {
1395 gpertea 103 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1396 gpertea 13 }
1397     else {
1398     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1399     }
1400     //tcounter++;
1401     }
1402    
1403     GStr getFext(GStr& s, int* xpos=NULL) {
1404     //s must be a filename without a path
1405     GStr r("");
1406     if (xpos!=NULL) *xpos=0;
1407     if (s.is_empty() || s=="-") return r;
1408     int p=s.rindex('.');
1409     int d=s.rindex('/');
1410     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1411     r=s.substr(p+1);
1412     if (xpos!=NULL) *xpos=p+1;
1413     r.lower();
1414     return r;
1415     }
1416    
1417     void baseFileName(GStr& fname) {
1418     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1419     int xpos=0;
1420     GStr fext=getFext(fname, &xpos);
1421     if (xpos<=1) return;
1422     bool extdel=false;
1423     GStr f2;
1424     if (fext=="z" || fext=="zip") {
1425     extdel=true;
1426     }
1427     else if (fext.length()>=2) {
1428     f2=fext.substr(0,2);
1429     extdel=(f2=="gz" || f2=="bz");
1430     }
1431     if (extdel) {
1432     fname.cut(xpos-1);
1433     fext=getFext(fname, &xpos);
1434     if (xpos<=1) return;
1435     }
1436     extdel=false;
1437     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1438     extdel=true;
1439     }
1440     else if (fext.length()>=2) {
1441     extdel=(fext.substr(0,2)=="fa");
1442     }
1443     if (extdel) fname.cut(xpos-1);
1444     GStr fncp(fname);
1445     fncp.lower();
1446     fncp.chomp("seq");
1447     fncp.chomp("sequence");
1448     fncp.trimR("_.");
1449     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1450     }
1451    
1452     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1453     FILE* f_out=NULL;
1454     GStr fname(getFileName(infname.chars()));
1455     //eliminate known extensions
1456     baseFileName(fname);
1457     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1458     else if (pocmd.is_empty()) {
1459     GStr oname(fname);
1460     oname.append('.');
1461     oname.append(outsuffix);
1462     f_out=fopen(oname.chars(),"w");
1463     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1464     }
1465     else {
1466     GStr oname(">");
1467     oname.append(fname);
1468     oname.append('.');
1469     oname.append(outsuffix);
1470     pocmd.append(oname);
1471     f_out=popen(pocmd.chars(), "w");
1472     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1473     }
1474     return f_out;
1475     }
1476    
1477     void guess_unzip(GStr& fname, GStr& picmd) {
1478     GStr fext=getFext(fname);
1479     if (fext=="gz" || fext=="gzip" || fext=="z") {
1480     picmd="gzip -cd ";
1481     }
1482     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1483     picmd="bzip2 -cd ";
1484     }
1485     }
1486    
1487 gpertea 180 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1488 gpertea 170 //TODO: prepare CASeqData here, and collect hexamers as well
1489 gpertea 180 if (seq.is_empty() || seq=="-" ||
1490     seq=="N/A" || seq==".") return;
1491    
1492 gpertea 172 CASeqData adata(revCompl);
1493 gpertea 180 int idx=adaptors.Add(adata);
1494 gpertea 172 if (idx<0) GError("Error: failed to add adaptor!\n");
1495 gpertea 180 adaptors[idx].trim_type=trim_type;
1496     adaptors[idx].update(seq.chars());
1497 gpertea 170 }
1498    
1499    
1500 gpertea 180 int loadAdaptors(const char* fname) {
1501 gpertea 94 GLineReader lr(fname);
1502     char* l;
1503     while ((l=lr.nextLine())!=NULL) {
1504     if (lr.length()<=3 || l[0]=='#') continue;
1505 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1506     l[0]==';'|| l[0]==':' ) {
1507 gpertea 94 int i=1;
1508     while (l[i]!=0 && isspace(l[i])) {
1509     i++;
1510     }
1511     if (l[i]!=0) {
1512 gpertea 102 GStr s(&(l[i]));
1513 gpertea 108 #ifdef GDEBUG
1514     //s.reverse();
1515     #endif
1516 gpertea 180 addAdaptor(adaptors3, s, galn_TrimRight);
1517 gpertea 94 continue;
1518     }
1519     }
1520     else {
1521 gpertea 98 GStr s(l);
1522     s.startTokenize("\t ;,:");
1523     GStr a5,a3;
1524     if (s.nextToken(a5))
1525 gpertea 180 s.nextToken(a3);
1526     else continue; //no tokens on this line
1527     GAlnTrimType ttype5=galn_TrimLeft;
1528 gpertea 102 a5.upper();
1529     a3.upper();
1530 gpertea 180 if (a3.is_empty() || a3==a5 || a3=="=") {
1531     a3.clear();
1532     ttype5=galn_TrimEither;
1533     }
1534 gpertea 108 #ifdef GDEBUG
1535     // a5.reverse();
1536     // a3.reverse();
1537     #endif
1538 gpertea 180 addAdaptor(adaptors5, a5, ttype5);
1539     addAdaptor(adaptors3, a3, galn_TrimRight);
1540 gpertea 94 }
1541     }
1542 gpertea 180 return adaptors5.Count()+adaptors3.Count();
1543 gpertea 94 }
1544    
1545 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1546     GStr& s, GStr& infname, GStr& infname2) {
1547 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
1548     infname="";
1549     infname2="";
1550     f_in=NULL;
1551     f_in2=NULL;
1552     f_out=NULL;
1553     f_out2=NULL;
1554     //analyze outsuffix intent
1555     GStr pocmd;
1556 gpertea 76 if (outsuffix=="-") {
1557     f_out=stdout;
1558     }
1559     else {
1560     GStr ox=getFext(outsuffix);
1561     if (ox.length()>2) ox=ox.substr(0,2);
1562     if (ox=="gz") pocmd="gzip -9 -c ";
1563     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1564     }
1565 gpertea 13 if (s=="-") {
1566     f_in=stdin;
1567     infname="stdin";
1568     f_out=prepOutFile(infname, pocmd);
1569     return;
1570     } // streaming from stdin
1571     s.startTokenize(",:");
1572     s.nextToken(infname);
1573     bool paired=s.nextToken(infname2);
1574 gpertea 170 if (fileExists(infname.chars())==0)
1575 gpertea 13 GError("Error: cannot find file %s!\n",infname.chars());
1576     GStr fname(getFileName(infname.chars()));
1577     GStr picmd;
1578     guess_unzip(fname, picmd);
1579     if (picmd.is_empty()) {
1580     f_in=fopen(infname.chars(), "r");
1581     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1582     }
1583     else {
1584     picmd.append(infname);
1585     f_in=popen(picmd.chars(), "r");
1586     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1587     }
1588 gpertea 76 if (f_out==stdout) {
1589     if (paired) GError("Error: output suffix required for paired reads\n");
1590     return;
1591     }
1592 gpertea 13 f_out=prepOutFile(infname, pocmd);
1593     if (!paired) return;
1594     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1595     // ---- paired reads:-------------
1596 gpertea 170 if (fileExists(infname2.chars())==0)
1597 gpertea 13 GError("Error: cannot find file %s!\n",infname2.chars());
1598     picmd="";
1599     GStr fname2(getFileName(infname2.chars()));
1600     guess_unzip(fname2, picmd);
1601     if (picmd.is_empty()) {
1602     f_in2=fopen(infname2.chars(), "r");
1603     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1604     }
1605     else {
1606     picmd.append(infname2);
1607     f_in2=popen(picmd.chars(), "r");
1608     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1609     }
1610     f_out2=prepOutFile(infname2, pocmd);
1611     }