ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 189
Committed: Fri Feb 17 23:08:53 2012 UTC (7 years, 8 months ago) by gpertea
File size: 48914 byte(s)
Log Message:
added -a option to control the minimum exact adaptor match to be trimmed at the ends

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 187 #include <inttypes.h>
8 gpertea 4
9     #define USAGE "Usage:\n\
10 gpertea 189 fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_match>]\\\n\
11 gpertea 172 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
12 gpertea 76 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
13 gpertea 189 [-a <min_poly>] [-A] <input.fq>[,<input_mates.fq>\n\
14 gpertea 4 \n\
15 gpertea 180 Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
16 gpertea 76 for low complexity and collapse duplicate reads.\n\
17 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
18     one read in a pair), the two file names should be given delimited by a comma\n\
19     or a colon character\n\
20 gpertea 4 \n\
21     Options:\n\
22     -n rename all the reads using the <prefix> followed by a read counter;\n\
23     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
24     the read duplication count\n\
25 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
26     file(s) named <input>.<outsuffix> which will be created in the \n\
27     current (working) directory; (writes to stdout if -o- is given);\n\
28     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
29 gpertea 180 -f file with adaptor sequences to trim, each line having this format:\n\
30     <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
31     -5 trim the given adaptor or primer sequence at the 5' end of each read\n\
32 gpertea 4 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
33 gpertea 180 -3 trim the given adaptor sequence at the 3' end of each read\n\
34 gpertea 4 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
35 gpertea 189 -a minimum exact match to adaptor sequence to trim at the end (6)\n\
36 gpertea 103 -A disable polyA/T trimming (enabled by default)\n\
37 gpertea 189 -y minimum length of poly-A/T run to remove (6)\n\
38 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
39 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
40 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
41     -l minimum \"clean\" length after trimming that a read must have\n\
42     in order to pass the filter (default: 16)\n\
43 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
44     one letter code for the reason why they were trashed\n\
45 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
46 gpertea 187 50%% of its length detected as low complexity\n\
47 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
48 gpertea 12 name (e.g. for microRNAs)\n\
49     -p input is phred64/phred33 (use -p64 or -p33)\n\
50     -Q convert quality values to the other Phred qv type\n\
51 gpertea 187 -V be verbose when done (print summary counts)\n\
52 gpertea 4 "
53 gpertea 170
54 gpertea 13 //-z for -o option, the output stream(s) will be first piped into the given\n
55     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
56    
57 gpertea 170
58 gpertea 180 // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
59 gpertea 12
60 gpertea 76 //For paired reads sequencing:
61 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
62     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
63 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
64     //FILE* f_out2=NULL; //for paired reads
65     //FILE* f_in=NULL; //input fastq (stdin if not provided)
66     //FILE* f_in2=NULL; //for paired reads
67 gpertea 170
68 gpertea 4 FILE* freport=NULL;
69 gpertea 13
70 gpertea 4 bool debug=false;
71 gpertea 12 bool verbose=false;
72 gpertea 4 bool doCollapse=false;
73     bool doDust=false;
74 gpertea 103 bool doPolyTrim=true;
75 gpertea 13 bool fastaOutput=false;
76 gpertea 4 bool trashReport=false;
77 gpertea 172 bool revCompl=false; //also reverse complement adaptor sequences
78 gpertea 4 int min_read_len=16;
79 gpertea 12 double max_perc_N=7.0;
80 gpertea 4 int dust_cutoff=16;
81     bool isfasta=false;
82 gpertea 12 bool convert_phred=false;
83 gpertea 170 GStr outsuffix; // -o
84 gpertea 13 GStr prefix;
85     GStr zcmd;
86 gpertea 187 char isACGT[256];
87 gpertea 12
88 gpertea 187 uint num_trimN=0; //reads trimmed by N%
89     uint num_trimQ=0; //reads trimmed by qv threshold
90     uint num_trimV=0; //reads trimmed by adapter match
91     uint num_trimA=0; //reads trimmed by polyA
92     uint num_trimT=0; //reads trimmed by polyT
93     uint num_trim5=0; //number of reads trimmed at 5' end
94     uint num_trim3=0; //number of reads trimmed at 3' end
95    
96     uint64 b_totalIn=0; //total number of input bases
97     uint64 b_totalN=0; //total number of undetermined bases found in input bases
98    
99     uint64 b_trimN=0; //total number of bases trimmed due to N-trim
100     uint64 b_trimQ=0; //number of bases trimmed due to qv threshold
101     uint64 b_trimV=0; //total number of bases trimmed due to adaptor matches
102     uint64 b_trimA=0; //number of bases trimmed due to poly-A tails
103     uint64 b_trimT=0; //number of bases trimmed due to poly-T tails
104     uint64 b_trim5=0; //total bases trimmed on the 5' side
105     uint64 b_trim3=0; //total bases trimmed on the 3' side
106     //int min_trimmed5=INT_MAX;
107     //int min_trimmed3=INT_MAX;
108    
109 gpertea 13 int qvtrim_qmin=0;
110     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
111 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
112     int qv_cvtadd=0; //could be -31 or +31
113    
114 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
115 gpertea 177 //const int match_reward=2;
116     //const int mismatch_penalty=3;
117 gpertea 189 int match_reward=1;
118     int mismatch_penalty=5;
119     int Xdrop=16;
120     int minEndAdapter=6;
121 gpertea 103
122 gpertea 189
123 gpertea 94 const int poly_m_score=2; //match score
124     const int poly_mis_score=-3; //mismatch
125 gpertea 88 const int poly_dropoff_score=7;
126 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
127 gpertea 4
128 gpertea 94 const char *polyA_seed="AAAA";
129     const char *polyT_seed="TTTT";
130 gpertea 12
131 gpertea 170 struct CASeqData {
132 gpertea 180 //positional data for every possible hexamer in an adaptor
133     GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
134     GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
135     GStr seq; //actual adaptor sequence data
136 gpertea 170 GStr seqr; //reverse complement sequence
137 gpertea 180 int amlen; //fraction of adaptor length matching that's
138 gpertea 175 //enough to consider the alignment
139 gpertea 180 GAlnTrimType trim_type;
140 gpertea 172 bool use_reverse;
141 gpertea 175 CASeqData(bool rev=false):seq(),seqr(),
142     amlen(0), use_reverse(rev) {
143 gpertea 180 trim_type=galn_None; //should be updated later!
144 gpertea 172 for (int i=0;i<4096;i++) {
145     pz[i]=NULL;
146     pzr[i]=NULL;
147     }
148     }
149    
150     void update(const char* s) {
151 gpertea 175 seq=s;
152     table6mers(seq.chars(), seq.length(), pz);
153 gpertea 189 amlen=calc_safelen(seq.length());
154 gpertea 175 if (!use_reverse) return;
155     //reverse complement
156     seqr=s;
157     int slen=seq.length();
158     for (int i=0;i<slen;i++)
159     seqr[i]=ntComplement(seq[slen-i-1]);
160     table6mers(seqr.chars(), seqr.length(), pzr);
161 gpertea 170 }
162    
163 gpertea 172 ~CASeqData() {
164     for (int i=0;i<4096;i++) {
165     delete pz[i];
166     delete pzr[i];
167 gpertea 170 }
168     }
169     };
170    
171 gpertea 180 GVec<CASeqData> adaptors5;
172     GVec<CASeqData> adaptors3;
173 gpertea 170
174 gpertea 103 CGreedyAlignData* gxmem_l=NULL;
175     CGreedyAlignData* gxmem_r=NULL;
176    
177 gpertea 4 // element in dhash:
178     class FqDupRec {
179     public:
180     int count; //how many of these reads are the same
181     int len; //length of qv
182     char* firstname; //optional, only if we want to keep the original read names
183     char* qv;
184     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
185     len=0;
186     qv=NULL;
187     firstname=NULL;
188     count=0;
189     if (qstr!=NULL) {
190     qv=Gstrdup(qstr->chars());
191     len=qstr->length();
192     count++;
193     }
194     if (rname!=NULL) firstname=Gstrdup(rname);
195     }
196     ~FqDupRec() {
197     GFREE(qv);
198     GFREE(firstname);
199     }
200     void add(GStr& d) { //collapse another record into this one
201     if (d.length()!=len)
202     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
203     count++;
204     for (int i=0;i<len;i++)
205     qv[i]=(qv[i]+d[i])/2;
206     }
207     };
208    
209     void openfw(FILE* &f, GArgs& args, char opt) {
210     GStr s=args.getOpt(opt);
211     if (!s.is_empty()) {
212     if (s=='-') f=stdout;
213     else {
214 gpertea 13 f=fopen(s.chars(),"w");
215 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
216     }
217     }
218     }
219    
220     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
221     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
222    
223     GHash<FqDupRec> dhash; //hash to keep track of duplicates
224    
225 gpertea 180 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
226     int loadAdaptors(const char* fname);
227 gpertea 94
228 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
229     GStr& s, GStr& infname, GStr& infname2);
230 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
231 gpertea 170
232 gpertea 13 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
233     void trash_report(char trashcode, GStr& rname, FILE* freport);
234    
235 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
236 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname);
237    
238 gpertea 170 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
239 gpertea 13 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
240    
241 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
242     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
243 gpertea 4 int dust(GStr& seq);
244 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
245     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
246 gpertea 180 bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
247     bool trim_adaptor3(GStr& seq, int &l5, int &l3);
248 gpertea 4
249 gpertea 12 void convertPhred(char* q, int len);
250     void convertPhred(GStr& q);
251 gpertea 4
252     int main(int argc, char * const argv[]) {
253 gpertea 189 GArgs args(argc, argv, "MATCH=MISMATCH=XDROP=YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:y:");
254 gpertea 4 int e;
255     if ((e=args.isError())>0) {
256     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
257     exit(224);
258     }
259 gpertea 9 debug=(args.getOpt('Y')!=NULL);
260 gpertea 13 verbose=(args.getOpt('V')!=NULL);
261 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
262 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
263     doDust=(args.getOpt('D')!=NULL);
264 gpertea 172 revCompl=(args.getOpt('R')!=NULL);
265 gpertea 103 if (args.getOpt('A')) doPolyTrim=false;
266 gpertea 12 /*
267 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
268     if (rawFormat) {
269     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
270     }
271 gpertea 12 */
272 gpertea 4 prefix=args.getOpt('n');
273     GStr s=args.getOpt('l');
274 gpertea 170 if (!s.is_empty())
275 gpertea 4 min_read_len=s.asInt();
276 gpertea 12 s=args.getOpt('m');
277 gpertea 170 if (!s.is_empty())
278 gpertea 12 max_perc_N=s.asDouble();
279 gpertea 4 s=args.getOpt('d');
280     if (!s.is_empty()) {
281     dust_cutoff=s.asInt();
282     doDust=true;
283     }
284 gpertea 12 s=args.getOpt('q');
285     if (!s.is_empty()) {
286 gpertea 13 qvtrim_qmin=s.asInt();
287 gpertea 12 }
288 gpertea 13 s=args.getOpt('t');
289     if (!s.is_empty()) {
290     qvtrim_max=s.asInt();
291     }
292 gpertea 189 s=args.getOpt("MATCH");
293     if (!s.is_empty())
294     match_reward=s.asInt();
295     s=args.getOpt("MISMATCH");
296     if (!s.is_empty())
297     mismatch_penalty=s.asInt();
298     s=args.getOpt("XDROP");
299     if (!s.is_empty())
300     Xdrop=s.asInt();
301    
302 gpertea 12 s=args.getOpt('p');
303     if (!s.is_empty()) {
304     int v=s.asInt();
305     if (v==33) {
306     qv_phredtype=33;
307     qv_cvtadd=31;
308     }
309     else if (v==64) {
310     qv_phredtype=64;
311     qv_cvtadd=-31;
312     }
313 gpertea 170 else
314 gpertea 12 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
315     }
316 gpertea 187 memset((void*)isACGT, 0, 256);
317     isACGT['A']=isACGT['a']=isACGT['C']=isACGT['c']=1;
318     isACGT['G']=isACGT['g']=isACGT['T']=isACGT['t']=1;
319 gpertea 94 s=args.getOpt('f');
320     if (!s.is_empty()) {
321 gpertea 180 loadAdaptors(s.chars());
322 gpertea 94 }
323 gpertea 180 bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
324 gpertea 94 s=args.getOpt('5');
325     if (!s.is_empty()) {
326 gpertea 180 if (fileAdaptors)
327 gpertea 94 GError("Error: options -5 and -f cannot be used together!\n");
328     s.upper();
329 gpertea 180 addAdaptor(adaptors5, s, galn_TrimLeft);
330 gpertea 4 }
331 gpertea 94 s=args.getOpt('3');
332     if (!s.is_empty()) {
333 gpertea 180 if (fileAdaptors)
334 gpertea 94 GError("Error: options -3 and -f cannot be used together!\n");
335 gpertea 102 s.upper();
336 gpertea 180 addAdaptor(adaptors3, s, galn_TrimRight);
337 gpertea 4 }
338 gpertea 94 s=args.getOpt('y');
339 gpertea 13 if (!s.is_empty()) {
340 gpertea 94 int minmatch=s.asInt();
341 gpertea 189 if (minmatch>2)
342     poly_minScore=minmatch*poly_m_score;
343     else GMessage("Warning: invalid -y option, ignored.\n");
344 gpertea 13 }
345 gpertea 189 s=args.getOpt('a');
346     if (!s.is_empty()) {
347     int minmatch=s.asInt();
348     if (minmatch>2)
349     minEndAdapter=minmatch;
350     else GMessage("Warning: invalid -a option, ignored.\n");
351     }
352 gpertea 170
353 gpertea 189
354 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
355 gpertea 76 else outsuffix="-";
356 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
357 gpertea 13 int fcount=args.startNonOpt();
358     if (fcount==0) {
359 gpertea 4 GMessage(USAGE);
360     exit(224);
361     }
362 gpertea 13 if (fcount>1 && doCollapse) {
363     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
364     }
365     //openfw(f_out, args, 'o');
366     //if (f_out==NULL) f_out=stdout;
367 gpertea 4 if (trashReport)
368     openfw(freport, args, 'r');
369     char* infile=NULL;
370 gpertea 103
371 gpertea 180 if (adaptors5.Count()>0)
372 gpertea 177 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
373     gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
374 gpertea 180 if (adaptors3.Count()>0)
375 gpertea 103 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
376    
377 gpertea 4 while ((infile=args.nextNonOpt())!=NULL) {
378 gpertea 103 //for each input file
379 gpertea 13 int incounter=0; //counter for input reads
380     int outcounter=0; //counter for output reads
381     int trash_s=0; //too short from the get go
382     int trash_Q=0;
383     int trash_N=0;
384     int trash_D=0;
385 gpertea 103 int trash_poly=0;
386 gpertea 187 int trash_V=0;
387     num_trimN=0;
388     num_trimQ=0;
389     num_trimV=0;
390     num_trimA=0;
391     num_trimT=0;
392     num_trim5=0;
393     num_trim3=0;
394    
395     b_totalIn=0;
396     b_totalN=0;
397     b_trimN=0;
398     b_trimQ=0;
399     b_trimV=0;
400     b_trimA=0;
401     b_trimT=0;
402     b_trim5=0;
403     b_trim3=0;
404    
405 gpertea 13 s=infile;
406     GStr infname;
407     GStr infname2;
408     FILE* f_in=NULL;
409     FILE* f_in2=NULL;
410     FILE* f_out=NULL;
411     FILE* f_out2=NULL;
412     bool paired_reads=false;
413     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
414     GLineReader fq(f_in);
415     GLineReader* fq2=NULL;
416     if (f_in2!=NULL) {
417     fq2=new GLineReader(f_in2);
418     paired_reads=true;
419     }
420     GStr rseq, rqv, seqid, seqinfo;
421     GStr rseq2, rqv2, seqid2, seqinfo2;
422     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
423     incounter++;
424     int a5=0, a3=0, b5=0, b3=0;
425     char tcode=0, tcode2=0;
426     tcode=process_read(seqid, rseq, rqv, a5, a3);
427 gpertea 187 if (a5>0) {
428     b_trim5+=a5;
429     num_trim5++;
430     }
431     if (a3<rseq.length()-1) {
432     b_trim3+=rseq.length()-a3-1;
433     num_trim3++;
434     }
435 gpertea 103 if (fq2!=NULL) {
436 gpertea 13 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
437     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
438     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
439     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
440     }
441     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
442 gpertea 187 if (b5>0) {
443     b_trim5+=b5;
444     num_trim5++;
445     }
446     if (b3<rseq2.length()-1) {
447     b_trim3+=rseq2.length()-b3-1;
448     num_trim3++;
449     }
450 gpertea 13 //decide what to do with this pair and print rseq2 if the pair makes it
451     if (tcode>1 && tcode2<=1) {
452     //"untrash" rseq
453     if (a3-a5+1<min_read_len) {
454     a5=1;
455     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
456 gpertea 4 }
457 gpertea 13 tcode=1;
458     }
459     else if (tcode<=1 && tcode2>1) {
460     //"untrash" rseq2
461     if (b3-b5+1<min_read_len) {
462     b5=1;
463     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
464     }
465     tcode2=1;
466     }
467     if (tcode<=1) { //trimmed or left intact -- write it!
468     if (tcode>0) {
469     rseq2=rseq2.substr(b5,b3-b5+1);
470     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
471 gpertea 4 }
472 gpertea 13 int nocounter=0;
473     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
474     }
475 gpertea 103 } //pair read
476 gpertea 13 if (tcode>1) { //trashed
477 gpertea 103 #ifdef GDEBUG
478 gpertea 175 GMessage(" !!!!TRASH code = %c\n",tcode);
479 gpertea 103 #endif
480 gpertea 13 if (tcode=='s') trash_s++;
481 gpertea 103 else if (tcode=='A' || tcode=='T') trash_poly++;
482 gpertea 13 else if (tcode=='Q') trash_Q++;
483     else if (tcode=='N') trash_N++;
484     else if (tcode=='D') trash_D++;
485 gpertea 187 else if (tcode=='V') trash_V++;
486     //else if (tcode=='5') trash_A5++;
487 gpertea 13 if (trashReport) trash_report(tcode, seqid, freport);
488     }
489     else if (!doCollapse) { //write it
490     if (tcode>0) {
491     rseq=rseq.substr(a5,a3-a5+1);
492     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
493     }
494 gpertea 103 #ifdef GDEBUG
495     GMessage(" After trimming:\n");
496 gpertea 105 GMessage("%s\n",rseq.chars());
497 gpertea 103 #endif
498 gpertea 13 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
499     }
500     } //for each fastq record
501     delete fq2;
502     FRCLOSE(f_in);
503     FRCLOSE(f_in2);
504     if (doCollapse) {
505     outcounter=0;
506     int maxdup_count=1;
507     char* maxdup_seq=NULL;
508     dhash.startIterate();
509     FqDupRec* qd=NULL;
510     char* seq=NULL;
511     while ((qd=dhash.NextData(seq))!=NULL) {
512     GStr rseq(seq);
513     //do the dusting here
514     if (doDust) {
515     int dustbases=dust(rseq);
516     if (dustbases>(rseq.length()>>1)) {
517     if (trashReport && qd->firstname!=NULL) {
518     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
519 gpertea 4 }
520 gpertea 13 trash_D+=qd->count;
521     continue;
522     }
523     }
524     outcounter++;
525 gpertea 170 if (qd->count>maxdup_count) {
526 gpertea 13 maxdup_count=qd->count;
527     maxdup_seq=seq;
528     }
529     if (isfasta) {
530     if (prefix.is_empty()) {
531 gpertea 170 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
532 gpertea 13 rseq.chars());
533 gpertea 4 }
534 gpertea 13 else { //use custom read name
535     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
536     qd->count, rseq.chars());
537 gpertea 4 }
538 gpertea 13 }
539     else { //fastq format
540     if (convert_phred) convertPhred(qd->qv, qd->len);
541     if (prefix.is_empty()) {
542 gpertea 170 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
543 gpertea 13 rseq.chars(), qd->qv);
544 gpertea 4 }
545 gpertea 13 else { //use custom read name
546     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
547     qd->count, rseq.chars(), qd->qv);
548     }
549     }
550     }//for each element of dhash
551     if (maxdup_count>1) {
552     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
553 gpertea 4 }
554 gpertea 13 } //collapse entries
555     if (verbose) {
556     if (paired_reads) {
557     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
558 gpertea 187 GMessage("Number of input pairs :%9u\n", incounter);
559     GMessage(" Output pairs :%9u\t(%u trashed)\n", outcounter, incounter-outcounter);
560 gpertea 13 }
561     else {
562     GMessage(">Input file : %s\n", infname.chars());
563     GMessage("Number of input reads :%9d\n", incounter);
564 gpertea 187 GMessage(" Output reads :%9d (%u trashed)\n", outcounter, incounter-outcounter);
565 gpertea 13 }
566 gpertea 187 GMessage("--------------- Read trimming: ------------------\n");
567     if (num_trim5)
568     GMessage(" 5' trimmed :%9u\n", num_trim5);
569     if (num_trim3)
570     GMessage(" 3' trimmed :%9u\n", num_trim3);
571     if (num_trimQ)
572     GMessage(" q.v. trimmed :%9u\n", num_trimQ);
573     if (num_trimN)
574     GMessage(" N trimmed :%9u\n", num_trimN);
575     if (num_trimT)
576     GMessage(" poly-T trimmed :%9u\n", num_trimT);
577     if (num_trimA)
578     GMessage(" poly-A trimmed :%9u\n", num_trimA);
579     if (num_trimV)
580     GMessage(" Adapter trimmed :%9u\n", num_trimV);
581 gpertea 13 GMessage("------------------------------------\n");
582     if (trash_s>0)
583 gpertea 187 GMessage("Trashed by initial len:%9d\n", trash_s);
584 gpertea 13 if (trash_N>0)
585     GMessage(" Trashed by N%%:%9d\n", trash_N);
586     if (trash_Q>0)
587     GMessage("Trashed by low quality:%9d\n", trash_Q);
588 gpertea 103 if (trash_poly>0)
589     GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
590 gpertea 187 if (trash_V>0)
591     GMessage(" Trashed by adaptor:%9d\n", trash_V);
592     GMessage("\n--------------- Base counts --------------------\n");
593     GMessage(" Input bases :%12llu\n", b_totalIn);
594     double percN=100.0* ((double)b_totalN/(double)b_totalIn);
595     GMessage(" N bases :%12llu (%4.2f%%)\n", b_totalN, percN);
596     GMessage(" trimmed from 5':%12llu\n", b_trim5);
597     GMessage(" trimmed from 3':%12llu\n", b_trim3);
598     GMessage("\n");
599     if (b_trimQ)
600     GMessage(" q.v. trimmed :%12llu\n", b_trimQ);
601     if (b_trimN)
602     GMessage(" N trimmed :%12llu\n", b_trimN);
603     if (b_trimT)
604     GMessage(" poly-T trimmed :%12llu\n", b_trimT);
605     if (b_trimA)
606     GMessage(" poly-A trimmed :%12llu\n", b_trimA);
607     if (b_trimV)
608     GMessage(" Adapter trimmed :%12llu\n", b_trimV);
609    
610 gpertea 13 }
611     if (trashReport) {
612     FWCLOSE(freport);
613 gpertea 4 }
614 gpertea 13 FWCLOSE(f_out);
615     FWCLOSE(f_out2);
616     } //while each input file
617 gpertea 103 delete gxmem_l;
618     delete gxmem_r;
619 gpertea 12 //getc(stdin);
620 gpertea 4 }
621    
622     class NData {
623     public:
624     int NPos[1024]; //there should be no reads longer than 1K ?
625     int NCount;
626     int end5;
627     int end3;
628     int seqlen;
629     double perc_N; //percentage of Ns in end5..end3 range only!
630     const char* seq;
631     bool valid;
632     NData() {
633 gpertea 170 seqlen=0;
634 gpertea 4 NCount=0;
635     end5=0;
636     end3=0;
637     seq=NULL;
638     perc_N=0;
639     valid=true;
640     }
641     void init(GStr& rseq) {
642     NCount=0;
643     perc_N=0;
644     seqlen=rseq.length();
645     seq=rseq.chars();
646     end5=1;
647     end3=seqlen;
648     for (int i=0;i<seqlen;i++)
649 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
650 gpertea 4 NPos[NCount]=i;
651     NCount++;
652     }
653     perc_N=(NCount*100.0)/seqlen;
654     valid=true;
655     }
656     void N_calc() { //only in the region after trimming
657     int n=0;
658     for (int i=end3-1;i<end5;i++) {
659     if (seq[i]=='N') n++;
660     }
661     perc_N=(n*100.0)/(end5-end3+1);
662     }
663     };
664 gpertea 170
665 gpertea 4 static NData feat;
666     int perc_lenN=12; // incremental distance from ends, in percentage of
667     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
668 gpertea 170
669 gpertea 4 void N_analyze(int l5, int l3, int p5, int p3) {
670     /* assumes feat was filled properly */
671     int old_dif, t5,t3,v;
672 gpertea 12 if (l3<l5+2 || p5>p3 ) {
673 gpertea 4 feat.end5=l5+1;
674     feat.end3=l3+1;
675 gpertea 170 return;
676 gpertea 4 }
677    
678     t5=feat.NPos[p5]-l5;
679     t3=l3-feat.NPos[p3];
680     old_dif=p3-p5;
681     v=(int)((((double)(l3-l5))*perc_lenN)/100);
682 gpertea 170 if (v>20) v=20; /* enforce N-search limit for very long reads */
683 gpertea 4 if (t5 < v ) {
684     l5=feat.NPos[p5]+1;
685     p5++;
686     }
687     if (t3 < v) {
688     l3=feat.NPos[p3]-1;
689     p3--;
690     }
691     /* restNs=p3-p5; number of Ns in the new CLR */
692     if (p3-p5==old_dif) { /* no change, return */
693     /* don't trim if this may shorten the read too much */
694     //if (l5-l3<min_read_len) return;
695     feat.end5=l5+1;
696     feat.end3=l3+1;
697     return;
698     }
699 gpertea 170 else
700 gpertea 4 N_analyze(l5,l3, p5,p3);
701     }
702    
703    
704 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
705 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
706 gpertea 12 if (qv_phredtype==0) {
707     //try to guess the Phred type
708     int vmin=256, vmax=0;
709     for (int i=0;i<qvs.length();i++) {
710     if (vmin>qvs[i]) vmin=qvs[i];
711     if (vmax<qvs[i]) vmax=qvs[i];
712     }
713     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
714     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
715     if (qv_phredtype==0) {
716     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
717     }
718 gpertea 13 if (verbose)
719     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
720     } //guessing Phred type
721 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
722 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
723 gpertea 12 }
724     //just in case, check also the 5' the end (?)
725     for (l5=0;l5<qvs.length()-3;l5++) {
726 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
727 gpertea 12 }
728 gpertea 13 if (qvtrim_max>0) {
729     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
730     if (l5>qvtrim_max) l5=qvtrim_max;
731     }
732 gpertea 12 return (l5>0 || l3<qvs.length()-1);
733     }
734    
735     bool ntrim(GStr& rseq, int &l5, int &l3) {
736     //count Ns in the sequence, trim N-rich ends
737 gpertea 4 feat.init(rseq);
738     l5=feat.end5-1;
739     l3=feat.end3-1;
740 gpertea 170 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
741 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
742     if (feat.perc_N>max_perc_N) {
743     feat.valid=false;
744     l3=l5+1;
745     return true;
746     }
747     else {
748     return false; //nothing changed
749     }
750     }
751 gpertea 4 l5=feat.end5-1;
752     l3=feat.end3-1;
753     if (l3-l5+1<min_read_len) {
754     feat.valid=false;
755     return true;
756     }
757     feat.N_calc();
758 gpertea 170
759 gpertea 12 if (feat.perc_N>max_perc_N) {
760 gpertea 4 feat.valid=false;
761     l3=l5+1;
762     return true;
763     }
764     return true;
765     }
766    
767     //--------------- dust functions ----------------
768     class DNADuster {
769     public:
770 gpertea 170 int dustword;
771     int dustwindow;
772     int dustwindow2;
773 gpertea 4 int dustcutoff;
774     int mv, iv, jv;
775     int counts[32*32*32];
776     int iis[32*32*32];
777     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
778     dustword=wordsize;
779     dustwindow=winsize;
780     dustwindow2 = (winsize>>1);
781     dustcutoff=cutoff;
782     mv=0;
783     iv=0;
784     jv=0;
785     }
786     void setWindowSize(int value) {
787     dustwindow = value;
788     dustwindow2 = (dustwindow >> 1);
789     }
790     void setWordSize(int value) {
791     dustword=value;
792     }
793     void wo1(int len, const char* s, int ivv) {
794     int i, ii, j, v, t, n, n1, sum;
795     int js, nis;
796     n = 32 * 32 * 32;
797     n1 = n - 1;
798     nis = 0;
799     i = 0;
800     ii = 0;
801     sum = 0;
802     v = 0;
803     for (j=0; j < len; j++, s++) {
804     ii <<= 5;
805     if (*s<=32) {
806     i=0;
807     continue;
808     }
809     ii |= *s - 'A'; //assume uppercase!
810     ii &= n1;
811     i++;
812     if (i >= dustword) {
813     for (js=0; js < nis && iis[js] != ii; js++) ;
814     if (js == nis) {
815     iis[nis] = ii;
816     counts[ii] = 0;
817     nis++;
818     }
819     if ((t = counts[ii]) > 0) {
820     sum += t;
821     v = 10 * sum / j;
822     if (mv < v) {
823     mv = v;
824     iv = ivv;
825     jv = j;
826     }
827     }
828     counts[ii]++;
829     }
830     }
831     }
832    
833     int wo(int len, const char* s, int* beg, int* end) {
834     int i, l1;
835     l1 = len - dustword + 1;
836     if (l1 < 0) {
837     *beg = 0;
838     *end = len - 1;
839     return 0;
840     }
841     mv = 0;
842     iv = 0;
843     jv = 0;
844     for (i=0; i < l1; i++) {
845     wo1(len-i, s+i, i);
846     }
847     *beg = iv;
848     *end = iv + jv;
849     return mv;
850     }
851    
852     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
853     int i, j, l, a, b, v;
854     if (cutoff==0) cutoff=dustcutoff;
855     a=0;b=0;
856     //GMessage("Dust cutoff=%d\n", cutoff);
857     for (i=0; i < seqlen; i += dustwindow2) {
858     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
859     v = wo(l, seq+i, &a, &b);
860     if (v > cutoff) {
861     //for (j = a; j <= b && j < dustwindow2; j++) {
862     for (j = a; j <= b; j++) {
863     seqmsk[i+j]='N';//could be made lowercase instead
864     }
865     }
866     }
867 gpertea 170 //return first;
868 gpertea 4 }
869     };
870    
871     static DNADuster duster;
872    
873     int dust(GStr& rseq) {
874     char* seq=Gstrdup(rseq.chars());
875     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
876     //check the number of Ns:
877     int ncount=0;
878     for (int i=0;i<rseq.length();i++) {
879     if (seq[i]=='N') ncount++;
880     }
881     GFREE(seq);
882     return ncount;
883     }
884    
885 gpertea 94 struct SLocScore {
886     int pos;
887     int score;
888     SLocScore(int p=0,int s=0) {
889     pos=p;
890     score=s;
891     }
892     void set(int p, int s) {
893     pos=p;
894     score=s;
895     }
896     void add(int p, int add) {
897     pos=p;
898     score+=add;
899     }
900     };
901 gpertea 12
902 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
903 gpertea 103 if (!doPolyTrim) return false;
904 gpertea 94 int rlen=seq.length();
905     l5=0;
906     l3=rlen-1;
907     int32 seedVal=*(int32*)poly_seed;
908     char polyChar=poly_seed[0];
909     //assumes N trimming was already done
910     //so a poly match should be very close to the end of the read
911     // -- find the initial match (seed)
912 gpertea 108 int lmin=GMAX((rlen-16), 0);
913 gpertea 94 int li;
914     for (li=rlen-4;li>lmin;li--) {
915     if (seedVal==*(int*)&(seq[li])) {
916     break;
917     }
918 gpertea 12 }
919 gpertea 94 if (li<=lmin) return false;
920     //seed found, try to extend it both ways
921     //extend right
922     int ri=li+3;
923     SLocScore loc(ri, poly_m_score<<2);
924     SLocScore maxloc(loc);
925     //extend right
926 gpertea 170 while (ri<rlen-1) {
927 gpertea 94 ri++;
928     if (seq[ri]==polyChar) {
929     loc.add(ri,poly_m_score);
930     }
931     else if (seq[ri]=='N') {
932     loc.add(ri,0);
933     }
934     else { //mismatch
935     loc.add(ri,poly_mis_score);
936     if (maxloc.score-loc.score>poly_dropoff_score) break;
937     }
938     if (maxloc.score<=loc.score) {
939     maxloc=loc;
940     }
941 gpertea 12 }
942 gpertea 94 ri=maxloc.pos;
943 gpertea 103 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
944 gpertea 94 //ri = right boundary for the poly match
945     //extend left
946     loc.set(li, maxloc.score);
947     maxloc.pos=li;
948     while (li>0) {
949     li--;
950     if (seq[li]==polyChar) {
951     loc.add(li,poly_m_score);
952     }
953     else if (seq[li]=='N') {
954     loc.add(li,0);
955     }
956     else { //mismatch
957     loc.add(li,poly_mis_score);
958     if (maxloc.score-loc.score>poly_dropoff_score) break;
959     }
960     if (maxloc.score<=loc.score) {
961     maxloc=loc;
962     }
963     }
964 gpertea 105 li=maxloc.pos;
965 gpertea 108 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
966     (maxloc.score>poly_minScore && ri>=rlen-3) ||
967     (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
968 gpertea 105 //trimming this li-ri match at 3' end
969     l3=li-1;
970     if (l3<0) l3=0;
971 gpertea 94 return true;
972     }
973     return false;
974 gpertea 12 }
975    
976 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
977 gpertea 103 if (!doPolyTrim) return false;
978 gpertea 88 int rlen=seq.length();
979     l5=0;
980     l3=rlen-1;
981 gpertea 94 int32 seedVal=*(int32*)poly_seed;
982     char polyChar=poly_seed[0];
983 gpertea 88 //assumes N trimming was already done
984     //so a poly match should be very close to the end of the read
985     // -- find the initial match (seed)
986 gpertea 108 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
987 gpertea 94 int li;
988     for (li=0;li<=lmax;li++) {
989     if (seedVal==*(int*)&(seq[li])) {
990 gpertea 88 break;
991     }
992     }
993 gpertea 94 if (li>lmax) return false;
994     //seed found, try to extend it both ways
995     //extend left
996     int ri=li+3; //save rightmost base of the seed
997     SLocScore loc(li, poly_m_score<<2);
998     SLocScore maxloc(loc);
999     while (li>0) {
1000     li--;
1001     if (seq[li]==polyChar) {
1002     loc.add(li,poly_m_score);
1003     }
1004     else if (seq[li]=='N') {
1005     loc.add(li,0);
1006     }
1007     else { //mismatch
1008     loc.add(li,poly_mis_score);
1009     if (maxloc.score-loc.score>poly_dropoff_score) break;
1010     }
1011     if (maxloc.score<=loc.score) {
1012     maxloc=loc;
1013     }
1014 gpertea 88 }
1015 gpertea 94 li=maxloc.pos;
1016 gpertea 103 if (li>5) return false; //no trimming wanted, too far from 5' end
1017 gpertea 94 //li = right boundary for the poly match
1018    
1019     //extend right
1020     loc.set(ri, maxloc.score);
1021     maxloc.pos=ri;
1022 gpertea 170 while (ri<rlen-1) {
1023 gpertea 94 ri++;
1024     if (seq[ri]==polyChar) {
1025     loc.add(ri,poly_m_score);
1026     }
1027     else if (seq[ri]=='N') {
1028     loc.add(ri,0);
1029     }
1030     else { //mismatch
1031     loc.add(ri,poly_mis_score);
1032     if (maxloc.score-loc.score>poly_dropoff_score) break;
1033     }
1034     if (maxloc.score<=loc.score) {
1035     maxloc=loc;
1036     }
1037     }
1038 gpertea 105 ri=maxloc.pos;
1039 gpertea 103 if ((maxloc.score==poly_minScore && li==0) ||
1040     (maxloc.score>poly_minScore && li<2)
1041 gpertea 108 || (maxloc.score>(poly_minScore*3) && li<8)) {
1042 gpertea 105 //adjust l5 to reflect this trimming of 5' end
1043     l5=ri+1;
1044     if (l5>rlen-1) l5=rlen-1;
1045 gpertea 94 return true;
1046     }
1047     return false;
1048 gpertea 88 }
1049    
1050 gpertea 180 bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
1051     if (adaptors3.Count()==0) return false;
1052 gpertea 4 int rlen=seq.length();
1053     l5=0;
1054     l3=rlen-1;
1055 gpertea 103 bool trimmed=false;
1056 gpertea 175 GStr wseq(seq);
1057 gpertea 103 int wlen=rlen;
1058 gpertea 175 GXSeqData seqdata;
1059 gpertea 180 int numruns=revCompl ? 2 : 1;
1060 gpertea 185 GList<GXAlnInfo> bestalns(true, true, false);
1061 gpertea 180 for (int ai=0;ai<adaptors3.Count();ai++) {
1062     for (int r=0;r<numruns;r++) {
1063     if (r) {
1064     seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
1065     adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
1066     }
1067     else {
1068     seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
1069     adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
1070     }
1071 gpertea 189 GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, minEndAdapter, gxmem_r, 86);
1072 gpertea 185 if (aln) {
1073     if (aln->strong) {
1074     trimmed=true;
1075     bestalns.Add(aln);
1076     break; //will check the rest next time
1077     }
1078     else bestalns.Add(aln);
1079     }
1080 gpertea 180 }//forward and reverse adaptors
1081 gpertea 185 if (trimmed) break; //will check the rest in the next cycle
1082 gpertea 180 }//for each 3' adaptor
1083 gpertea 185 if (bestalns.Count()>0) {
1084     GXAlnInfo* aln=bestalns[0];
1085     if (aln->sl-1 > wlen-aln->sr) {
1086     //keep left side
1087     l3-=(wlen-aln->sl+1);
1088     if (l3<0) l3=0;
1089     }
1090     else { //keep right side
1091     l5+=aln->sr;
1092     if (l5>=rlen) l5=rlen-1;
1093     }
1094     //delete aln;
1095     //if (l3-l5+1<min_read_len) return true;
1096     wseq=seq.substr(l5,l3-l5+1);
1097     wlen=wseq.length();
1098     return true; //break the loops here to report a good find
1099     }
1100     return false;
1101 gpertea 4 }
1102    
1103 gpertea 180 bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1104     if (adaptors5.Count()==0) return false;
1105 gpertea 4 int rlen=seq.length();
1106     l5=0;
1107     l3=rlen-1;
1108 gpertea 102 bool trimmed=false;
1109 gpertea 175 GStr wseq(seq);
1110 gpertea 103 int wlen=rlen;
1111 gpertea 175 GXSeqData seqdata;
1112 gpertea 180 int numruns=revCompl ? 2 : 1;
1113 gpertea 183 GList<GXAlnInfo> bestalns(true, true, false);
1114 gpertea 180 for (int ai=0;ai<adaptors5.Count();ai++) {
1115     for (int r=0;r<numruns;r++) {
1116     if (r) {
1117     seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1118     adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1119     }
1120     else {
1121     seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1122     adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1123     }
1124 gpertea 189 GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type,
1125     minEndAdapter, gxmem_l, 90);
1126 gpertea 182 if (aln) {
1127 gpertea 183 if (aln->strong) {
1128     trimmed=true;
1129     bestalns.Add(aln);
1130     break; //will check the rest next time
1131     }
1132     else bestalns.Add(aln);
1133     }
1134     } //forward and reverse?
1135     if (trimmed) break; //will check the rest in the next cycle
1136     }//for each 5' adaptor
1137     if (bestalns.Count()>0) {
1138     GXAlnInfo* aln=bestalns[0];
1139 gpertea 182 if (aln->sl-1 > wlen-aln->sr) {
1140 gpertea 180 //keep left side
1141 gpertea 182 l3-=(wlen-aln->sl+1);
1142 gpertea 180 if (l3<0) l3=0;
1143     }
1144     else { //keep right side
1145 gpertea 182 l5+=aln->sr;
1146 gpertea 180 if (l5>=rlen) l5=rlen-1;
1147     }
1148 gpertea 183 //delete aln;
1149     //if (l3-l5+1<min_read_len) return true;
1150 gpertea 180 wseq=seq.substr(l5,l3-l5+1);
1151     wlen=wseq.length();
1152 gpertea 183 return true; //break the loops here to report a good find
1153     }
1154     return false;
1155 gpertea 98 }
1156 gpertea 4
1157 gpertea 170 //convert qvs to/from phred64 from/to phread33
1158 gpertea 12 void convertPhred(GStr& q) {
1159     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1160 gpertea 4 }
1161    
1162 gpertea 12 void convertPhred(char* q, int len) {
1163     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1164 gpertea 4 }
1165    
1166 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1167 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname) {
1168     rseq="";
1169     rqv="";
1170     rname="";
1171     rinfo="";
1172     if (fq.eof()) return false;
1173     char* l=fq.getLine();
1174     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1175     if (l==NULL) return false;
1176     /* if (rawFormat) {
1177     //TODO: implement raw qseq parsing here
1178     //if (raw type=N) then continue; //skip invalid/bad records
1179     } //raw qseq format
1180     else { // FASTQ or FASTA */
1181     isfasta=(l[0]=='>');
1182 gpertea 170 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1183 gpertea 13 infname.chars(), l);
1184     GStr s(l);
1185     rname=&(l[1]);
1186     for (int i=0;i<rname.length();i++)
1187 gpertea 170 if (rname[i]<=' ') {
1188     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1189     rname.cut(i);
1190     break;
1191 gpertea 13 }
1192     //now get the sequence
1193 gpertea 170 if ((l=fq.getLine())==NULL)
1194 gpertea 13 GError("Error: unexpected EOF after header for read %s (%s)\n",
1195     rname.chars(), infname.chars());
1196     rseq=l; //this must be the DNA line
1197     while ((l=fq.getLine())!=NULL) {
1198     //seq can span multiple lines
1199     if (l[0]=='>' || l[0]=='+') {
1200 gpertea 170 fq.pushBack();
1201 gpertea 13 break; //
1202     }
1203     rseq+=l;
1204 gpertea 170 } //check for multi-line seq
1205 gpertea 13 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1206     if ((l=fq.getLine())==NULL)
1207     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1208     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1209 gpertea 170 if ((l=fq.getLine())==NULL)
1210 gpertea 13 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1211     rqv=l;
1212 gpertea 170 //if (rqv.length()!=rseq.length())
1213 gpertea 13 // GError("Error: qv len != seq len for %s\n", rname.chars());
1214     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1215     rqv+=l; //append to qv string
1216     }
1217     }// fastq
1218     // } //<-- FASTA or FASTQ
1219 gpertea 94 rseq.upper();
1220 gpertea 13 return true;
1221     }
1222    
1223 gpertea 105 #ifdef GDEBUG
1224     void showTrim(GStr& s, int l5, int l3) {
1225 gpertea 177 if (l5>0 || l3==0) {
1226 gpertea 105 color_bg(c_red);
1227     }
1228     for (int i=0;i<s.length()-1;i++) {
1229     if (i && i==l5) color_resetbg();
1230     fprintf(stderr, "%c", s[i]);
1231 gpertea 177 if (i && i==l3) color_bg(c_red);
1232 gpertea 105 }
1233     fprintf(stderr, "%c", s[s.length()-1]);
1234     color_reset();
1235     fprintf(stderr, "\n");
1236     }
1237     #endif
1238    
1239 gpertea 13 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1240 gpertea 170 //returns 0 if the read was untouched, 1 if it was just trimmed
1241 gpertea 13 // and a trash code if it was trashed
1242     l5=0;
1243     l3=rseq.length()-1;
1244 gpertea 103 #ifdef GDEBUG
1245 gpertea 108 //rseq.reverse();
1246 gpertea 103 GMessage(">%s\n", rname.chars());
1247 gpertea 105 GMessage("%s\n",rseq.chars());
1248 gpertea 103 #endif
1249 gpertea 13 if (l3-l5+1<min_read_len) {
1250     return 's';
1251     }
1252     GStr wseq(rseq.chars());
1253     GStr wqv(rqv.chars());
1254     int w5=l5;
1255     int w3=l3;
1256 gpertea 187 //count Ns
1257     b_totalIn+=rseq.length();
1258     for (int i=0;i<rseq.length();i++) {
1259     if (isACGT[(int)rseq[i]]==0) b_totalN++;
1260     }
1261    
1262 gpertea 13 //first do the q-based trimming
1263     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1264 gpertea 187 b_trimQ+=(w5-l5)+(l3-w3);
1265     num_trimQ++;
1266 gpertea 13 if (w3-w5+1<min_read_len) {
1267     return 'Q'; //invalid read
1268     }
1269     //-- keep only the w5..w3 range
1270     l5=w5;
1271     l3=w3;
1272     wseq=wseq.substr(w5, w3-w5+1);
1273     if (!wqv.is_empty())
1274     wqv=wqv.substr(w5, w3-w5+1);
1275     } //qv trimming
1276     // N-trimming on the remaining read seq
1277     if (ntrim(wseq, w5, w3)) {
1278 gpertea 187 //Note: ntrim resets w5 and w3
1279 gpertea 13 //GMessage("before: %s\n",wseq.chars());
1280     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1281 gpertea 187 int trim3=(wseq.length()-1-w3);
1282     b_trimN+=w5+trim3;
1283     num_trimN++;
1284 gpertea 13 l5+=w5;
1285 gpertea 187 l3-=trim3;
1286 gpertea 13 if (w3-w5+1<min_read_len) {
1287     return 'N'; //to be trashed
1288     }
1289     //-- keep only the w5..w3 range
1290     wseq=wseq.substr(w5, w3-w5+1);
1291     if (!wqv.is_empty())
1292     wqv=wqv.substr(w5, w3-w5+1);
1293     w5=0;
1294     w3=wseq.length()-1;
1295     }
1296 gpertea 103 char trim_code;
1297 gpertea 180 //clean the more dirty end first - 3'
1298 gpertea 187 bool trimmedA=false;
1299     bool trimmedT=false;
1300     bool trimmedV=false;
1301    
1302 gpertea 180 int prev_w5=0;
1303     int prev_w3=0;
1304     bool w3upd=true;
1305     bool w5upd=true;
1306 gpertea 103 do {
1307     trim_code=0;
1308 gpertea 180 if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1309 gpertea 103 trim_code='A';
1310 gpertea 187 b_trimA+=(w5+(wseq.length()-1-w3));
1311     if (!trimmedA) { num_trimA++; trimmedA=true; }
1312 gpertea 103 }
1313 gpertea 180 else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1314 gpertea 103 trim_code='T';
1315 gpertea 187 b_trimT+=(w5+(wseq.length()-1-w3));
1316     if (!trimmedT) { num_trimT++; trimmedT=true; }
1317 gpertea 103 }
1318 gpertea 180 else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1319 gpertea 103 trim_code='A';
1320 gpertea 187 b_trimA+=(w5+(wseq.length()-1-w3));
1321     if (!trimmedA) { num_trimA++; trimmedA=true; }
1322 gpertea 103 }
1323 gpertea 180 else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1324 gpertea 103 trim_code='T';
1325 gpertea 187 b_trimT+=(w5+(wseq.length()-1-w3));
1326     if (!trimmedT) { num_trimT++; trimmedT=true; }
1327 gpertea 103 }
1328 gpertea 180 else if (trim_adaptor5(wseq, w5, w3)) {
1329 gpertea 187 trim_code='V';
1330     b_trimV+=(w5+(wseq.length()-1-w3));
1331     if (!trimmedV) { num_trimV++; trimmedV=true; }
1332 gpertea 180 }
1333     else if (trim_adaptor3(wseq, w5, w3)) {
1334 gpertea 187 trim_code='V';
1335     b_trimV+=(w5+(wseq.length()-1-w3));
1336     if (!trimmedV) { num_trimV++; trimmedV=true; }
1337 gpertea 103 }
1338     if (trim_code) {
1339 gpertea 180 w3upd=(w3!=prev_w3);
1340     w5upd=(w5!=prev_w5);
1341     if (w3upd) prev_w3=w3;
1342     if (w5upd) prev_w5=w5;
1343     #ifdef GDEBUG
1344 gpertea 105 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1345     showTrim(wseq, w5, w3);
1346 gpertea 180 #endif
1347 gpertea 187 //int trimlen=wseq.length()-(w3-w5+1);
1348 gpertea 13 l5+=w5;
1349     l3-=(wseq.length()-1-w3);
1350     if (w3-w5+1<min_read_len) {
1351 gpertea 103 return trim_code;
1352 gpertea 13 }
1353     //-- keep only the w5..w3 range
1354     wseq=wseq.substr(w5, w3-w5+1);
1355     if (!wqv.is_empty())
1356     wqv=wqv.substr(w5, w3-w5+1);
1357 gpertea 103 }//trimming at 3' end
1358     } while (trim_code);
1359 gpertea 13 if (doCollapse) {
1360     //keep read for later
1361     FqDupRec* dr=dhash.Find(wseq.chars());
1362     if (dr==NULL) { //new entry
1363 gpertea 170 //if (prefix.is_empty())
1364     dhash.Add(wseq.chars(),
1365 gpertea 13 new FqDupRec(&wqv, rname.chars()));
1366     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1367     }
1368     else
1369     dr->add(wqv);
1370     } //collapsing duplicates
1371     else { //not collapsing duplicates
1372     //apply the dust filter now
1373     if (doDust) {
1374     int dustbases=dust(wseq);
1375     if (dustbases>(wseq.length()>>1)) {
1376     return 'D';
1377     }
1378     }
1379     } //not collapsing duplicates
1380     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1381     }
1382    
1383     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1384     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1385     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1386     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1387     }
1388    
1389     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1390     outcounter++;
1391     bool asFasta=(rqv.is_empty() || fastaOutput);
1392     if (asFasta) {
1393     if (prefix.is_empty()) {
1394     printHeader(f_out, '>',rname,rinfo);
1395     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1396     }
1397     else {
1398 gpertea 170 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1399 gpertea 13 rseq.chars());
1400     }
1401     }
1402     else { //fastq
1403     if (convert_phred) convertPhred(rqv);
1404     if (prefix.is_empty()) {
1405     printHeader(f_out, '@', rname, rinfo);
1406     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1407     }
1408     else
1409 gpertea 170 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1410 gpertea 13 rseq.chars(),rqv.chars() );
1411     }
1412     }
1413    
1414     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1415     if (freport==NULL || trashcode<=' ') return;
1416     if (trashcode=='3' || trashcode=='5') {
1417 gpertea 103 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1418 gpertea 13 }
1419     else {
1420     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1421     }
1422     //tcounter++;
1423     }
1424    
1425     GStr getFext(GStr& s, int* xpos=NULL) {
1426     //s must be a filename without a path
1427     GStr r("");
1428     if (xpos!=NULL) *xpos=0;
1429     if (s.is_empty() || s=="-") return r;
1430     int p=s.rindex('.');
1431     int d=s.rindex('/');
1432     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1433     r=s.substr(p+1);
1434     if (xpos!=NULL) *xpos=p+1;
1435     r.lower();
1436     return r;
1437     }
1438    
1439     void baseFileName(GStr& fname) {
1440     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1441     int xpos=0;
1442     GStr fext=getFext(fname, &xpos);
1443     if (xpos<=1) return;
1444     bool extdel=false;
1445     GStr f2;
1446     if (fext=="z" || fext=="zip") {
1447     extdel=true;
1448     }
1449     else if (fext.length()>=2) {
1450     f2=fext.substr(0,2);
1451     extdel=(f2=="gz" || f2=="bz");
1452     }
1453     if (extdel) {
1454     fname.cut(xpos-1);
1455     fext=getFext(fname, &xpos);
1456     if (xpos<=1) return;
1457     }
1458     extdel=false;
1459     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1460     extdel=true;
1461     }
1462     else if (fext.length()>=2) {
1463     extdel=(fext.substr(0,2)=="fa");
1464     }
1465     if (extdel) fname.cut(xpos-1);
1466     GStr fncp(fname);
1467     fncp.lower();
1468     fncp.chomp("seq");
1469     fncp.chomp("sequence");
1470     fncp.trimR("_.");
1471     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1472     }
1473    
1474     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1475     FILE* f_out=NULL;
1476     GStr fname(getFileName(infname.chars()));
1477     //eliminate known extensions
1478     baseFileName(fname);
1479     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1480     else if (pocmd.is_empty()) {
1481     GStr oname(fname);
1482     oname.append('.');
1483     oname.append(outsuffix);
1484     f_out=fopen(oname.chars(),"w");
1485     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1486     }
1487     else {
1488     GStr oname(">");
1489     oname.append(fname);
1490     oname.append('.');
1491     oname.append(outsuffix);
1492     pocmd.append(oname);
1493     f_out=popen(pocmd.chars(), "w");
1494     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1495     }
1496     return f_out;
1497     }
1498    
1499     void guess_unzip(GStr& fname, GStr& picmd) {
1500     GStr fext=getFext(fname);
1501     if (fext=="gz" || fext=="gzip" || fext=="z") {
1502     picmd="gzip -cd ";
1503     }
1504     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1505     picmd="bzip2 -cd ";
1506     }
1507     }
1508    
1509 gpertea 180 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1510 gpertea 170 //TODO: prepare CASeqData here, and collect hexamers as well
1511 gpertea 180 if (seq.is_empty() || seq=="-" ||
1512     seq=="N/A" || seq==".") return;
1513    
1514 gpertea 172 CASeqData adata(revCompl);
1515 gpertea 180 int idx=adaptors.Add(adata);
1516 gpertea 172 if (idx<0) GError("Error: failed to add adaptor!\n");
1517 gpertea 180 adaptors[idx].trim_type=trim_type;
1518     adaptors[idx].update(seq.chars());
1519 gpertea 170 }
1520    
1521    
1522 gpertea 180 int loadAdaptors(const char* fname) {
1523 gpertea 94 GLineReader lr(fname);
1524     char* l;
1525     while ((l=lr.nextLine())!=NULL) {
1526     if (lr.length()<=3 || l[0]=='#') continue;
1527 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1528     l[0]==';'|| l[0]==':' ) {
1529 gpertea 94 int i=1;
1530     while (l[i]!=0 && isspace(l[i])) {
1531     i++;
1532     }
1533     if (l[i]!=0) {
1534 gpertea 102 GStr s(&(l[i]));
1535 gpertea 108 #ifdef GDEBUG
1536     //s.reverse();
1537     #endif
1538 gpertea 180 addAdaptor(adaptors3, s, galn_TrimRight);
1539 gpertea 94 continue;
1540     }
1541     }
1542     else {
1543 gpertea 98 GStr s(l);
1544     s.startTokenize("\t ;,:");
1545     GStr a5,a3;
1546     if (s.nextToken(a5))
1547 gpertea 180 s.nextToken(a3);
1548     else continue; //no tokens on this line
1549     GAlnTrimType ttype5=galn_TrimLeft;
1550 gpertea 102 a5.upper();
1551     a3.upper();
1552 gpertea 180 if (a3.is_empty() || a3==a5 || a3=="=") {
1553     a3.clear();
1554     ttype5=galn_TrimEither;
1555     }
1556 gpertea 108 #ifdef GDEBUG
1557     // a5.reverse();
1558     // a3.reverse();
1559     #endif
1560 gpertea 180 addAdaptor(adaptors5, a5, ttype5);
1561     addAdaptor(adaptors3, a3, galn_TrimRight);
1562 gpertea 94 }
1563     }
1564 gpertea 180 return adaptors5.Count()+adaptors3.Count();
1565 gpertea 94 }
1566    
1567 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1568     GStr& s, GStr& infname, GStr& infname2) {
1569 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
1570     infname="";
1571     infname2="";
1572     f_in=NULL;
1573     f_in2=NULL;
1574     f_out=NULL;
1575     f_out2=NULL;
1576     //analyze outsuffix intent
1577     GStr pocmd;
1578 gpertea 76 if (outsuffix=="-") {
1579     f_out=stdout;
1580     }
1581     else {
1582     GStr ox=getFext(outsuffix);
1583     if (ox.length()>2) ox=ox.substr(0,2);
1584     if (ox=="gz") pocmd="gzip -9 -c ";
1585     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1586     }
1587 gpertea 13 if (s=="-") {
1588     f_in=stdin;
1589     infname="stdin";
1590     f_out=prepOutFile(infname, pocmd);
1591     return;
1592     } // streaming from stdin
1593     s.startTokenize(",:");
1594     s.nextToken(infname);
1595     bool paired=s.nextToken(infname2);
1596 gpertea 170 if (fileExists(infname.chars())==0)
1597 gpertea 13 GError("Error: cannot find file %s!\n",infname.chars());
1598     GStr fname(getFileName(infname.chars()));
1599     GStr picmd;
1600     guess_unzip(fname, picmd);
1601     if (picmd.is_empty()) {
1602     f_in=fopen(infname.chars(), "r");
1603     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1604     }
1605     else {
1606     picmd.append(infname);
1607     f_in=popen(picmd.chars(), "r");
1608     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1609     }
1610 gpertea 76 if (f_out==stdout) {
1611     if (paired) GError("Error: output suffix required for paired reads\n");
1612     return;
1613     }
1614 gpertea 13 f_out=prepOutFile(infname, pocmd);
1615     if (!paired) return;
1616     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1617     // ---- paired reads:-------------
1618 gpertea 170 if (fileExists(infname2.chars())==0)
1619 gpertea 13 GError("Error: cannot find file %s!\n",infname2.chars());
1620     picmd="";
1621     GStr fname2(getFileName(infname2.chars()));
1622     guess_unzip(fname2, picmd);
1623     if (picmd.is_empty()) {
1624     f_in2=fopen(infname2.chars(), "r");
1625     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1626     }
1627     else {
1628     picmd.append(infname2);
1629     f_in2=popen(picmd.chars(), "r");
1630     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1631     }
1632     f_out2=prepOutFile(infname2, pocmd);
1633     }