ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 182
Committed: Thu Feb 16 21:33:27 2012 UTC (7 years, 9 months ago) by gpertea
File size: 44004 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 180 fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10 gpertea 172 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 gpertea 76 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 180 Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15 gpertea 76 for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 gpertea 180 -f file with adaptor sequences to trim, each line having this format:\n\
29     <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30     -5 trim the given adaptor or primer sequence at the 5' end of each read\n\
31 gpertea 4 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 gpertea 180 -3 trim the given adaptor sequence at the 3' end of each read\n\
33 gpertea 4 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 103 -A disable polyA/T trimming (enabled by default)\n\
35 gpertea 94 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39     -l minimum \"clean\" length after trimming that a read must have\n\
40     in order to pass the filter (default: 16)\n\
41 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42     one letter code for the reason why they were trashed\n\
43 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44     50% of its length detected as low complexity\n\
45 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 gpertea 12 name (e.g. for microRNAs)\n\
47     -p input is phred64/phred33 (use -p64 or -p33)\n\
48     -Q convert quality values to the other Phred qv type\n\
49     -V verbose processing\n\
50 gpertea 4 "
51 gpertea 170
52 gpertea 13 //-z for -o option, the output stream(s) will be first piped into the given\n
53     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54    
55 gpertea 170
56 gpertea 180 // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 gpertea 12
58 gpertea 76 //For paired reads sequencing:
59 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
62     //FILE* f_out2=NULL; //for paired reads
63     //FILE* f_in=NULL; //input fastq (stdin if not provided)
64     //FILE* f_in2=NULL; //for paired reads
65 gpertea 170
66 gpertea 4 FILE* freport=NULL;
67 gpertea 13
68 gpertea 4 bool debug=false;
69 gpertea 12 bool verbose=false;
70 gpertea 4 bool doCollapse=false;
71     bool doDust=false;
72 gpertea 103 bool doPolyTrim=true;
73 gpertea 13 bool fastaOutput=false;
74 gpertea 4 bool trashReport=false;
75 gpertea 172 bool revCompl=false; //also reverse complement adaptor sequences
76 gpertea 4 int min_read_len=16;
77 gpertea 12 double max_perc_N=7.0;
78 gpertea 4 int dust_cutoff=16;
79     bool isfasta=false;
80 gpertea 12 bool convert_phred=false;
81 gpertea 170 GStr outsuffix; // -o
82 gpertea 13 GStr prefix;
83     GStr zcmd;
84     int num_trimmed5=0;
85     int num_trimmed3=0;
86     int num_trimmedN=0;
87     int num_trimmedQ=0;
88     int min_trimmed5=INT_MAX;
89     int min_trimmed3=INT_MAX;
90 gpertea 12
91 gpertea 13 int qvtrim_qmin=0;
92     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94     int qv_cvtadd=0; //could be -31 or +31
95    
96 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
97 gpertea 177 //const int match_reward=2;
98     //const int mismatch_penalty=3;
99 gpertea 103 const int match_reward=2;
100 gpertea 177 const int mismatch_penalty=4;
101     const int Xdrop=10;
102 gpertea 103
103 gpertea 94 const int poly_m_score=2; //match score
104     const int poly_mis_score=-3; //mismatch
105 gpertea 88 const int poly_dropoff_score=7;
106 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
107 gpertea 4
108 gpertea 94 const char *polyA_seed="AAAA";
109     const char *polyT_seed="TTTT";
110 gpertea 12
111 gpertea 170 struct CASeqData {
112 gpertea 180 //positional data for every possible hexamer in an adaptor
113     GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114     GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115     GStr seq; //actual adaptor sequence data
116 gpertea 170 GStr seqr; //reverse complement sequence
117 gpertea 180 int amlen; //fraction of adaptor length matching that's
118 gpertea 175 //enough to consider the alignment
119 gpertea 180 GAlnTrimType trim_type;
120 gpertea 172 bool use_reverse;
121 gpertea 175 CASeqData(bool rev=false):seq(),seqr(),
122     amlen(0), use_reverse(rev) {
123 gpertea 180 trim_type=galn_None; //should be updated later!
124 gpertea 172 for (int i=0;i<4096;i++) {
125     pz[i]=NULL;
126     pzr[i]=NULL;
127     }
128     }
129    
130     void update(const char* s) {
131 gpertea 175 seq=s;
132     table6mers(seq.chars(), seq.length(), pz);
133     amlen=iround(double(seq.length())*0.8);
134     if (amlen<12)
135     amlen=12;
136     if (!use_reverse) return;
137     //reverse complement
138     seqr=s;
139     int slen=seq.length();
140     for (int i=0;i<slen;i++)
141     seqr[i]=ntComplement(seq[slen-i-1]);
142     table6mers(seqr.chars(), seqr.length(), pzr);
143 gpertea 170 }
144    
145 gpertea 172 ~CASeqData() {
146     for (int i=0;i<4096;i++) {
147     delete pz[i];
148     delete pzr[i];
149 gpertea 170 }
150     }
151     };
152    
153 gpertea 180 GVec<CASeqData> adaptors5;
154     GVec<CASeqData> adaptors3;
155 gpertea 170
156 gpertea 103 CGreedyAlignData* gxmem_l=NULL;
157     CGreedyAlignData* gxmem_r=NULL;
158    
159 gpertea 4 // element in dhash:
160     class FqDupRec {
161     public:
162     int count; //how many of these reads are the same
163     int len; //length of qv
164     char* firstname; //optional, only if we want to keep the original read names
165     char* qv;
166     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
167     len=0;
168     qv=NULL;
169     firstname=NULL;
170     count=0;
171     if (qstr!=NULL) {
172     qv=Gstrdup(qstr->chars());
173     len=qstr->length();
174     count++;
175     }
176     if (rname!=NULL) firstname=Gstrdup(rname);
177     }
178     ~FqDupRec() {
179     GFREE(qv);
180     GFREE(firstname);
181     }
182     void add(GStr& d) { //collapse another record into this one
183     if (d.length()!=len)
184     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
185     count++;
186     for (int i=0;i<len;i++)
187     qv[i]=(qv[i]+d[i])/2;
188     }
189     };
190    
191     void openfw(FILE* &f, GArgs& args, char opt) {
192     GStr s=args.getOpt(opt);
193     if (!s.is_empty()) {
194     if (s=='-') f=stdout;
195     else {
196 gpertea 13 f=fopen(s.chars(),"w");
197 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
198     }
199     }
200     }
201    
202     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
203     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
204    
205     GHash<FqDupRec> dhash; //hash to keep track of duplicates
206    
207 gpertea 180 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208     int loadAdaptors(const char* fname);
209 gpertea 94
210 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211     GStr& s, GStr& infname, GStr& infname2);
212 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
213 gpertea 170
214 gpertea 13 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
215     void trash_report(char trashcode, GStr& rname, FILE* freport);
216    
217 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
218 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname);
219    
220 gpertea 170 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
221 gpertea 13 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
222    
223 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
224     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
225 gpertea 4 int dust(GStr& seq);
226 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 gpertea 180 bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229     bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230 gpertea 4
231 gpertea 12 void convertPhred(char* q, int len);
232     void convertPhred(GStr& q);
233 gpertea 4
234     int main(int argc, char * const argv[]) {
235 gpertea 172 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
236 gpertea 4 int e;
237     if ((e=args.isError())>0) {
238     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
239     exit(224);
240     }
241 gpertea 9 debug=(args.getOpt('Y')!=NULL);
242 gpertea 13 verbose=(args.getOpt('V')!=NULL);
243 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
244 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
245     doDust=(args.getOpt('D')!=NULL);
246 gpertea 172 revCompl=(args.getOpt('R')!=NULL);
247 gpertea 103 if (args.getOpt('A')) doPolyTrim=false;
248 gpertea 12 /*
249 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
250     if (rawFormat) {
251     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
252     }
253 gpertea 12 */
254 gpertea 4 prefix=args.getOpt('n');
255     GStr s=args.getOpt('l');
256 gpertea 170 if (!s.is_empty())
257 gpertea 4 min_read_len=s.asInt();
258 gpertea 12 s=args.getOpt('m');
259 gpertea 170 if (!s.is_empty())
260 gpertea 12 max_perc_N=s.asDouble();
261 gpertea 4 s=args.getOpt('d');
262     if (!s.is_empty()) {
263     dust_cutoff=s.asInt();
264     doDust=true;
265     }
266 gpertea 12 s=args.getOpt('q');
267     if (!s.is_empty()) {
268 gpertea 13 qvtrim_qmin=s.asInt();
269 gpertea 12 }
270 gpertea 13 s=args.getOpt('t');
271     if (!s.is_empty()) {
272     qvtrim_max=s.asInt();
273     }
274 gpertea 12 s=args.getOpt('p');
275     if (!s.is_empty()) {
276     int v=s.asInt();
277     if (v==33) {
278     qv_phredtype=33;
279     qv_cvtadd=31;
280     }
281     else if (v==64) {
282     qv_phredtype=64;
283     qv_cvtadd=-31;
284     }
285 gpertea 170 else
286 gpertea 12 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
287     }
288 gpertea 94 s=args.getOpt('f');
289     if (!s.is_empty()) {
290 gpertea 180 loadAdaptors(s.chars());
291 gpertea 94 }
292 gpertea 180 bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293 gpertea 94 s=args.getOpt('5');
294     if (!s.is_empty()) {
295 gpertea 180 if (fileAdaptors)
296 gpertea 94 GError("Error: options -5 and -f cannot be used together!\n");
297     s.upper();
298 gpertea 180 addAdaptor(adaptors5, s, galn_TrimLeft);
299 gpertea 4 }
300 gpertea 94 s=args.getOpt('3');
301     if (!s.is_empty()) {
302 gpertea 180 if (fileAdaptors)
303 gpertea 94 GError("Error: options -3 and -f cannot be used together!\n");
304 gpertea 102 s.upper();
305 gpertea 180 addAdaptor(adaptors3, s, galn_TrimRight);
306 gpertea 4 }
307 gpertea 94 s=args.getOpt('y');
308 gpertea 13 if (!s.is_empty()) {
309 gpertea 94 int minmatch=s.asInt();
310     poly_minScore=minmatch*poly_m_score;
311 gpertea 13 }
312 gpertea 170
313 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
314 gpertea 76 else outsuffix="-";
315 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
316 gpertea 13 int fcount=args.startNonOpt();
317     if (fcount==0) {
318 gpertea 4 GMessage(USAGE);
319     exit(224);
320     }
321 gpertea 13 if (fcount>1 && doCollapse) {
322     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
323     }
324     //openfw(f_out, args, 'o');
325     //if (f_out==NULL) f_out=stdout;
326 gpertea 4 if (trashReport)
327     openfw(freport, args, 'r');
328     char* infile=NULL;
329 gpertea 103
330 gpertea 180 if (adaptors5.Count()>0)
331 gpertea 177 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332     gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 gpertea 180 if (adaptors3.Count()>0)
334 gpertea 103 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335    
336 gpertea 4 while ((infile=args.nextNonOpt())!=NULL) {
337 gpertea 103 //for each input file
338 gpertea 13 int incounter=0; //counter for input reads
339     int outcounter=0; //counter for output reads
340     int trash_s=0; //too short from the get go
341     int trash_Q=0;
342     int trash_N=0;
343     int trash_D=0;
344 gpertea 103 int trash_poly=0;
345 gpertea 13 int trash_A3=0;
346     int trash_A5=0;
347     s=infile;
348     GStr infname;
349     GStr infname2;
350     FILE* f_in=NULL;
351     FILE* f_in2=NULL;
352     FILE* f_out=NULL;
353     FILE* f_out2=NULL;
354     bool paired_reads=false;
355     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
356     GLineReader fq(f_in);
357     GLineReader* fq2=NULL;
358     if (f_in2!=NULL) {
359     fq2=new GLineReader(f_in2);
360     paired_reads=true;
361     }
362     GStr rseq, rqv, seqid, seqinfo;
363     GStr rseq2, rqv2, seqid2, seqinfo2;
364     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
365     incounter++;
366     int a5=0, a3=0, b5=0, b3=0;
367     char tcode=0, tcode2=0;
368     tcode=process_read(seqid, rseq, rqv, a5, a3);
369 gpertea 103 if (fq2!=NULL) {
370 gpertea 13 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
371     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
372     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
373     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
374     }
375     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
376     //decide what to do with this pair and print rseq2 if the pair makes it
377     if (tcode>1 && tcode2<=1) {
378     //"untrash" rseq
379     if (a3-a5+1<min_read_len) {
380     a5=1;
381     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
382 gpertea 4 }
383 gpertea 13 tcode=1;
384     }
385     else if (tcode<=1 && tcode2>1) {
386     //"untrash" rseq2
387     if (b3-b5+1<min_read_len) {
388     b5=1;
389     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
390     }
391     tcode2=1;
392     }
393     if (tcode<=1) { //trimmed or left intact -- write it!
394     if (tcode>0) {
395     rseq2=rseq2.substr(b5,b3-b5+1);
396     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
397 gpertea 4 }
398 gpertea 13 int nocounter=0;
399     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
400     }
401 gpertea 103 } //pair read
402 gpertea 13 if (tcode>1) { //trashed
403 gpertea 103 #ifdef GDEBUG
404 gpertea 175 GMessage(" !!!!TRASH code = %c\n",tcode);
405 gpertea 103 #endif
406 gpertea 13 if (tcode=='s') trash_s++;
407 gpertea 103 else if (tcode=='A' || tcode=='T') trash_poly++;
408 gpertea 13 else if (tcode=='Q') trash_Q++;
409     else if (tcode=='N') trash_N++;
410     else if (tcode=='D') trash_D++;
411     else if (tcode=='3') trash_A3++;
412     else if (tcode=='5') trash_A5++;
413     if (trashReport) trash_report(tcode, seqid, freport);
414     }
415     else if (!doCollapse) { //write it
416     if (tcode>0) {
417     rseq=rseq.substr(a5,a3-a5+1);
418     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
419     }
420 gpertea 103 #ifdef GDEBUG
421     GMessage(" After trimming:\n");
422 gpertea 105 GMessage("%s\n",rseq.chars());
423 gpertea 103 #endif
424 gpertea 13 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
425     }
426     } //for each fastq record
427     delete fq2;
428     FRCLOSE(f_in);
429     FRCLOSE(f_in2);
430     if (doCollapse) {
431     outcounter=0;
432     int maxdup_count=1;
433     char* maxdup_seq=NULL;
434     dhash.startIterate();
435     FqDupRec* qd=NULL;
436     char* seq=NULL;
437     while ((qd=dhash.NextData(seq))!=NULL) {
438     GStr rseq(seq);
439     //do the dusting here
440     if (doDust) {
441     int dustbases=dust(rseq);
442     if (dustbases>(rseq.length()>>1)) {
443     if (trashReport && qd->firstname!=NULL) {
444     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
445 gpertea 4 }
446 gpertea 13 trash_D+=qd->count;
447     continue;
448     }
449     }
450     outcounter++;
451 gpertea 170 if (qd->count>maxdup_count) {
452 gpertea 13 maxdup_count=qd->count;
453     maxdup_seq=seq;
454     }
455     if (isfasta) {
456     if (prefix.is_empty()) {
457 gpertea 170 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
458 gpertea 13 rseq.chars());
459 gpertea 4 }
460 gpertea 13 else { //use custom read name
461     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
462     qd->count, rseq.chars());
463 gpertea 4 }
464 gpertea 13 }
465     else { //fastq format
466     if (convert_phred) convertPhred(qd->qv, qd->len);
467     if (prefix.is_empty()) {
468 gpertea 170 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
469 gpertea 13 rseq.chars(), qd->qv);
470 gpertea 4 }
471 gpertea 13 else { //use custom read name
472     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
473     qd->count, rseq.chars(), qd->qv);
474     }
475     }
476     }//for each element of dhash
477     if (maxdup_count>1) {
478     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
479 gpertea 4 }
480 gpertea 13 } //collapse entries
481     if (verbose) {
482     if (paired_reads) {
483     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
484     GMessage("Number of input pairs :%9d\n", incounter);
485     GMessage(" Output pairs :%9d\n", outcounter);
486     }
487     else {
488     GMessage(">Input file : %s\n", infname.chars());
489     GMessage("Number of input reads :%9d\n", incounter);
490     GMessage(" Output reads :%9d\n", outcounter);
491     }
492     GMessage("------------------------------------\n");
493     if (num_trimmed5)
494     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
495     if (num_trimmed3)
496     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
497     GMessage("------------------------------------\n");
498     if (trash_s>0)
499     GMessage(" Trashed by length:%9d\n", trash_s);
500     if (trash_N>0)
501     GMessage(" Trashed by N%%:%9d\n", trash_N);
502     if (trash_Q>0)
503     GMessage("Trashed by low quality:%9d\n", trash_Q);
504 gpertea 103 if (trash_poly>0)
505     GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
506 gpertea 13 if (trash_A5>0)
507 gpertea 180 GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508 gpertea 13 if (trash_A3>0)
509 gpertea 180 GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510 gpertea 13 }
511     if (trashReport) {
512     FWCLOSE(freport);
513 gpertea 4 }
514 gpertea 13 FWCLOSE(f_out);
515     FWCLOSE(f_out2);
516     } //while each input file
517 gpertea 103 delete gxmem_l;
518     delete gxmem_r;
519 gpertea 12 //getc(stdin);
520 gpertea 4 }
521    
522     class NData {
523     public:
524     int NPos[1024]; //there should be no reads longer than 1K ?
525     int NCount;
526     int end5;
527     int end3;
528     int seqlen;
529     double perc_N; //percentage of Ns in end5..end3 range only!
530     const char* seq;
531     bool valid;
532     NData() {
533 gpertea 170 seqlen=0;
534 gpertea 4 NCount=0;
535     end5=0;
536     end3=0;
537     seq=NULL;
538     perc_N=0;
539     valid=true;
540     }
541     void init(GStr& rseq) {
542     NCount=0;
543     perc_N=0;
544     seqlen=rseq.length();
545     seq=rseq.chars();
546     end5=1;
547     end3=seqlen;
548     for (int i=0;i<seqlen;i++)
549 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
550 gpertea 4 NPos[NCount]=i;
551     NCount++;
552     }
553     perc_N=(NCount*100.0)/seqlen;
554     valid=true;
555     }
556     void N_calc() { //only in the region after trimming
557     int n=0;
558     for (int i=end3-1;i<end5;i++) {
559     if (seq[i]=='N') n++;
560     }
561     perc_N=(n*100.0)/(end5-end3+1);
562     }
563     };
564 gpertea 170
565 gpertea 4 static NData feat;
566     int perc_lenN=12; // incremental distance from ends, in percentage of
567     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
568 gpertea 170
569 gpertea 4 void N_analyze(int l5, int l3, int p5, int p3) {
570     /* assumes feat was filled properly */
571     int old_dif, t5,t3,v;
572 gpertea 12 if (l3<l5+2 || p5>p3 ) {
573 gpertea 4 feat.end5=l5+1;
574     feat.end3=l3+1;
575 gpertea 170 return;
576 gpertea 4 }
577    
578     t5=feat.NPos[p5]-l5;
579     t3=l3-feat.NPos[p3];
580     old_dif=p3-p5;
581     v=(int)((((double)(l3-l5))*perc_lenN)/100);
582 gpertea 170 if (v>20) v=20; /* enforce N-search limit for very long reads */
583 gpertea 4 if (t5 < v ) {
584     l5=feat.NPos[p5]+1;
585     p5++;
586     }
587     if (t3 < v) {
588     l3=feat.NPos[p3]-1;
589     p3--;
590     }
591     /* restNs=p3-p5; number of Ns in the new CLR */
592     if (p3-p5==old_dif) { /* no change, return */
593     /* don't trim if this may shorten the read too much */
594     //if (l5-l3<min_read_len) return;
595     feat.end5=l5+1;
596     feat.end3=l3+1;
597     return;
598     }
599 gpertea 170 else
600 gpertea 4 N_analyze(l5,l3, p5,p3);
601     }
602    
603    
604 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
605 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
606 gpertea 12 if (qv_phredtype==0) {
607     //try to guess the Phred type
608     int vmin=256, vmax=0;
609     for (int i=0;i<qvs.length();i++) {
610     if (vmin>qvs[i]) vmin=qvs[i];
611     if (vmax<qvs[i]) vmax=qvs[i];
612     }
613     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
614     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
615     if (qv_phredtype==0) {
616     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
617     }
618 gpertea 13 if (verbose)
619     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
620     } //guessing Phred type
621 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
622 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
623 gpertea 12 }
624     //just in case, check also the 5' the end (?)
625     for (l5=0;l5<qvs.length()-3;l5++) {
626 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
627 gpertea 12 }
628 gpertea 13 if (qvtrim_max>0) {
629     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
630     if (l5>qvtrim_max) l5=qvtrim_max;
631     }
632 gpertea 12 return (l5>0 || l3<qvs.length()-1);
633     }
634    
635     bool ntrim(GStr& rseq, int &l5, int &l3) {
636     //count Ns in the sequence, trim N-rich ends
637 gpertea 4 feat.init(rseq);
638     l5=feat.end5-1;
639     l3=feat.end3-1;
640 gpertea 170 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
641 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
642     if (feat.perc_N>max_perc_N) {
643     feat.valid=false;
644     l3=l5+1;
645     return true;
646     }
647     else {
648     return false; //nothing changed
649     }
650     }
651 gpertea 4 l5=feat.end5-1;
652     l3=feat.end3-1;
653     if (l3-l5+1<min_read_len) {
654     feat.valid=false;
655     return true;
656     }
657     feat.N_calc();
658 gpertea 170
659 gpertea 12 if (feat.perc_N>max_perc_N) {
660 gpertea 4 feat.valid=false;
661     l3=l5+1;
662     return true;
663     }
664     return true;
665     }
666    
667     //--------------- dust functions ----------------
668     class DNADuster {
669     public:
670 gpertea 170 int dustword;
671     int dustwindow;
672     int dustwindow2;
673 gpertea 4 int dustcutoff;
674     int mv, iv, jv;
675     int counts[32*32*32];
676     int iis[32*32*32];
677     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
678     dustword=wordsize;
679     dustwindow=winsize;
680     dustwindow2 = (winsize>>1);
681     dustcutoff=cutoff;
682     mv=0;
683     iv=0;
684     jv=0;
685     }
686     void setWindowSize(int value) {
687     dustwindow = value;
688     dustwindow2 = (dustwindow >> 1);
689     }
690     void setWordSize(int value) {
691     dustword=value;
692     }
693     void wo1(int len, const char* s, int ivv) {
694     int i, ii, j, v, t, n, n1, sum;
695     int js, nis;
696     n = 32 * 32 * 32;
697     n1 = n - 1;
698     nis = 0;
699     i = 0;
700     ii = 0;
701     sum = 0;
702     v = 0;
703     for (j=0; j < len; j++, s++) {
704     ii <<= 5;
705     if (*s<=32) {
706     i=0;
707     continue;
708     }
709     ii |= *s - 'A'; //assume uppercase!
710     ii &= n1;
711     i++;
712     if (i >= dustword) {
713     for (js=0; js < nis && iis[js] != ii; js++) ;
714     if (js == nis) {
715     iis[nis] = ii;
716     counts[ii] = 0;
717     nis++;
718     }
719     if ((t = counts[ii]) > 0) {
720     sum += t;
721     v = 10 * sum / j;
722     if (mv < v) {
723     mv = v;
724     iv = ivv;
725     jv = j;
726     }
727     }
728     counts[ii]++;
729     }
730     }
731     }
732    
733     int wo(int len, const char* s, int* beg, int* end) {
734     int i, l1;
735     l1 = len - dustword + 1;
736     if (l1 < 0) {
737     *beg = 0;
738     *end = len - 1;
739     return 0;
740     }
741     mv = 0;
742     iv = 0;
743     jv = 0;
744     for (i=0; i < l1; i++) {
745     wo1(len-i, s+i, i);
746     }
747     *beg = iv;
748     *end = iv + jv;
749     return mv;
750     }
751    
752     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
753     int i, j, l, a, b, v;
754     if (cutoff==0) cutoff=dustcutoff;
755     a=0;b=0;
756     //GMessage("Dust cutoff=%d\n", cutoff);
757     for (i=0; i < seqlen; i += dustwindow2) {
758     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
759     v = wo(l, seq+i, &a, &b);
760     if (v > cutoff) {
761     //for (j = a; j <= b && j < dustwindow2; j++) {
762     for (j = a; j <= b; j++) {
763     seqmsk[i+j]='N';//could be made lowercase instead
764     }
765     }
766     }
767 gpertea 170 //return first;
768 gpertea 4 }
769     };
770    
771     static DNADuster duster;
772    
773     int dust(GStr& rseq) {
774     char* seq=Gstrdup(rseq.chars());
775     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
776     //check the number of Ns:
777     int ncount=0;
778     for (int i=0;i<rseq.length();i++) {
779     if (seq[i]=='N') ncount++;
780     }
781     GFREE(seq);
782     return ncount;
783     }
784    
785 gpertea 94 struct SLocScore {
786     int pos;
787     int score;
788     SLocScore(int p=0,int s=0) {
789     pos=p;
790     score=s;
791     }
792     void set(int p, int s) {
793     pos=p;
794     score=s;
795     }
796     void add(int p, int add) {
797     pos=p;
798     score+=add;
799     }
800     };
801 gpertea 12
802 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
803 gpertea 103 if (!doPolyTrim) return false;
804 gpertea 94 int rlen=seq.length();
805     l5=0;
806     l3=rlen-1;
807     int32 seedVal=*(int32*)poly_seed;
808     char polyChar=poly_seed[0];
809     //assumes N trimming was already done
810     //so a poly match should be very close to the end of the read
811     // -- find the initial match (seed)
812 gpertea 108 int lmin=GMAX((rlen-16), 0);
813 gpertea 94 int li;
814     for (li=rlen-4;li>lmin;li--) {
815     if (seedVal==*(int*)&(seq[li])) {
816     break;
817     }
818 gpertea 12 }
819 gpertea 94 if (li<=lmin) return false;
820     //seed found, try to extend it both ways
821     //extend right
822     int ri=li+3;
823     SLocScore loc(ri, poly_m_score<<2);
824     SLocScore maxloc(loc);
825     //extend right
826 gpertea 170 while (ri<rlen-1) {
827 gpertea 94 ri++;
828     if (seq[ri]==polyChar) {
829     loc.add(ri,poly_m_score);
830     }
831     else if (seq[ri]=='N') {
832     loc.add(ri,0);
833     }
834     else { //mismatch
835     loc.add(ri,poly_mis_score);
836     if (maxloc.score-loc.score>poly_dropoff_score) break;
837     }
838     if (maxloc.score<=loc.score) {
839     maxloc=loc;
840     }
841 gpertea 12 }
842 gpertea 94 ri=maxloc.pos;
843 gpertea 103 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
844 gpertea 94 //ri = right boundary for the poly match
845     //extend left
846     loc.set(li, maxloc.score);
847     maxloc.pos=li;
848     while (li>0) {
849     li--;
850     if (seq[li]==polyChar) {
851     loc.add(li,poly_m_score);
852     }
853     else if (seq[li]=='N') {
854     loc.add(li,0);
855     }
856     else { //mismatch
857     loc.add(li,poly_mis_score);
858     if (maxloc.score-loc.score>poly_dropoff_score) break;
859     }
860     if (maxloc.score<=loc.score) {
861     maxloc=loc;
862     }
863     }
864 gpertea 105 li=maxloc.pos;
865 gpertea 108 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
866     (maxloc.score>poly_minScore && ri>=rlen-3) ||
867     (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
868 gpertea 105 //trimming this li-ri match at 3' end
869     l3=li-1;
870     if (l3<0) l3=0;
871 gpertea 94 return true;
872     }
873     return false;
874 gpertea 12 }
875    
876 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
877 gpertea 103 if (!doPolyTrim) return false;
878 gpertea 88 int rlen=seq.length();
879     l5=0;
880     l3=rlen-1;
881 gpertea 94 int32 seedVal=*(int32*)poly_seed;
882     char polyChar=poly_seed[0];
883 gpertea 88 //assumes N trimming was already done
884     //so a poly match should be very close to the end of the read
885     // -- find the initial match (seed)
886 gpertea 108 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
887 gpertea 94 int li;
888     for (li=0;li<=lmax;li++) {
889     if (seedVal==*(int*)&(seq[li])) {
890 gpertea 88 break;
891     }
892     }
893 gpertea 94 if (li>lmax) return false;
894     //seed found, try to extend it both ways
895     //extend left
896     int ri=li+3; //save rightmost base of the seed
897     SLocScore loc(li, poly_m_score<<2);
898     SLocScore maxloc(loc);
899     while (li>0) {
900     li--;
901     if (seq[li]==polyChar) {
902     loc.add(li,poly_m_score);
903     }
904     else if (seq[li]=='N') {
905     loc.add(li,0);
906     }
907     else { //mismatch
908     loc.add(li,poly_mis_score);
909     if (maxloc.score-loc.score>poly_dropoff_score) break;
910     }
911     if (maxloc.score<=loc.score) {
912     maxloc=loc;
913     }
914 gpertea 88 }
915 gpertea 94 li=maxloc.pos;
916 gpertea 103 if (li>5) return false; //no trimming wanted, too far from 5' end
917 gpertea 94 //li = right boundary for the poly match
918    
919     //extend right
920     loc.set(ri, maxloc.score);
921     maxloc.pos=ri;
922 gpertea 170 while (ri<rlen-1) {
923 gpertea 94 ri++;
924     if (seq[ri]==polyChar) {
925     loc.add(ri,poly_m_score);
926     }
927     else if (seq[ri]=='N') {
928     loc.add(ri,0);
929     }
930     else { //mismatch
931     loc.add(ri,poly_mis_score);
932     if (maxloc.score-loc.score>poly_dropoff_score) break;
933     }
934     if (maxloc.score<=loc.score) {
935     maxloc=loc;
936     }
937     }
938 gpertea 105 ri=maxloc.pos;
939 gpertea 103 if ((maxloc.score==poly_minScore && li==0) ||
940     (maxloc.score>poly_minScore && li<2)
941 gpertea 108 || (maxloc.score>(poly_minScore*3) && li<8)) {
942 gpertea 105 //adjust l5 to reflect this trimming of 5' end
943     l5=ri+1;
944     if (l5>rlen-1) l5=rlen-1;
945 gpertea 94 return true;
946     }
947     return false;
948 gpertea 88 }
949    
950 gpertea 180 bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951     if (adaptors3.Count()==0) return false;
952 gpertea 4 int rlen=seq.length();
953     l5=0;
954     l3=rlen-1;
955 gpertea 103 bool trimmed=false;
956 gpertea 175 GStr wseq(seq);
957 gpertea 103 int wlen=rlen;
958 gpertea 175 GXSeqData seqdata;
959 gpertea 180 int numruns=revCompl ? 2 : 1;
960 gpertea 182
961 gpertea 180 for (int ai=0;ai<adaptors3.Count();ai++) {
962     for (int r=0;r<numruns;r++) {
963     if (r) {
964     seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
965     adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
966     }
967     else {
968     seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
969     adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
970     }
971    
972 gpertea 182 GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
973     if (aln) {
974 gpertea 103 trimmed=true;
975 gpertea 175 //keep unmatched region on the left OR right (the longer one)
976 gpertea 182 if (aln->sl > wlen-aln->sr) {
977 gpertea 175 //keep left side
978 gpertea 182 l3-=(wlen-aln->sl+1);
979 gpertea 175 if (l3<0) l3=0;
980     }
981     else { //keep right side
982 gpertea 182 l5+=aln->sr;
983 gpertea 175 if (l5>=rlen) l5=rlen-1;
984     }
985 gpertea 182 delete aln;
986 gpertea 103 if (l3-l5+1<min_read_len) return true;
987     wseq=seq.substr(l5,l3-l5+1);
988     wlen=wseq.length();
989     }
990 gpertea 180 }//forward and reverse adaptors
991     }//for each 3' adaptor
992 gpertea 103 return trimmed;
993 gpertea 4 }
994    
995 gpertea 180 bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
996     if (adaptors5.Count()==0) return false;
997 gpertea 4 int rlen=seq.length();
998     l5=0;
999     l3=rlen-1;
1000 gpertea 102 bool trimmed=false;
1001 gpertea 175 GStr wseq(seq);
1002 gpertea 103 int wlen=rlen;
1003 gpertea 175 GXSeqData seqdata;
1004 gpertea 180 int numruns=revCompl ? 2 : 1;
1005     for (int ai=0;ai<adaptors5.Count();ai++) {
1006     for (int r=0;r<numruns;r++) {
1007     if (r) {
1008     seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1009     adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1010     }
1011     else {
1012     seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1013     adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1014     }
1015 gpertea 182 GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1016     if (aln) {
1017 gpertea 180 trimmed=true;
1018 gpertea 182 if (aln->sl-1 > wlen-aln->sr) {
1019 gpertea 180 //keep left side
1020 gpertea 182 l3-=(wlen-aln->sl+1);
1021 gpertea 180 if (l3<0) l3=0;
1022     }
1023     else { //keep right side
1024 gpertea 182 l5+=aln->sr;
1025 gpertea 180 if (l5>=rlen) l5=rlen-1;
1026     }
1027 gpertea 182 delete aln;
1028 gpertea 180 if (l3-l5+1<min_read_len) return true;
1029     wseq=seq.substr(l5,l3-l5+1);
1030     wlen=wseq.length();
1031     }
1032     } //forward and reverse?
1033     }//for each 5' adaptor
1034 gpertea 102 return trimmed;
1035 gpertea 98 }
1036 gpertea 4
1037 gpertea 170 //convert qvs to/from phred64 from/to phread33
1038 gpertea 12 void convertPhred(GStr& q) {
1039     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1040 gpertea 4 }
1041    
1042 gpertea 12 void convertPhred(char* q, int len) {
1043     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1044 gpertea 4 }
1045    
1046 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1047 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname) {
1048     rseq="";
1049     rqv="";
1050     rname="";
1051     rinfo="";
1052     if (fq.eof()) return false;
1053     char* l=fq.getLine();
1054     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1055     if (l==NULL) return false;
1056     /* if (rawFormat) {
1057     //TODO: implement raw qseq parsing here
1058     //if (raw type=N) then continue; //skip invalid/bad records
1059     } //raw qseq format
1060     else { // FASTQ or FASTA */
1061     isfasta=(l[0]=='>');
1062 gpertea 170 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1063 gpertea 13 infname.chars(), l);
1064     GStr s(l);
1065     rname=&(l[1]);
1066     for (int i=0;i<rname.length();i++)
1067 gpertea 170 if (rname[i]<=' ') {
1068     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1069     rname.cut(i);
1070     break;
1071 gpertea 13 }
1072     //now get the sequence
1073 gpertea 170 if ((l=fq.getLine())==NULL)
1074 gpertea 13 GError("Error: unexpected EOF after header for read %s (%s)\n",
1075     rname.chars(), infname.chars());
1076     rseq=l; //this must be the DNA line
1077     while ((l=fq.getLine())!=NULL) {
1078     //seq can span multiple lines
1079     if (l[0]=='>' || l[0]=='+') {
1080 gpertea 170 fq.pushBack();
1081 gpertea 13 break; //
1082     }
1083     rseq+=l;
1084 gpertea 170 } //check for multi-line seq
1085 gpertea 13 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1086     if ((l=fq.getLine())==NULL)
1087     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1088     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1089 gpertea 170 if ((l=fq.getLine())==NULL)
1090 gpertea 13 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1091     rqv=l;
1092 gpertea 170 //if (rqv.length()!=rseq.length())
1093 gpertea 13 // GError("Error: qv len != seq len for %s\n", rname.chars());
1094     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1095     rqv+=l; //append to qv string
1096     }
1097     }// fastq
1098     // } //<-- FASTA or FASTQ
1099 gpertea 94 rseq.upper();
1100 gpertea 13 return true;
1101     }
1102    
1103 gpertea 105 #ifdef GDEBUG
1104     void showTrim(GStr& s, int l5, int l3) {
1105 gpertea 177 if (l5>0 || l3==0) {
1106 gpertea 105 color_bg(c_red);
1107     }
1108     for (int i=0;i<s.length()-1;i++) {
1109     if (i && i==l5) color_resetbg();
1110     fprintf(stderr, "%c", s[i]);
1111 gpertea 177 if (i && i==l3) color_bg(c_red);
1112 gpertea 105 }
1113     fprintf(stderr, "%c", s[s.length()-1]);
1114     color_reset();
1115     fprintf(stderr, "\n");
1116     }
1117     #endif
1118    
1119 gpertea 13 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1120 gpertea 170 //returns 0 if the read was untouched, 1 if it was just trimmed
1121 gpertea 13 // and a trash code if it was trashed
1122     l5=0;
1123     l3=rseq.length()-1;
1124 gpertea 103 #ifdef GDEBUG
1125 gpertea 108 //rseq.reverse();
1126 gpertea 103 GMessage(">%s\n", rname.chars());
1127 gpertea 105 GMessage("%s\n",rseq.chars());
1128 gpertea 103 #endif
1129 gpertea 13 if (l3-l5+1<min_read_len) {
1130     return 's';
1131     }
1132     GStr wseq(rseq.chars());
1133     GStr wqv(rqv.chars());
1134     int w5=l5;
1135     int w3=l3;
1136     //first do the q-based trimming
1137     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1138     if (w3-w5+1<min_read_len) {
1139     return 'Q'; //invalid read
1140     }
1141     //-- keep only the w5..w3 range
1142     l5=w5;
1143     l3=w3;
1144     wseq=wseq.substr(w5, w3-w5+1);
1145     if (!wqv.is_empty())
1146     wqv=wqv.substr(w5, w3-w5+1);
1147     } //qv trimming
1148     // N-trimming on the remaining read seq
1149     if (ntrim(wseq, w5, w3)) {
1150     //GMessage("before: %s\n",wseq.chars());
1151     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1152     l5+=w5;
1153     l3-=(wseq.length()-1-w3);
1154     if (w3-w5+1<min_read_len) {
1155     return 'N'; //to be trashed
1156     }
1157     //-- keep only the w5..w3 range
1158     wseq=wseq.substr(w5, w3-w5+1);
1159     if (!wqv.is_empty())
1160     wqv=wqv.substr(w5, w3-w5+1);
1161     w5=0;
1162     w3=wseq.length()-1;
1163     }
1164 gpertea 103 char trim_code;
1165 gpertea 180 //clean the more dirty end first - 3'
1166     int prev_w5=0;
1167     int prev_w3=0;
1168     bool w3upd=true;
1169     bool w5upd=true;
1170 gpertea 103 do {
1171     trim_code=0;
1172 gpertea 180 if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1173 gpertea 103 trim_code='A';
1174     }
1175 gpertea 180 else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1176 gpertea 103 trim_code='T';
1177     }
1178 gpertea 180 else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1179 gpertea 103 trim_code='A';
1180     }
1181 gpertea 180 else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1182 gpertea 103 trim_code='T';
1183     }
1184 gpertea 180 else if (trim_adaptor5(wseq, w5, w3)) {
1185     trim_code='5';
1186     }
1187     else if (trim_adaptor3(wseq, w5, w3)) {
1188 gpertea 103 trim_code='3';
1189     }
1190     if (trim_code) {
1191 gpertea 180 w3upd=(w3!=prev_w3);
1192     w5upd=(w5!=prev_w5);
1193     if (w3upd) prev_w3=w3;
1194     if (w5upd) prev_w5=w5;
1195     #ifdef GDEBUG
1196 gpertea 105 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1197     showTrim(wseq, w5, w3);
1198 gpertea 180 #endif
1199 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1200 gpertea 103 num_trimmed3++;
1201 gpertea 170 if (trimlen<min_trimmed3)
1202 gpertea 103 min_trimmed3=trimlen;
1203 gpertea 13 l5+=w5;
1204     l3-=(wseq.length()-1-w3);
1205     if (w3-w5+1<min_read_len) {
1206 gpertea 103 return trim_code;
1207 gpertea 13 }
1208     //-- keep only the w5..w3 range
1209     wseq=wseq.substr(w5, w3-w5+1);
1210     if (!wqv.is_empty())
1211     wqv=wqv.substr(w5, w3-w5+1);
1212 gpertea 103 }//trimming at 3' end
1213     } while (trim_code);
1214    
1215 gpertea 13 if (doCollapse) {
1216     //keep read for later
1217     FqDupRec* dr=dhash.Find(wseq.chars());
1218     if (dr==NULL) { //new entry
1219 gpertea 170 //if (prefix.is_empty())
1220     dhash.Add(wseq.chars(),
1221 gpertea 13 new FqDupRec(&wqv, rname.chars()));
1222     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1223     }
1224     else
1225     dr->add(wqv);
1226     } //collapsing duplicates
1227     else { //not collapsing duplicates
1228     //apply the dust filter now
1229     if (doDust) {
1230     int dustbases=dust(wseq);
1231     if (dustbases>(wseq.length()>>1)) {
1232     return 'D';
1233     }
1234     }
1235     } //not collapsing duplicates
1236     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1237     }
1238    
1239     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1240     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1241     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1242     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1243     }
1244    
1245     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1246     outcounter++;
1247     bool asFasta=(rqv.is_empty() || fastaOutput);
1248     if (asFasta) {
1249     if (prefix.is_empty()) {
1250     printHeader(f_out, '>',rname,rinfo);
1251     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1252     }
1253     else {
1254 gpertea 170 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1255 gpertea 13 rseq.chars());
1256     }
1257     }
1258     else { //fastq
1259     if (convert_phred) convertPhred(rqv);
1260     if (prefix.is_empty()) {
1261     printHeader(f_out, '@', rname, rinfo);
1262     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1263     }
1264     else
1265 gpertea 170 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1266 gpertea 13 rseq.chars(),rqv.chars() );
1267     }
1268     }
1269    
1270     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1271     if (freport==NULL || trashcode<=' ') return;
1272     if (trashcode=='3' || trashcode=='5') {
1273 gpertea 103 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1274 gpertea 13 }
1275     else {
1276     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1277     }
1278     //tcounter++;
1279     }
1280    
1281     GStr getFext(GStr& s, int* xpos=NULL) {
1282     //s must be a filename without a path
1283     GStr r("");
1284     if (xpos!=NULL) *xpos=0;
1285     if (s.is_empty() || s=="-") return r;
1286     int p=s.rindex('.');
1287     int d=s.rindex('/');
1288     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1289     r=s.substr(p+1);
1290     if (xpos!=NULL) *xpos=p+1;
1291     r.lower();
1292     return r;
1293     }
1294    
1295     void baseFileName(GStr& fname) {
1296     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1297     int xpos=0;
1298     GStr fext=getFext(fname, &xpos);
1299     if (xpos<=1) return;
1300     bool extdel=false;
1301     GStr f2;
1302     if (fext=="z" || fext=="zip") {
1303     extdel=true;
1304     }
1305     else if (fext.length()>=2) {
1306     f2=fext.substr(0,2);
1307     extdel=(f2=="gz" || f2=="bz");
1308     }
1309     if (extdel) {
1310     fname.cut(xpos-1);
1311     fext=getFext(fname, &xpos);
1312     if (xpos<=1) return;
1313     }
1314     extdel=false;
1315     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1316     extdel=true;
1317     }
1318     else if (fext.length()>=2) {
1319     extdel=(fext.substr(0,2)=="fa");
1320     }
1321     if (extdel) fname.cut(xpos-1);
1322     GStr fncp(fname);
1323     fncp.lower();
1324     fncp.chomp("seq");
1325     fncp.chomp("sequence");
1326     fncp.trimR("_.");
1327     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1328     }
1329    
1330     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1331     FILE* f_out=NULL;
1332     GStr fname(getFileName(infname.chars()));
1333     //eliminate known extensions
1334     baseFileName(fname);
1335     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1336     else if (pocmd.is_empty()) {
1337     GStr oname(fname);
1338     oname.append('.');
1339     oname.append(outsuffix);
1340     f_out=fopen(oname.chars(),"w");
1341     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1342     }
1343     else {
1344     GStr oname(">");
1345     oname.append(fname);
1346     oname.append('.');
1347     oname.append(outsuffix);
1348     pocmd.append(oname);
1349     f_out=popen(pocmd.chars(), "w");
1350     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1351     }
1352     return f_out;
1353     }
1354    
1355     void guess_unzip(GStr& fname, GStr& picmd) {
1356     GStr fext=getFext(fname);
1357     if (fext=="gz" || fext=="gzip" || fext=="z") {
1358     picmd="gzip -cd ";
1359     }
1360     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1361     picmd="bzip2 -cd ";
1362     }
1363     }
1364    
1365 gpertea 180 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1366 gpertea 170 //TODO: prepare CASeqData here, and collect hexamers as well
1367 gpertea 180 if (seq.is_empty() || seq=="-" ||
1368     seq=="N/A" || seq==".") return;
1369    
1370 gpertea 172 CASeqData adata(revCompl);
1371 gpertea 180 int idx=adaptors.Add(adata);
1372 gpertea 172 if (idx<0) GError("Error: failed to add adaptor!\n");
1373 gpertea 180 adaptors[idx].trim_type=trim_type;
1374     adaptors[idx].update(seq.chars());
1375 gpertea 170 }
1376    
1377    
1378 gpertea 180 int loadAdaptors(const char* fname) {
1379 gpertea 94 GLineReader lr(fname);
1380     char* l;
1381     while ((l=lr.nextLine())!=NULL) {
1382     if (lr.length()<=3 || l[0]=='#') continue;
1383 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1384     l[0]==';'|| l[0]==':' ) {
1385 gpertea 94 int i=1;
1386     while (l[i]!=0 && isspace(l[i])) {
1387     i++;
1388     }
1389     if (l[i]!=0) {
1390 gpertea 102 GStr s(&(l[i]));
1391 gpertea 108 #ifdef GDEBUG
1392     //s.reverse();
1393     #endif
1394 gpertea 180 addAdaptor(adaptors3, s, galn_TrimRight);
1395 gpertea 94 continue;
1396     }
1397     }
1398     else {
1399 gpertea 98 GStr s(l);
1400     s.startTokenize("\t ;,:");
1401     GStr a5,a3;
1402     if (s.nextToken(a5))
1403 gpertea 180 s.nextToken(a3);
1404     else continue; //no tokens on this line
1405     GAlnTrimType ttype5=galn_TrimLeft;
1406 gpertea 102 a5.upper();
1407     a3.upper();
1408 gpertea 180 if (a3.is_empty() || a3==a5 || a3=="=") {
1409     a3.clear();
1410     ttype5=galn_TrimEither;
1411     }
1412 gpertea 108 #ifdef GDEBUG
1413     // a5.reverse();
1414     // a3.reverse();
1415     #endif
1416 gpertea 180 addAdaptor(adaptors5, a5, ttype5);
1417     addAdaptor(adaptors3, a3, galn_TrimRight);
1418 gpertea 94 }
1419     }
1420 gpertea 180 return adaptors5.Count()+adaptors3.Count();
1421 gpertea 94 }
1422    
1423 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1424     GStr& s, GStr& infname, GStr& infname2) {
1425 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
1426     infname="";
1427     infname2="";
1428     f_in=NULL;
1429     f_in2=NULL;
1430     f_out=NULL;
1431     f_out2=NULL;
1432     //analyze outsuffix intent
1433     GStr pocmd;
1434 gpertea 76 if (outsuffix=="-") {
1435     f_out=stdout;
1436     }
1437     else {
1438     GStr ox=getFext(outsuffix);
1439     if (ox.length()>2) ox=ox.substr(0,2);
1440     if (ox=="gz") pocmd="gzip -9 -c ";
1441     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1442     }
1443 gpertea 13 if (s=="-") {
1444     f_in=stdin;
1445     infname="stdin";
1446     f_out=prepOutFile(infname, pocmd);
1447     return;
1448     } // streaming from stdin
1449     s.startTokenize(",:");
1450     s.nextToken(infname);
1451     bool paired=s.nextToken(infname2);
1452 gpertea 170 if (fileExists(infname.chars())==0)
1453 gpertea 13 GError("Error: cannot find file %s!\n",infname.chars());
1454     GStr fname(getFileName(infname.chars()));
1455     GStr picmd;
1456     guess_unzip(fname, picmd);
1457     if (picmd.is_empty()) {
1458     f_in=fopen(infname.chars(), "r");
1459     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1460     }
1461     else {
1462     picmd.append(infname);
1463     f_in=popen(picmd.chars(), "r");
1464     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1465     }
1466 gpertea 76 if (f_out==stdout) {
1467     if (paired) GError("Error: output suffix required for paired reads\n");
1468     return;
1469     }
1470 gpertea 13 f_out=prepOutFile(infname, pocmd);
1471     if (!paired) return;
1472     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1473     // ---- paired reads:-------------
1474 gpertea 170 if (fileExists(infname2.chars())==0)
1475 gpertea 13 GError("Error: cannot find file %s!\n",infname2.chars());
1476     picmd="";
1477     GStr fname2(getFileName(infname2.chars()));
1478     guess_unzip(fname2, picmd);
1479     if (picmd.is_empty()) {
1480     f_in2=fopen(infname2.chars(), "r");
1481     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1482     }
1483     else {
1484     picmd.append(infname2);
1485     f_in2=popen(picmd.chars(), "r");
1486     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1487     }
1488     f_out2=prepOutFile(infname2, pocmd);
1489     }