ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 185
Committed: Thu Feb 16 22:05:42 2012 UTC (7 years, 6 months ago) by gpertea
File size: 44641 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 180 fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10 gpertea 172 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 gpertea 76 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 180 Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15 gpertea 76 for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 gpertea 180 -f file with adaptor sequences to trim, each line having this format:\n\
29     <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30     -5 trim the given adaptor or primer sequence at the 5' end of each read\n\
31 gpertea 4 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 gpertea 180 -3 trim the given adaptor sequence at the 3' end of each read\n\
33 gpertea 4 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 103 -A disable polyA/T trimming (enabled by default)\n\
35 gpertea 94 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39     -l minimum \"clean\" length after trimming that a read must have\n\
40     in order to pass the filter (default: 16)\n\
41 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42     one letter code for the reason why they were trashed\n\
43 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44     50% of its length detected as low complexity\n\
45 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 gpertea 12 name (e.g. for microRNAs)\n\
47     -p input is phred64/phred33 (use -p64 or -p33)\n\
48     -Q convert quality values to the other Phred qv type\n\
49     -V verbose processing\n\
50 gpertea 4 "
51 gpertea 170
52 gpertea 13 //-z for -o option, the output stream(s) will be first piped into the given\n
53     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54    
55 gpertea 170
56 gpertea 180 // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 gpertea 12
58 gpertea 76 //For paired reads sequencing:
59 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
62     //FILE* f_out2=NULL; //for paired reads
63     //FILE* f_in=NULL; //input fastq (stdin if not provided)
64     //FILE* f_in2=NULL; //for paired reads
65 gpertea 170
66 gpertea 4 FILE* freport=NULL;
67 gpertea 13
68 gpertea 4 bool debug=false;
69 gpertea 12 bool verbose=false;
70 gpertea 4 bool doCollapse=false;
71     bool doDust=false;
72 gpertea 103 bool doPolyTrim=true;
73 gpertea 13 bool fastaOutput=false;
74 gpertea 4 bool trashReport=false;
75 gpertea 172 bool revCompl=false; //also reverse complement adaptor sequences
76 gpertea 4 int min_read_len=16;
77 gpertea 12 double max_perc_N=7.0;
78 gpertea 4 int dust_cutoff=16;
79     bool isfasta=false;
80 gpertea 12 bool convert_phred=false;
81 gpertea 170 GStr outsuffix; // -o
82 gpertea 13 GStr prefix;
83     GStr zcmd;
84     int num_trimmed5=0;
85     int num_trimmed3=0;
86     int num_trimmedN=0;
87     int num_trimmedQ=0;
88     int min_trimmed5=INT_MAX;
89     int min_trimmed3=INT_MAX;
90 gpertea 12
91 gpertea 13 int qvtrim_qmin=0;
92     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94     int qv_cvtadd=0; //could be -31 or +31
95    
96 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
97 gpertea 177 //const int match_reward=2;
98     //const int mismatch_penalty=3;
99 gpertea 103 const int match_reward=2;
100 gpertea 177 const int mismatch_penalty=4;
101     const int Xdrop=10;
102 gpertea 103
103 gpertea 94 const int poly_m_score=2; //match score
104     const int poly_mis_score=-3; //mismatch
105 gpertea 88 const int poly_dropoff_score=7;
106 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
107 gpertea 4
108 gpertea 94 const char *polyA_seed="AAAA";
109     const char *polyT_seed="TTTT";
110 gpertea 12
111 gpertea 170 struct CASeqData {
112 gpertea 180 //positional data for every possible hexamer in an adaptor
113     GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114     GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115     GStr seq; //actual adaptor sequence data
116 gpertea 170 GStr seqr; //reverse complement sequence
117 gpertea 180 int amlen; //fraction of adaptor length matching that's
118 gpertea 175 //enough to consider the alignment
119 gpertea 180 GAlnTrimType trim_type;
120 gpertea 172 bool use_reverse;
121 gpertea 175 CASeqData(bool rev=false):seq(),seqr(),
122     amlen(0), use_reverse(rev) {
123 gpertea 180 trim_type=galn_None; //should be updated later!
124 gpertea 172 for (int i=0;i<4096;i++) {
125     pz[i]=NULL;
126     pzr[i]=NULL;
127     }
128     }
129    
130     void update(const char* s) {
131 gpertea 175 seq=s;
132     table6mers(seq.chars(), seq.length(), pz);
133     amlen=iround(double(seq.length())*0.8);
134     if (amlen<12)
135     amlen=12;
136     if (!use_reverse) return;
137     //reverse complement
138     seqr=s;
139     int slen=seq.length();
140     for (int i=0;i<slen;i++)
141     seqr[i]=ntComplement(seq[slen-i-1]);
142     table6mers(seqr.chars(), seqr.length(), pzr);
143 gpertea 170 }
144    
145 gpertea 172 ~CASeqData() {
146     for (int i=0;i<4096;i++) {
147     delete pz[i];
148     delete pzr[i];
149 gpertea 170 }
150     }
151     };
152    
153 gpertea 180 GVec<CASeqData> adaptors5;
154     GVec<CASeqData> adaptors3;
155 gpertea 170
156 gpertea 103 CGreedyAlignData* gxmem_l=NULL;
157     CGreedyAlignData* gxmem_r=NULL;
158    
159 gpertea 4 // element in dhash:
160     class FqDupRec {
161     public:
162     int count; //how many of these reads are the same
163     int len; //length of qv
164     char* firstname; //optional, only if we want to keep the original read names
165     char* qv;
166     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
167     len=0;
168     qv=NULL;
169     firstname=NULL;
170     count=0;
171     if (qstr!=NULL) {
172     qv=Gstrdup(qstr->chars());
173     len=qstr->length();
174     count++;
175     }
176     if (rname!=NULL) firstname=Gstrdup(rname);
177     }
178     ~FqDupRec() {
179     GFREE(qv);
180     GFREE(firstname);
181     }
182     void add(GStr& d) { //collapse another record into this one
183     if (d.length()!=len)
184     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
185     count++;
186     for (int i=0;i<len;i++)
187     qv[i]=(qv[i]+d[i])/2;
188     }
189     };
190    
191     void openfw(FILE* &f, GArgs& args, char opt) {
192     GStr s=args.getOpt(opt);
193     if (!s.is_empty()) {
194     if (s=='-') f=stdout;
195     else {
196 gpertea 13 f=fopen(s.chars(),"w");
197 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
198     }
199     }
200     }
201    
202     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
203     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
204    
205     GHash<FqDupRec> dhash; //hash to keep track of duplicates
206    
207 gpertea 180 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208     int loadAdaptors(const char* fname);
209 gpertea 94
210 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211     GStr& s, GStr& infname, GStr& infname2);
212 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
213 gpertea 170
214 gpertea 13 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
215     void trash_report(char trashcode, GStr& rname, FILE* freport);
216    
217 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
218 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname);
219    
220 gpertea 170 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
221 gpertea 13 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
222    
223 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
224     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
225 gpertea 4 int dust(GStr& seq);
226 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 gpertea 180 bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229     bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230 gpertea 4
231 gpertea 12 void convertPhred(char* q, int len);
232     void convertPhred(GStr& q);
233 gpertea 4
234     int main(int argc, char * const argv[]) {
235 gpertea 172 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
236 gpertea 4 int e;
237     if ((e=args.isError())>0) {
238     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
239     exit(224);
240     }
241 gpertea 9 debug=(args.getOpt('Y')!=NULL);
242 gpertea 13 verbose=(args.getOpt('V')!=NULL);
243 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
244 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
245     doDust=(args.getOpt('D')!=NULL);
246 gpertea 172 revCompl=(args.getOpt('R')!=NULL);
247 gpertea 103 if (args.getOpt('A')) doPolyTrim=false;
248 gpertea 12 /*
249 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
250     if (rawFormat) {
251     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
252     }
253 gpertea 12 */
254 gpertea 4 prefix=args.getOpt('n');
255     GStr s=args.getOpt('l');
256 gpertea 170 if (!s.is_empty())
257 gpertea 4 min_read_len=s.asInt();
258 gpertea 12 s=args.getOpt('m');
259 gpertea 170 if (!s.is_empty())
260 gpertea 12 max_perc_N=s.asDouble();
261 gpertea 4 s=args.getOpt('d');
262     if (!s.is_empty()) {
263     dust_cutoff=s.asInt();
264     doDust=true;
265     }
266 gpertea 12 s=args.getOpt('q');
267     if (!s.is_empty()) {
268 gpertea 13 qvtrim_qmin=s.asInt();
269 gpertea 12 }
270 gpertea 13 s=args.getOpt('t');
271     if (!s.is_empty()) {
272     qvtrim_max=s.asInt();
273     }
274 gpertea 12 s=args.getOpt('p');
275     if (!s.is_empty()) {
276     int v=s.asInt();
277     if (v==33) {
278     qv_phredtype=33;
279     qv_cvtadd=31;
280     }
281     else if (v==64) {
282     qv_phredtype=64;
283     qv_cvtadd=-31;
284     }
285 gpertea 170 else
286 gpertea 12 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
287     }
288 gpertea 94 s=args.getOpt('f');
289     if (!s.is_empty()) {
290 gpertea 180 loadAdaptors(s.chars());
291 gpertea 94 }
292 gpertea 180 bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293 gpertea 94 s=args.getOpt('5');
294     if (!s.is_empty()) {
295 gpertea 180 if (fileAdaptors)
296 gpertea 94 GError("Error: options -5 and -f cannot be used together!\n");
297     s.upper();
298 gpertea 180 addAdaptor(adaptors5, s, galn_TrimLeft);
299 gpertea 4 }
300 gpertea 94 s=args.getOpt('3');
301     if (!s.is_empty()) {
302 gpertea 180 if (fileAdaptors)
303 gpertea 94 GError("Error: options -3 and -f cannot be used together!\n");
304 gpertea 102 s.upper();
305 gpertea 180 addAdaptor(adaptors3, s, galn_TrimRight);
306 gpertea 4 }
307 gpertea 94 s=args.getOpt('y');
308 gpertea 13 if (!s.is_empty()) {
309 gpertea 94 int minmatch=s.asInt();
310     poly_minScore=minmatch*poly_m_score;
311 gpertea 13 }
312 gpertea 170
313 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
314 gpertea 76 else outsuffix="-";
315 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
316 gpertea 13 int fcount=args.startNonOpt();
317     if (fcount==0) {
318 gpertea 4 GMessage(USAGE);
319     exit(224);
320     }
321 gpertea 13 if (fcount>1 && doCollapse) {
322     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
323     }
324     //openfw(f_out, args, 'o');
325     //if (f_out==NULL) f_out=stdout;
326 gpertea 4 if (trashReport)
327     openfw(freport, args, 'r');
328     char* infile=NULL;
329 gpertea 103
330 gpertea 180 if (adaptors5.Count()>0)
331 gpertea 177 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332     gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 gpertea 180 if (adaptors3.Count()>0)
334 gpertea 103 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335    
336 gpertea 4 while ((infile=args.nextNonOpt())!=NULL) {
337 gpertea 103 //for each input file
338 gpertea 13 int incounter=0; //counter for input reads
339     int outcounter=0; //counter for output reads
340     int trash_s=0; //too short from the get go
341     int trash_Q=0;
342     int trash_N=0;
343     int trash_D=0;
344 gpertea 103 int trash_poly=0;
345 gpertea 13 int trash_A3=0;
346     int trash_A5=0;
347     s=infile;
348     GStr infname;
349     GStr infname2;
350     FILE* f_in=NULL;
351     FILE* f_in2=NULL;
352     FILE* f_out=NULL;
353     FILE* f_out2=NULL;
354     bool paired_reads=false;
355     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
356     GLineReader fq(f_in);
357     GLineReader* fq2=NULL;
358     if (f_in2!=NULL) {
359     fq2=new GLineReader(f_in2);
360     paired_reads=true;
361     }
362     GStr rseq, rqv, seqid, seqinfo;
363     GStr rseq2, rqv2, seqid2, seqinfo2;
364     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
365     incounter++;
366     int a5=0, a3=0, b5=0, b3=0;
367     char tcode=0, tcode2=0;
368     tcode=process_read(seqid, rseq, rqv, a5, a3);
369 gpertea 103 if (fq2!=NULL) {
370 gpertea 13 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
371     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
372     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
373     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
374     }
375     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
376     //decide what to do with this pair and print rseq2 if the pair makes it
377     if (tcode>1 && tcode2<=1) {
378     //"untrash" rseq
379     if (a3-a5+1<min_read_len) {
380     a5=1;
381     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
382 gpertea 4 }
383 gpertea 13 tcode=1;
384     }
385     else if (tcode<=1 && tcode2>1) {
386     //"untrash" rseq2
387     if (b3-b5+1<min_read_len) {
388     b5=1;
389     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
390     }
391     tcode2=1;
392     }
393     if (tcode<=1) { //trimmed or left intact -- write it!
394     if (tcode>0) {
395     rseq2=rseq2.substr(b5,b3-b5+1);
396     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
397 gpertea 4 }
398 gpertea 13 int nocounter=0;
399     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
400     }
401 gpertea 103 } //pair read
402 gpertea 13 if (tcode>1) { //trashed
403 gpertea 103 #ifdef GDEBUG
404 gpertea 175 GMessage(" !!!!TRASH code = %c\n",tcode);
405 gpertea 103 #endif
406 gpertea 13 if (tcode=='s') trash_s++;
407 gpertea 103 else if (tcode=='A' || tcode=='T') trash_poly++;
408 gpertea 13 else if (tcode=='Q') trash_Q++;
409     else if (tcode=='N') trash_N++;
410     else if (tcode=='D') trash_D++;
411     else if (tcode=='3') trash_A3++;
412     else if (tcode=='5') trash_A5++;
413     if (trashReport) trash_report(tcode, seqid, freport);
414     }
415     else if (!doCollapse) { //write it
416     if (tcode>0) {
417     rseq=rseq.substr(a5,a3-a5+1);
418     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
419     }
420 gpertea 103 #ifdef GDEBUG
421     GMessage(" After trimming:\n");
422 gpertea 105 GMessage("%s\n",rseq.chars());
423 gpertea 103 #endif
424 gpertea 13 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
425     }
426     } //for each fastq record
427     delete fq2;
428     FRCLOSE(f_in);
429     FRCLOSE(f_in2);
430     if (doCollapse) {
431     outcounter=0;
432     int maxdup_count=1;
433     char* maxdup_seq=NULL;
434     dhash.startIterate();
435     FqDupRec* qd=NULL;
436     char* seq=NULL;
437     while ((qd=dhash.NextData(seq))!=NULL) {
438     GStr rseq(seq);
439     //do the dusting here
440     if (doDust) {
441     int dustbases=dust(rseq);
442     if (dustbases>(rseq.length()>>1)) {
443     if (trashReport && qd->firstname!=NULL) {
444     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
445 gpertea 4 }
446 gpertea 13 trash_D+=qd->count;
447     continue;
448     }
449     }
450     outcounter++;
451 gpertea 170 if (qd->count>maxdup_count) {
452 gpertea 13 maxdup_count=qd->count;
453     maxdup_seq=seq;
454     }
455     if (isfasta) {
456     if (prefix.is_empty()) {
457 gpertea 170 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
458 gpertea 13 rseq.chars());
459 gpertea 4 }
460 gpertea 13 else { //use custom read name
461     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
462     qd->count, rseq.chars());
463 gpertea 4 }
464 gpertea 13 }
465     else { //fastq format
466     if (convert_phred) convertPhred(qd->qv, qd->len);
467     if (prefix.is_empty()) {
468 gpertea 170 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
469 gpertea 13 rseq.chars(), qd->qv);
470 gpertea 4 }
471 gpertea 13 else { //use custom read name
472     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
473     qd->count, rseq.chars(), qd->qv);
474     }
475     }
476     }//for each element of dhash
477     if (maxdup_count>1) {
478     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
479 gpertea 4 }
480 gpertea 13 } //collapse entries
481     if (verbose) {
482     if (paired_reads) {
483     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
484     GMessage("Number of input pairs :%9d\n", incounter);
485     GMessage(" Output pairs :%9d\n", outcounter);
486     }
487     else {
488     GMessage(">Input file : %s\n", infname.chars());
489     GMessage("Number of input reads :%9d\n", incounter);
490     GMessage(" Output reads :%9d\n", outcounter);
491     }
492     GMessage("------------------------------------\n");
493     if (num_trimmed5)
494     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
495     if (num_trimmed3)
496     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
497     GMessage("------------------------------------\n");
498     if (trash_s>0)
499     GMessage(" Trashed by length:%9d\n", trash_s);
500     if (trash_N>0)
501     GMessage(" Trashed by N%%:%9d\n", trash_N);
502     if (trash_Q>0)
503     GMessage("Trashed by low quality:%9d\n", trash_Q);
504 gpertea 103 if (trash_poly>0)
505     GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
506 gpertea 13 if (trash_A5>0)
507 gpertea 180 GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508 gpertea 13 if (trash_A3>0)
509 gpertea 180 GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510 gpertea 13 }
511     if (trashReport) {
512     FWCLOSE(freport);
513 gpertea 4 }
514 gpertea 13 FWCLOSE(f_out);
515     FWCLOSE(f_out2);
516     } //while each input file
517 gpertea 103 delete gxmem_l;
518     delete gxmem_r;
519 gpertea 12 //getc(stdin);
520 gpertea 4 }
521    
522     class NData {
523     public:
524     int NPos[1024]; //there should be no reads longer than 1K ?
525     int NCount;
526     int end5;
527     int end3;
528     int seqlen;
529     double perc_N; //percentage of Ns in end5..end3 range only!
530     const char* seq;
531     bool valid;
532     NData() {
533 gpertea 170 seqlen=0;
534 gpertea 4 NCount=0;
535     end5=0;
536     end3=0;
537     seq=NULL;
538     perc_N=0;
539     valid=true;
540     }
541     void init(GStr& rseq) {
542     NCount=0;
543     perc_N=0;
544     seqlen=rseq.length();
545     seq=rseq.chars();
546     end5=1;
547     end3=seqlen;
548     for (int i=0;i<seqlen;i++)
549 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
550 gpertea 4 NPos[NCount]=i;
551     NCount++;
552     }
553     perc_N=(NCount*100.0)/seqlen;
554     valid=true;
555     }
556     void N_calc() { //only in the region after trimming
557     int n=0;
558     for (int i=end3-1;i<end5;i++) {
559     if (seq[i]=='N') n++;
560     }
561     perc_N=(n*100.0)/(end5-end3+1);
562     }
563     };
564 gpertea 170
565 gpertea 4 static NData feat;
566     int perc_lenN=12; // incremental distance from ends, in percentage of
567     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
568 gpertea 170
569 gpertea 4 void N_analyze(int l5, int l3, int p5, int p3) {
570     /* assumes feat was filled properly */
571     int old_dif, t5,t3,v;
572 gpertea 12 if (l3<l5+2 || p5>p3 ) {
573 gpertea 4 feat.end5=l5+1;
574     feat.end3=l3+1;
575 gpertea 170 return;
576 gpertea 4 }
577    
578     t5=feat.NPos[p5]-l5;
579     t3=l3-feat.NPos[p3];
580     old_dif=p3-p5;
581     v=(int)((((double)(l3-l5))*perc_lenN)/100);
582 gpertea 170 if (v>20) v=20; /* enforce N-search limit for very long reads */
583 gpertea 4 if (t5 < v ) {
584     l5=feat.NPos[p5]+1;
585     p5++;
586     }
587     if (t3 < v) {
588     l3=feat.NPos[p3]-1;
589     p3--;
590     }
591     /* restNs=p3-p5; number of Ns in the new CLR */
592     if (p3-p5==old_dif) { /* no change, return */
593     /* don't trim if this may shorten the read too much */
594     //if (l5-l3<min_read_len) return;
595     feat.end5=l5+1;
596     feat.end3=l3+1;
597     return;
598     }
599 gpertea 170 else
600 gpertea 4 N_analyze(l5,l3, p5,p3);
601     }
602    
603    
604 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
605 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
606 gpertea 12 if (qv_phredtype==0) {
607     //try to guess the Phred type
608     int vmin=256, vmax=0;
609     for (int i=0;i<qvs.length();i++) {
610     if (vmin>qvs[i]) vmin=qvs[i];
611     if (vmax<qvs[i]) vmax=qvs[i];
612     }
613     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
614     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
615     if (qv_phredtype==0) {
616     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
617     }
618 gpertea 13 if (verbose)
619     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
620     } //guessing Phred type
621 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
622 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
623 gpertea 12 }
624     //just in case, check also the 5' the end (?)
625     for (l5=0;l5<qvs.length()-3;l5++) {
626 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
627 gpertea 12 }
628 gpertea 13 if (qvtrim_max>0) {
629     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
630     if (l5>qvtrim_max) l5=qvtrim_max;
631     }
632 gpertea 12 return (l5>0 || l3<qvs.length()-1);
633     }
634    
635     bool ntrim(GStr& rseq, int &l5, int &l3) {
636     //count Ns in the sequence, trim N-rich ends
637 gpertea 4 feat.init(rseq);
638     l5=feat.end5-1;
639     l3=feat.end3-1;
640 gpertea 170 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
641 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
642     if (feat.perc_N>max_perc_N) {
643     feat.valid=false;
644     l3=l5+1;
645     return true;
646     }
647     else {
648     return false; //nothing changed
649     }
650     }
651 gpertea 4 l5=feat.end5-1;
652     l3=feat.end3-1;
653     if (l3-l5+1<min_read_len) {
654     feat.valid=false;
655     return true;
656     }
657     feat.N_calc();
658 gpertea 170
659 gpertea 12 if (feat.perc_N>max_perc_N) {
660 gpertea 4 feat.valid=false;
661     l3=l5+1;
662     return true;
663     }
664     return true;
665     }
666    
667     //--------------- dust functions ----------------
668     class DNADuster {
669     public:
670 gpertea 170 int dustword;
671     int dustwindow;
672     int dustwindow2;
673 gpertea 4 int dustcutoff;
674     int mv, iv, jv;
675     int counts[32*32*32];
676     int iis[32*32*32];
677     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
678     dustword=wordsize;
679     dustwindow=winsize;
680     dustwindow2 = (winsize>>1);
681     dustcutoff=cutoff;
682     mv=0;
683     iv=0;
684     jv=0;
685     }
686     void setWindowSize(int value) {
687     dustwindow = value;
688     dustwindow2 = (dustwindow >> 1);
689     }
690     void setWordSize(int value) {
691     dustword=value;
692     }
693     void wo1(int len, const char* s, int ivv) {
694     int i, ii, j, v, t, n, n1, sum;
695     int js, nis;
696     n = 32 * 32 * 32;
697     n1 = n - 1;
698     nis = 0;
699     i = 0;
700     ii = 0;
701     sum = 0;
702     v = 0;
703     for (j=0; j < len; j++, s++) {
704     ii <<= 5;
705     if (*s<=32) {
706     i=0;
707     continue;
708     }
709     ii |= *s - 'A'; //assume uppercase!
710     ii &= n1;
711     i++;
712     if (i >= dustword) {
713     for (js=0; js < nis && iis[js] != ii; js++) ;
714     if (js == nis) {
715     iis[nis] = ii;
716     counts[ii] = 0;
717     nis++;
718     }
719     if ((t = counts[ii]) > 0) {
720     sum += t;
721     v = 10 * sum / j;
722     if (mv < v) {
723     mv = v;
724     iv = ivv;
725     jv = j;
726     }
727     }
728     counts[ii]++;
729     }
730     }
731     }
732    
733     int wo(int len, const char* s, int* beg, int* end) {
734     int i, l1;
735     l1 = len - dustword + 1;
736     if (l1 < 0) {
737     *beg = 0;
738     *end = len - 1;
739     return 0;
740     }
741     mv = 0;
742     iv = 0;
743     jv = 0;
744     for (i=0; i < l1; i++) {
745     wo1(len-i, s+i, i);
746     }
747     *beg = iv;
748     *end = iv + jv;
749     return mv;
750     }
751    
752     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
753     int i, j, l, a, b, v;
754     if (cutoff==0) cutoff=dustcutoff;
755     a=0;b=0;
756     //GMessage("Dust cutoff=%d\n", cutoff);
757     for (i=0; i < seqlen; i += dustwindow2) {
758     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
759     v = wo(l, seq+i, &a, &b);
760     if (v > cutoff) {
761     //for (j = a; j <= b && j < dustwindow2; j++) {
762     for (j = a; j <= b; j++) {
763     seqmsk[i+j]='N';//could be made lowercase instead
764     }
765     }
766     }
767 gpertea 170 //return first;
768 gpertea 4 }
769     };
770    
771     static DNADuster duster;
772    
773     int dust(GStr& rseq) {
774     char* seq=Gstrdup(rseq.chars());
775     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
776     //check the number of Ns:
777     int ncount=0;
778     for (int i=0;i<rseq.length();i++) {
779     if (seq[i]=='N') ncount++;
780     }
781     GFREE(seq);
782     return ncount;
783     }
784    
785 gpertea 94 struct SLocScore {
786     int pos;
787     int score;
788     SLocScore(int p=0,int s=0) {
789     pos=p;
790     score=s;
791     }
792     void set(int p, int s) {
793     pos=p;
794     score=s;
795     }
796     void add(int p, int add) {
797     pos=p;
798     score+=add;
799     }
800     };
801 gpertea 12
802 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
803 gpertea 103 if (!doPolyTrim) return false;
804 gpertea 94 int rlen=seq.length();
805     l5=0;
806     l3=rlen-1;
807     int32 seedVal=*(int32*)poly_seed;
808     char polyChar=poly_seed[0];
809     //assumes N trimming was already done
810     //so a poly match should be very close to the end of the read
811     // -- find the initial match (seed)
812 gpertea 108 int lmin=GMAX((rlen-16), 0);
813 gpertea 94 int li;
814     for (li=rlen-4;li>lmin;li--) {
815     if (seedVal==*(int*)&(seq[li])) {
816     break;
817     }
818 gpertea 12 }
819 gpertea 94 if (li<=lmin) return false;
820     //seed found, try to extend it both ways
821     //extend right
822     int ri=li+3;
823     SLocScore loc(ri, poly_m_score<<2);
824     SLocScore maxloc(loc);
825     //extend right
826 gpertea 170 while (ri<rlen-1) {
827 gpertea 94 ri++;
828     if (seq[ri]==polyChar) {
829     loc.add(ri,poly_m_score);
830     }
831     else if (seq[ri]=='N') {
832     loc.add(ri,0);
833     }
834     else { //mismatch
835     loc.add(ri,poly_mis_score);
836     if (maxloc.score-loc.score>poly_dropoff_score) break;
837     }
838     if (maxloc.score<=loc.score) {
839     maxloc=loc;
840     }
841 gpertea 12 }
842 gpertea 94 ri=maxloc.pos;
843 gpertea 103 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
844 gpertea 94 //ri = right boundary for the poly match
845     //extend left
846     loc.set(li, maxloc.score);
847     maxloc.pos=li;
848     while (li>0) {
849     li--;
850     if (seq[li]==polyChar) {
851     loc.add(li,poly_m_score);
852     }
853     else if (seq[li]=='N') {
854     loc.add(li,0);
855     }
856     else { //mismatch
857     loc.add(li,poly_mis_score);
858     if (maxloc.score-loc.score>poly_dropoff_score) break;
859     }
860     if (maxloc.score<=loc.score) {
861     maxloc=loc;
862     }
863     }
864 gpertea 105 li=maxloc.pos;
865 gpertea 108 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
866     (maxloc.score>poly_minScore && ri>=rlen-3) ||
867     (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
868 gpertea 105 //trimming this li-ri match at 3' end
869     l3=li-1;
870     if (l3<0) l3=0;
871 gpertea 94 return true;
872     }
873     return false;
874 gpertea 12 }
875    
876 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
877 gpertea 103 if (!doPolyTrim) return false;
878 gpertea 88 int rlen=seq.length();
879     l5=0;
880     l3=rlen-1;
881 gpertea 94 int32 seedVal=*(int32*)poly_seed;
882     char polyChar=poly_seed[0];
883 gpertea 88 //assumes N trimming was already done
884     //so a poly match should be very close to the end of the read
885     // -- find the initial match (seed)
886 gpertea 108 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
887 gpertea 94 int li;
888     for (li=0;li<=lmax;li++) {
889     if (seedVal==*(int*)&(seq[li])) {
890 gpertea 88 break;
891     }
892     }
893 gpertea 94 if (li>lmax) return false;
894     //seed found, try to extend it both ways
895     //extend left
896     int ri=li+3; //save rightmost base of the seed
897     SLocScore loc(li, poly_m_score<<2);
898     SLocScore maxloc(loc);
899     while (li>0) {
900     li--;
901     if (seq[li]==polyChar) {
902     loc.add(li,poly_m_score);
903     }
904     else if (seq[li]=='N') {
905     loc.add(li,0);
906     }
907     else { //mismatch
908     loc.add(li,poly_mis_score);
909     if (maxloc.score-loc.score>poly_dropoff_score) break;
910     }
911     if (maxloc.score<=loc.score) {
912     maxloc=loc;
913     }
914 gpertea 88 }
915 gpertea 94 li=maxloc.pos;
916 gpertea 103 if (li>5) return false; //no trimming wanted, too far from 5' end
917 gpertea 94 //li = right boundary for the poly match
918    
919     //extend right
920     loc.set(ri, maxloc.score);
921     maxloc.pos=ri;
922 gpertea 170 while (ri<rlen-1) {
923 gpertea 94 ri++;
924     if (seq[ri]==polyChar) {
925     loc.add(ri,poly_m_score);
926     }
927     else if (seq[ri]=='N') {
928     loc.add(ri,0);
929     }
930     else { //mismatch
931     loc.add(ri,poly_mis_score);
932     if (maxloc.score-loc.score>poly_dropoff_score) break;
933     }
934     if (maxloc.score<=loc.score) {
935     maxloc=loc;
936     }
937     }
938 gpertea 105 ri=maxloc.pos;
939 gpertea 103 if ((maxloc.score==poly_minScore && li==0) ||
940     (maxloc.score>poly_minScore && li<2)
941 gpertea 108 || (maxloc.score>(poly_minScore*3) && li<8)) {
942 gpertea 105 //adjust l5 to reflect this trimming of 5' end
943     l5=ri+1;
944     if (l5>rlen-1) l5=rlen-1;
945 gpertea 94 return true;
946     }
947     return false;
948 gpertea 88 }
949    
950 gpertea 180 bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951     if (adaptors3.Count()==0) return false;
952 gpertea 4 int rlen=seq.length();
953     l5=0;
954     l3=rlen-1;
955 gpertea 103 bool trimmed=false;
956 gpertea 175 GStr wseq(seq);
957 gpertea 103 int wlen=rlen;
958 gpertea 175 GXSeqData seqdata;
959 gpertea 180 int numruns=revCompl ? 2 : 1;
960 gpertea 185 GList<GXAlnInfo> bestalns(true, true, false);
961 gpertea 180 for (int ai=0;ai<adaptors3.Count();ai++) {
962     for (int r=0;r<numruns;r++) {
963     if (r) {
964     seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
965     adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
966     }
967     else {
968     seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
969     adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
970     }
971 gpertea 185 GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
972     if (aln) {
973     if (aln->strong) {
974     trimmed=true;
975     bestalns.Add(aln);
976     break; //will check the rest next time
977     }
978     else bestalns.Add(aln);
979     }
980 gpertea 180 }//forward and reverse adaptors
981 gpertea 185 if (trimmed) break; //will check the rest in the next cycle
982 gpertea 180 }//for each 3' adaptor
983 gpertea 185 if (bestalns.Count()>0) {
984     GXAlnInfo* aln=bestalns[0];
985     if (aln->sl-1 > wlen-aln->sr) {
986     //keep left side
987     l3-=(wlen-aln->sl+1);
988     if (l3<0) l3=0;
989     }
990     else { //keep right side
991     l5+=aln->sr;
992     if (l5>=rlen) l5=rlen-1;
993     }
994     //delete aln;
995     //if (l3-l5+1<min_read_len) return true;
996     wseq=seq.substr(l5,l3-l5+1);
997     wlen=wseq.length();
998     return true; //break the loops here to report a good find
999     }
1000     return false;
1001 gpertea 4 }
1002    
1003 gpertea 180 bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1004     if (adaptors5.Count()==0) return false;
1005 gpertea 4 int rlen=seq.length();
1006     l5=0;
1007     l3=rlen-1;
1008 gpertea 102 bool trimmed=false;
1009 gpertea 175 GStr wseq(seq);
1010 gpertea 103 int wlen=rlen;
1011 gpertea 175 GXSeqData seqdata;
1012 gpertea 180 int numruns=revCompl ? 2 : 1;
1013 gpertea 183 GList<GXAlnInfo> bestalns(true, true, false);
1014 gpertea 180 for (int ai=0;ai<adaptors5.Count();ai++) {
1015     for (int r=0;r<numruns;r++) {
1016     if (r) {
1017     seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1018     adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1019     }
1020     else {
1021     seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1022     adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1023     }
1024 gpertea 182 GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1025     if (aln) {
1026 gpertea 183 if (aln->strong) {
1027     trimmed=true;
1028     bestalns.Add(aln);
1029     break; //will check the rest next time
1030     }
1031     else bestalns.Add(aln);
1032     }
1033     } //forward and reverse?
1034     if (trimmed) break; //will check the rest in the next cycle
1035     }//for each 5' adaptor
1036     if (bestalns.Count()>0) {
1037     GXAlnInfo* aln=bestalns[0];
1038 gpertea 182 if (aln->sl-1 > wlen-aln->sr) {
1039 gpertea 180 //keep left side
1040 gpertea 182 l3-=(wlen-aln->sl+1);
1041 gpertea 180 if (l3<0) l3=0;
1042     }
1043     else { //keep right side
1044 gpertea 182 l5+=aln->sr;
1045 gpertea 180 if (l5>=rlen) l5=rlen-1;
1046     }
1047 gpertea 183 //delete aln;
1048     //if (l3-l5+1<min_read_len) return true;
1049 gpertea 180 wseq=seq.substr(l5,l3-l5+1);
1050     wlen=wseq.length();
1051 gpertea 183 return true; //break the loops here to report a good find
1052     }
1053     return false;
1054 gpertea 98 }
1055 gpertea 4
1056 gpertea 170 //convert qvs to/from phred64 from/to phread33
1057 gpertea 12 void convertPhred(GStr& q) {
1058     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1059 gpertea 4 }
1060    
1061 gpertea 12 void convertPhred(char* q, int len) {
1062     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1063 gpertea 4 }
1064    
1065 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1066 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname) {
1067     rseq="";
1068     rqv="";
1069     rname="";
1070     rinfo="";
1071     if (fq.eof()) return false;
1072     char* l=fq.getLine();
1073     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1074     if (l==NULL) return false;
1075     /* if (rawFormat) {
1076     //TODO: implement raw qseq parsing here
1077     //if (raw type=N) then continue; //skip invalid/bad records
1078     } //raw qseq format
1079     else { // FASTQ or FASTA */
1080     isfasta=(l[0]=='>');
1081 gpertea 170 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1082 gpertea 13 infname.chars(), l);
1083     GStr s(l);
1084     rname=&(l[1]);
1085     for (int i=0;i<rname.length();i++)
1086 gpertea 170 if (rname[i]<=' ') {
1087     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1088     rname.cut(i);
1089     break;
1090 gpertea 13 }
1091     //now get the sequence
1092 gpertea 170 if ((l=fq.getLine())==NULL)
1093 gpertea 13 GError("Error: unexpected EOF after header for read %s (%s)\n",
1094     rname.chars(), infname.chars());
1095     rseq=l; //this must be the DNA line
1096     while ((l=fq.getLine())!=NULL) {
1097     //seq can span multiple lines
1098     if (l[0]=='>' || l[0]=='+') {
1099 gpertea 170 fq.pushBack();
1100 gpertea 13 break; //
1101     }
1102     rseq+=l;
1103 gpertea 170 } //check for multi-line seq
1104 gpertea 13 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1105     if ((l=fq.getLine())==NULL)
1106     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1107     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1108 gpertea 170 if ((l=fq.getLine())==NULL)
1109 gpertea 13 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1110     rqv=l;
1111 gpertea 170 //if (rqv.length()!=rseq.length())
1112 gpertea 13 // GError("Error: qv len != seq len for %s\n", rname.chars());
1113     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1114     rqv+=l; //append to qv string
1115     }
1116     }// fastq
1117     // } //<-- FASTA or FASTQ
1118 gpertea 94 rseq.upper();
1119 gpertea 13 return true;
1120     }
1121    
1122 gpertea 105 #ifdef GDEBUG
1123     void showTrim(GStr& s, int l5, int l3) {
1124 gpertea 177 if (l5>0 || l3==0) {
1125 gpertea 105 color_bg(c_red);
1126     }
1127     for (int i=0;i<s.length()-1;i++) {
1128     if (i && i==l5) color_resetbg();
1129     fprintf(stderr, "%c", s[i]);
1130 gpertea 177 if (i && i==l3) color_bg(c_red);
1131 gpertea 105 }
1132     fprintf(stderr, "%c", s[s.length()-1]);
1133     color_reset();
1134     fprintf(stderr, "\n");
1135     }
1136     #endif
1137    
1138 gpertea 13 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1139 gpertea 170 //returns 0 if the read was untouched, 1 if it was just trimmed
1140 gpertea 13 // and a trash code if it was trashed
1141     l5=0;
1142     l3=rseq.length()-1;
1143 gpertea 103 #ifdef GDEBUG
1144 gpertea 108 //rseq.reverse();
1145 gpertea 103 GMessage(">%s\n", rname.chars());
1146 gpertea 105 GMessage("%s\n",rseq.chars());
1147 gpertea 103 #endif
1148 gpertea 13 if (l3-l5+1<min_read_len) {
1149     return 's';
1150     }
1151     GStr wseq(rseq.chars());
1152     GStr wqv(rqv.chars());
1153     int w5=l5;
1154     int w3=l3;
1155     //first do the q-based trimming
1156     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1157     if (w3-w5+1<min_read_len) {
1158     return 'Q'; //invalid read
1159     }
1160     //-- keep only the w5..w3 range
1161     l5=w5;
1162     l3=w3;
1163     wseq=wseq.substr(w5, w3-w5+1);
1164     if (!wqv.is_empty())
1165     wqv=wqv.substr(w5, w3-w5+1);
1166     } //qv trimming
1167     // N-trimming on the remaining read seq
1168     if (ntrim(wseq, w5, w3)) {
1169     //GMessage("before: %s\n",wseq.chars());
1170     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1171     l5+=w5;
1172     l3-=(wseq.length()-1-w3);
1173     if (w3-w5+1<min_read_len) {
1174     return 'N'; //to be trashed
1175     }
1176     //-- keep only the w5..w3 range
1177     wseq=wseq.substr(w5, w3-w5+1);
1178     if (!wqv.is_empty())
1179     wqv=wqv.substr(w5, w3-w5+1);
1180     w5=0;
1181     w3=wseq.length()-1;
1182     }
1183 gpertea 103 char trim_code;
1184 gpertea 180 //clean the more dirty end first - 3'
1185     int prev_w5=0;
1186     int prev_w3=0;
1187     bool w3upd=true;
1188     bool w5upd=true;
1189 gpertea 103 do {
1190     trim_code=0;
1191 gpertea 180 if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1192 gpertea 103 trim_code='A';
1193     }
1194 gpertea 180 else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1195 gpertea 103 trim_code='T';
1196     }
1197 gpertea 180 else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1198 gpertea 103 trim_code='A';
1199     }
1200 gpertea 180 else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1201 gpertea 103 trim_code='T';
1202     }
1203 gpertea 180 else if (trim_adaptor5(wseq, w5, w3)) {
1204     trim_code='5';
1205     }
1206     else if (trim_adaptor3(wseq, w5, w3)) {
1207 gpertea 103 trim_code='3';
1208     }
1209     if (trim_code) {
1210 gpertea 180 w3upd=(w3!=prev_w3);
1211     w5upd=(w5!=prev_w5);
1212     if (w3upd) prev_w3=w3;
1213     if (w5upd) prev_w5=w5;
1214     #ifdef GDEBUG
1215 gpertea 105 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1216     showTrim(wseq, w5, w3);
1217 gpertea 180 #endif
1218 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1219 gpertea 103 num_trimmed3++;
1220 gpertea 170 if (trimlen<min_trimmed3)
1221 gpertea 103 min_trimmed3=trimlen;
1222 gpertea 13 l5+=w5;
1223     l3-=(wseq.length()-1-w3);
1224     if (w3-w5+1<min_read_len) {
1225 gpertea 103 return trim_code;
1226 gpertea 13 }
1227     //-- keep only the w5..w3 range
1228     wseq=wseq.substr(w5, w3-w5+1);
1229     if (!wqv.is_empty())
1230     wqv=wqv.substr(w5, w3-w5+1);
1231 gpertea 103 }//trimming at 3' end
1232     } while (trim_code);
1233    
1234 gpertea 13 if (doCollapse) {
1235     //keep read for later
1236     FqDupRec* dr=dhash.Find(wseq.chars());
1237     if (dr==NULL) { //new entry
1238 gpertea 170 //if (prefix.is_empty())
1239     dhash.Add(wseq.chars(),
1240 gpertea 13 new FqDupRec(&wqv, rname.chars()));
1241     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1242     }
1243     else
1244     dr->add(wqv);
1245     } //collapsing duplicates
1246     else { //not collapsing duplicates
1247     //apply the dust filter now
1248     if (doDust) {
1249     int dustbases=dust(wseq);
1250     if (dustbases>(wseq.length()>>1)) {
1251     return 'D';
1252     }
1253     }
1254     } //not collapsing duplicates
1255     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1256     }
1257    
1258     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1259     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1260     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1261     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1262     }
1263    
1264     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1265     outcounter++;
1266     bool asFasta=(rqv.is_empty() || fastaOutput);
1267     if (asFasta) {
1268     if (prefix.is_empty()) {
1269     printHeader(f_out, '>',rname,rinfo);
1270     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1271     }
1272     else {
1273 gpertea 170 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1274 gpertea 13 rseq.chars());
1275     }
1276     }
1277     else { //fastq
1278     if (convert_phred) convertPhred(rqv);
1279     if (prefix.is_empty()) {
1280     printHeader(f_out, '@', rname, rinfo);
1281     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1282     }
1283     else
1284 gpertea 170 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1285 gpertea 13 rseq.chars(),rqv.chars() );
1286     }
1287     }
1288    
1289     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1290     if (freport==NULL || trashcode<=' ') return;
1291     if (trashcode=='3' || trashcode=='5') {
1292 gpertea 103 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1293 gpertea 13 }
1294     else {
1295     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1296     }
1297     //tcounter++;
1298     }
1299    
1300     GStr getFext(GStr& s, int* xpos=NULL) {
1301     //s must be a filename without a path
1302     GStr r("");
1303     if (xpos!=NULL) *xpos=0;
1304     if (s.is_empty() || s=="-") return r;
1305     int p=s.rindex('.');
1306     int d=s.rindex('/');
1307     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1308     r=s.substr(p+1);
1309     if (xpos!=NULL) *xpos=p+1;
1310     r.lower();
1311     return r;
1312     }
1313    
1314     void baseFileName(GStr& fname) {
1315     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1316     int xpos=0;
1317     GStr fext=getFext(fname, &xpos);
1318     if (xpos<=1) return;
1319     bool extdel=false;
1320     GStr f2;
1321     if (fext=="z" || fext=="zip") {
1322     extdel=true;
1323     }
1324     else if (fext.length()>=2) {
1325     f2=fext.substr(0,2);
1326     extdel=(f2=="gz" || f2=="bz");
1327     }
1328     if (extdel) {
1329     fname.cut(xpos-1);
1330     fext=getFext(fname, &xpos);
1331     if (xpos<=1) return;
1332     }
1333     extdel=false;
1334     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1335     extdel=true;
1336     }
1337     else if (fext.length()>=2) {
1338     extdel=(fext.substr(0,2)=="fa");
1339     }
1340     if (extdel) fname.cut(xpos-1);
1341     GStr fncp(fname);
1342     fncp.lower();
1343     fncp.chomp("seq");
1344     fncp.chomp("sequence");
1345     fncp.trimR("_.");
1346     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1347     }
1348    
1349     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1350     FILE* f_out=NULL;
1351     GStr fname(getFileName(infname.chars()));
1352     //eliminate known extensions
1353     baseFileName(fname);
1354     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1355     else if (pocmd.is_empty()) {
1356     GStr oname(fname);
1357     oname.append('.');
1358     oname.append(outsuffix);
1359     f_out=fopen(oname.chars(),"w");
1360     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1361     }
1362     else {
1363     GStr oname(">");
1364     oname.append(fname);
1365     oname.append('.');
1366     oname.append(outsuffix);
1367     pocmd.append(oname);
1368     f_out=popen(pocmd.chars(), "w");
1369     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1370     }
1371     return f_out;
1372     }
1373    
1374     void guess_unzip(GStr& fname, GStr& picmd) {
1375     GStr fext=getFext(fname);
1376     if (fext=="gz" || fext=="gzip" || fext=="z") {
1377     picmd="gzip -cd ";
1378     }
1379     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1380     picmd="bzip2 -cd ";
1381     }
1382     }
1383    
1384 gpertea 180 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1385 gpertea 170 //TODO: prepare CASeqData here, and collect hexamers as well
1386 gpertea 180 if (seq.is_empty() || seq=="-" ||
1387     seq=="N/A" || seq==".") return;
1388    
1389 gpertea 172 CASeqData adata(revCompl);
1390 gpertea 180 int idx=adaptors.Add(adata);
1391 gpertea 172 if (idx<0) GError("Error: failed to add adaptor!\n");
1392 gpertea 180 adaptors[idx].trim_type=trim_type;
1393     adaptors[idx].update(seq.chars());
1394 gpertea 170 }
1395    
1396    
1397 gpertea 180 int loadAdaptors(const char* fname) {
1398 gpertea 94 GLineReader lr(fname);
1399     char* l;
1400     while ((l=lr.nextLine())!=NULL) {
1401     if (lr.length()<=3 || l[0]=='#') continue;
1402 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1403     l[0]==';'|| l[0]==':' ) {
1404 gpertea 94 int i=1;
1405     while (l[i]!=0 && isspace(l[i])) {
1406     i++;
1407     }
1408     if (l[i]!=0) {
1409 gpertea 102 GStr s(&(l[i]));
1410 gpertea 108 #ifdef GDEBUG
1411     //s.reverse();
1412     #endif
1413 gpertea 180 addAdaptor(adaptors3, s, galn_TrimRight);
1414 gpertea 94 continue;
1415     }
1416     }
1417     else {
1418 gpertea 98 GStr s(l);
1419     s.startTokenize("\t ;,:");
1420     GStr a5,a3;
1421     if (s.nextToken(a5))
1422 gpertea 180 s.nextToken(a3);
1423     else continue; //no tokens on this line
1424     GAlnTrimType ttype5=galn_TrimLeft;
1425 gpertea 102 a5.upper();
1426     a3.upper();
1427 gpertea 180 if (a3.is_empty() || a3==a5 || a3=="=") {
1428     a3.clear();
1429     ttype5=galn_TrimEither;
1430     }
1431 gpertea 108 #ifdef GDEBUG
1432     // a5.reverse();
1433     // a3.reverse();
1434     #endif
1435 gpertea 180 addAdaptor(adaptors5, a5, ttype5);
1436     addAdaptor(adaptors3, a3, galn_TrimRight);
1437 gpertea 94 }
1438     }
1439 gpertea 180 return adaptors5.Count()+adaptors3.Count();
1440 gpertea 94 }
1441    
1442 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1443     GStr& s, GStr& infname, GStr& infname2) {
1444 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
1445     infname="";
1446     infname2="";
1447     f_in=NULL;
1448     f_in2=NULL;
1449     f_out=NULL;
1450     f_out2=NULL;
1451     //analyze outsuffix intent
1452     GStr pocmd;
1453 gpertea 76 if (outsuffix=="-") {
1454     f_out=stdout;
1455     }
1456     else {
1457     GStr ox=getFext(outsuffix);
1458     if (ox.length()>2) ox=ox.substr(0,2);
1459     if (ox=="gz") pocmd="gzip -9 -c ";
1460     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1461     }
1462 gpertea 13 if (s=="-") {
1463     f_in=stdin;
1464     infname="stdin";
1465     f_out=prepOutFile(infname, pocmd);
1466     return;
1467     } // streaming from stdin
1468     s.startTokenize(",:");
1469     s.nextToken(infname);
1470     bool paired=s.nextToken(infname2);
1471 gpertea 170 if (fileExists(infname.chars())==0)
1472 gpertea 13 GError("Error: cannot find file %s!\n",infname.chars());
1473     GStr fname(getFileName(infname.chars()));
1474     GStr picmd;
1475     guess_unzip(fname, picmd);
1476     if (picmd.is_empty()) {
1477     f_in=fopen(infname.chars(), "r");
1478     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1479     }
1480     else {
1481     picmd.append(infname);
1482     f_in=popen(picmd.chars(), "r");
1483     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1484     }
1485 gpertea 76 if (f_out==stdout) {
1486     if (paired) GError("Error: output suffix required for paired reads\n");
1487     return;
1488     }
1489 gpertea 13 f_out=prepOutFile(infname, pocmd);
1490     if (!paired) return;
1491     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1492     // ---- paired reads:-------------
1493 gpertea 170 if (fileExists(infname2.chars())==0)
1494 gpertea 13 GError("Error: cannot find file %s!\n",infname2.chars());
1495     picmd="";
1496     GStr fname2(getFileName(infname2.chars()));
1497     guess_unzip(fname2, picmd);
1498     if (picmd.is_empty()) {
1499     f_in2=fopen(infname2.chars(), "r");
1500     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1501     }
1502     else {
1503     picmd.append(infname2);
1504     f_in2=popen(picmd.chars(), "r");
1505     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1506     }
1507     f_out2=prepOutFile(infname2, pocmd);
1508     }