ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 183
Committed: Thu Feb 16 21:59:33 2012 UTC (7 years, 7 months ago) by gpertea
File size: 44373 byte(s)
Log Message:
works

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 180 fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10 gpertea 172 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 gpertea 76 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 180 Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15 gpertea 76 for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 gpertea 180 -f file with adaptor sequences to trim, each line having this format:\n\
29     <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30     -5 trim the given adaptor or primer sequence at the 5' end of each read\n\
31 gpertea 4 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 gpertea 180 -3 trim the given adaptor sequence at the 3' end of each read\n\
33 gpertea 4 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 103 -A disable polyA/T trimming (enabled by default)\n\
35 gpertea 94 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39     -l minimum \"clean\" length after trimming that a read must have\n\
40     in order to pass the filter (default: 16)\n\
41 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42     one letter code for the reason why they were trashed\n\
43 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44     50% of its length detected as low complexity\n\
45 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 gpertea 12 name (e.g. for microRNAs)\n\
47     -p input is phred64/phred33 (use -p64 or -p33)\n\
48     -Q convert quality values to the other Phred qv type\n\
49     -V verbose processing\n\
50 gpertea 4 "
51 gpertea 170
52 gpertea 13 //-z for -o option, the output stream(s) will be first piped into the given\n
53     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54    
55 gpertea 170
56 gpertea 180 // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 gpertea 12
58 gpertea 76 //For paired reads sequencing:
59 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
62     //FILE* f_out2=NULL; //for paired reads
63     //FILE* f_in=NULL; //input fastq (stdin if not provided)
64     //FILE* f_in2=NULL; //for paired reads
65 gpertea 170
66 gpertea 4 FILE* freport=NULL;
67 gpertea 13
68 gpertea 4 bool debug=false;
69 gpertea 12 bool verbose=false;
70 gpertea 4 bool doCollapse=false;
71     bool doDust=false;
72 gpertea 103 bool doPolyTrim=true;
73 gpertea 13 bool fastaOutput=false;
74 gpertea 4 bool trashReport=false;
75 gpertea 172 bool revCompl=false; //also reverse complement adaptor sequences
76 gpertea 4 int min_read_len=16;
77 gpertea 12 double max_perc_N=7.0;
78 gpertea 4 int dust_cutoff=16;
79     bool isfasta=false;
80 gpertea 12 bool convert_phred=false;
81 gpertea 170 GStr outsuffix; // -o
82 gpertea 13 GStr prefix;
83     GStr zcmd;
84     int num_trimmed5=0;
85     int num_trimmed3=0;
86     int num_trimmedN=0;
87     int num_trimmedQ=0;
88     int min_trimmed5=INT_MAX;
89     int min_trimmed3=INT_MAX;
90 gpertea 12
91 gpertea 13 int qvtrim_qmin=0;
92     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94     int qv_cvtadd=0; //could be -31 or +31
95    
96 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
97 gpertea 177 //const int match_reward=2;
98     //const int mismatch_penalty=3;
99 gpertea 103 const int match_reward=2;
100 gpertea 177 const int mismatch_penalty=4;
101     const int Xdrop=10;
102 gpertea 103
103 gpertea 94 const int poly_m_score=2; //match score
104     const int poly_mis_score=-3; //mismatch
105 gpertea 88 const int poly_dropoff_score=7;
106 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
107 gpertea 4
108 gpertea 94 const char *polyA_seed="AAAA";
109     const char *polyT_seed="TTTT";
110 gpertea 12
111 gpertea 170 struct CASeqData {
112 gpertea 180 //positional data for every possible hexamer in an adaptor
113     GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114     GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115     GStr seq; //actual adaptor sequence data
116 gpertea 170 GStr seqr; //reverse complement sequence
117 gpertea 180 int amlen; //fraction of adaptor length matching that's
118 gpertea 175 //enough to consider the alignment
119 gpertea 180 GAlnTrimType trim_type;
120 gpertea 172 bool use_reverse;
121 gpertea 175 CASeqData(bool rev=false):seq(),seqr(),
122     amlen(0), use_reverse(rev) {
123 gpertea 180 trim_type=galn_None; //should be updated later!
124 gpertea 172 for (int i=0;i<4096;i++) {
125     pz[i]=NULL;
126     pzr[i]=NULL;
127     }
128     }
129    
130     void update(const char* s) {
131 gpertea 175 seq=s;
132     table6mers(seq.chars(), seq.length(), pz);
133     amlen=iround(double(seq.length())*0.8);
134     if (amlen<12)
135     amlen=12;
136     if (!use_reverse) return;
137     //reverse complement
138     seqr=s;
139     int slen=seq.length();
140     for (int i=0;i<slen;i++)
141     seqr[i]=ntComplement(seq[slen-i-1]);
142     table6mers(seqr.chars(), seqr.length(), pzr);
143 gpertea 170 }
144    
145 gpertea 172 ~CASeqData() {
146     for (int i=0;i<4096;i++) {
147     delete pz[i];
148     delete pzr[i];
149 gpertea 170 }
150     }
151     };
152    
153 gpertea 180 GVec<CASeqData> adaptors5;
154     GVec<CASeqData> adaptors3;
155 gpertea 170
156 gpertea 103 CGreedyAlignData* gxmem_l=NULL;
157     CGreedyAlignData* gxmem_r=NULL;
158    
159 gpertea 4 // element in dhash:
160     class FqDupRec {
161     public:
162     int count; //how many of these reads are the same
163     int len; //length of qv
164     char* firstname; //optional, only if we want to keep the original read names
165     char* qv;
166     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
167     len=0;
168     qv=NULL;
169     firstname=NULL;
170     count=0;
171     if (qstr!=NULL) {
172     qv=Gstrdup(qstr->chars());
173     len=qstr->length();
174     count++;
175     }
176     if (rname!=NULL) firstname=Gstrdup(rname);
177     }
178     ~FqDupRec() {
179     GFREE(qv);
180     GFREE(firstname);
181     }
182     void add(GStr& d) { //collapse another record into this one
183     if (d.length()!=len)
184     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
185     count++;
186     for (int i=0;i<len;i++)
187     qv[i]=(qv[i]+d[i])/2;
188     }
189     };
190    
191     void openfw(FILE* &f, GArgs& args, char opt) {
192     GStr s=args.getOpt(opt);
193     if (!s.is_empty()) {
194     if (s=='-') f=stdout;
195     else {
196 gpertea 13 f=fopen(s.chars(),"w");
197 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
198     }
199     }
200     }
201    
202     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
203     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
204    
205     GHash<FqDupRec> dhash; //hash to keep track of duplicates
206    
207 gpertea 180 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208     int loadAdaptors(const char* fname);
209 gpertea 94
210 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211     GStr& s, GStr& infname, GStr& infname2);
212 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
213 gpertea 170
214 gpertea 13 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
215     void trash_report(char trashcode, GStr& rname, FILE* freport);
216    
217 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
218 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname);
219    
220 gpertea 170 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
221 gpertea 13 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
222    
223 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
224     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
225 gpertea 4 int dust(GStr& seq);
226 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 gpertea 180 bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229     bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230 gpertea 4
231 gpertea 12 void convertPhred(char* q, int len);
232     void convertPhred(GStr& q);
233 gpertea 4
234     int main(int argc, char * const argv[]) {
235 gpertea 172 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
236 gpertea 4 int e;
237     if ((e=args.isError())>0) {
238     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
239     exit(224);
240     }
241 gpertea 9 debug=(args.getOpt('Y')!=NULL);
242 gpertea 13 verbose=(args.getOpt('V')!=NULL);
243 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
244 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
245     doDust=(args.getOpt('D')!=NULL);
246 gpertea 172 revCompl=(args.getOpt('R')!=NULL);
247 gpertea 103 if (args.getOpt('A')) doPolyTrim=false;
248 gpertea 12 /*
249 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
250     if (rawFormat) {
251     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
252     }
253 gpertea 12 */
254 gpertea 4 prefix=args.getOpt('n');
255     GStr s=args.getOpt('l');
256 gpertea 170 if (!s.is_empty())
257 gpertea 4 min_read_len=s.asInt();
258 gpertea 12 s=args.getOpt('m');
259 gpertea 170 if (!s.is_empty())
260 gpertea 12 max_perc_N=s.asDouble();
261 gpertea 4 s=args.getOpt('d');
262     if (!s.is_empty()) {
263     dust_cutoff=s.asInt();
264     doDust=true;
265     }
266 gpertea 12 s=args.getOpt('q');
267     if (!s.is_empty()) {
268 gpertea 13 qvtrim_qmin=s.asInt();
269 gpertea 12 }
270 gpertea 13 s=args.getOpt('t');
271     if (!s.is_empty()) {
272     qvtrim_max=s.asInt();
273     }
274 gpertea 12 s=args.getOpt('p');
275     if (!s.is_empty()) {
276     int v=s.asInt();
277     if (v==33) {
278     qv_phredtype=33;
279     qv_cvtadd=31;
280     }
281     else if (v==64) {
282     qv_phredtype=64;
283     qv_cvtadd=-31;
284     }
285 gpertea 170 else
286 gpertea 12 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
287     }
288 gpertea 94 s=args.getOpt('f');
289     if (!s.is_empty()) {
290 gpertea 180 loadAdaptors(s.chars());
291 gpertea 94 }
292 gpertea 180 bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293 gpertea 94 s=args.getOpt('5');
294     if (!s.is_empty()) {
295 gpertea 180 if (fileAdaptors)
296 gpertea 94 GError("Error: options -5 and -f cannot be used together!\n");
297     s.upper();
298 gpertea 180 addAdaptor(adaptors5, s, galn_TrimLeft);
299 gpertea 4 }
300 gpertea 94 s=args.getOpt('3');
301     if (!s.is_empty()) {
302 gpertea 180 if (fileAdaptors)
303 gpertea 94 GError("Error: options -3 and -f cannot be used together!\n");
304 gpertea 102 s.upper();
305 gpertea 180 addAdaptor(adaptors3, s, galn_TrimRight);
306 gpertea 4 }
307 gpertea 94 s=args.getOpt('y');
308 gpertea 13 if (!s.is_empty()) {
309 gpertea 94 int minmatch=s.asInt();
310     poly_minScore=minmatch*poly_m_score;
311 gpertea 13 }
312 gpertea 170
313 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
314 gpertea 76 else outsuffix="-";
315 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
316 gpertea 13 int fcount=args.startNonOpt();
317     if (fcount==0) {
318 gpertea 4 GMessage(USAGE);
319     exit(224);
320     }
321 gpertea 13 if (fcount>1 && doCollapse) {
322     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
323     }
324     //openfw(f_out, args, 'o');
325     //if (f_out==NULL) f_out=stdout;
326 gpertea 4 if (trashReport)
327     openfw(freport, args, 'r');
328     char* infile=NULL;
329 gpertea 103
330 gpertea 180 if (adaptors5.Count()>0)
331 gpertea 177 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332     gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 gpertea 180 if (adaptors3.Count()>0)
334 gpertea 103 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335    
336 gpertea 4 while ((infile=args.nextNonOpt())!=NULL) {
337 gpertea 103 //for each input file
338 gpertea 13 int incounter=0; //counter for input reads
339     int outcounter=0; //counter for output reads
340     int trash_s=0; //too short from the get go
341     int trash_Q=0;
342     int trash_N=0;
343     int trash_D=0;
344 gpertea 103 int trash_poly=0;
345 gpertea 13 int trash_A3=0;
346     int trash_A5=0;
347     s=infile;
348     GStr infname;
349     GStr infname2;
350     FILE* f_in=NULL;
351     FILE* f_in2=NULL;
352     FILE* f_out=NULL;
353     FILE* f_out2=NULL;
354     bool paired_reads=false;
355     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
356     GLineReader fq(f_in);
357     GLineReader* fq2=NULL;
358     if (f_in2!=NULL) {
359     fq2=new GLineReader(f_in2);
360     paired_reads=true;
361     }
362     GStr rseq, rqv, seqid, seqinfo;
363     GStr rseq2, rqv2, seqid2, seqinfo2;
364     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
365     incounter++;
366     int a5=0, a3=0, b5=0, b3=0;
367     char tcode=0, tcode2=0;
368     tcode=process_read(seqid, rseq, rqv, a5, a3);
369 gpertea 103 if (fq2!=NULL) {
370 gpertea 13 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
371     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
372     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
373     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
374     }
375     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
376     //decide what to do with this pair and print rseq2 if the pair makes it
377     if (tcode>1 && tcode2<=1) {
378     //"untrash" rseq
379     if (a3-a5+1<min_read_len) {
380     a5=1;
381     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
382 gpertea 4 }
383 gpertea 13 tcode=1;
384     }
385     else if (tcode<=1 && tcode2>1) {
386     //"untrash" rseq2
387     if (b3-b5+1<min_read_len) {
388     b5=1;
389     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
390     }
391     tcode2=1;
392     }
393     if (tcode<=1) { //trimmed or left intact -- write it!
394     if (tcode>0) {
395     rseq2=rseq2.substr(b5,b3-b5+1);
396     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
397 gpertea 4 }
398 gpertea 13 int nocounter=0;
399     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
400     }
401 gpertea 103 } //pair read
402 gpertea 13 if (tcode>1) { //trashed
403 gpertea 103 #ifdef GDEBUG
404 gpertea 175 GMessage(" !!!!TRASH code = %c\n",tcode);
405 gpertea 103 #endif
406 gpertea 13 if (tcode=='s') trash_s++;
407 gpertea 103 else if (tcode=='A' || tcode=='T') trash_poly++;
408 gpertea 13 else if (tcode=='Q') trash_Q++;
409     else if (tcode=='N') trash_N++;
410     else if (tcode=='D') trash_D++;
411     else if (tcode=='3') trash_A3++;
412     else if (tcode=='5') trash_A5++;
413     if (trashReport) trash_report(tcode, seqid, freport);
414     }
415     else if (!doCollapse) { //write it
416     if (tcode>0) {
417     rseq=rseq.substr(a5,a3-a5+1);
418     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
419     }
420 gpertea 103 #ifdef GDEBUG
421     GMessage(" After trimming:\n");
422 gpertea 105 GMessage("%s\n",rseq.chars());
423 gpertea 103 #endif
424 gpertea 13 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
425     }
426     } //for each fastq record
427     delete fq2;
428     FRCLOSE(f_in);
429     FRCLOSE(f_in2);
430     if (doCollapse) {
431     outcounter=0;
432     int maxdup_count=1;
433     char* maxdup_seq=NULL;
434     dhash.startIterate();
435     FqDupRec* qd=NULL;
436     char* seq=NULL;
437     while ((qd=dhash.NextData(seq))!=NULL) {
438     GStr rseq(seq);
439     //do the dusting here
440     if (doDust) {
441     int dustbases=dust(rseq);
442     if (dustbases>(rseq.length()>>1)) {
443     if (trashReport && qd->firstname!=NULL) {
444     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
445 gpertea 4 }
446 gpertea 13 trash_D+=qd->count;
447     continue;
448     }
449     }
450     outcounter++;
451 gpertea 170 if (qd->count>maxdup_count) {
452 gpertea 13 maxdup_count=qd->count;
453     maxdup_seq=seq;
454     }
455     if (isfasta) {
456     if (prefix.is_empty()) {
457 gpertea 170 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
458 gpertea 13 rseq.chars());
459 gpertea 4 }
460 gpertea 13 else { //use custom read name
461     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
462     qd->count, rseq.chars());
463 gpertea 4 }
464 gpertea 13 }
465     else { //fastq format
466     if (convert_phred) convertPhred(qd->qv, qd->len);
467     if (prefix.is_empty()) {
468 gpertea 170 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
469 gpertea 13 rseq.chars(), qd->qv);
470 gpertea 4 }
471 gpertea 13 else { //use custom read name
472     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
473     qd->count, rseq.chars(), qd->qv);
474     }
475     }
476     }//for each element of dhash
477     if (maxdup_count>1) {
478     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
479 gpertea 4 }
480 gpertea 13 } //collapse entries
481     if (verbose) {
482     if (paired_reads) {
483     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
484     GMessage("Number of input pairs :%9d\n", incounter);
485     GMessage(" Output pairs :%9d\n", outcounter);
486     }
487     else {
488     GMessage(">Input file : %s\n", infname.chars());
489     GMessage("Number of input reads :%9d\n", incounter);
490     GMessage(" Output reads :%9d\n", outcounter);
491     }
492     GMessage("------------------------------------\n");
493     if (num_trimmed5)
494     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
495     if (num_trimmed3)
496     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
497     GMessage("------------------------------------\n");
498     if (trash_s>0)
499     GMessage(" Trashed by length:%9d\n", trash_s);
500     if (trash_N>0)
501     GMessage(" Trashed by N%%:%9d\n", trash_N);
502     if (trash_Q>0)
503     GMessage("Trashed by low quality:%9d\n", trash_Q);
504 gpertea 103 if (trash_poly>0)
505     GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
506 gpertea 13 if (trash_A5>0)
507 gpertea 180 GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508 gpertea 13 if (trash_A3>0)
509 gpertea 180 GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510 gpertea 13 }
511     if (trashReport) {
512     FWCLOSE(freport);
513 gpertea 4 }
514 gpertea 13 FWCLOSE(f_out);
515     FWCLOSE(f_out2);
516     } //while each input file
517 gpertea 103 delete gxmem_l;
518     delete gxmem_r;
519 gpertea 12 //getc(stdin);
520 gpertea 4 }
521    
522     class NData {
523     public:
524     int NPos[1024]; //there should be no reads longer than 1K ?
525     int NCount;
526     int end5;
527     int end3;
528     int seqlen;
529     double perc_N; //percentage of Ns in end5..end3 range only!
530     const char* seq;
531     bool valid;
532     NData() {
533 gpertea 170 seqlen=0;
534 gpertea 4 NCount=0;
535     end5=0;
536     end3=0;
537     seq=NULL;
538     perc_N=0;
539     valid=true;
540     }
541     void init(GStr& rseq) {
542     NCount=0;
543     perc_N=0;
544     seqlen=rseq.length();
545     seq=rseq.chars();
546     end5=1;
547     end3=seqlen;
548     for (int i=0;i<seqlen;i++)
549 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
550 gpertea 4 NPos[NCount]=i;
551     NCount++;
552     }
553     perc_N=(NCount*100.0)/seqlen;
554     valid=true;
555     }
556     void N_calc() { //only in the region after trimming
557     int n=0;
558     for (int i=end3-1;i<end5;i++) {
559     if (seq[i]=='N') n++;
560     }
561     perc_N=(n*100.0)/(end5-end3+1);
562     }
563     };
564 gpertea 170
565 gpertea 4 static NData feat;
566     int perc_lenN=12; // incremental distance from ends, in percentage of
567     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
568 gpertea 170
569 gpertea 4 void N_analyze(int l5, int l3, int p5, int p3) {
570     /* assumes feat was filled properly */
571     int old_dif, t5,t3,v;
572 gpertea 12 if (l3<l5+2 || p5>p3 ) {
573 gpertea 4 feat.end5=l5+1;
574     feat.end3=l3+1;
575 gpertea 170 return;
576 gpertea 4 }
577    
578     t5=feat.NPos[p5]-l5;
579     t3=l3-feat.NPos[p3];
580     old_dif=p3-p5;
581     v=(int)((((double)(l3-l5))*perc_lenN)/100);
582 gpertea 170 if (v>20) v=20; /* enforce N-search limit for very long reads */
583 gpertea 4 if (t5 < v ) {
584     l5=feat.NPos[p5]+1;
585     p5++;
586     }
587     if (t3 < v) {
588     l3=feat.NPos[p3]-1;
589     p3--;
590     }
591     /* restNs=p3-p5; number of Ns in the new CLR */
592     if (p3-p5==old_dif) { /* no change, return */
593     /* don't trim if this may shorten the read too much */
594     //if (l5-l3<min_read_len) return;
595     feat.end5=l5+1;
596     feat.end3=l3+1;
597     return;
598     }
599 gpertea 170 else
600 gpertea 4 N_analyze(l5,l3, p5,p3);
601     }
602    
603    
604 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
605 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
606 gpertea 12 if (qv_phredtype==0) {
607     //try to guess the Phred type
608     int vmin=256, vmax=0;
609     for (int i=0;i<qvs.length();i++) {
610     if (vmin>qvs[i]) vmin=qvs[i];
611     if (vmax<qvs[i]) vmax=qvs[i];
612     }
613     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
614     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
615     if (qv_phredtype==0) {
616     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
617     }
618 gpertea 13 if (verbose)
619     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
620     } //guessing Phred type
621 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
622 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
623 gpertea 12 }
624     //just in case, check also the 5' the end (?)
625     for (l5=0;l5<qvs.length()-3;l5++) {
626 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
627 gpertea 12 }
628 gpertea 13 if (qvtrim_max>0) {
629     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
630     if (l5>qvtrim_max) l5=qvtrim_max;
631     }
632 gpertea 12 return (l5>0 || l3<qvs.length()-1);
633     }
634    
635     bool ntrim(GStr& rseq, int &l5, int &l3) {
636     //count Ns in the sequence, trim N-rich ends
637 gpertea 4 feat.init(rseq);
638     l5=feat.end5-1;
639     l3=feat.end3-1;
640 gpertea 170 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
641 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
642     if (feat.perc_N>max_perc_N) {
643     feat.valid=false;
644     l3=l5+1;
645     return true;
646     }
647     else {
648     return false; //nothing changed
649     }
650     }
651 gpertea 4 l5=feat.end5-1;
652     l3=feat.end3-1;
653     if (l3-l5+1<min_read_len) {
654     feat.valid=false;
655     return true;
656     }
657     feat.N_calc();
658 gpertea 170
659 gpertea 12 if (feat.perc_N>max_perc_N) {
660 gpertea 4 feat.valid=false;
661     l3=l5+1;
662     return true;
663     }
664     return true;
665     }
666    
667     //--------------- dust functions ----------------
668     class DNADuster {
669     public:
670 gpertea 170 int dustword;
671     int dustwindow;
672     int dustwindow2;
673 gpertea 4 int dustcutoff;
674     int mv, iv, jv;
675     int counts[32*32*32];
676     int iis[32*32*32];
677     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
678     dustword=wordsize;
679     dustwindow=winsize;
680     dustwindow2 = (winsize>>1);
681     dustcutoff=cutoff;
682     mv=0;
683     iv=0;
684     jv=0;
685     }
686     void setWindowSize(int value) {
687     dustwindow = value;
688     dustwindow2 = (dustwindow >> 1);
689     }
690     void setWordSize(int value) {
691     dustword=value;
692     }
693     void wo1(int len, const char* s, int ivv) {
694     int i, ii, j, v, t, n, n1, sum;
695     int js, nis;
696     n = 32 * 32 * 32;
697     n1 = n - 1;
698     nis = 0;
699     i = 0;
700     ii = 0;
701     sum = 0;
702     v = 0;
703     for (j=0; j < len; j++, s++) {
704     ii <<= 5;
705     if (*s<=32) {
706     i=0;
707     continue;
708     }
709     ii |= *s - 'A'; //assume uppercase!
710     ii &= n1;
711     i++;
712     if (i >= dustword) {
713     for (js=0; js < nis && iis[js] != ii; js++) ;
714     if (js == nis) {
715     iis[nis] = ii;
716     counts[ii] = 0;
717     nis++;
718     }
719     if ((t = counts[ii]) > 0) {
720     sum += t;
721     v = 10 * sum / j;
722     if (mv < v) {
723     mv = v;
724     iv = ivv;
725     jv = j;
726     }
727     }
728     counts[ii]++;
729     }
730     }
731     }
732    
733     int wo(int len, const char* s, int* beg, int* end) {
734     int i, l1;
735     l1 = len - dustword + 1;
736     if (l1 < 0) {
737     *beg = 0;
738     *end = len - 1;
739     return 0;
740     }
741     mv = 0;
742     iv = 0;
743     jv = 0;
744     for (i=0; i < l1; i++) {
745     wo1(len-i, s+i, i);
746     }
747     *beg = iv;
748     *end = iv + jv;
749     return mv;
750     }
751    
752     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
753     int i, j, l, a, b, v;
754     if (cutoff==0) cutoff=dustcutoff;
755     a=0;b=0;
756     //GMessage("Dust cutoff=%d\n", cutoff);
757     for (i=0; i < seqlen; i += dustwindow2) {
758     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
759     v = wo(l, seq+i, &a, &b);
760     if (v > cutoff) {
761     //for (j = a; j <= b && j < dustwindow2; j++) {
762     for (j = a; j <= b; j++) {
763     seqmsk[i+j]='N';//could be made lowercase instead
764     }
765     }
766     }
767 gpertea 170 //return first;
768 gpertea 4 }
769     };
770    
771     static DNADuster duster;
772    
773     int dust(GStr& rseq) {
774     char* seq=Gstrdup(rseq.chars());
775     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
776     //check the number of Ns:
777     int ncount=0;
778     for (int i=0;i<rseq.length();i++) {
779     if (seq[i]=='N') ncount++;
780     }
781     GFREE(seq);
782     return ncount;
783     }
784    
785 gpertea 94 struct SLocScore {
786     int pos;
787     int score;
788     SLocScore(int p=0,int s=0) {
789     pos=p;
790     score=s;
791     }
792     void set(int p, int s) {
793     pos=p;
794     score=s;
795     }
796     void add(int p, int add) {
797     pos=p;
798     score+=add;
799     }
800     };
801 gpertea 12
802 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
803 gpertea 103 if (!doPolyTrim) return false;
804 gpertea 94 int rlen=seq.length();
805     l5=0;
806     l3=rlen-1;
807     int32 seedVal=*(int32*)poly_seed;
808     char polyChar=poly_seed[0];
809     //assumes N trimming was already done
810     //so a poly match should be very close to the end of the read
811     // -- find the initial match (seed)
812 gpertea 108 int lmin=GMAX((rlen-16), 0);
813 gpertea 94 int li;
814     for (li=rlen-4;li>lmin;li--) {
815     if (seedVal==*(int*)&(seq[li])) {
816     break;
817     }
818 gpertea 12 }
819 gpertea 94 if (li<=lmin) return false;
820     //seed found, try to extend it both ways
821     //extend right
822     int ri=li+3;
823     SLocScore loc(ri, poly_m_score<<2);
824     SLocScore maxloc(loc);
825     //extend right
826 gpertea 170 while (ri<rlen-1) {
827 gpertea 94 ri++;
828     if (seq[ri]==polyChar) {
829     loc.add(ri,poly_m_score);
830     }
831     else if (seq[ri]=='N') {
832     loc.add(ri,0);
833     }
834     else { //mismatch
835     loc.add(ri,poly_mis_score);
836     if (maxloc.score-loc.score>poly_dropoff_score) break;
837     }
838     if (maxloc.score<=loc.score) {
839     maxloc=loc;
840     }
841 gpertea 12 }
842 gpertea 94 ri=maxloc.pos;
843 gpertea 103 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
844 gpertea 94 //ri = right boundary for the poly match
845     //extend left
846     loc.set(li, maxloc.score);
847     maxloc.pos=li;
848     while (li>0) {
849     li--;
850     if (seq[li]==polyChar) {
851     loc.add(li,poly_m_score);
852     }
853     else if (seq[li]=='N') {
854     loc.add(li,0);
855     }
856     else { //mismatch
857     loc.add(li,poly_mis_score);
858     if (maxloc.score-loc.score>poly_dropoff_score) break;
859     }
860     if (maxloc.score<=loc.score) {
861     maxloc=loc;
862     }
863     }
864 gpertea 105 li=maxloc.pos;
865 gpertea 108 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
866     (maxloc.score>poly_minScore && ri>=rlen-3) ||
867     (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
868 gpertea 105 //trimming this li-ri match at 3' end
869     l3=li-1;
870     if (l3<0) l3=0;
871 gpertea 94 return true;
872     }
873     return false;
874 gpertea 12 }
875    
876 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
877 gpertea 103 if (!doPolyTrim) return false;
878 gpertea 88 int rlen=seq.length();
879     l5=0;
880     l3=rlen-1;
881 gpertea 94 int32 seedVal=*(int32*)poly_seed;
882     char polyChar=poly_seed[0];
883 gpertea 88 //assumes N trimming was already done
884     //so a poly match should be very close to the end of the read
885     // -- find the initial match (seed)
886 gpertea 108 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
887 gpertea 94 int li;
888     for (li=0;li<=lmax;li++) {
889     if (seedVal==*(int*)&(seq[li])) {
890 gpertea 88 break;
891     }
892     }
893 gpertea 94 if (li>lmax) return false;
894     //seed found, try to extend it both ways
895     //extend left
896     int ri=li+3; //save rightmost base of the seed
897     SLocScore loc(li, poly_m_score<<2);
898     SLocScore maxloc(loc);
899     while (li>0) {
900     li--;
901     if (seq[li]==polyChar) {
902     loc.add(li,poly_m_score);
903     }
904     else if (seq[li]=='N') {
905     loc.add(li,0);
906     }
907     else { //mismatch
908     loc.add(li,poly_mis_score);
909     if (maxloc.score-loc.score>poly_dropoff_score) break;
910     }
911     if (maxloc.score<=loc.score) {
912     maxloc=loc;
913     }
914 gpertea 88 }
915 gpertea 94 li=maxloc.pos;
916 gpertea 103 if (li>5) return false; //no trimming wanted, too far from 5' end
917 gpertea 94 //li = right boundary for the poly match
918    
919     //extend right
920     loc.set(ri, maxloc.score);
921     maxloc.pos=ri;
922 gpertea 170 while (ri<rlen-1) {
923 gpertea 94 ri++;
924     if (seq[ri]==polyChar) {
925     loc.add(ri,poly_m_score);
926     }
927     else if (seq[ri]=='N') {
928     loc.add(ri,0);
929     }
930     else { //mismatch
931     loc.add(ri,poly_mis_score);
932     if (maxloc.score-loc.score>poly_dropoff_score) break;
933     }
934     if (maxloc.score<=loc.score) {
935     maxloc=loc;
936     }
937     }
938 gpertea 105 ri=maxloc.pos;
939 gpertea 103 if ((maxloc.score==poly_minScore && li==0) ||
940     (maxloc.score>poly_minScore && li<2)
941 gpertea 108 || (maxloc.score>(poly_minScore*3) && li<8)) {
942 gpertea 105 //adjust l5 to reflect this trimming of 5' end
943     l5=ri+1;
944     if (l5>rlen-1) l5=rlen-1;
945 gpertea 94 return true;
946     }
947     return false;
948 gpertea 88 }
949    
950 gpertea 180 bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951     if (adaptors3.Count()==0) return false;
952 gpertea 4 int rlen=seq.length();
953     l5=0;
954     l3=rlen-1;
955 gpertea 103 bool trimmed=false;
956 gpertea 175 GStr wseq(seq);
957 gpertea 103 int wlen=rlen;
958 gpertea 175 GXSeqData seqdata;
959 gpertea 180 int numruns=revCompl ? 2 : 1;
960 gpertea 182
961 gpertea 180 for (int ai=0;ai<adaptors3.Count();ai++) {
962     for (int r=0;r<numruns;r++) {
963     if (r) {
964     seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
965     adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
966     }
967     else {
968     seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
969     adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
970     }
971    
972 gpertea 182 GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
973     if (aln) {
974 gpertea 103 trimmed=true;
975 gpertea 175 //keep unmatched region on the left OR right (the longer one)
976 gpertea 182 if (aln->sl > wlen-aln->sr) {
977 gpertea 175 //keep left side
978 gpertea 182 l3-=(wlen-aln->sl+1);
979 gpertea 175 if (l3<0) l3=0;
980     }
981     else { //keep right side
982 gpertea 182 l5+=aln->sr;
983 gpertea 175 if (l5>=rlen) l5=rlen-1;
984     }
985 gpertea 182 delete aln;
986 gpertea 103 if (l3-l5+1<min_read_len) return true;
987     wseq=seq.substr(l5,l3-l5+1);
988     wlen=wseq.length();
989     }
990 gpertea 180 }//forward and reverse adaptors
991     }//for each 3' adaptor
992 gpertea 103 return trimmed;
993 gpertea 4 }
994    
995 gpertea 180 bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
996     if (adaptors5.Count()==0) return false;
997 gpertea 4 int rlen=seq.length();
998     l5=0;
999     l3=rlen-1;
1000 gpertea 102 bool trimmed=false;
1001 gpertea 175 GStr wseq(seq);
1002 gpertea 103 int wlen=rlen;
1003 gpertea 175 GXSeqData seqdata;
1004 gpertea 180 int numruns=revCompl ? 2 : 1;
1005 gpertea 183 GList<GXAlnInfo> bestalns(true, true, false);
1006 gpertea 180 for (int ai=0;ai<adaptors5.Count();ai++) {
1007     for (int r=0;r<numruns;r++) {
1008     if (r) {
1009     seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1010     adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1011     }
1012     else {
1013     seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1014     adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1015     }
1016 gpertea 182 GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1017     if (aln) {
1018 gpertea 183 if (aln->strong) {
1019     trimmed=true;
1020     bestalns.Add(aln);
1021     break; //will check the rest next time
1022     }
1023     else bestalns.Add(aln);
1024     }
1025     } //forward and reverse?
1026     if (trimmed) break; //will check the rest in the next cycle
1027     }//for each 5' adaptor
1028     if (bestalns.Count()>0) {
1029     GXAlnInfo* aln=bestalns[0];
1030 gpertea 182 if (aln->sl-1 > wlen-aln->sr) {
1031 gpertea 180 //keep left side
1032 gpertea 182 l3-=(wlen-aln->sl+1);
1033 gpertea 180 if (l3<0) l3=0;
1034     }
1035     else { //keep right side
1036 gpertea 182 l5+=aln->sr;
1037 gpertea 180 if (l5>=rlen) l5=rlen-1;
1038     }
1039 gpertea 183 //delete aln;
1040     //if (l3-l5+1<min_read_len) return true;
1041 gpertea 180 wseq=seq.substr(l5,l3-l5+1);
1042     wlen=wseq.length();
1043 gpertea 183 return true; //break the loops here to report a good find
1044     }
1045     return false;
1046 gpertea 98 }
1047 gpertea 4
1048 gpertea 170 //convert qvs to/from phred64 from/to phread33
1049 gpertea 12 void convertPhred(GStr& q) {
1050     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1051 gpertea 4 }
1052    
1053 gpertea 12 void convertPhred(char* q, int len) {
1054     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1055 gpertea 4 }
1056    
1057 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1058 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname) {
1059     rseq="";
1060     rqv="";
1061     rname="";
1062     rinfo="";
1063     if (fq.eof()) return false;
1064     char* l=fq.getLine();
1065     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1066     if (l==NULL) return false;
1067     /* if (rawFormat) {
1068     //TODO: implement raw qseq parsing here
1069     //if (raw type=N) then continue; //skip invalid/bad records
1070     } //raw qseq format
1071     else { // FASTQ or FASTA */
1072     isfasta=(l[0]=='>');
1073 gpertea 170 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1074 gpertea 13 infname.chars(), l);
1075     GStr s(l);
1076     rname=&(l[1]);
1077     for (int i=0;i<rname.length();i++)
1078 gpertea 170 if (rname[i]<=' ') {
1079     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1080     rname.cut(i);
1081     break;
1082 gpertea 13 }
1083     //now get the sequence
1084 gpertea 170 if ((l=fq.getLine())==NULL)
1085 gpertea 13 GError("Error: unexpected EOF after header for read %s (%s)\n",
1086     rname.chars(), infname.chars());
1087     rseq=l; //this must be the DNA line
1088     while ((l=fq.getLine())!=NULL) {
1089     //seq can span multiple lines
1090     if (l[0]=='>' || l[0]=='+') {
1091 gpertea 170 fq.pushBack();
1092 gpertea 13 break; //
1093     }
1094     rseq+=l;
1095 gpertea 170 } //check for multi-line seq
1096 gpertea 13 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1097     if ((l=fq.getLine())==NULL)
1098     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1099     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1100 gpertea 170 if ((l=fq.getLine())==NULL)
1101 gpertea 13 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1102     rqv=l;
1103 gpertea 170 //if (rqv.length()!=rseq.length())
1104 gpertea 13 // GError("Error: qv len != seq len for %s\n", rname.chars());
1105     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1106     rqv+=l; //append to qv string
1107     }
1108     }// fastq
1109     // } //<-- FASTA or FASTQ
1110 gpertea 94 rseq.upper();
1111 gpertea 13 return true;
1112     }
1113    
1114 gpertea 105 #ifdef GDEBUG
1115     void showTrim(GStr& s, int l5, int l3) {
1116 gpertea 177 if (l5>0 || l3==0) {
1117 gpertea 105 color_bg(c_red);
1118     }
1119     for (int i=0;i<s.length()-1;i++) {
1120     if (i && i==l5) color_resetbg();
1121     fprintf(stderr, "%c", s[i]);
1122 gpertea 177 if (i && i==l3) color_bg(c_red);
1123 gpertea 105 }
1124     fprintf(stderr, "%c", s[s.length()-1]);
1125     color_reset();
1126     fprintf(stderr, "\n");
1127     }
1128     #endif
1129    
1130 gpertea 13 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1131 gpertea 170 //returns 0 if the read was untouched, 1 if it was just trimmed
1132 gpertea 13 // and a trash code if it was trashed
1133     l5=0;
1134     l3=rseq.length()-1;
1135 gpertea 103 #ifdef GDEBUG
1136 gpertea 108 //rseq.reverse();
1137 gpertea 103 GMessage(">%s\n", rname.chars());
1138 gpertea 105 GMessage("%s\n",rseq.chars());
1139 gpertea 103 #endif
1140 gpertea 13 if (l3-l5+1<min_read_len) {
1141     return 's';
1142     }
1143     GStr wseq(rseq.chars());
1144     GStr wqv(rqv.chars());
1145     int w5=l5;
1146     int w3=l3;
1147     //first do the q-based trimming
1148     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1149     if (w3-w5+1<min_read_len) {
1150     return 'Q'; //invalid read
1151     }
1152     //-- keep only the w5..w3 range
1153     l5=w5;
1154     l3=w3;
1155     wseq=wseq.substr(w5, w3-w5+1);
1156     if (!wqv.is_empty())
1157     wqv=wqv.substr(w5, w3-w5+1);
1158     } //qv trimming
1159     // N-trimming on the remaining read seq
1160     if (ntrim(wseq, w5, w3)) {
1161     //GMessage("before: %s\n",wseq.chars());
1162     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1163     l5+=w5;
1164     l3-=(wseq.length()-1-w3);
1165     if (w3-w5+1<min_read_len) {
1166     return 'N'; //to be trashed
1167     }
1168     //-- keep only the w5..w3 range
1169     wseq=wseq.substr(w5, w3-w5+1);
1170     if (!wqv.is_empty())
1171     wqv=wqv.substr(w5, w3-w5+1);
1172     w5=0;
1173     w3=wseq.length()-1;
1174     }
1175 gpertea 103 char trim_code;
1176 gpertea 180 //clean the more dirty end first - 3'
1177     int prev_w5=0;
1178     int prev_w3=0;
1179     bool w3upd=true;
1180     bool w5upd=true;
1181 gpertea 103 do {
1182     trim_code=0;
1183 gpertea 180 if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1184 gpertea 103 trim_code='A';
1185     }
1186 gpertea 180 else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1187 gpertea 103 trim_code='T';
1188     }
1189 gpertea 180 else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1190 gpertea 103 trim_code='A';
1191     }
1192 gpertea 180 else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1193 gpertea 103 trim_code='T';
1194     }
1195 gpertea 180 else if (trim_adaptor5(wseq, w5, w3)) {
1196     trim_code='5';
1197     }
1198     else if (trim_adaptor3(wseq, w5, w3)) {
1199 gpertea 103 trim_code='3';
1200     }
1201     if (trim_code) {
1202 gpertea 180 w3upd=(w3!=prev_w3);
1203     w5upd=(w5!=prev_w5);
1204     if (w3upd) prev_w3=w3;
1205     if (w5upd) prev_w5=w5;
1206     #ifdef GDEBUG
1207 gpertea 105 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1208     showTrim(wseq, w5, w3);
1209 gpertea 180 #endif
1210 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1211 gpertea 103 num_trimmed3++;
1212 gpertea 170 if (trimlen<min_trimmed3)
1213 gpertea 103 min_trimmed3=trimlen;
1214 gpertea 13 l5+=w5;
1215     l3-=(wseq.length()-1-w3);
1216     if (w3-w5+1<min_read_len) {
1217 gpertea 103 return trim_code;
1218 gpertea 13 }
1219     //-- keep only the w5..w3 range
1220     wseq=wseq.substr(w5, w3-w5+1);
1221     if (!wqv.is_empty())
1222     wqv=wqv.substr(w5, w3-w5+1);
1223 gpertea 103 }//trimming at 3' end
1224     } while (trim_code);
1225    
1226 gpertea 13 if (doCollapse) {
1227     //keep read for later
1228     FqDupRec* dr=dhash.Find(wseq.chars());
1229     if (dr==NULL) { //new entry
1230 gpertea 170 //if (prefix.is_empty())
1231     dhash.Add(wseq.chars(),
1232 gpertea 13 new FqDupRec(&wqv, rname.chars()));
1233     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1234     }
1235     else
1236     dr->add(wqv);
1237     } //collapsing duplicates
1238     else { //not collapsing duplicates
1239     //apply the dust filter now
1240     if (doDust) {
1241     int dustbases=dust(wseq);
1242     if (dustbases>(wseq.length()>>1)) {
1243     return 'D';
1244     }
1245     }
1246     } //not collapsing duplicates
1247     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1248     }
1249    
1250     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1251     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1252     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1253     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1254     }
1255    
1256     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1257     outcounter++;
1258     bool asFasta=(rqv.is_empty() || fastaOutput);
1259     if (asFasta) {
1260     if (prefix.is_empty()) {
1261     printHeader(f_out, '>',rname,rinfo);
1262     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1263     }
1264     else {
1265 gpertea 170 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1266 gpertea 13 rseq.chars());
1267     }
1268     }
1269     else { //fastq
1270     if (convert_phred) convertPhred(rqv);
1271     if (prefix.is_empty()) {
1272     printHeader(f_out, '@', rname, rinfo);
1273     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1274     }
1275     else
1276 gpertea 170 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1277 gpertea 13 rseq.chars(),rqv.chars() );
1278     }
1279     }
1280    
1281     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1282     if (freport==NULL || trashcode<=' ') return;
1283     if (trashcode=='3' || trashcode=='5') {
1284 gpertea 103 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1285 gpertea 13 }
1286     else {
1287     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1288     }
1289     //tcounter++;
1290     }
1291    
1292     GStr getFext(GStr& s, int* xpos=NULL) {
1293     //s must be a filename without a path
1294     GStr r("");
1295     if (xpos!=NULL) *xpos=0;
1296     if (s.is_empty() || s=="-") return r;
1297     int p=s.rindex('.');
1298     int d=s.rindex('/');
1299     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1300     r=s.substr(p+1);
1301     if (xpos!=NULL) *xpos=p+1;
1302     r.lower();
1303     return r;
1304     }
1305    
1306     void baseFileName(GStr& fname) {
1307     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1308     int xpos=0;
1309     GStr fext=getFext(fname, &xpos);
1310     if (xpos<=1) return;
1311     bool extdel=false;
1312     GStr f2;
1313     if (fext=="z" || fext=="zip") {
1314     extdel=true;
1315     }
1316     else if (fext.length()>=2) {
1317     f2=fext.substr(0,2);
1318     extdel=(f2=="gz" || f2=="bz");
1319     }
1320     if (extdel) {
1321     fname.cut(xpos-1);
1322     fext=getFext(fname, &xpos);
1323     if (xpos<=1) return;
1324     }
1325     extdel=false;
1326     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1327     extdel=true;
1328     }
1329     else if (fext.length()>=2) {
1330     extdel=(fext.substr(0,2)=="fa");
1331     }
1332     if (extdel) fname.cut(xpos-1);
1333     GStr fncp(fname);
1334     fncp.lower();
1335     fncp.chomp("seq");
1336     fncp.chomp("sequence");
1337     fncp.trimR("_.");
1338     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1339     }
1340    
1341     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1342     FILE* f_out=NULL;
1343     GStr fname(getFileName(infname.chars()));
1344     //eliminate known extensions
1345     baseFileName(fname);
1346     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1347     else if (pocmd.is_empty()) {
1348     GStr oname(fname);
1349     oname.append('.');
1350     oname.append(outsuffix);
1351     f_out=fopen(oname.chars(),"w");
1352     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1353     }
1354     else {
1355     GStr oname(">");
1356     oname.append(fname);
1357     oname.append('.');
1358     oname.append(outsuffix);
1359     pocmd.append(oname);
1360     f_out=popen(pocmd.chars(), "w");
1361     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1362     }
1363     return f_out;
1364     }
1365    
1366     void guess_unzip(GStr& fname, GStr& picmd) {
1367     GStr fext=getFext(fname);
1368     if (fext=="gz" || fext=="gzip" || fext=="z") {
1369     picmd="gzip -cd ";
1370     }
1371     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1372     picmd="bzip2 -cd ";
1373     }
1374     }
1375    
1376 gpertea 180 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1377 gpertea 170 //TODO: prepare CASeqData here, and collect hexamers as well
1378 gpertea 180 if (seq.is_empty() || seq=="-" ||
1379     seq=="N/A" || seq==".") return;
1380    
1381 gpertea 172 CASeqData adata(revCompl);
1382 gpertea 180 int idx=adaptors.Add(adata);
1383 gpertea 172 if (idx<0) GError("Error: failed to add adaptor!\n");
1384 gpertea 180 adaptors[idx].trim_type=trim_type;
1385     adaptors[idx].update(seq.chars());
1386 gpertea 170 }
1387    
1388    
1389 gpertea 180 int loadAdaptors(const char* fname) {
1390 gpertea 94 GLineReader lr(fname);
1391     char* l;
1392     while ((l=lr.nextLine())!=NULL) {
1393     if (lr.length()<=3 || l[0]=='#') continue;
1394 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1395     l[0]==';'|| l[0]==':' ) {
1396 gpertea 94 int i=1;
1397     while (l[i]!=0 && isspace(l[i])) {
1398     i++;
1399     }
1400     if (l[i]!=0) {
1401 gpertea 102 GStr s(&(l[i]));
1402 gpertea 108 #ifdef GDEBUG
1403     //s.reverse();
1404     #endif
1405 gpertea 180 addAdaptor(adaptors3, s, galn_TrimRight);
1406 gpertea 94 continue;
1407     }
1408     }
1409     else {
1410 gpertea 98 GStr s(l);
1411     s.startTokenize("\t ;,:");
1412     GStr a5,a3;
1413     if (s.nextToken(a5))
1414 gpertea 180 s.nextToken(a3);
1415     else continue; //no tokens on this line
1416     GAlnTrimType ttype5=galn_TrimLeft;
1417 gpertea 102 a5.upper();
1418     a3.upper();
1419 gpertea 180 if (a3.is_empty() || a3==a5 || a3=="=") {
1420     a3.clear();
1421     ttype5=galn_TrimEither;
1422     }
1423 gpertea 108 #ifdef GDEBUG
1424     // a5.reverse();
1425     // a3.reverse();
1426     #endif
1427 gpertea 180 addAdaptor(adaptors5, a5, ttype5);
1428     addAdaptor(adaptors3, a3, galn_TrimRight);
1429 gpertea 94 }
1430     }
1431 gpertea 180 return adaptors5.Count()+adaptors3.Count();
1432 gpertea 94 }
1433    
1434 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1435     GStr& s, GStr& infname, GStr& infname2) {
1436 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
1437     infname="";
1438     infname2="";
1439     f_in=NULL;
1440     f_in2=NULL;
1441     f_out=NULL;
1442     f_out2=NULL;
1443     //analyze outsuffix intent
1444     GStr pocmd;
1445 gpertea 76 if (outsuffix=="-") {
1446     f_out=stdout;
1447     }
1448     else {
1449     GStr ox=getFext(outsuffix);
1450     if (ox.length()>2) ox=ox.substr(0,2);
1451     if (ox=="gz") pocmd="gzip -9 -c ";
1452     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1453     }
1454 gpertea 13 if (s=="-") {
1455     f_in=stdin;
1456     infname="stdin";
1457     f_out=prepOutFile(infname, pocmd);
1458     return;
1459     } // streaming from stdin
1460     s.startTokenize(",:");
1461     s.nextToken(infname);
1462     bool paired=s.nextToken(infname2);
1463 gpertea 170 if (fileExists(infname.chars())==0)
1464 gpertea 13 GError("Error: cannot find file %s!\n",infname.chars());
1465     GStr fname(getFileName(infname.chars()));
1466     GStr picmd;
1467     guess_unzip(fname, picmd);
1468     if (picmd.is_empty()) {
1469     f_in=fopen(infname.chars(), "r");
1470     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1471     }
1472     else {
1473     picmd.append(infname);
1474     f_in=popen(picmd.chars(), "r");
1475     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1476     }
1477 gpertea 76 if (f_out==stdout) {
1478     if (paired) GError("Error: output suffix required for paired reads\n");
1479     return;
1480     }
1481 gpertea 13 f_out=prepOutFile(infname, pocmd);
1482     if (!paired) return;
1483     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1484     // ---- paired reads:-------------
1485 gpertea 170 if (fileExists(infname2.chars())==0)
1486 gpertea 13 GError("Error: cannot find file %s!\n",infname2.chars());
1487     picmd="";
1488     GStr fname2(getFileName(infname2.chars()));
1489     guess_unzip(fname2, picmd);
1490     if (picmd.is_empty()) {
1491     f_in2=fopen(infname2.chars(), "r");
1492     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1493     }
1494     else {
1495     picmd.append(infname2);
1496     f_in2=popen(picmd.chars(), "r");
1497     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1498     }
1499     f_out2=prepOutFile(infname2, pocmd);
1500     }