ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 180
Committed: Thu Feb 16 21:24:04 2012 UTC (7 years, 6 months ago) by gpertea
File size: 44059 byte(s)
Log Message:
works probably

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 180 fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10 gpertea 172 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 gpertea 76 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 180 Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15 gpertea 76 for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 gpertea 180 -f file with adaptor sequences to trim, each line having this format:\n\
29     <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30     -5 trim the given adaptor or primer sequence at the 5' end of each read\n\
31 gpertea 4 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 gpertea 180 -3 trim the given adaptor sequence at the 3' end of each read\n\
33 gpertea 4 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 103 -A disable polyA/T trimming (enabled by default)\n\
35 gpertea 94 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39     -l minimum \"clean\" length after trimming that a read must have\n\
40     in order to pass the filter (default: 16)\n\
41 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42     one letter code for the reason why they were trashed\n\
43 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44     50% of its length detected as low complexity\n\
45 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 gpertea 12 name (e.g. for microRNAs)\n\
47     -p input is phred64/phred33 (use -p64 or -p33)\n\
48     -Q convert quality values to the other Phred qv type\n\
49     -V verbose processing\n\
50 gpertea 4 "
51 gpertea 170
52 gpertea 13 //-z for -o option, the output stream(s) will be first piped into the given\n
53     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54    
55 gpertea 170
56 gpertea 180 // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 gpertea 12
58 gpertea 76 //For paired reads sequencing:
59 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
62     //FILE* f_out2=NULL; //for paired reads
63     //FILE* f_in=NULL; //input fastq (stdin if not provided)
64     //FILE* f_in2=NULL; //for paired reads
65 gpertea 170
66 gpertea 4 FILE* freport=NULL;
67 gpertea 13
68 gpertea 4 bool debug=false;
69 gpertea 12 bool verbose=false;
70 gpertea 4 bool doCollapse=false;
71     bool doDust=false;
72 gpertea 103 bool doPolyTrim=true;
73 gpertea 13 bool fastaOutput=false;
74 gpertea 4 bool trashReport=false;
75 gpertea 172 bool revCompl=false; //also reverse complement adaptor sequences
76 gpertea 4 int min_read_len=16;
77 gpertea 12 double max_perc_N=7.0;
78 gpertea 4 int dust_cutoff=16;
79     bool isfasta=false;
80 gpertea 12 bool convert_phred=false;
81 gpertea 170 GStr outsuffix; // -o
82 gpertea 13 GStr prefix;
83     GStr zcmd;
84     int num_trimmed5=0;
85     int num_trimmed3=0;
86     int num_trimmedN=0;
87     int num_trimmedQ=0;
88     int min_trimmed5=INT_MAX;
89     int min_trimmed3=INT_MAX;
90 gpertea 12
91 gpertea 13 int qvtrim_qmin=0;
92     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94     int qv_cvtadd=0; //could be -31 or +31
95    
96 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
97 gpertea 177 //const int match_reward=2;
98     //const int mismatch_penalty=3;
99 gpertea 103 const int match_reward=2;
100 gpertea 177 const int mismatch_penalty=4;
101     const int Xdrop=10;
102 gpertea 103
103 gpertea 94 const int poly_m_score=2; //match score
104     const int poly_mis_score=-3; //mismatch
105 gpertea 88 const int poly_dropoff_score=7;
106 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
107 gpertea 4
108 gpertea 94 const char *polyA_seed="AAAA";
109     const char *polyT_seed="TTTT";
110 gpertea 12
111 gpertea 170 struct CASeqData {
112 gpertea 180 //positional data for every possible hexamer in an adaptor
113     GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114     GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115     GStr seq; //actual adaptor sequence data
116 gpertea 170 GStr seqr; //reverse complement sequence
117 gpertea 180 int amlen; //fraction of adaptor length matching that's
118 gpertea 175 //enough to consider the alignment
119 gpertea 180 GAlnTrimType trim_type;
120 gpertea 172 bool use_reverse;
121 gpertea 175 CASeqData(bool rev=false):seq(),seqr(),
122     amlen(0), use_reverse(rev) {
123 gpertea 180 trim_type=galn_None; //should be updated later!
124 gpertea 172 for (int i=0;i<4096;i++) {
125     pz[i]=NULL;
126     pzr[i]=NULL;
127     }
128     }
129    
130     void update(const char* s) {
131 gpertea 175 seq=s;
132     table6mers(seq.chars(), seq.length(), pz);
133     amlen=iround(double(seq.length())*0.8);
134     if (amlen<12)
135     amlen=12;
136     if (!use_reverse) return;
137     //reverse complement
138     seqr=s;
139     int slen=seq.length();
140     for (int i=0;i<slen;i++)
141     seqr[i]=ntComplement(seq[slen-i-1]);
142     table6mers(seqr.chars(), seqr.length(), pzr);
143 gpertea 170 }
144    
145 gpertea 172 ~CASeqData() {
146     for (int i=0;i<4096;i++) {
147     delete pz[i];
148     delete pzr[i];
149 gpertea 170 }
150     }
151     };
152    
153 gpertea 180 GVec<CASeqData> adaptors5;
154     GVec<CASeqData> adaptors3;
155 gpertea 170
156 gpertea 103 CGreedyAlignData* gxmem_l=NULL;
157     CGreedyAlignData* gxmem_r=NULL;
158    
159 gpertea 4 // element in dhash:
160     class FqDupRec {
161     public:
162     int count; //how many of these reads are the same
163     int len; //length of qv
164     char* firstname; //optional, only if we want to keep the original read names
165     char* qv;
166     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
167     len=0;
168     qv=NULL;
169     firstname=NULL;
170     count=0;
171     if (qstr!=NULL) {
172     qv=Gstrdup(qstr->chars());
173     len=qstr->length();
174     count++;
175     }
176     if (rname!=NULL) firstname=Gstrdup(rname);
177     }
178     ~FqDupRec() {
179     GFREE(qv);
180     GFREE(firstname);
181     }
182     void add(GStr& d) { //collapse another record into this one
183     if (d.length()!=len)
184     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
185     count++;
186     for (int i=0;i<len;i++)
187     qv[i]=(qv[i]+d[i])/2;
188     }
189     };
190    
191     void openfw(FILE* &f, GArgs& args, char opt) {
192     GStr s=args.getOpt(opt);
193     if (!s.is_empty()) {
194     if (s=='-') f=stdout;
195     else {
196 gpertea 13 f=fopen(s.chars(),"w");
197 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
198     }
199     }
200     }
201    
202     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
203     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
204    
205     GHash<FqDupRec> dhash; //hash to keep track of duplicates
206    
207 gpertea 180 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208     int loadAdaptors(const char* fname);
209 gpertea 94
210 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211     GStr& s, GStr& infname, GStr& infname2);
212 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
213 gpertea 170
214 gpertea 13 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
215     void trash_report(char trashcode, GStr& rname, FILE* freport);
216    
217 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
218 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname);
219    
220 gpertea 170 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
221 gpertea 13 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
222    
223 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
224     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
225 gpertea 4 int dust(GStr& seq);
226 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 gpertea 180 bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229     bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230 gpertea 4
231 gpertea 12 void convertPhred(char* q, int len);
232     void convertPhred(GStr& q);
233 gpertea 4
234     int main(int argc, char * const argv[]) {
235 gpertea 172 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
236 gpertea 4 int e;
237     if ((e=args.isError())>0) {
238     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
239     exit(224);
240     }
241 gpertea 9 debug=(args.getOpt('Y')!=NULL);
242 gpertea 13 verbose=(args.getOpt('V')!=NULL);
243 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
244 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
245     doDust=(args.getOpt('D')!=NULL);
246 gpertea 172 revCompl=(args.getOpt('R')!=NULL);
247 gpertea 103 if (args.getOpt('A')) doPolyTrim=false;
248 gpertea 12 /*
249 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
250     if (rawFormat) {
251     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
252     }
253 gpertea 12 */
254 gpertea 4 prefix=args.getOpt('n');
255     GStr s=args.getOpt('l');
256 gpertea 170 if (!s.is_empty())
257 gpertea 4 min_read_len=s.asInt();
258 gpertea 12 s=args.getOpt('m');
259 gpertea 170 if (!s.is_empty())
260 gpertea 12 max_perc_N=s.asDouble();
261 gpertea 4 s=args.getOpt('d');
262     if (!s.is_empty()) {
263     dust_cutoff=s.asInt();
264     doDust=true;
265     }
266 gpertea 12 s=args.getOpt('q');
267     if (!s.is_empty()) {
268 gpertea 13 qvtrim_qmin=s.asInt();
269 gpertea 12 }
270 gpertea 13 s=args.getOpt('t');
271     if (!s.is_empty()) {
272     qvtrim_max=s.asInt();
273     }
274 gpertea 12 s=args.getOpt('p');
275     if (!s.is_empty()) {
276     int v=s.asInt();
277     if (v==33) {
278     qv_phredtype=33;
279     qv_cvtadd=31;
280     }
281     else if (v==64) {
282     qv_phredtype=64;
283     qv_cvtadd=-31;
284     }
285 gpertea 170 else
286 gpertea 12 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
287     }
288 gpertea 94 s=args.getOpt('f');
289     if (!s.is_empty()) {
290 gpertea 180 loadAdaptors(s.chars());
291 gpertea 94 }
292 gpertea 180 bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293 gpertea 94 s=args.getOpt('5');
294     if (!s.is_empty()) {
295 gpertea 180 if (fileAdaptors)
296 gpertea 94 GError("Error: options -5 and -f cannot be used together!\n");
297     s.upper();
298 gpertea 180 addAdaptor(adaptors5, s, galn_TrimLeft);
299 gpertea 4 }
300 gpertea 94 s=args.getOpt('3');
301     if (!s.is_empty()) {
302 gpertea 180 if (fileAdaptors)
303 gpertea 94 GError("Error: options -3 and -f cannot be used together!\n");
304 gpertea 102 s.upper();
305 gpertea 180 addAdaptor(adaptors3, s, galn_TrimRight);
306 gpertea 4 }
307 gpertea 94 s=args.getOpt('y');
308 gpertea 13 if (!s.is_empty()) {
309 gpertea 94 int minmatch=s.asInt();
310     poly_minScore=minmatch*poly_m_score;
311 gpertea 13 }
312 gpertea 170
313 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
314 gpertea 76 else outsuffix="-";
315 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
316 gpertea 13 int fcount=args.startNonOpt();
317     if (fcount==0) {
318 gpertea 4 GMessage(USAGE);
319     exit(224);
320     }
321 gpertea 13 if (fcount>1 && doCollapse) {
322     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
323     }
324     //openfw(f_out, args, 'o');
325     //if (f_out==NULL) f_out=stdout;
326 gpertea 4 if (trashReport)
327     openfw(freport, args, 'r');
328     char* infile=NULL;
329 gpertea 103
330 gpertea 180 if (adaptors5.Count()>0)
331 gpertea 177 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332     gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 gpertea 180 if (adaptors3.Count()>0)
334 gpertea 103 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335    
336 gpertea 4 while ((infile=args.nextNonOpt())!=NULL) {
337 gpertea 103 //for each input file
338 gpertea 13 int incounter=0; //counter for input reads
339     int outcounter=0; //counter for output reads
340     int trash_s=0; //too short from the get go
341     int trash_Q=0;
342     int trash_N=0;
343     int trash_D=0;
344 gpertea 103 int trash_poly=0;
345 gpertea 13 int trash_A3=0;
346     int trash_A5=0;
347     s=infile;
348     GStr infname;
349     GStr infname2;
350     FILE* f_in=NULL;
351     FILE* f_in2=NULL;
352     FILE* f_out=NULL;
353     FILE* f_out2=NULL;
354     bool paired_reads=false;
355     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
356     GLineReader fq(f_in);
357     GLineReader* fq2=NULL;
358     if (f_in2!=NULL) {
359     fq2=new GLineReader(f_in2);
360     paired_reads=true;
361     }
362     GStr rseq, rqv, seqid, seqinfo;
363     GStr rseq2, rqv2, seqid2, seqinfo2;
364     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
365     incounter++;
366     int a5=0, a3=0, b5=0, b3=0;
367     char tcode=0, tcode2=0;
368     tcode=process_read(seqid, rseq, rqv, a5, a3);
369 gpertea 103 if (fq2!=NULL) {
370 gpertea 13 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
371     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
372     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
373     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
374     }
375     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
376     //decide what to do with this pair and print rseq2 if the pair makes it
377     if (tcode>1 && tcode2<=1) {
378     //"untrash" rseq
379     if (a3-a5+1<min_read_len) {
380     a5=1;
381     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
382 gpertea 4 }
383 gpertea 13 tcode=1;
384     }
385     else if (tcode<=1 && tcode2>1) {
386     //"untrash" rseq2
387     if (b3-b5+1<min_read_len) {
388     b5=1;
389     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
390     }
391     tcode2=1;
392     }
393     if (tcode<=1) { //trimmed or left intact -- write it!
394     if (tcode>0) {
395     rseq2=rseq2.substr(b5,b3-b5+1);
396     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
397 gpertea 4 }
398 gpertea 13 int nocounter=0;
399     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
400     }
401 gpertea 103 } //pair read
402 gpertea 13 if (tcode>1) { //trashed
403 gpertea 103 #ifdef GDEBUG
404 gpertea 175 GMessage(" !!!!TRASH code = %c\n",tcode);
405 gpertea 103 #endif
406 gpertea 13 if (tcode=='s') trash_s++;
407 gpertea 103 else if (tcode=='A' || tcode=='T') trash_poly++;
408 gpertea 13 else if (tcode=='Q') trash_Q++;
409     else if (tcode=='N') trash_N++;
410     else if (tcode=='D') trash_D++;
411     else if (tcode=='3') trash_A3++;
412     else if (tcode=='5') trash_A5++;
413     if (trashReport) trash_report(tcode, seqid, freport);
414     }
415     else if (!doCollapse) { //write it
416     if (tcode>0) {
417     rseq=rseq.substr(a5,a3-a5+1);
418     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
419     }
420 gpertea 103 #ifdef GDEBUG
421     GMessage(" After trimming:\n");
422 gpertea 105 GMessage("%s\n",rseq.chars());
423 gpertea 103 #endif
424 gpertea 13 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
425     }
426     } //for each fastq record
427     delete fq2;
428     FRCLOSE(f_in);
429     FRCLOSE(f_in2);
430     if (doCollapse) {
431     outcounter=0;
432     int maxdup_count=1;
433     char* maxdup_seq=NULL;
434     dhash.startIterate();
435     FqDupRec* qd=NULL;
436     char* seq=NULL;
437     while ((qd=dhash.NextData(seq))!=NULL) {
438     GStr rseq(seq);
439     //do the dusting here
440     if (doDust) {
441     int dustbases=dust(rseq);
442     if (dustbases>(rseq.length()>>1)) {
443     if (trashReport && qd->firstname!=NULL) {
444     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
445 gpertea 4 }
446 gpertea 13 trash_D+=qd->count;
447     continue;
448     }
449     }
450     outcounter++;
451 gpertea 170 if (qd->count>maxdup_count) {
452 gpertea 13 maxdup_count=qd->count;
453     maxdup_seq=seq;
454     }
455     if (isfasta) {
456     if (prefix.is_empty()) {
457 gpertea 170 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
458 gpertea 13 rseq.chars());
459 gpertea 4 }
460 gpertea 13 else { //use custom read name
461     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
462     qd->count, rseq.chars());
463 gpertea 4 }
464 gpertea 13 }
465     else { //fastq format
466     if (convert_phred) convertPhred(qd->qv, qd->len);
467     if (prefix.is_empty()) {
468 gpertea 170 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
469 gpertea 13 rseq.chars(), qd->qv);
470 gpertea 4 }
471 gpertea 13 else { //use custom read name
472     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
473     qd->count, rseq.chars(), qd->qv);
474     }
475     }
476     }//for each element of dhash
477     if (maxdup_count>1) {
478     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
479 gpertea 4 }
480 gpertea 13 } //collapse entries
481     if (verbose) {
482     if (paired_reads) {
483     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
484     GMessage("Number of input pairs :%9d\n", incounter);
485     GMessage(" Output pairs :%9d\n", outcounter);
486     }
487     else {
488     GMessage(">Input file : %s\n", infname.chars());
489     GMessage("Number of input reads :%9d\n", incounter);
490     GMessage(" Output reads :%9d\n", outcounter);
491     }
492     GMessage("------------------------------------\n");
493     if (num_trimmed5)
494     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
495     if (num_trimmed3)
496     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
497     GMessage("------------------------------------\n");
498     if (trash_s>0)
499     GMessage(" Trashed by length:%9d\n", trash_s);
500     if (trash_N>0)
501     GMessage(" Trashed by N%%:%9d\n", trash_N);
502     if (trash_Q>0)
503     GMessage("Trashed by low quality:%9d\n", trash_Q);
504 gpertea 103 if (trash_poly>0)
505     GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
506 gpertea 13 if (trash_A5>0)
507 gpertea 180 GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508 gpertea 13 if (trash_A3>0)
509 gpertea 180 GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510 gpertea 13 }
511     if (trashReport) {
512     FWCLOSE(freport);
513 gpertea 4 }
514 gpertea 13 FWCLOSE(f_out);
515     FWCLOSE(f_out2);
516     } //while each input file
517 gpertea 103 delete gxmem_l;
518     delete gxmem_r;
519 gpertea 12 //getc(stdin);
520 gpertea 4 }
521    
522     class NData {
523     public:
524     int NPos[1024]; //there should be no reads longer than 1K ?
525     int NCount;
526     int end5;
527     int end3;
528     int seqlen;
529     double perc_N; //percentage of Ns in end5..end3 range only!
530     const char* seq;
531     bool valid;
532     NData() {
533 gpertea 170 seqlen=0;
534 gpertea 4 NCount=0;
535     end5=0;
536     end3=0;
537     seq=NULL;
538     perc_N=0;
539     valid=true;
540     }
541     void init(GStr& rseq) {
542     NCount=0;
543     perc_N=0;
544     seqlen=rseq.length();
545     seq=rseq.chars();
546     end5=1;
547     end3=seqlen;
548     for (int i=0;i<seqlen;i++)
549 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
550 gpertea 4 NPos[NCount]=i;
551     NCount++;
552     }
553     perc_N=(NCount*100.0)/seqlen;
554     valid=true;
555     }
556     void N_calc() { //only in the region after trimming
557     int n=0;
558     for (int i=end3-1;i<end5;i++) {
559     if (seq[i]=='N') n++;
560     }
561     perc_N=(n*100.0)/(end5-end3+1);
562     }
563     };
564 gpertea 170
565 gpertea 4 static NData feat;
566     int perc_lenN=12; // incremental distance from ends, in percentage of
567     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
568 gpertea 170
569 gpertea 4 void N_analyze(int l5, int l3, int p5, int p3) {
570     /* assumes feat was filled properly */
571     int old_dif, t5,t3,v;
572 gpertea 12 if (l3<l5+2 || p5>p3 ) {
573 gpertea 4 feat.end5=l5+1;
574     feat.end3=l3+1;
575 gpertea 170 return;
576 gpertea 4 }
577    
578     t5=feat.NPos[p5]-l5;
579     t3=l3-feat.NPos[p3];
580     old_dif=p3-p5;
581     v=(int)((((double)(l3-l5))*perc_lenN)/100);
582 gpertea 170 if (v>20) v=20; /* enforce N-search limit for very long reads */
583 gpertea 4 if (t5 < v ) {
584     l5=feat.NPos[p5]+1;
585     p5++;
586     }
587     if (t3 < v) {
588     l3=feat.NPos[p3]-1;
589     p3--;
590     }
591     /* restNs=p3-p5; number of Ns in the new CLR */
592     if (p3-p5==old_dif) { /* no change, return */
593     /* don't trim if this may shorten the read too much */
594     //if (l5-l3<min_read_len) return;
595     feat.end5=l5+1;
596     feat.end3=l3+1;
597     return;
598     }
599 gpertea 170 else
600 gpertea 4 N_analyze(l5,l3, p5,p3);
601     }
602    
603    
604 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
605 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
606 gpertea 12 if (qv_phredtype==0) {
607     //try to guess the Phred type
608     int vmin=256, vmax=0;
609     for (int i=0;i<qvs.length();i++) {
610     if (vmin>qvs[i]) vmin=qvs[i];
611     if (vmax<qvs[i]) vmax=qvs[i];
612     }
613     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
614     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
615     if (qv_phredtype==0) {
616     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
617     }
618 gpertea 13 if (verbose)
619     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
620     } //guessing Phred type
621 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
622 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
623 gpertea 12 }
624     //just in case, check also the 5' the end (?)
625     for (l5=0;l5<qvs.length()-3;l5++) {
626 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
627 gpertea 12 }
628 gpertea 13 if (qvtrim_max>0) {
629     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
630     if (l5>qvtrim_max) l5=qvtrim_max;
631     }
632 gpertea 12 return (l5>0 || l3<qvs.length()-1);
633     }
634    
635     bool ntrim(GStr& rseq, int &l5, int &l3) {
636     //count Ns in the sequence, trim N-rich ends
637 gpertea 4 feat.init(rseq);
638     l5=feat.end5-1;
639     l3=feat.end3-1;
640 gpertea 170 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
641 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
642     if (feat.perc_N>max_perc_N) {
643     feat.valid=false;
644     l3=l5+1;
645     return true;
646     }
647     else {
648     return false; //nothing changed
649     }
650     }
651 gpertea 4 l5=feat.end5-1;
652     l3=feat.end3-1;
653     if (l3-l5+1<min_read_len) {
654     feat.valid=false;
655     return true;
656     }
657     feat.N_calc();
658 gpertea 170
659 gpertea 12 if (feat.perc_N>max_perc_N) {
660 gpertea 4 feat.valid=false;
661     l3=l5+1;
662     return true;
663     }
664     return true;
665     }
666    
667     //--------------- dust functions ----------------
668     class DNADuster {
669     public:
670 gpertea 170 int dustword;
671     int dustwindow;
672     int dustwindow2;
673 gpertea 4 int dustcutoff;
674     int mv, iv, jv;
675     int counts[32*32*32];
676     int iis[32*32*32];
677     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
678     dustword=wordsize;
679     dustwindow=winsize;
680     dustwindow2 = (winsize>>1);
681     dustcutoff=cutoff;
682     mv=0;
683     iv=0;
684     jv=0;
685     }
686     void setWindowSize(int value) {
687     dustwindow = value;
688     dustwindow2 = (dustwindow >> 1);
689     }
690     void setWordSize(int value) {
691     dustword=value;
692     }
693     void wo1(int len, const char* s, int ivv) {
694     int i, ii, j, v, t, n, n1, sum;
695     int js, nis;
696     n = 32 * 32 * 32;
697     n1 = n - 1;
698     nis = 0;
699     i = 0;
700     ii = 0;
701     sum = 0;
702     v = 0;
703     for (j=0; j < len; j++, s++) {
704     ii <<= 5;
705     if (*s<=32) {
706     i=0;
707     continue;
708     }
709     ii |= *s - 'A'; //assume uppercase!
710     ii &= n1;
711     i++;
712     if (i >= dustword) {
713     for (js=0; js < nis && iis[js] != ii; js++) ;
714     if (js == nis) {
715     iis[nis] = ii;
716     counts[ii] = 0;
717     nis++;
718     }
719     if ((t = counts[ii]) > 0) {
720     sum += t;
721     v = 10 * sum / j;
722     if (mv < v) {
723     mv = v;
724     iv = ivv;
725     jv = j;
726     }
727     }
728     counts[ii]++;
729     }
730     }
731     }
732    
733     int wo(int len, const char* s, int* beg, int* end) {
734     int i, l1;
735     l1 = len - dustword + 1;
736     if (l1 < 0) {
737     *beg = 0;
738     *end = len - 1;
739     return 0;
740     }
741     mv = 0;
742     iv = 0;
743     jv = 0;
744     for (i=0; i < l1; i++) {
745     wo1(len-i, s+i, i);
746     }
747     *beg = iv;
748     *end = iv + jv;
749     return mv;
750     }
751    
752     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
753     int i, j, l, a, b, v;
754     if (cutoff==0) cutoff=dustcutoff;
755     a=0;b=0;
756     //GMessage("Dust cutoff=%d\n", cutoff);
757     for (i=0; i < seqlen; i += dustwindow2) {
758     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
759     v = wo(l, seq+i, &a, &b);
760     if (v > cutoff) {
761     //for (j = a; j <= b && j < dustwindow2; j++) {
762     for (j = a; j <= b; j++) {
763     seqmsk[i+j]='N';//could be made lowercase instead
764     }
765     }
766     }
767 gpertea 170 //return first;
768 gpertea 4 }
769     };
770    
771     static DNADuster duster;
772    
773     int dust(GStr& rseq) {
774     char* seq=Gstrdup(rseq.chars());
775     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
776     //check the number of Ns:
777     int ncount=0;
778     for (int i=0;i<rseq.length();i++) {
779     if (seq[i]=='N') ncount++;
780     }
781     GFREE(seq);
782     return ncount;
783     }
784    
785 gpertea 94 struct SLocScore {
786     int pos;
787     int score;
788     SLocScore(int p=0,int s=0) {
789     pos=p;
790     score=s;
791     }
792     void set(int p, int s) {
793     pos=p;
794     score=s;
795     }
796     void add(int p, int add) {
797     pos=p;
798     score+=add;
799     }
800     };
801 gpertea 12
802 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
803 gpertea 103 if (!doPolyTrim) return false;
804 gpertea 94 int rlen=seq.length();
805     l5=0;
806     l3=rlen-1;
807     int32 seedVal=*(int32*)poly_seed;
808     char polyChar=poly_seed[0];
809     //assumes N trimming was already done
810     //so a poly match should be very close to the end of the read
811     // -- find the initial match (seed)
812 gpertea 108 int lmin=GMAX((rlen-16), 0);
813 gpertea 94 int li;
814     for (li=rlen-4;li>lmin;li--) {
815     if (seedVal==*(int*)&(seq[li])) {
816     break;
817     }
818 gpertea 12 }
819 gpertea 94 if (li<=lmin) return false;
820     //seed found, try to extend it both ways
821     //extend right
822     int ri=li+3;
823     SLocScore loc(ri, poly_m_score<<2);
824     SLocScore maxloc(loc);
825     //extend right
826 gpertea 170 while (ri<rlen-1) {
827 gpertea 94 ri++;
828     if (seq[ri]==polyChar) {
829     loc.add(ri,poly_m_score);
830     }
831     else if (seq[ri]=='N') {
832     loc.add(ri,0);
833     }
834     else { //mismatch
835     loc.add(ri,poly_mis_score);
836     if (maxloc.score-loc.score>poly_dropoff_score) break;
837     }
838     if (maxloc.score<=loc.score) {
839     maxloc=loc;
840     }
841 gpertea 12 }
842 gpertea 94 ri=maxloc.pos;
843 gpertea 103 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
844 gpertea 94 //ri = right boundary for the poly match
845     //extend left
846     loc.set(li, maxloc.score);
847     maxloc.pos=li;
848     while (li>0) {
849     li--;
850     if (seq[li]==polyChar) {
851     loc.add(li,poly_m_score);
852     }
853     else if (seq[li]=='N') {
854     loc.add(li,0);
855     }
856     else { //mismatch
857     loc.add(li,poly_mis_score);
858     if (maxloc.score-loc.score>poly_dropoff_score) break;
859     }
860     if (maxloc.score<=loc.score) {
861     maxloc=loc;
862     }
863     }
864 gpertea 105 li=maxloc.pos;
865 gpertea 108 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
866     (maxloc.score>poly_minScore && ri>=rlen-3) ||
867     (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
868 gpertea 105 //trimming this li-ri match at 3' end
869     l3=li-1;
870     if (l3<0) l3=0;
871 gpertea 94 return true;
872     }
873     return false;
874 gpertea 12 }
875    
876 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
877 gpertea 103 if (!doPolyTrim) return false;
878 gpertea 88 int rlen=seq.length();
879     l5=0;
880     l3=rlen-1;
881 gpertea 94 int32 seedVal=*(int32*)poly_seed;
882     char polyChar=poly_seed[0];
883 gpertea 88 //assumes N trimming was already done
884     //so a poly match should be very close to the end of the read
885     // -- find the initial match (seed)
886 gpertea 108 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
887 gpertea 94 int li;
888     for (li=0;li<=lmax;li++) {
889     if (seedVal==*(int*)&(seq[li])) {
890 gpertea 88 break;
891     }
892     }
893 gpertea 94 if (li>lmax) return false;
894     //seed found, try to extend it both ways
895     //extend left
896     int ri=li+3; //save rightmost base of the seed
897     SLocScore loc(li, poly_m_score<<2);
898     SLocScore maxloc(loc);
899     while (li>0) {
900     li--;
901     if (seq[li]==polyChar) {
902     loc.add(li,poly_m_score);
903     }
904     else if (seq[li]=='N') {
905     loc.add(li,0);
906     }
907     else { //mismatch
908     loc.add(li,poly_mis_score);
909     if (maxloc.score-loc.score>poly_dropoff_score) break;
910     }
911     if (maxloc.score<=loc.score) {
912     maxloc=loc;
913     }
914 gpertea 88 }
915 gpertea 94 li=maxloc.pos;
916 gpertea 103 if (li>5) return false; //no trimming wanted, too far from 5' end
917 gpertea 94 //li = right boundary for the poly match
918    
919     //extend right
920     loc.set(ri, maxloc.score);
921     maxloc.pos=ri;
922 gpertea 170 while (ri<rlen-1) {
923 gpertea 94 ri++;
924     if (seq[ri]==polyChar) {
925     loc.add(ri,poly_m_score);
926     }
927     else if (seq[ri]=='N') {
928     loc.add(ri,0);
929     }
930     else { //mismatch
931     loc.add(ri,poly_mis_score);
932     if (maxloc.score-loc.score>poly_dropoff_score) break;
933     }
934     if (maxloc.score<=loc.score) {
935     maxloc=loc;
936     }
937     }
938 gpertea 105 ri=maxloc.pos;
939 gpertea 103 if ((maxloc.score==poly_minScore && li==0) ||
940     (maxloc.score>poly_minScore && li<2)
941 gpertea 108 || (maxloc.score>(poly_minScore*3) && li<8)) {
942 gpertea 105 //adjust l5 to reflect this trimming of 5' end
943     l5=ri+1;
944     if (l5>rlen-1) l5=rlen-1;
945 gpertea 94 return true;
946     }
947     return false;
948 gpertea 88 }
949    
950 gpertea 180 bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951     if (adaptors3.Count()==0) return false;
952 gpertea 4 int rlen=seq.length();
953     l5=0;
954     l3=rlen-1;
955 gpertea 103 bool trimmed=false;
956 gpertea 175 GStr wseq(seq);
957 gpertea 103 int wlen=rlen;
958 gpertea 175 GXSeqData seqdata;
959 gpertea 180 int numruns=revCompl ? 2 : 1;
960     for (int ai=0;ai<adaptors3.Count();ai++) {
961     for (int r=0;r<numruns;r++) {
962     if (r) {
963     seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
964     adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
965     }
966     else {
967     seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
968     adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
969     }
970    
971     GXAlnInfo* bestaln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
972 gpertea 175 if (bestaln) {
973 gpertea 103 trimmed=true;
974 gpertea 175 //keep unmatched region on the left OR right (the longer one)
975     if (bestaln->sl > wlen-bestaln->sr) {
976     //keep left side
977     l3-=(wlen-bestaln->sl+1);
978     if (l3<0) l3=0;
979     }
980     else { //keep right side
981     l5+=bestaln->sr;
982     if (l5>=rlen) l5=rlen-1;
983     }
984     delete bestaln;
985 gpertea 103 if (l3-l5+1<min_read_len) return true;
986     wseq=seq.substr(l5,l3-l5+1);
987     wlen=wseq.length();
988     }
989 gpertea 180 }//forward and reverse adaptors
990     }//for each 3' adaptor
991 gpertea 103 return trimmed;
992 gpertea 4 }
993    
994 gpertea 180 bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
995     if (adaptors5.Count()==0) return false;
996 gpertea 4 int rlen=seq.length();
997     l5=0;
998     l3=rlen-1;
999 gpertea 102 bool trimmed=false;
1000 gpertea 175 GStr wseq(seq);
1001 gpertea 103 int wlen=rlen;
1002 gpertea 175 GXSeqData seqdata;
1003 gpertea 180 int numruns=revCompl ? 2 : 1;
1004     for (int ai=0;ai<adaptors5.Count();ai++) {
1005     for (int r=0;r<numruns;r++) {
1006     if (r) {
1007     seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1008     adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1009     }
1010     else {
1011     seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1012     adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1013     }
1014     GXAlnInfo* bestaln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1015     if (bestaln) {
1016     trimmed=true;
1017     if (bestaln->sl-1 > wlen-bestaln->sr) {
1018     //keep left side
1019     l3-=(wlen-bestaln->sl+1);
1020     if (l3<0) l3=0;
1021     }
1022     else { //keep right side
1023     l5+=bestaln->sr;
1024     if (l5>=rlen) l5=rlen-1;
1025     }
1026     delete bestaln;
1027     if (l3-l5+1<min_read_len) return true;
1028     wseq=seq.substr(l5,l3-l5+1);
1029     wlen=wseq.length();
1030     }
1031     } //forward and reverse?
1032     }//for each 5' adaptor
1033 gpertea 102 return trimmed;
1034 gpertea 98 }
1035 gpertea 4
1036 gpertea 170 //convert qvs to/from phred64 from/to phread33
1037 gpertea 12 void convertPhred(GStr& q) {
1038     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1039 gpertea 4 }
1040    
1041 gpertea 12 void convertPhred(char* q, int len) {
1042     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1043 gpertea 4 }
1044    
1045 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1046 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname) {
1047     rseq="";
1048     rqv="";
1049     rname="";
1050     rinfo="";
1051     if (fq.eof()) return false;
1052     char* l=fq.getLine();
1053     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1054     if (l==NULL) return false;
1055     /* if (rawFormat) {
1056     //TODO: implement raw qseq parsing here
1057     //if (raw type=N) then continue; //skip invalid/bad records
1058     } //raw qseq format
1059     else { // FASTQ or FASTA */
1060     isfasta=(l[0]=='>');
1061 gpertea 170 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1062 gpertea 13 infname.chars(), l);
1063     GStr s(l);
1064     rname=&(l[1]);
1065     for (int i=0;i<rname.length();i++)
1066 gpertea 170 if (rname[i]<=' ') {
1067     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1068     rname.cut(i);
1069     break;
1070 gpertea 13 }
1071     //now get the sequence
1072 gpertea 170 if ((l=fq.getLine())==NULL)
1073 gpertea 13 GError("Error: unexpected EOF after header for read %s (%s)\n",
1074     rname.chars(), infname.chars());
1075     rseq=l; //this must be the DNA line
1076     while ((l=fq.getLine())!=NULL) {
1077     //seq can span multiple lines
1078     if (l[0]=='>' || l[0]=='+') {
1079 gpertea 170 fq.pushBack();
1080 gpertea 13 break; //
1081     }
1082     rseq+=l;
1083 gpertea 170 } //check for multi-line seq
1084 gpertea 13 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1085     if ((l=fq.getLine())==NULL)
1086     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1087     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1088 gpertea 170 if ((l=fq.getLine())==NULL)
1089 gpertea 13 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1090     rqv=l;
1091 gpertea 170 //if (rqv.length()!=rseq.length())
1092 gpertea 13 // GError("Error: qv len != seq len for %s\n", rname.chars());
1093     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1094     rqv+=l; //append to qv string
1095     }
1096     }// fastq
1097     // } //<-- FASTA or FASTQ
1098 gpertea 94 rseq.upper();
1099 gpertea 13 return true;
1100     }
1101    
1102 gpertea 105 #ifdef GDEBUG
1103     void showTrim(GStr& s, int l5, int l3) {
1104 gpertea 177 if (l5>0 || l3==0) {
1105 gpertea 105 color_bg(c_red);
1106     }
1107     for (int i=0;i<s.length()-1;i++) {
1108     if (i && i==l5) color_resetbg();
1109     fprintf(stderr, "%c", s[i]);
1110 gpertea 177 if (i && i==l3) color_bg(c_red);
1111 gpertea 105 }
1112     fprintf(stderr, "%c", s[s.length()-1]);
1113     color_reset();
1114     fprintf(stderr, "\n");
1115     }
1116     #endif
1117    
1118 gpertea 13 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1119 gpertea 170 //returns 0 if the read was untouched, 1 if it was just trimmed
1120 gpertea 13 // and a trash code if it was trashed
1121     l5=0;
1122     l3=rseq.length()-1;
1123 gpertea 103 #ifdef GDEBUG
1124 gpertea 108 //rseq.reverse();
1125 gpertea 103 GMessage(">%s\n", rname.chars());
1126 gpertea 105 GMessage("%s\n",rseq.chars());
1127 gpertea 103 #endif
1128 gpertea 13 if (l3-l5+1<min_read_len) {
1129     return 's';
1130     }
1131     GStr wseq(rseq.chars());
1132     GStr wqv(rqv.chars());
1133     int w5=l5;
1134     int w3=l3;
1135     //first do the q-based trimming
1136     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1137     if (w3-w5+1<min_read_len) {
1138     return 'Q'; //invalid read
1139     }
1140     //-- keep only the w5..w3 range
1141     l5=w5;
1142     l3=w3;
1143     wseq=wseq.substr(w5, w3-w5+1);
1144     if (!wqv.is_empty())
1145     wqv=wqv.substr(w5, w3-w5+1);
1146     } //qv trimming
1147     // N-trimming on the remaining read seq
1148     if (ntrim(wseq, w5, w3)) {
1149     //GMessage("before: %s\n",wseq.chars());
1150     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1151     l5+=w5;
1152     l3-=(wseq.length()-1-w3);
1153     if (w3-w5+1<min_read_len) {
1154     return 'N'; //to be trashed
1155     }
1156     //-- keep only the w5..w3 range
1157     wseq=wseq.substr(w5, w3-w5+1);
1158     if (!wqv.is_empty())
1159     wqv=wqv.substr(w5, w3-w5+1);
1160     w5=0;
1161     w3=wseq.length()-1;
1162     }
1163 gpertea 103 char trim_code;
1164 gpertea 180 //clean the more dirty end first - 3'
1165     int prev_w5=0;
1166     int prev_w3=0;
1167     bool w3upd=true;
1168     bool w5upd=true;
1169 gpertea 103 do {
1170     trim_code=0;
1171 gpertea 180 if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1172 gpertea 103 trim_code='A';
1173     }
1174 gpertea 180 else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1175 gpertea 103 trim_code='T';
1176     }
1177 gpertea 180 else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1178 gpertea 103 trim_code='A';
1179     }
1180 gpertea 180 else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1181 gpertea 103 trim_code='T';
1182     }
1183 gpertea 180 else if (trim_adaptor5(wseq, w5, w3)) {
1184     trim_code='5';
1185     }
1186     else if (trim_adaptor3(wseq, w5, w3)) {
1187 gpertea 103 trim_code='3';
1188     }
1189     if (trim_code) {
1190 gpertea 180 w3upd=(w3!=prev_w3);
1191     w5upd=(w5!=prev_w5);
1192     if (w3upd) prev_w3=w3;
1193     if (w5upd) prev_w5=w5;
1194     #ifdef GDEBUG
1195 gpertea 105 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1196     showTrim(wseq, w5, w3);
1197 gpertea 180 #endif
1198 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1199 gpertea 103 num_trimmed3++;
1200 gpertea 170 if (trimlen<min_trimmed3)
1201 gpertea 103 min_trimmed3=trimlen;
1202 gpertea 13 l5+=w5;
1203     l3-=(wseq.length()-1-w3);
1204     if (w3-w5+1<min_read_len) {
1205 gpertea 103 return trim_code;
1206 gpertea 13 }
1207     //-- keep only the w5..w3 range
1208     wseq=wseq.substr(w5, w3-w5+1);
1209     if (!wqv.is_empty())
1210     wqv=wqv.substr(w5, w3-w5+1);
1211 gpertea 103 }//trimming at 3' end
1212     } while (trim_code);
1213    
1214 gpertea 13 if (doCollapse) {
1215     //keep read for later
1216     FqDupRec* dr=dhash.Find(wseq.chars());
1217     if (dr==NULL) { //new entry
1218 gpertea 170 //if (prefix.is_empty())
1219     dhash.Add(wseq.chars(),
1220 gpertea 13 new FqDupRec(&wqv, rname.chars()));
1221     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1222     }
1223     else
1224     dr->add(wqv);
1225     } //collapsing duplicates
1226     else { //not collapsing duplicates
1227     //apply the dust filter now
1228     if (doDust) {
1229     int dustbases=dust(wseq);
1230     if (dustbases>(wseq.length()>>1)) {
1231     return 'D';
1232     }
1233     }
1234     } //not collapsing duplicates
1235     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1236     }
1237    
1238     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1239     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1240     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1241     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1242     }
1243    
1244     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1245     outcounter++;
1246     bool asFasta=(rqv.is_empty() || fastaOutput);
1247     if (asFasta) {
1248     if (prefix.is_empty()) {
1249     printHeader(f_out, '>',rname,rinfo);
1250     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1251     }
1252     else {
1253 gpertea 170 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1254 gpertea 13 rseq.chars());
1255     }
1256     }
1257     else { //fastq
1258     if (convert_phred) convertPhred(rqv);
1259     if (prefix.is_empty()) {
1260     printHeader(f_out, '@', rname, rinfo);
1261     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1262     }
1263     else
1264 gpertea 170 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1265 gpertea 13 rseq.chars(),rqv.chars() );
1266     }
1267     }
1268    
1269     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1270     if (freport==NULL || trashcode<=' ') return;
1271     if (trashcode=='3' || trashcode=='5') {
1272 gpertea 103 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1273 gpertea 13 }
1274     else {
1275     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1276     }
1277     //tcounter++;
1278     }
1279    
1280     GStr getFext(GStr& s, int* xpos=NULL) {
1281     //s must be a filename without a path
1282     GStr r("");
1283     if (xpos!=NULL) *xpos=0;
1284     if (s.is_empty() || s=="-") return r;
1285     int p=s.rindex('.');
1286     int d=s.rindex('/');
1287     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1288     r=s.substr(p+1);
1289     if (xpos!=NULL) *xpos=p+1;
1290     r.lower();
1291     return r;
1292     }
1293    
1294     void baseFileName(GStr& fname) {
1295     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1296     int xpos=0;
1297     GStr fext=getFext(fname, &xpos);
1298     if (xpos<=1) return;
1299     bool extdel=false;
1300     GStr f2;
1301     if (fext=="z" || fext=="zip") {
1302     extdel=true;
1303     }
1304     else if (fext.length()>=2) {
1305     f2=fext.substr(0,2);
1306     extdel=(f2=="gz" || f2=="bz");
1307     }
1308     if (extdel) {
1309     fname.cut(xpos-1);
1310     fext=getFext(fname, &xpos);
1311     if (xpos<=1) return;
1312     }
1313     extdel=false;
1314     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1315     extdel=true;
1316     }
1317     else if (fext.length()>=2) {
1318     extdel=(fext.substr(0,2)=="fa");
1319     }
1320     if (extdel) fname.cut(xpos-1);
1321     GStr fncp(fname);
1322     fncp.lower();
1323     fncp.chomp("seq");
1324     fncp.chomp("sequence");
1325     fncp.trimR("_.");
1326     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1327     }
1328    
1329     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1330     FILE* f_out=NULL;
1331     GStr fname(getFileName(infname.chars()));
1332     //eliminate known extensions
1333     baseFileName(fname);
1334     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1335     else if (pocmd.is_empty()) {
1336     GStr oname(fname);
1337     oname.append('.');
1338     oname.append(outsuffix);
1339     f_out=fopen(oname.chars(),"w");
1340     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1341     }
1342     else {
1343     GStr oname(">");
1344     oname.append(fname);
1345     oname.append('.');
1346     oname.append(outsuffix);
1347     pocmd.append(oname);
1348     f_out=popen(pocmd.chars(), "w");
1349     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1350     }
1351     return f_out;
1352     }
1353    
1354     void guess_unzip(GStr& fname, GStr& picmd) {
1355     GStr fext=getFext(fname);
1356     if (fext=="gz" || fext=="gzip" || fext=="z") {
1357     picmd="gzip -cd ";
1358     }
1359     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1360     picmd="bzip2 -cd ";
1361     }
1362     }
1363    
1364 gpertea 180 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1365 gpertea 170 //TODO: prepare CASeqData here, and collect hexamers as well
1366 gpertea 180 if (seq.is_empty() || seq=="-" ||
1367     seq=="N/A" || seq==".") return;
1368    
1369 gpertea 172 CASeqData adata(revCompl);
1370 gpertea 180 int idx=adaptors.Add(adata);
1371 gpertea 172 if (idx<0) GError("Error: failed to add adaptor!\n");
1372 gpertea 180 adaptors[idx].trim_type=trim_type;
1373     adaptors[idx].update(seq.chars());
1374 gpertea 170 }
1375    
1376    
1377 gpertea 180 int loadAdaptors(const char* fname) {
1378 gpertea 94 GLineReader lr(fname);
1379     char* l;
1380     while ((l=lr.nextLine())!=NULL) {
1381     if (lr.length()<=3 || l[0]=='#') continue;
1382 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1383     l[0]==';'|| l[0]==':' ) {
1384 gpertea 94 int i=1;
1385     while (l[i]!=0 && isspace(l[i])) {
1386     i++;
1387     }
1388     if (l[i]!=0) {
1389 gpertea 102 GStr s(&(l[i]));
1390 gpertea 108 #ifdef GDEBUG
1391     //s.reverse();
1392     #endif
1393 gpertea 180 addAdaptor(adaptors3, s, galn_TrimRight);
1394 gpertea 94 continue;
1395     }
1396     }
1397     else {
1398 gpertea 98 GStr s(l);
1399     s.startTokenize("\t ;,:");
1400     GStr a5,a3;
1401     if (s.nextToken(a5))
1402 gpertea 180 s.nextToken(a3);
1403     else continue; //no tokens on this line
1404     GAlnTrimType ttype5=galn_TrimLeft;
1405 gpertea 102 a5.upper();
1406     a3.upper();
1407 gpertea 180 if (a3.is_empty() || a3==a5 || a3=="=") {
1408     a3.clear();
1409     ttype5=galn_TrimEither;
1410     }
1411 gpertea 108 #ifdef GDEBUG
1412     // a5.reverse();
1413     // a3.reverse();
1414     #endif
1415 gpertea 180 addAdaptor(adaptors5, a5, ttype5);
1416     addAdaptor(adaptors3, a3, galn_TrimRight);
1417 gpertea 94 }
1418     }
1419 gpertea 180 return adaptors5.Count()+adaptors3.Count();
1420 gpertea 94 }
1421    
1422 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1423     GStr& s, GStr& infname, GStr& infname2) {
1424 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
1425     infname="";
1426     infname2="";
1427     f_in=NULL;
1428     f_in2=NULL;
1429     f_out=NULL;
1430     f_out2=NULL;
1431     //analyze outsuffix intent
1432     GStr pocmd;
1433 gpertea 76 if (outsuffix=="-") {
1434     f_out=stdout;
1435     }
1436     else {
1437     GStr ox=getFext(outsuffix);
1438     if (ox.length()>2) ox=ox.substr(0,2);
1439     if (ox=="gz") pocmd="gzip -9 -c ";
1440     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1441     }
1442 gpertea 13 if (s=="-") {
1443     f_in=stdin;
1444     infname="stdin";
1445     f_out=prepOutFile(infname, pocmd);
1446     return;
1447     } // streaming from stdin
1448     s.startTokenize(",:");
1449     s.nextToken(infname);
1450     bool paired=s.nextToken(infname2);
1451 gpertea 170 if (fileExists(infname.chars())==0)
1452 gpertea 13 GError("Error: cannot find file %s!\n",infname.chars());
1453     GStr fname(getFileName(infname.chars()));
1454     GStr picmd;
1455     guess_unzip(fname, picmd);
1456     if (picmd.is_empty()) {
1457     f_in=fopen(infname.chars(), "r");
1458     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1459     }
1460     else {
1461     picmd.append(infname);
1462     f_in=popen(picmd.chars(), "r");
1463     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1464     }
1465 gpertea 76 if (f_out==stdout) {
1466     if (paired) GError("Error: output suffix required for paired reads\n");
1467     return;
1468     }
1469 gpertea 13 f_out=prepOutFile(infname, pocmd);
1470     if (!paired) return;
1471     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1472     // ---- paired reads:-------------
1473 gpertea 170 if (fileExists(infname2.chars())==0)
1474 gpertea 13 GError("Error: cannot find file %s!\n",infname2.chars());
1475     picmd="";
1476     GStr fname2(getFileName(infname2.chars()));
1477     guess_unzip(fname2, picmd);
1478     if (picmd.is_empty()) {
1479     f_in2=fopen(infname2.chars(), "r");
1480     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1481     }
1482     else {
1483     picmd.append(infname2);
1484     f_in2=popen(picmd.chars(), "r");
1485     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1486     }
1487     f_out2=prepOutFile(infname2, pocmd);
1488     }