ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 170
Committed: Tue Feb 14 22:35:51 2012 UTC (7 years, 6 months ago) by gpertea
File size: 42010 byte(s)
Log Message:
wip fqtrim

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 76 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10     [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 76 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15     for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28     -f file with adapter sequences to trim, each line having this format:\n\
29     <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 gpertea 4 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32     -3 trim the given adapter sequence at the 3' end of each read\n\
33     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 103 -A disable polyA/T trimming (enabled by default)\n\
35 gpertea 94 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39     -l minimum \"clean\" length after trimming that a read must have\n\
40     in order to pass the filter (default: 16)\n\
41 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42     one letter code for the reason why they were trashed\n\
43 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44     50% of its length detected as low complexity\n\
45 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 gpertea 12 name (e.g. for microRNAs)\n\
47     -p input is phred64/phred33 (use -p64 or -p33)\n\
48     -Q convert quality values to the other Phred qv type\n\
49     -V verbose processing\n\
50 gpertea 4 "
51 gpertea 170
52 gpertea 13 //-z for -o option, the output stream(s) will be first piped into the given\n
53     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54    
55 gpertea 170
56 gpertea 4 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 gpertea 12
58 gpertea 76 //For paired reads sequencing:
59 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
62     //FILE* f_out2=NULL; //for paired reads
63     //FILE* f_in=NULL; //input fastq (stdin if not provided)
64     //FILE* f_in2=NULL; //for paired reads
65 gpertea 170
66 gpertea 4 FILE* freport=NULL;
67 gpertea 13
68 gpertea 4 bool debug=false;
69 gpertea 12 bool verbose=false;
70 gpertea 4 bool doCollapse=false;
71     bool doDust=false;
72 gpertea 103 bool doPolyTrim=true;
73 gpertea 13 bool fastaOutput=false;
74 gpertea 4 bool trashReport=false;
75 gpertea 12 //bool rawFormat=false;
76 gpertea 4 int min_read_len=16;
77 gpertea 12 double max_perc_N=7.0;
78 gpertea 4 int dust_cutoff=16;
79     bool isfasta=false;
80 gpertea 12 bool convert_phred=false;
81 gpertea 170 GStr outsuffix; // -o
82 gpertea 13 GStr prefix;
83     GStr zcmd;
84     int num_trimmed5=0;
85     int num_trimmed3=0;
86     int num_trimmedN=0;
87     int num_trimmedQ=0;
88     int min_trimmed5=INT_MAX;
89     int min_trimmed3=INT_MAX;
90 gpertea 12
91 gpertea 13 int qvtrim_qmin=0;
92     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94     int qv_cvtadd=0; //could be -31 or +31
95    
96 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
97 gpertea 103 const int match_reward=2;
98     const int mismatch_penalty=3;
99     const int Xdrop=8;
100    
101 gpertea 94 const int poly_m_score=2; //match score
102     const int poly_mis_score=-3; //mismatch
103 gpertea 88 const int poly_dropoff_score=7;
104 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
105 gpertea 4
106 gpertea 94 const char *polyA_seed="AAAA";
107     const char *polyT_seed="TTTT";
108 gpertea 12
109 gpertea 170 struct CASeqData {
110     //positional data for every possible hexamer in an adapter
111     GVec<uint16>* pz[64]; //0-based coordinates of all possible hexamers in the adapter sequence
112     GVec<uint16>* pzr[64]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
113     GStr seq; //actual adapter sequence data
114     GStr seqr; //reverse complement sequence
115     CASeqData(bool rev=false):seq(),seqr() {
116     for (int i=0;i<63;i++) {
117     pz[i]=new GVec<uint16>(1);
118     if (rev) pzr[i]=new GVec<uint16>(1);
119     }
120     }
121    
122     ~CSeqData() {
123     for (int i=0;i<63;i++) {
124     delete pz[i];
125     if (rev) { delete pzr[i]; }
126     }
127     }
128     };
129    
130     GVec<CASeqData> adapters5;
131     GVec<CASeqData> adapters3;
132    
133 gpertea 103 CGreedyAlignData* gxmem_l=NULL;
134     CGreedyAlignData* gxmem_r=NULL;
135    
136 gpertea 4 // element in dhash:
137     class FqDupRec {
138     public:
139     int count; //how many of these reads are the same
140     int len; //length of qv
141     char* firstname; //optional, only if we want to keep the original read names
142     char* qv;
143     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
144     len=0;
145     qv=NULL;
146     firstname=NULL;
147     count=0;
148     if (qstr!=NULL) {
149     qv=Gstrdup(qstr->chars());
150     len=qstr->length();
151     count++;
152     }
153     if (rname!=NULL) firstname=Gstrdup(rname);
154     }
155     ~FqDupRec() {
156     GFREE(qv);
157     GFREE(firstname);
158     }
159     void add(GStr& d) { //collapse another record into this one
160     if (d.length()!=len)
161     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
162     count++;
163     for (int i=0;i<len;i++)
164     qv[i]=(qv[i]+d[i])/2;
165     }
166     };
167    
168     void openfw(FILE* &f, GArgs& args, char opt) {
169     GStr s=args.getOpt(opt);
170     if (!s.is_empty()) {
171     if (s=='-') f=stdout;
172     else {
173 gpertea 13 f=fopen(s.chars(),"w");
174 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
175     }
176     }
177     }
178    
179     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
180     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
181    
182     GHash<FqDupRec> dhash; //hash to keep track of duplicates
183    
184 gpertea 94 int loadAdapters(const char* fname);
185    
186 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
187     GStr& s, GStr& infname, GStr& infname2);
188 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
189 gpertea 170
190 gpertea 13 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
191     void trash_report(char trashcode, GStr& rname, FILE* freport);
192    
193 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
194 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname);
195    
196 gpertea 170 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
197 gpertea 13 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
198    
199 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
200     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
201 gpertea 4 int dust(GStr& seq);
202 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
203     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
204 gpertea 4 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
205 gpertea 94 bool trim_adapter3(GStr& seq, int &l5, int &l3);
206 gpertea 4
207 gpertea 12 void convertPhred(char* q, int len);
208     void convertPhred(GStr& q);
209 gpertea 4
210     int main(int argc, char * const argv[]) {
211 gpertea 103 GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
212 gpertea 4 int e;
213     if ((e=args.isError())>0) {
214     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
215     exit(224);
216     }
217 gpertea 9 debug=(args.getOpt('Y')!=NULL);
218 gpertea 13 verbose=(args.getOpt('V')!=NULL);
219 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
220 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
221     doDust=(args.getOpt('D')!=NULL);
222 gpertea 103 if (args.getOpt('A')) doPolyTrim=false;
223 gpertea 12 /*
224 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
225     if (rawFormat) {
226     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
227     }
228 gpertea 12 */
229 gpertea 4 prefix=args.getOpt('n');
230     GStr s=args.getOpt('l');
231 gpertea 170 if (!s.is_empty())
232 gpertea 4 min_read_len=s.asInt();
233 gpertea 12 s=args.getOpt('m');
234 gpertea 170 if (!s.is_empty())
235 gpertea 12 max_perc_N=s.asDouble();
236 gpertea 4 s=args.getOpt('d');
237     if (!s.is_empty()) {
238     dust_cutoff=s.asInt();
239     doDust=true;
240     }
241 gpertea 12 s=args.getOpt('q');
242     if (!s.is_empty()) {
243 gpertea 13 qvtrim_qmin=s.asInt();
244 gpertea 12 }
245 gpertea 13 s=args.getOpt('t');
246     if (!s.is_empty()) {
247     qvtrim_max=s.asInt();
248     }
249 gpertea 12 s=args.getOpt('p');
250     if (!s.is_empty()) {
251     int v=s.asInt();
252     if (v==33) {
253     qv_phredtype=33;
254     qv_cvtadd=31;
255     }
256     else if (v==64) {
257     qv_phredtype=64;
258     qv_cvtadd=-31;
259     }
260 gpertea 170 else
261 gpertea 12 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
262     }
263 gpertea 94 s=args.getOpt('f');
264     if (!s.is_empty()) {
265     loadAdapters(s.chars());
266     }
267 gpertea 102 bool fileAdapters=adapters5.Count()+adapters3.Count();
268 gpertea 94 s=args.getOpt('5');
269     if (!s.is_empty()) {
270     if (fileAdapters)
271     GError("Error: options -5 and -f cannot be used together!\n");
272     s.upper();
273 gpertea 102 adapters5.Add(s);
274 gpertea 4 }
275 gpertea 94 s=args.getOpt('3');
276     if (!s.is_empty()) {
277     if (fileAdapters)
278     GError("Error: options -3 and -f cannot be used together!\n");
279 gpertea 102 s.upper();
280     adapters3.Add(s);
281 gpertea 4 }
282 gpertea 94 s=args.getOpt('y');
283 gpertea 13 if (!s.is_empty()) {
284 gpertea 94 int minmatch=s.asInt();
285     poly_minScore=minmatch*poly_m_score;
286 gpertea 13 }
287 gpertea 170
288 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
289 gpertea 76 else outsuffix="-";
290 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
291 gpertea 13 int fcount=args.startNonOpt();
292     if (fcount==0) {
293 gpertea 4 GMessage(USAGE);
294     exit(224);
295     }
296 gpertea 13 if (fcount>1 && doCollapse) {
297     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
298     }
299     //openfw(f_out, args, 'o');
300     //if (f_out==NULL) f_out=stdout;
301 gpertea 4 if (trashReport)
302     openfw(freport, args, 'r');
303     char* infile=NULL;
304 gpertea 103
305     if (adapters5.Count()>0)
306     gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
307     if (adapters3.Count()>0)
308     gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
309    
310 gpertea 4 while ((infile=args.nextNonOpt())!=NULL) {
311 gpertea 103 //for each input file
312 gpertea 13 int incounter=0; //counter for input reads
313     int outcounter=0; //counter for output reads
314     int trash_s=0; //too short from the get go
315     int trash_Q=0;
316     int trash_N=0;
317     int trash_D=0;
318 gpertea 103 int trash_poly=0;
319 gpertea 13 int trash_A3=0;
320     int trash_A5=0;
321     s=infile;
322     GStr infname;
323     GStr infname2;
324     FILE* f_in=NULL;
325     FILE* f_in2=NULL;
326     FILE* f_out=NULL;
327     FILE* f_out2=NULL;
328     bool paired_reads=false;
329     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
330     GLineReader fq(f_in);
331     GLineReader* fq2=NULL;
332     if (f_in2!=NULL) {
333     fq2=new GLineReader(f_in2);
334     paired_reads=true;
335     }
336     GStr rseq, rqv, seqid, seqinfo;
337     GStr rseq2, rqv2, seqid2, seqinfo2;
338     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
339     incounter++;
340     int a5=0, a3=0, b5=0, b3=0;
341     char tcode=0, tcode2=0;
342     tcode=process_read(seqid, rseq, rqv, a5, a3);
343 gpertea 103 if (fq2!=NULL) {
344 gpertea 13 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
345     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
346     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
347     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
348     }
349     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
350     //decide what to do with this pair and print rseq2 if the pair makes it
351     if (tcode>1 && tcode2<=1) {
352     //"untrash" rseq
353     if (a3-a5+1<min_read_len) {
354     a5=1;
355     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
356 gpertea 4 }
357 gpertea 13 tcode=1;
358     }
359     else if (tcode<=1 && tcode2>1) {
360     //"untrash" rseq2
361     if (b3-b5+1<min_read_len) {
362     b5=1;
363     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
364     }
365     tcode2=1;
366     }
367     if (tcode<=1) { //trimmed or left intact -- write it!
368     if (tcode>0) {
369     rseq2=rseq2.substr(b5,b3-b5+1);
370     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
371 gpertea 4 }
372 gpertea 13 int nocounter=0;
373     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
374     }
375 gpertea 103 } //pair read
376 gpertea 13 if (tcode>1) { //trashed
377 gpertea 103 #ifdef GDEBUG
378     GMessage(" !!!!TRASH => 'N'\n");
379     #endif
380 gpertea 13 if (tcode=='s') trash_s++;
381 gpertea 103 else if (tcode=='A' || tcode=='T') trash_poly++;
382 gpertea 13 else if (tcode=='Q') trash_Q++;
383     else if (tcode=='N') trash_N++;
384     else if (tcode=='D') trash_D++;
385     else if (tcode=='3') trash_A3++;
386     else if (tcode=='5') trash_A5++;
387     if (trashReport) trash_report(tcode, seqid, freport);
388     }
389     else if (!doCollapse) { //write it
390     if (tcode>0) {
391     rseq=rseq.substr(a5,a3-a5+1);
392     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
393     }
394 gpertea 103 #ifdef GDEBUG
395     GMessage(" After trimming:\n");
396 gpertea 105 GMessage("%s\n",rseq.chars());
397 gpertea 103 #endif
398 gpertea 13 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
399     }
400     } //for each fastq record
401     delete fq2;
402     FRCLOSE(f_in);
403     FRCLOSE(f_in2);
404     if (doCollapse) {
405     outcounter=0;
406     int maxdup_count=1;
407     char* maxdup_seq=NULL;
408     dhash.startIterate();
409     FqDupRec* qd=NULL;
410     char* seq=NULL;
411     while ((qd=dhash.NextData(seq))!=NULL) {
412     GStr rseq(seq);
413     //do the dusting here
414     if (doDust) {
415     int dustbases=dust(rseq);
416     if (dustbases>(rseq.length()>>1)) {
417     if (trashReport && qd->firstname!=NULL) {
418     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
419 gpertea 4 }
420 gpertea 13 trash_D+=qd->count;
421     continue;
422     }
423     }
424     outcounter++;
425 gpertea 170 if (qd->count>maxdup_count) {
426 gpertea 13 maxdup_count=qd->count;
427     maxdup_seq=seq;
428     }
429     if (isfasta) {
430     if (prefix.is_empty()) {
431 gpertea 170 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
432 gpertea 13 rseq.chars());
433 gpertea 4 }
434 gpertea 13 else { //use custom read name
435     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
436     qd->count, rseq.chars());
437 gpertea 4 }
438 gpertea 13 }
439     else { //fastq format
440     if (convert_phred) convertPhred(qd->qv, qd->len);
441     if (prefix.is_empty()) {
442 gpertea 170 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
443 gpertea 13 rseq.chars(), qd->qv);
444 gpertea 4 }
445 gpertea 13 else { //use custom read name
446     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
447     qd->count, rseq.chars(), qd->qv);
448     }
449     }
450     }//for each element of dhash
451     if (maxdup_count>1) {
452     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
453 gpertea 4 }
454 gpertea 13 } //collapse entries
455     if (verbose) {
456     if (paired_reads) {
457     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
458     GMessage("Number of input pairs :%9d\n", incounter);
459     GMessage(" Output pairs :%9d\n", outcounter);
460     }
461     else {
462     GMessage(">Input file : %s\n", infname.chars());
463     GMessage("Number of input reads :%9d\n", incounter);
464     GMessage(" Output reads :%9d\n", outcounter);
465     }
466     GMessage("------------------------------------\n");
467     if (num_trimmed5)
468     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
469     if (num_trimmed3)
470     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
471     GMessage("------------------------------------\n");
472     if (trash_s>0)
473     GMessage(" Trashed by length:%9d\n", trash_s);
474     if (trash_N>0)
475     GMessage(" Trashed by N%%:%9d\n", trash_N);
476     if (trash_Q>0)
477     GMessage("Trashed by low quality:%9d\n", trash_Q);
478 gpertea 103 if (trash_poly>0)
479     GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
480 gpertea 13 if (trash_A5>0)
481     GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
482     if (trash_A3>0)
483     GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
484     }
485     if (trashReport) {
486     FWCLOSE(freport);
487 gpertea 4 }
488 gpertea 13 FWCLOSE(f_out);
489     FWCLOSE(f_out2);
490     } //while each input file
491 gpertea 103 delete gxmem_l;
492     delete gxmem_r;
493 gpertea 12 //getc(stdin);
494 gpertea 4 }
495    
496     class NData {
497     public:
498     int NPos[1024]; //there should be no reads longer than 1K ?
499     int NCount;
500     int end5;
501     int end3;
502     int seqlen;
503     double perc_N; //percentage of Ns in end5..end3 range only!
504     const char* seq;
505     bool valid;
506     NData() {
507 gpertea 170 seqlen=0;
508 gpertea 4 NCount=0;
509     end5=0;
510     end3=0;
511     seq=NULL;
512     perc_N=0;
513     valid=true;
514     }
515     void init(GStr& rseq) {
516     NCount=0;
517     perc_N=0;
518     seqlen=rseq.length();
519     seq=rseq.chars();
520     end5=1;
521     end3=seqlen;
522     for (int i=0;i<seqlen;i++)
523 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
524 gpertea 4 NPos[NCount]=i;
525     NCount++;
526     }
527     perc_N=(NCount*100.0)/seqlen;
528     valid=true;
529     }
530     void N_calc() { //only in the region after trimming
531     int n=0;
532     for (int i=end3-1;i<end5;i++) {
533     if (seq[i]=='N') n++;
534     }
535     perc_N=(n*100.0)/(end5-end3+1);
536     }
537     };
538 gpertea 170
539 gpertea 4 static NData feat;
540     int perc_lenN=12; // incremental distance from ends, in percentage of
541     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
542 gpertea 170
543 gpertea 4 void N_analyze(int l5, int l3, int p5, int p3) {
544     /* assumes feat was filled properly */
545     int old_dif, t5,t3,v;
546 gpertea 12 if (l3<l5+2 || p5>p3 ) {
547 gpertea 4 feat.end5=l5+1;
548     feat.end3=l3+1;
549 gpertea 170 return;
550 gpertea 4 }
551    
552     t5=feat.NPos[p5]-l5;
553     t3=l3-feat.NPos[p3];
554     old_dif=p3-p5;
555     v=(int)((((double)(l3-l5))*perc_lenN)/100);
556 gpertea 170 if (v>20) v=20; /* enforce N-search limit for very long reads */
557 gpertea 4 if (t5 < v ) {
558     l5=feat.NPos[p5]+1;
559     p5++;
560     }
561     if (t3 < v) {
562     l3=feat.NPos[p3]-1;
563     p3--;
564     }
565     /* restNs=p3-p5; number of Ns in the new CLR */
566     if (p3-p5==old_dif) { /* no change, return */
567     /* don't trim if this may shorten the read too much */
568     //if (l5-l3<min_read_len) return;
569     feat.end5=l5+1;
570     feat.end3=l3+1;
571     return;
572     }
573 gpertea 170 else
574 gpertea 4 N_analyze(l5,l3, p5,p3);
575     }
576    
577    
578 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
579 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
580 gpertea 12 if (qv_phredtype==0) {
581     //try to guess the Phred type
582     int vmin=256, vmax=0;
583     for (int i=0;i<qvs.length();i++) {
584     if (vmin>qvs[i]) vmin=qvs[i];
585     if (vmax<qvs[i]) vmax=qvs[i];
586     }
587     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
588     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
589     if (qv_phredtype==0) {
590     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
591     }
592 gpertea 13 if (verbose)
593     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
594     } //guessing Phred type
595 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
596 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
597 gpertea 12 }
598     //just in case, check also the 5' the end (?)
599     for (l5=0;l5<qvs.length()-3;l5++) {
600 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
601 gpertea 12 }
602 gpertea 13 if (qvtrim_max>0) {
603     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
604     if (l5>qvtrim_max) l5=qvtrim_max;
605     }
606 gpertea 12 return (l5>0 || l3<qvs.length()-1);
607     }
608    
609     bool ntrim(GStr& rseq, int &l5, int &l3) {
610     //count Ns in the sequence, trim N-rich ends
611 gpertea 4 feat.init(rseq);
612     l5=feat.end5-1;
613     l3=feat.end3-1;
614 gpertea 170 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
615 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
616     if (feat.perc_N>max_perc_N) {
617     feat.valid=false;
618     l3=l5+1;
619     return true;
620     }
621     else {
622     return false; //nothing changed
623     }
624     }
625 gpertea 4 l5=feat.end5-1;
626     l3=feat.end3-1;
627     if (l3-l5+1<min_read_len) {
628     feat.valid=false;
629     return true;
630     }
631     feat.N_calc();
632 gpertea 170
633 gpertea 12 if (feat.perc_N>max_perc_N) {
634 gpertea 4 feat.valid=false;
635     l3=l5+1;
636     return true;
637     }
638     return true;
639     }
640    
641     //--------------- dust functions ----------------
642     class DNADuster {
643     public:
644 gpertea 170 int dustword;
645     int dustwindow;
646     int dustwindow2;
647 gpertea 4 int dustcutoff;
648     int mv, iv, jv;
649     int counts[32*32*32];
650     int iis[32*32*32];
651     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
652     dustword=wordsize;
653     dustwindow=winsize;
654     dustwindow2 = (winsize>>1);
655     dustcutoff=cutoff;
656     mv=0;
657     iv=0;
658     jv=0;
659     }
660     void setWindowSize(int value) {
661     dustwindow = value;
662     dustwindow2 = (dustwindow >> 1);
663     }
664     void setWordSize(int value) {
665     dustword=value;
666     }
667     void wo1(int len, const char* s, int ivv) {
668     int i, ii, j, v, t, n, n1, sum;
669     int js, nis;
670     n = 32 * 32 * 32;
671     n1 = n - 1;
672     nis = 0;
673     i = 0;
674     ii = 0;
675     sum = 0;
676     v = 0;
677     for (j=0; j < len; j++, s++) {
678     ii <<= 5;
679     if (*s<=32) {
680     i=0;
681     continue;
682     }
683     ii |= *s - 'A'; //assume uppercase!
684     ii &= n1;
685     i++;
686     if (i >= dustword) {
687     for (js=0; js < nis && iis[js] != ii; js++) ;
688     if (js == nis) {
689     iis[nis] = ii;
690     counts[ii] = 0;
691     nis++;
692     }
693     if ((t = counts[ii]) > 0) {
694     sum += t;
695     v = 10 * sum / j;
696     if (mv < v) {
697     mv = v;
698     iv = ivv;
699     jv = j;
700     }
701     }
702     counts[ii]++;
703     }
704     }
705     }
706    
707     int wo(int len, const char* s, int* beg, int* end) {
708     int i, l1;
709     l1 = len - dustword + 1;
710     if (l1 < 0) {
711     *beg = 0;
712     *end = len - 1;
713     return 0;
714     }
715     mv = 0;
716     iv = 0;
717     jv = 0;
718     for (i=0; i < l1; i++) {
719     wo1(len-i, s+i, i);
720     }
721     *beg = iv;
722     *end = iv + jv;
723     return mv;
724     }
725    
726     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
727     int i, j, l, a, b, v;
728     if (cutoff==0) cutoff=dustcutoff;
729     a=0;b=0;
730     //GMessage("Dust cutoff=%d\n", cutoff);
731     for (i=0; i < seqlen; i += dustwindow2) {
732     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
733     v = wo(l, seq+i, &a, &b);
734     if (v > cutoff) {
735     //for (j = a; j <= b && j < dustwindow2; j++) {
736     for (j = a; j <= b; j++) {
737     seqmsk[i+j]='N';//could be made lowercase instead
738     }
739     }
740     }
741 gpertea 170 //return first;
742 gpertea 4 }
743     };
744    
745     static DNADuster duster;
746    
747     int dust(GStr& rseq) {
748     char* seq=Gstrdup(rseq.chars());
749     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
750     //check the number of Ns:
751     int ncount=0;
752     for (int i=0;i<rseq.length();i++) {
753     if (seq[i]=='N') ncount++;
754     }
755     GFREE(seq);
756     return ncount;
757     }
758    
759 gpertea 94 struct SLocScore {
760     int pos;
761     int score;
762     SLocScore(int p=0,int s=0) {
763     pos=p;
764     score=s;
765     }
766     void set(int p, int s) {
767     pos=p;
768     score=s;
769     }
770     void add(int p, int add) {
771     pos=p;
772     score+=add;
773     }
774     };
775 gpertea 12
776 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
777 gpertea 103 if (!doPolyTrim) return false;
778 gpertea 94 int rlen=seq.length();
779     l5=0;
780     l3=rlen-1;
781     int32 seedVal=*(int32*)poly_seed;
782     char polyChar=poly_seed[0];
783     //assumes N trimming was already done
784     //so a poly match should be very close to the end of the read
785     // -- find the initial match (seed)
786 gpertea 108 int lmin=GMAX((rlen-16), 0);
787 gpertea 94 int li;
788     for (li=rlen-4;li>lmin;li--) {
789     if (seedVal==*(int*)&(seq[li])) {
790     break;
791     }
792 gpertea 12 }
793 gpertea 94 if (li<=lmin) return false;
794     //seed found, try to extend it both ways
795     //extend right
796     int ri=li+3;
797     SLocScore loc(ri, poly_m_score<<2);
798     SLocScore maxloc(loc);
799     //extend right
800 gpertea 170 while (ri<rlen-1) {
801 gpertea 94 ri++;
802     if (seq[ri]==polyChar) {
803     loc.add(ri,poly_m_score);
804     }
805     else if (seq[ri]=='N') {
806     loc.add(ri,0);
807     }
808     else { //mismatch
809     loc.add(ri,poly_mis_score);
810     if (maxloc.score-loc.score>poly_dropoff_score) break;
811     }
812     if (maxloc.score<=loc.score) {
813     maxloc=loc;
814     }
815 gpertea 12 }
816 gpertea 94 ri=maxloc.pos;
817 gpertea 103 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
818 gpertea 94 //ri = right boundary for the poly match
819     //extend left
820     loc.set(li, maxloc.score);
821     maxloc.pos=li;
822     while (li>0) {
823     li--;
824     if (seq[li]==polyChar) {
825     loc.add(li,poly_m_score);
826     }
827     else if (seq[li]=='N') {
828     loc.add(li,0);
829     }
830     else { //mismatch
831     loc.add(li,poly_mis_score);
832     if (maxloc.score-loc.score>poly_dropoff_score) break;
833     }
834     if (maxloc.score<=loc.score) {
835     maxloc=loc;
836     }
837     }
838 gpertea 105 li=maxloc.pos;
839 gpertea 108 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
840     (maxloc.score>poly_minScore && ri>=rlen-3) ||
841     (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
842 gpertea 105 //trimming this li-ri match at 3' end
843     l3=li-1;
844     if (l3<0) l3=0;
845 gpertea 94 return true;
846     }
847     return false;
848 gpertea 12 }
849    
850 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
851 gpertea 103 if (!doPolyTrim) return false;
852 gpertea 88 int rlen=seq.length();
853     l5=0;
854     l3=rlen-1;
855 gpertea 94 int32 seedVal=*(int32*)poly_seed;
856     char polyChar=poly_seed[0];
857 gpertea 88 //assumes N trimming was already done
858     //so a poly match should be very close to the end of the read
859     // -- find the initial match (seed)
860 gpertea 108 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
861 gpertea 94 int li;
862     for (li=0;li<=lmax;li++) {
863     if (seedVal==*(int*)&(seq[li])) {
864 gpertea 88 break;
865     }
866     }
867 gpertea 94 if (li>lmax) return false;
868     //seed found, try to extend it both ways
869     //extend left
870     int ri=li+3; //save rightmost base of the seed
871     SLocScore loc(li, poly_m_score<<2);
872     SLocScore maxloc(loc);
873     while (li>0) {
874     li--;
875     if (seq[li]==polyChar) {
876     loc.add(li,poly_m_score);
877     }
878     else if (seq[li]=='N') {
879     loc.add(li,0);
880     }
881     else { //mismatch
882     loc.add(li,poly_mis_score);
883     if (maxloc.score-loc.score>poly_dropoff_score) break;
884     }
885     if (maxloc.score<=loc.score) {
886     maxloc=loc;
887     }
888 gpertea 88 }
889 gpertea 94 li=maxloc.pos;
890 gpertea 103 if (li>5) return false; //no trimming wanted, too far from 5' end
891 gpertea 94 //li = right boundary for the poly match
892    
893     //extend right
894     loc.set(ri, maxloc.score);
895     maxloc.pos=ri;
896 gpertea 170 while (ri<rlen-1) {
897 gpertea 94 ri++;
898     if (seq[ri]==polyChar) {
899     loc.add(ri,poly_m_score);
900     }
901     else if (seq[ri]=='N') {
902     loc.add(ri,0);
903     }
904     else { //mismatch
905     loc.add(ri,poly_mis_score);
906     if (maxloc.score-loc.score>poly_dropoff_score) break;
907     }
908     if (maxloc.score<=loc.score) {
909     maxloc=loc;
910     }
911     }
912 gpertea 105 ri=maxloc.pos;
913 gpertea 103 if ((maxloc.score==poly_minScore && li==0) ||
914     (maxloc.score>poly_minScore && li<2)
915 gpertea 108 || (maxloc.score>(poly_minScore*3) && li<8)) {
916 gpertea 105 //adjust l5 to reflect this trimming of 5' end
917     l5=ri+1;
918     if (l5>rlen-1) l5=rlen-1;
919 gpertea 94 return true;
920     }
921     return false;
922 gpertea 88 }
923    
924 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
925 gpertea 103 if (adapters3.Count()==0) return false;
926 gpertea 4 int rlen=seq.length();
927     l5=0;
928     l3=rlen-1;
929 gpertea 103 bool trimmed=false;
930     GStr wseq(seq.chars());
931     int wlen=rlen;
932     for (int ai=0;ai<adapters3.Count();ai++) {
933     if (adapters3[ai].is_empty()) continue;
934     int alen=adapters3[ai].length();
935     GStr& aseq=adapters3[ai];
936     GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
937     if (r_bestaln) {
938     trimmed=true;
939     //keep unmatched region on the left, if any
940     l3-=(wlen-r_bestaln->sl+1);
941     delete r_bestaln;
942     if (l3<0) l3=0;
943     if (l3-l5+1<min_read_len) return true;
944     wseq=seq.substr(l5,l3-l5+1);
945     wlen=wseq.length();
946     }
947     }//for each 5' adapter
948     return trimmed;
949 gpertea 4 }
950    
951     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
952 gpertea 103 if (adapters5.Count()==0) return false;
953 gpertea 4 int rlen=seq.length();
954     l5=0;
955     l3=rlen-1;
956 gpertea 102 bool trimmed=false;
957 gpertea 103 GStr wseq(seq.chars());
958     int wlen=rlen;
959 gpertea 102 for (int ai=0;ai<adapters5.Count();ai++) {
960     if (adapters5[ai].is_empty()) continue;
961 gpertea 103 int alen=adapters5[ai].length();
962     GStr& aseq=adapters5[ai];
963 gpertea 170 GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
964     wseq.chars(), wlen, gxmem_l, 84);
965 gpertea 103 if (l_bestaln) {
966     trimmed=true;
967     l5+=l_bestaln->sr;
968     delete l_bestaln;
969     if (l5>=rlen) l5=rlen-1;
970     if (l3-l5+1<min_read_len) return true;
971     wseq=seq.substr(l5,l3-l5+1);
972     wlen=wseq.length();
973     }
974 gpertea 102 }//for each 5' adapter
975     return trimmed;
976 gpertea 98 }
977 gpertea 4
978 gpertea 170 //convert qvs to/from phred64 from/to phread33
979 gpertea 12 void convertPhred(GStr& q) {
980     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
981 gpertea 4 }
982    
983 gpertea 12 void convertPhred(char* q, int len) {
984     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
985 gpertea 4 }
986    
987 gpertea 170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
988 gpertea 13 GStr& rname, GStr& rinfo, GStr& infname) {
989     rseq="";
990     rqv="";
991     rname="";
992     rinfo="";
993     if (fq.eof()) return false;
994     char* l=fq.getLine();
995     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
996     if (l==NULL) return false;
997     /* if (rawFormat) {
998     //TODO: implement raw qseq parsing here
999     //if (raw type=N) then continue; //skip invalid/bad records
1000     } //raw qseq format
1001     else { // FASTQ or FASTA */
1002     isfasta=(l[0]=='>');
1003 gpertea 170 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1004 gpertea 13 infname.chars(), l);
1005     GStr s(l);
1006     rname=&(l[1]);
1007     for (int i=0;i<rname.length();i++)
1008 gpertea 170 if (rname[i]<=' ') {
1009     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1010     rname.cut(i);
1011     break;
1012 gpertea 13 }
1013     //now get the sequence
1014 gpertea 170 if ((l=fq.getLine())==NULL)
1015 gpertea 13 GError("Error: unexpected EOF after header for read %s (%s)\n",
1016     rname.chars(), infname.chars());
1017     rseq=l; //this must be the DNA line
1018     while ((l=fq.getLine())!=NULL) {
1019     //seq can span multiple lines
1020     if (l[0]=='>' || l[0]=='+') {
1021 gpertea 170 fq.pushBack();
1022 gpertea 13 break; //
1023     }
1024     rseq+=l;
1025 gpertea 170 } //check for multi-line seq
1026 gpertea 13 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1027     if ((l=fq.getLine())==NULL)
1028     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1029     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1030 gpertea 170 if ((l=fq.getLine())==NULL)
1031 gpertea 13 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1032     rqv=l;
1033 gpertea 170 //if (rqv.length()!=rseq.length())
1034 gpertea 13 // GError("Error: qv len != seq len for %s\n", rname.chars());
1035     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1036     rqv+=l; //append to qv string
1037     }
1038     }// fastq
1039     // } //<-- FASTA or FASTQ
1040 gpertea 94 rseq.upper();
1041 gpertea 13 return true;
1042     }
1043    
1044 gpertea 105 #ifdef GDEBUG
1045     void showTrim(GStr& s, int l5, int l3) {
1046     if (l5>0) {
1047     color_bg(c_red);
1048     }
1049     for (int i=0;i<s.length()-1;i++) {
1050     if (i && i==l5) color_resetbg();
1051     fprintf(stderr, "%c", s[i]);
1052     if (i==l3) color_bg(c_red);
1053     }
1054     fprintf(stderr, "%c", s[s.length()-1]);
1055     color_reset();
1056     fprintf(stderr, "\n");
1057     }
1058     #endif
1059    
1060 gpertea 13 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1061 gpertea 170 //returns 0 if the read was untouched, 1 if it was just trimmed
1062 gpertea 13 // and a trash code if it was trashed
1063     l5=0;
1064     l3=rseq.length()-1;
1065 gpertea 103 #ifdef GDEBUG
1066 gpertea 108 //rseq.reverse();
1067 gpertea 103 GMessage(">%s\n", rname.chars());
1068 gpertea 105 GMessage("%s\n",rseq.chars());
1069 gpertea 103 #endif
1070 gpertea 13 if (l3-l5+1<min_read_len) {
1071     return 's';
1072     }
1073     GStr wseq(rseq.chars());
1074     GStr wqv(rqv.chars());
1075     int w5=l5;
1076     int w3=l3;
1077     //first do the q-based trimming
1078     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1079     if (w3-w5+1<min_read_len) {
1080     return 'Q'; //invalid read
1081     }
1082     //-- keep only the w5..w3 range
1083     l5=w5;
1084     l3=w3;
1085     wseq=wseq.substr(w5, w3-w5+1);
1086     if (!wqv.is_empty())
1087     wqv=wqv.substr(w5, w3-w5+1);
1088     } //qv trimming
1089     // N-trimming on the remaining read seq
1090     if (ntrim(wseq, w5, w3)) {
1091     //GMessage("before: %s\n",wseq.chars());
1092     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1093     l5+=w5;
1094     l3-=(wseq.length()-1-w3);
1095     if (w3-w5+1<min_read_len) {
1096     return 'N'; //to be trashed
1097     }
1098     //-- keep only the w5..w3 range
1099     wseq=wseq.substr(w5, w3-w5+1);
1100     if (!wqv.is_empty())
1101     wqv=wqv.substr(w5, w3-w5+1);
1102     w5=0;
1103     w3=wseq.length()-1;
1104     }
1105 gpertea 103 char trim_code;
1106     do {
1107     trim_code=0;
1108     if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1109     trim_code='A';
1110     }
1111     else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1112     trim_code='T';
1113     }
1114     else if (trim_adapter5(wseq, w5, w3)) {
1115     trim_code='5';
1116     }
1117     if (trim_code) {
1118 gpertea 105 #ifdef GDEBUG
1119     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1120     showTrim(wseq, w5, w3);
1121     #endif
1122 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1123 gpertea 103 num_trimmed5++;
1124     if (trimlen<min_trimmed5)
1125     min_trimmed5=trimlen;
1126 gpertea 13 l5+=w5;
1127     l3-=(wseq.length()-1-w3);
1128     if (w3-w5+1<min_read_len) {
1129 gpertea 103 return trim_code;
1130 gpertea 13 }
1131     //-- keep only the w5..w3 range
1132     wseq=wseq.substr(w5, w3-w5+1);
1133     if (!wqv.is_empty())
1134     wqv=wqv.substr(w5, w3-w5+1);
1135 gpertea 103 }// trimmed at 5' end
1136     } while (trim_code);
1137    
1138     do {
1139     trim_code=0;
1140     if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1141     trim_code='A';
1142     }
1143     else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1144     trim_code='T';
1145     }
1146     else if (trim_adapter3(wseq, w5, w3)) {
1147     trim_code='3';
1148     }
1149     if (trim_code) {
1150 gpertea 105 #ifdef GDEBUG
1151     GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1152     showTrim(wseq, w5, w3);
1153     #endif
1154 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1155 gpertea 103 num_trimmed3++;
1156 gpertea 170 if (trimlen<min_trimmed3)
1157 gpertea 103 min_trimmed3=trimlen;
1158 gpertea 13 l5+=w5;
1159     l3-=(wseq.length()-1-w3);
1160     if (w3-w5+1<min_read_len) {
1161 gpertea 103 return trim_code;
1162 gpertea 13 }
1163     //-- keep only the w5..w3 range
1164     wseq=wseq.substr(w5, w3-w5+1);
1165     if (!wqv.is_empty())
1166     wqv=wqv.substr(w5, w3-w5+1);
1167 gpertea 103 }//trimming at 3' end
1168     } while (trim_code);
1169    
1170    
1171 gpertea 13 if (doCollapse) {
1172     //keep read for later
1173     FqDupRec* dr=dhash.Find(wseq.chars());
1174     if (dr==NULL) { //new entry
1175 gpertea 170 //if (prefix.is_empty())
1176     dhash.Add(wseq.chars(),
1177 gpertea 13 new FqDupRec(&wqv, rname.chars()));
1178     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1179     }
1180     else
1181     dr->add(wqv);
1182     } //collapsing duplicates
1183     else { //not collapsing duplicates
1184     //apply the dust filter now
1185     if (doDust) {
1186     int dustbases=dust(wseq);
1187     if (dustbases>(wseq.length()>>1)) {
1188     return 'D';
1189     }
1190     }
1191     } //not collapsing duplicates
1192     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1193     }
1194    
1195     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1196     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1197     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1198     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1199     }
1200    
1201     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1202     outcounter++;
1203     bool asFasta=(rqv.is_empty() || fastaOutput);
1204     if (asFasta) {
1205     if (prefix.is_empty()) {
1206     printHeader(f_out, '>',rname,rinfo);
1207     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1208     }
1209     else {
1210 gpertea 170 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1211 gpertea 13 rseq.chars());
1212     }
1213     }
1214     else { //fastq
1215     if (convert_phred) convertPhred(rqv);
1216     if (prefix.is_empty()) {
1217     printHeader(f_out, '@', rname, rinfo);
1218     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1219     }
1220     else
1221 gpertea 170 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1222 gpertea 13 rseq.chars(),rqv.chars() );
1223     }
1224     }
1225    
1226     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1227     if (freport==NULL || trashcode<=' ') return;
1228     if (trashcode=='3' || trashcode=='5') {
1229 gpertea 103 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1230 gpertea 13 }
1231     else {
1232     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1233     }
1234     //tcounter++;
1235     }
1236    
1237     GStr getFext(GStr& s, int* xpos=NULL) {
1238     //s must be a filename without a path
1239     GStr r("");
1240     if (xpos!=NULL) *xpos=0;
1241     if (s.is_empty() || s=="-") return r;
1242     int p=s.rindex('.');
1243     int d=s.rindex('/');
1244     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1245     r=s.substr(p+1);
1246     if (xpos!=NULL) *xpos=p+1;
1247     r.lower();
1248     return r;
1249     }
1250    
1251     void baseFileName(GStr& fname) {
1252     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1253     int xpos=0;
1254     GStr fext=getFext(fname, &xpos);
1255     if (xpos<=1) return;
1256     bool extdel=false;
1257     GStr f2;
1258     if (fext=="z" || fext=="zip") {
1259     extdel=true;
1260     }
1261     else if (fext.length()>=2) {
1262     f2=fext.substr(0,2);
1263     extdel=(f2=="gz" || f2=="bz");
1264     }
1265     if (extdel) {
1266     fname.cut(xpos-1);
1267     fext=getFext(fname, &xpos);
1268     if (xpos<=1) return;
1269     }
1270     extdel=false;
1271     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1272     extdel=true;
1273     }
1274     else if (fext.length()>=2) {
1275     extdel=(fext.substr(0,2)=="fa");
1276     }
1277     if (extdel) fname.cut(xpos-1);
1278     GStr fncp(fname);
1279     fncp.lower();
1280     fncp.chomp("seq");
1281     fncp.chomp("sequence");
1282     fncp.trimR("_.");
1283     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1284     }
1285    
1286     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1287     FILE* f_out=NULL;
1288     GStr fname(getFileName(infname.chars()));
1289     //eliminate known extensions
1290     baseFileName(fname);
1291     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1292     else if (pocmd.is_empty()) {
1293     GStr oname(fname);
1294     oname.append('.');
1295     oname.append(outsuffix);
1296     f_out=fopen(oname.chars(),"w");
1297     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1298     }
1299     else {
1300     GStr oname(">");
1301     oname.append(fname);
1302     oname.append('.');
1303     oname.append(outsuffix);
1304     pocmd.append(oname);
1305     f_out=popen(pocmd.chars(), "w");
1306     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1307     }
1308     return f_out;
1309     }
1310    
1311     void guess_unzip(GStr& fname, GStr& picmd) {
1312     GStr fext=getFext(fname);
1313     if (fext=="gz" || fext=="gzip" || fext=="z") {
1314     picmd="gzip -cd ";
1315     }
1316     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1317     picmd="bzip2 -cd ";
1318     }
1319     }
1320    
1321 gpertea 170 void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1322     //TODO: prepare CASeqData here, and collect hexamers as well
1323 gpertea 94
1324 gpertea 170 }
1325    
1326    
1327 gpertea 94 int loadAdapters(const char* fname) {
1328     GLineReader lr(fname);
1329     char* l;
1330     while ((l=lr.nextLine())!=NULL) {
1331     if (lr.length()<=3 || l[0]=='#') continue;
1332 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1333     l[0]==';'|| l[0]==':' ) {
1334 gpertea 94 int i=1;
1335     while (l[i]!=0 && isspace(l[i])) {
1336     i++;
1337     }
1338     if (l[i]!=0) {
1339 gpertea 102 GStr s(&(l[i]));
1340 gpertea 108 #ifdef GDEBUG
1341     //s.reverse();
1342     #endif
1343 gpertea 170 addAdapter(adapters3, s);
1344 gpertea 94 continue;
1345     }
1346     }
1347     else {
1348 gpertea 98 GStr s(l);
1349     s.startTokenize("\t ;,:");
1350     GStr a5,a3;
1351     if (s.nextToken(a5))
1352     s.nextToken(a3);
1353 gpertea 102 a5.upper();
1354     a3.upper();
1355 gpertea 108 #ifdef GDEBUG
1356     // a5.reverse();
1357     // a3.reverse();
1358     #endif
1359 gpertea 170 addAdapter(adapters5, a5);
1360     addAdapter(adapters3, a3);
1361 gpertea 94 }
1362     }
1363 gpertea 102 return adapters5.Count()+adapters3.Count();
1364 gpertea 94 }
1365    
1366 gpertea 170 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1367     GStr& s, GStr& infname, GStr& infname2) {
1368 gpertea 13 // uses outsuffix to generate output file names and open file handles as needed
1369     infname="";
1370     infname2="";
1371     f_in=NULL;
1372     f_in2=NULL;
1373     f_out=NULL;
1374     f_out2=NULL;
1375     //analyze outsuffix intent
1376     GStr pocmd;
1377 gpertea 76 if (outsuffix=="-") {
1378     f_out=stdout;
1379     }
1380     else {
1381     GStr ox=getFext(outsuffix);
1382     if (ox.length()>2) ox=ox.substr(0,2);
1383     if (ox=="gz") pocmd="gzip -9 -c ";
1384     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1385     }
1386 gpertea 13 if (s=="-") {
1387     f_in=stdin;
1388     infname="stdin";
1389     f_out=prepOutFile(infname, pocmd);
1390     return;
1391     } // streaming from stdin
1392     s.startTokenize(",:");
1393     s.nextToken(infname);
1394     bool paired=s.nextToken(infname2);
1395 gpertea 170 if (fileExists(infname.chars())==0)
1396 gpertea 13 GError("Error: cannot find file %s!\n",infname.chars());
1397     GStr fname(getFileName(infname.chars()));
1398     GStr picmd;
1399     guess_unzip(fname, picmd);
1400     if (picmd.is_empty()) {
1401     f_in=fopen(infname.chars(), "r");
1402     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1403     }
1404     else {
1405     picmd.append(infname);
1406     f_in=popen(picmd.chars(), "r");
1407     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1408     }
1409 gpertea 76 if (f_out==stdout) {
1410     if (paired) GError("Error: output suffix required for paired reads\n");
1411     return;
1412     }
1413 gpertea 13 f_out=prepOutFile(infname, pocmd);
1414     if (!paired) return;
1415     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1416     // ---- paired reads:-------------
1417 gpertea 170 if (fileExists(infname2.chars())==0)
1418 gpertea 13 GError("Error: cannot find file %s!\n",infname2.chars());
1419     picmd="";
1420     GStr fname2(getFileName(infname2.chars()));
1421     guess_unzip(fname2, picmd);
1422     if (picmd.is_empty()) {
1423     f_in2=fopen(infname2.chars(), "r");
1424     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1425     }
1426     else {
1427     picmd.append(infname2);
1428     f_in2=popen(picmd.chars(), "r");
1429     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1430     }
1431     f_out2=prepOutFile(infname2, pocmd);
1432     }