ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 103
Committed: Mon Oct 10 19:33:39 2011 UTC (7 years, 10 months ago) by gpertea
File size: 41090 byte(s)
Log Message:
wip

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 76 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10     [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 76 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15     for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28     -f file with adapter sequences to trim, each line having this format:\n\
29     <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 gpertea 4 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32     -3 trim the given adapter sequence at the 3' end of each read\n\
33     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 103 -A disable polyA/T trimming (enabled by default)\n\
35 gpertea 94 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39     -l minimum \"clean\" length after trimming that a read must have\n\
40     in order to pass the filter (default: 16)\n\
41 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42     one letter code for the reason why they were trashed\n\
43 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44     50% of its length detected as low complexity\n\
45 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 gpertea 12 name (e.g. for microRNAs)\n\
47     -p input is phred64/phred33 (use -p64 or -p33)\n\
48     -Q convert quality values to the other Phred qv type\n\
49     -V verbose processing\n\
50 gpertea 4 "
51 gpertea 13
52     //-z for -o option, the output stream(s) will be first piped into the given\n
53     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54    
55    
56 gpertea 4 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57 gpertea 12
58 gpertea 76 //For paired reads sequencing:
59 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
62     //FILE* f_out2=NULL; //for paired reads
63     //FILE* f_in=NULL; //input fastq (stdin if not provided)
64     //FILE* f_in2=NULL; //for paired reads
65 gpertea 4 FILE* freport=NULL;
66 gpertea 13
67 gpertea 4 bool debug=false;
68 gpertea 12 bool verbose=false;
69 gpertea 4 bool doCollapse=false;
70     bool doDust=false;
71 gpertea 103 bool doPolyTrim=true;
72 gpertea 13 bool fastaOutput=false;
73 gpertea 4 bool trashReport=false;
74 gpertea 12 //bool rawFormat=false;
75 gpertea 4 int min_read_len=16;
76 gpertea 12 double max_perc_N=7.0;
77 gpertea 4 int dust_cutoff=16;
78     bool isfasta=false;
79 gpertea 12 bool convert_phred=false;
80 gpertea 13 GStr outsuffix; // -o
81     GStr prefix;
82     GStr zcmd;
83     int num_trimmed5=0;
84     int num_trimmed3=0;
85     int num_trimmedN=0;
86     int num_trimmedQ=0;
87     int min_trimmed5=INT_MAX;
88     int min_trimmed3=INT_MAX;
89 gpertea 12
90 gpertea 13 int qvtrim_qmin=0;
91     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
92 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93     int qv_cvtadd=0; //could be -31 or +31
94    
95 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
96 gpertea 103 const int match_reward=2;
97     const int mismatch_penalty=3;
98     const int Xdrop=8;
99    
100 gpertea 94 const int poly_m_score=2; //match score
101     const int poly_mis_score=-3; //mismatch
102 gpertea 88 const int poly_dropoff_score=7;
103 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
104 gpertea 4
105 gpertea 94 const char *polyA_seed="AAAA";
106     const char *polyT_seed="TTTT";
107 gpertea 102 GVec<GStr> adapters5;
108     GVec<GStr> adapters3;
109 gpertea 12
110 gpertea 103 CGreedyAlignData* gxmem_l=NULL;
111     CGreedyAlignData* gxmem_r=NULL;
112    
113 gpertea 4 // element in dhash:
114     class FqDupRec {
115     public:
116     int count; //how many of these reads are the same
117     int len; //length of qv
118     char* firstname; //optional, only if we want to keep the original read names
119     char* qv;
120     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
121     len=0;
122     qv=NULL;
123     firstname=NULL;
124     count=0;
125     if (qstr!=NULL) {
126     qv=Gstrdup(qstr->chars());
127     len=qstr->length();
128     count++;
129     }
130     if (rname!=NULL) firstname=Gstrdup(rname);
131     }
132     ~FqDupRec() {
133     GFREE(qv);
134     GFREE(firstname);
135     }
136     void add(GStr& d) { //collapse another record into this one
137     if (d.length()!=len)
138     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
139     count++;
140     for (int i=0;i<len;i++)
141     qv[i]=(qv[i]+d[i])/2;
142     }
143     };
144    
145     void openfw(FILE* &f, GArgs& args, char opt) {
146     GStr s=args.getOpt(opt);
147     if (!s.is_empty()) {
148     if (s=='-') f=stdout;
149     else {
150 gpertea 13 f=fopen(s.chars(),"w");
151 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
152     }
153     }
154     }
155    
156     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
157     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
158    
159     GHash<FqDupRec> dhash; //hash to keep track of duplicates
160    
161 gpertea 94 int loadAdapters(const char* fname);
162    
163 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
164     GStr& s, GStr& infname, GStr& infname2);
165     // uses outsuffix to generate output file names and open file handles as needed
166    
167     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
168     void trash_report(char trashcode, GStr& rname, FILE* freport);
169    
170     bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
171     GStr& rname, GStr& rinfo, GStr& infname);
172    
173     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
174     //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
175    
176 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
177     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
178 gpertea 4 int dust(GStr& seq);
179 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
180     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
181 gpertea 4 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
182 gpertea 94 bool trim_adapter3(GStr& seq, int &l5, int &l3);
183 gpertea 4
184 gpertea 12 void convertPhred(char* q, int len);
185     void convertPhred(GStr& q);
186 gpertea 4
187     int main(int argc, char * const argv[]) {
188 gpertea 103 GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189 gpertea 4 int e;
190     if ((e=args.isError())>0) {
191     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
192     exit(224);
193     }
194 gpertea 9 debug=(args.getOpt('Y')!=NULL);
195 gpertea 13 verbose=(args.getOpt('V')!=NULL);
196 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
197 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
198     doDust=(args.getOpt('D')!=NULL);
199 gpertea 103 if (args.getOpt('A')) doPolyTrim=false;
200 gpertea 12 /*
201 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
202     if (rawFormat) {
203     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
204     }
205 gpertea 12 */
206 gpertea 4 prefix=args.getOpt('n');
207     GStr s=args.getOpt('l');
208     if (!s.is_empty())
209     min_read_len=s.asInt();
210 gpertea 12 s=args.getOpt('m');
211     if (!s.is_empty())
212     max_perc_N=s.asDouble();
213 gpertea 4 s=args.getOpt('d');
214     if (!s.is_empty()) {
215     dust_cutoff=s.asInt();
216     doDust=true;
217     }
218 gpertea 12 s=args.getOpt('q');
219     if (!s.is_empty()) {
220 gpertea 13 qvtrim_qmin=s.asInt();
221 gpertea 12 }
222 gpertea 13 s=args.getOpt('t');
223     if (!s.is_empty()) {
224     qvtrim_max=s.asInt();
225     }
226 gpertea 12 s=args.getOpt('p');
227     if (!s.is_empty()) {
228     int v=s.asInt();
229     if (v==33) {
230     qv_phredtype=33;
231     qv_cvtadd=31;
232     }
233     else if (v==64) {
234     qv_phredtype=64;
235     qv_cvtadd=-31;
236     }
237     else
238     GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
239     }
240 gpertea 94 s=args.getOpt('f');
241     if (!s.is_empty()) {
242     loadAdapters(s.chars());
243     }
244 gpertea 102 bool fileAdapters=adapters5.Count()+adapters3.Count();
245 gpertea 94 s=args.getOpt('5');
246     if (!s.is_empty()) {
247     if (fileAdapters)
248     GError("Error: options -5 and -f cannot be used together!\n");
249     s.upper();
250 gpertea 102 adapters5.Add(s);
251 gpertea 4 }
252 gpertea 94 s=args.getOpt('3');
253     if (!s.is_empty()) {
254     if (fileAdapters)
255     GError("Error: options -3 and -f cannot be used together!\n");
256 gpertea 102 s.upper();
257     adapters3.Add(s);
258 gpertea 4 }
259 gpertea 94 s=args.getOpt('y');
260 gpertea 13 if (!s.is_empty()) {
261 gpertea 94 int minmatch=s.asInt();
262     poly_minScore=minmatch*poly_m_score;
263 gpertea 13 }
264 gpertea 76
265 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
266 gpertea 76 else outsuffix="-";
267 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
268 gpertea 13 int fcount=args.startNonOpt();
269     if (fcount==0) {
270 gpertea 4 GMessage(USAGE);
271     exit(224);
272     }
273 gpertea 13 if (fcount>1 && doCollapse) {
274     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
275     }
276     //openfw(f_out, args, 'o');
277     //if (f_out==NULL) f_out=stdout;
278 gpertea 4 if (trashReport)
279     openfw(freport, args, 'r');
280     char* infile=NULL;
281 gpertea 103
282     if (adapters5.Count()>0)
283     gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
284     if (adapters3.Count()>0)
285     gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
286    
287 gpertea 4 while ((infile=args.nextNonOpt())!=NULL) {
288 gpertea 103 //for each input file
289 gpertea 13 int incounter=0; //counter for input reads
290     int outcounter=0; //counter for output reads
291     int trash_s=0; //too short from the get go
292     int trash_Q=0;
293     int trash_N=0;
294     int trash_D=0;
295 gpertea 103 int trash_poly=0;
296 gpertea 13 int trash_A3=0;
297     int trash_A5=0;
298     s=infile;
299     GStr infname;
300     GStr infname2;
301     FILE* f_in=NULL;
302     FILE* f_in2=NULL;
303     FILE* f_out=NULL;
304     FILE* f_out2=NULL;
305     bool paired_reads=false;
306     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
307     GLineReader fq(f_in);
308     GLineReader* fq2=NULL;
309     if (f_in2!=NULL) {
310     fq2=new GLineReader(f_in2);
311     paired_reads=true;
312     }
313     GStr rseq, rqv, seqid, seqinfo;
314     GStr rseq2, rqv2, seqid2, seqinfo2;
315     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
316     incounter++;
317     int a5=0, a3=0, b5=0, b3=0;
318     char tcode=0, tcode2=0;
319     tcode=process_read(seqid, rseq, rqv, a5, a3);
320 gpertea 103 if (fq2!=NULL) {
321 gpertea 13 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
322     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
323     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
324     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
325     }
326     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
327     //decide what to do with this pair and print rseq2 if the pair makes it
328     if (tcode>1 && tcode2<=1) {
329     //"untrash" rseq
330     if (a3-a5+1<min_read_len) {
331     a5=1;
332     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
333 gpertea 4 }
334 gpertea 13 tcode=1;
335     }
336     else if (tcode<=1 && tcode2>1) {
337     //"untrash" rseq2
338     if (b3-b5+1<min_read_len) {
339     b5=1;
340     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
341     }
342     tcode2=1;
343     }
344     if (tcode<=1) { //trimmed or left intact -- write it!
345     if (tcode>0) {
346     rseq2=rseq2.substr(b5,b3-b5+1);
347     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
348 gpertea 4 }
349 gpertea 13 int nocounter=0;
350     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
351     }
352 gpertea 103 } //pair read
353 gpertea 13 if (tcode>1) { //trashed
354 gpertea 103 #ifdef GDEBUG
355     GMessage(" !!!!TRASH => 'N'\n");
356     #endif
357 gpertea 13 if (tcode=='s') trash_s++;
358 gpertea 103 else if (tcode=='A' || tcode=='T') trash_poly++;
359 gpertea 13 else if (tcode=='Q') trash_Q++;
360     else if (tcode=='N') trash_N++;
361     else if (tcode=='D') trash_D++;
362     else if (tcode=='3') trash_A3++;
363     else if (tcode=='5') trash_A5++;
364     if (trashReport) trash_report(tcode, seqid, freport);
365     }
366     else if (!doCollapse) { //write it
367     if (tcode>0) {
368     rseq=rseq.substr(a5,a3-a5+1);
369     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
370     }
371 gpertea 103 #ifdef GDEBUG
372     GMessage(" After trimming:\n");
373     GMessage("<==%s\n",rseq.chars());
374     #endif
375 gpertea 13 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
376     }
377     } //for each fastq record
378     delete fq2;
379     FRCLOSE(f_in);
380     FRCLOSE(f_in2);
381     if (doCollapse) {
382     outcounter=0;
383     int maxdup_count=1;
384     char* maxdup_seq=NULL;
385     dhash.startIterate();
386     FqDupRec* qd=NULL;
387     char* seq=NULL;
388     while ((qd=dhash.NextData(seq))!=NULL) {
389     GStr rseq(seq);
390     //do the dusting here
391     if (doDust) {
392     int dustbases=dust(rseq);
393     if (dustbases>(rseq.length()>>1)) {
394     if (trashReport && qd->firstname!=NULL) {
395     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
396 gpertea 4 }
397 gpertea 13 trash_D+=qd->count;
398     continue;
399     }
400     }
401     outcounter++;
402     if (qd->count>maxdup_count) {
403     maxdup_count=qd->count;
404     maxdup_seq=seq;
405     }
406     if (isfasta) {
407     if (prefix.is_empty()) {
408     fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
409     rseq.chars());
410 gpertea 4 }
411 gpertea 13 else { //use custom read name
412     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
413     qd->count, rseq.chars());
414 gpertea 4 }
415 gpertea 13 }
416     else { //fastq format
417     if (convert_phred) convertPhred(qd->qv, qd->len);
418     if (prefix.is_empty()) {
419     fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
420     rseq.chars(), qd->qv);
421 gpertea 4 }
422 gpertea 13 else { //use custom read name
423     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
424     qd->count, rseq.chars(), qd->qv);
425     }
426     }
427     }//for each element of dhash
428     if (maxdup_count>1) {
429     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
430 gpertea 4 }
431 gpertea 13 } //collapse entries
432     if (verbose) {
433     if (paired_reads) {
434     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
435     GMessage("Number of input pairs :%9d\n", incounter);
436     GMessage(" Output pairs :%9d\n", outcounter);
437     }
438     else {
439     GMessage(">Input file : %s\n", infname.chars());
440     GMessage("Number of input reads :%9d\n", incounter);
441     GMessage(" Output reads :%9d\n", outcounter);
442     }
443     GMessage("------------------------------------\n");
444     if (num_trimmed5)
445     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
446     if (num_trimmed3)
447     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
448     GMessage("------------------------------------\n");
449     if (trash_s>0)
450     GMessage(" Trashed by length:%9d\n", trash_s);
451     if (trash_N>0)
452     GMessage(" Trashed by N%%:%9d\n", trash_N);
453     if (trash_Q>0)
454     GMessage("Trashed by low quality:%9d\n", trash_Q);
455 gpertea 103 if (trash_poly>0)
456     GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
457 gpertea 13 if (trash_A5>0)
458     GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
459     if (trash_A3>0)
460     GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
461     }
462     if (trashReport) {
463     FWCLOSE(freport);
464 gpertea 4 }
465 gpertea 13 FWCLOSE(f_out);
466     FWCLOSE(f_out2);
467     } //while each input file
468 gpertea 103 delete gxmem_l;
469     delete gxmem_r;
470 gpertea 12 //getc(stdin);
471 gpertea 4 }
472    
473     class NData {
474     public:
475     int NPos[1024]; //there should be no reads longer than 1K ?
476     int NCount;
477     int end5;
478     int end3;
479     int seqlen;
480     double perc_N; //percentage of Ns in end5..end3 range only!
481     const char* seq;
482     bool valid;
483     NData() {
484     NCount=0;
485     end5=0;
486     end3=0;
487     seq=NULL;
488     perc_N=0;
489     valid=true;
490     }
491     void init(GStr& rseq) {
492     NCount=0;
493     perc_N=0;
494     seqlen=rseq.length();
495     seq=rseq.chars();
496     end5=1;
497     end3=seqlen;
498     for (int i=0;i<seqlen;i++)
499 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
500 gpertea 4 NPos[NCount]=i;
501     NCount++;
502     }
503     perc_N=(NCount*100.0)/seqlen;
504     valid=true;
505     }
506     void N_calc() { //only in the region after trimming
507     int n=0;
508     for (int i=end3-1;i<end5;i++) {
509     if (seq[i]=='N') n++;
510     }
511     perc_N=(n*100.0)/(end5-end3+1);
512     }
513     };
514    
515     static NData feat;
516     int perc_lenN=12; // incremental distance from ends, in percentage of
517     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
518    
519     void N_analyze(int l5, int l3, int p5, int p3) {
520     /* assumes feat was filled properly */
521     int old_dif, t5,t3,v;
522 gpertea 12 if (l3<l5+2 || p5>p3 ) {
523 gpertea 4 feat.end5=l5+1;
524     feat.end3=l3+1;
525     return;
526     }
527    
528     t5=feat.NPos[p5]-l5;
529     t3=l3-feat.NPos[p3];
530     old_dif=p3-p5;
531     v=(int)((((double)(l3-l5))*perc_lenN)/100);
532     if (v>20) v=20; /* enforce N-search limit for very long reads */
533     if (t5 < v ) {
534     l5=feat.NPos[p5]+1;
535     p5++;
536     }
537     if (t3 < v) {
538     l3=feat.NPos[p3]-1;
539     p3--;
540     }
541     /* restNs=p3-p5; number of Ns in the new CLR */
542     if (p3-p5==old_dif) { /* no change, return */
543     /* don't trim if this may shorten the read too much */
544     //if (l5-l3<min_read_len) return;
545     feat.end5=l5+1;
546     feat.end3=l3+1;
547     return;
548     }
549     else
550     N_analyze(l5,l3, p5,p3);
551     }
552    
553    
554 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
555 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
556 gpertea 12 if (qv_phredtype==0) {
557     //try to guess the Phred type
558     int vmin=256, vmax=0;
559     for (int i=0;i<qvs.length();i++) {
560     if (vmin>qvs[i]) vmin=qvs[i];
561     if (vmax<qvs[i]) vmax=qvs[i];
562     }
563     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
564     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
565     if (qv_phredtype==0) {
566     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
567     }
568 gpertea 13 if (verbose)
569     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
570     } //guessing Phred type
571 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
572 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
573 gpertea 12 }
574     //just in case, check also the 5' the end (?)
575     for (l5=0;l5<qvs.length()-3;l5++) {
576 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
577 gpertea 12 }
578 gpertea 13 if (qvtrim_max>0) {
579     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
580     if (l5>qvtrim_max) l5=qvtrim_max;
581     }
582 gpertea 12 return (l5>0 || l3<qvs.length()-1);
583     }
584    
585     bool ntrim(GStr& rseq, int &l5, int &l3) {
586     //count Ns in the sequence, trim N-rich ends
587 gpertea 4 feat.init(rseq);
588     l5=feat.end5-1;
589     l3=feat.end3-1;
590     N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
591 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
592     if (feat.perc_N>max_perc_N) {
593     feat.valid=false;
594     l3=l5+1;
595     return true;
596     }
597     else {
598     return false; //nothing changed
599     }
600     }
601 gpertea 4 l5=feat.end5-1;
602     l3=feat.end3-1;
603     if (l3-l5+1<min_read_len) {
604     feat.valid=false;
605     return true;
606     }
607     feat.N_calc();
608 gpertea 12
609     if (feat.perc_N>max_perc_N) {
610 gpertea 4 feat.valid=false;
611     l3=l5+1;
612     return true;
613     }
614     return true;
615     }
616    
617     //--------------- dust functions ----------------
618     class DNADuster {
619     public:
620     int dustword;
621     int dustwindow;
622     int dustwindow2;
623     int dustcutoff;
624     int mv, iv, jv;
625     int counts[32*32*32];
626     int iis[32*32*32];
627     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
628     dustword=wordsize;
629     dustwindow=winsize;
630     dustwindow2 = (winsize>>1);
631     dustcutoff=cutoff;
632     mv=0;
633     iv=0;
634     jv=0;
635     }
636     void setWindowSize(int value) {
637     dustwindow = value;
638     dustwindow2 = (dustwindow >> 1);
639     }
640     void setWordSize(int value) {
641     dustword=value;
642     }
643     void wo1(int len, const char* s, int ivv) {
644     int i, ii, j, v, t, n, n1, sum;
645     int js, nis;
646     n = 32 * 32 * 32;
647     n1 = n - 1;
648     nis = 0;
649     i = 0;
650     ii = 0;
651     sum = 0;
652     v = 0;
653     for (j=0; j < len; j++, s++) {
654     ii <<= 5;
655     if (*s<=32) {
656     i=0;
657     continue;
658     }
659     ii |= *s - 'A'; //assume uppercase!
660     ii &= n1;
661     i++;
662     if (i >= dustword) {
663     for (js=0; js < nis && iis[js] != ii; js++) ;
664     if (js == nis) {
665     iis[nis] = ii;
666     counts[ii] = 0;
667     nis++;
668     }
669     if ((t = counts[ii]) > 0) {
670     sum += t;
671     v = 10 * sum / j;
672     if (mv < v) {
673     mv = v;
674     iv = ivv;
675     jv = j;
676     }
677     }
678     counts[ii]++;
679     }
680     }
681     }
682    
683     int wo(int len, const char* s, int* beg, int* end) {
684     int i, l1;
685     l1 = len - dustword + 1;
686     if (l1 < 0) {
687     *beg = 0;
688     *end = len - 1;
689     return 0;
690     }
691     mv = 0;
692     iv = 0;
693     jv = 0;
694     for (i=0; i < l1; i++) {
695     wo1(len-i, s+i, i);
696     }
697     *beg = iv;
698     *end = iv + jv;
699     return mv;
700     }
701    
702     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
703     int i, j, l, a, b, v;
704     if (cutoff==0) cutoff=dustcutoff;
705     a=0;b=0;
706     //GMessage("Dust cutoff=%d\n", cutoff);
707     for (i=0; i < seqlen; i += dustwindow2) {
708     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
709     v = wo(l, seq+i, &a, &b);
710     if (v > cutoff) {
711     //for (j = a; j <= b && j < dustwindow2; j++) {
712     for (j = a; j <= b; j++) {
713     seqmsk[i+j]='N';//could be made lowercase instead
714     }
715     }
716     }
717     //return first;
718     }
719     };
720    
721     static DNADuster duster;
722    
723     int dust(GStr& rseq) {
724     char* seq=Gstrdup(rseq.chars());
725     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
726     //check the number of Ns:
727     int ncount=0;
728     for (int i=0;i<rseq.length();i++) {
729     if (seq[i]=='N') ncount++;
730     }
731     GFREE(seq);
732     return ncount;
733     }
734    
735 gpertea 12 int get3mer_value(const char* s) {
736     return (s[0]<<16)+(s[1]<<8)+s[2];
737     }
738    
739     int w3_match(int qv, const char* str, int slen, int start_index=0) {
740     if (start_index>=slen || start_index<0) return -1;
741     for (int i=start_index;i<slen-3;i++) {
742     int rv=get3mer_value(str+i);
743     if (rv==qv) return i;
744     }
745     return -1;
746     }
747    
748     int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
749     if (end_index>=slen) return -1;
750     if (end_index<0) end_index=slen-1;
751     for (int i=end_index-2;i>=0;i--) {
752     int rv=get3mer_value(str+i);
753     if (rv==qv) return i;
754     }
755     return -1;
756     }
757    
758 gpertea 94 struct SLocScore {
759     int pos;
760     int score;
761     SLocScore(int p=0,int s=0) {
762     pos=p;
763     score=s;
764     }
765     void set(int p, int s) {
766     pos=p;
767     score=s;
768     }
769     void add(int p, int add) {
770     pos=p;
771     score+=add;
772     }
773     };
774 gpertea 12
775 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
776 gpertea 103 if (!doPolyTrim) return false;
777 gpertea 94 int rlen=seq.length();
778     l5=0;
779     l3=rlen-1;
780     int32 seedVal=*(int32*)poly_seed;
781     char polyChar=poly_seed[0];
782     //assumes N trimming was already done
783     //so a poly match should be very close to the end of the read
784     // -- find the initial match (seed)
785     int lmin=GMAX((rlen-12), 0);
786     int li;
787     for (li=rlen-4;li>lmin;li--) {
788     if (seedVal==*(int*)&(seq[li])) {
789     break;
790     }
791 gpertea 12 }
792 gpertea 94 if (li<=lmin) return false;
793     //seed found, try to extend it both ways
794     //extend right
795     int ri=li+3;
796     SLocScore loc(ri, poly_m_score<<2);
797     SLocScore maxloc(loc);
798     //extend right
799     while (ri<rlen-2) {
800     ri++;
801     if (seq[ri]==polyChar) {
802     loc.add(ri,poly_m_score);
803     }
804     else if (seq[ri]=='N') {
805     loc.add(ri,0);
806     }
807     else { //mismatch
808     loc.add(ri,poly_mis_score);
809     if (maxloc.score-loc.score>poly_dropoff_score) break;
810     }
811     if (maxloc.score<=loc.score) {
812     maxloc=loc;
813     }
814 gpertea 12 }
815 gpertea 94 ri=maxloc.pos;
816 gpertea 103 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
817 gpertea 94 //ri = right boundary for the poly match
818     //extend left
819     loc.set(li, maxloc.score);
820     maxloc.pos=li;
821     while (li>0) {
822     li--;
823     if (seq[li]==polyChar) {
824     loc.add(li,poly_m_score);
825     }
826     else if (seq[li]=='N') {
827     loc.add(li,0);
828     }
829     else { //mismatch
830     loc.add(li,poly_mis_score);
831     if (maxloc.score-loc.score>poly_dropoff_score) break;
832     }
833     if (maxloc.score<=loc.score) {
834     maxloc=loc;
835     }
836     }
837 gpertea 103 if ((maxloc.score>poly_minScore && ri>=rlen-3) ||
838     (maxloc.score==poly_minScore && ri==rlen-1) ||
839     (maxloc.score>(poly_minScore<<1) && ri>=rlen-6)) {
840 gpertea 94 l5=li;
841     l3=ri;
842     return true;
843     }
844     return false;
845 gpertea 12 }
846    
847 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
848 gpertea 103 if (!doPolyTrim) return false;
849 gpertea 88 int rlen=seq.length();
850     l5=0;
851     l3=rlen-1;
852 gpertea 94 int32 seedVal=*(int32*)poly_seed;
853     char polyChar=poly_seed[0];
854 gpertea 88 //assumes N trimming was already done
855     //so a poly match should be very close to the end of the read
856     // -- find the initial match (seed)
857 gpertea 94 int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
858     int li;
859     for (li=0;li<=lmax;li++) {
860     if (seedVal==*(int*)&(seq[li])) {
861 gpertea 88 break;
862     }
863     }
864 gpertea 94 if (li>lmax) return false;
865     //seed found, try to extend it both ways
866     //extend left
867     int ri=li+3; //save rightmost base of the seed
868     SLocScore loc(li, poly_m_score<<2);
869     SLocScore maxloc(loc);
870     while (li>0) {
871     li--;
872     if (seq[li]==polyChar) {
873     loc.add(li,poly_m_score);
874     }
875     else if (seq[li]=='N') {
876     loc.add(li,0);
877     }
878     else { //mismatch
879     loc.add(li,poly_mis_score);
880     if (maxloc.score-loc.score>poly_dropoff_score) break;
881     }
882     if (maxloc.score<=loc.score) {
883     maxloc=loc;
884     }
885 gpertea 88 }
886 gpertea 94 li=maxloc.pos;
887 gpertea 103 if (li>5) return false; //no trimming wanted, too far from 5' end
888 gpertea 94 //li = right boundary for the poly match
889    
890     //extend right
891     loc.set(ri, maxloc.score);
892     maxloc.pos=ri;
893     while (ri<rlen-2) {
894     ri++;
895     if (seq[ri]==polyChar) {
896     loc.add(ri,poly_m_score);
897     }
898     else if (seq[ri]=='N') {
899     loc.add(ri,0);
900     }
901     else { //mismatch
902     loc.add(ri,poly_mis_score);
903     if (maxloc.score-loc.score>poly_dropoff_score) break;
904     }
905     if (maxloc.score<=loc.score) {
906     maxloc=loc;
907     }
908     }
909    
910 gpertea 103 if ((maxloc.score==poly_minScore && li==0) ||
911     (maxloc.score>poly_minScore && li<2)
912     || (maxloc.score>(poly_minScore<<1) && li<6)) {
913 gpertea 94 l5=li;
914     l3=ri;
915     return true;
916     }
917     return false;
918 gpertea 88 }
919    
920 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
921 gpertea 103 if (adapters3.Count()==0) return false;
922 gpertea 4 int rlen=seq.length();
923     l5=0;
924     l3=rlen-1;
925 gpertea 103 bool trimmed=false;
926     GStr wseq(seq.chars());
927     int wlen=rlen;
928     for (int ai=0;ai<adapters3.Count();ai++) {
929     if (adapters3[ai].is_empty()) continue;
930     int alen=adapters3[ai].length();
931     GStr& aseq=adapters3[ai];
932     GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
933     if (r_bestaln) {
934     trimmed=true;
935     //keep unmatched region on the left, if any
936     l3-=(wlen-r_bestaln->sl+1);
937     delete r_bestaln;
938     if (l3<0) l3=0;
939     if (l3-l5+1<min_read_len) return true;
940     wseq=seq.substr(l5,l3-l5+1);
941     wlen=wseq.length();
942     }
943     }//for each 5' adapter
944     return trimmed;
945 gpertea 4 }
946    
947     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
948 gpertea 103 if (adapters5.Count()==0) return false;
949 gpertea 4 int rlen=seq.length();
950     l5=0;
951     l3=rlen-1;
952 gpertea 102 bool trimmed=false;
953 gpertea 103 GStr wseq(seq.chars());
954     int wlen=rlen;
955 gpertea 102 for (int ai=0;ai<adapters5.Count();ai++) {
956     if (adapters5[ai].is_empty()) continue;
957 gpertea 103 int alen=adapters5[ai].length();
958     GStr& aseq=adapters5[ai];
959     GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_l, 84);
960     if (l_bestaln) {
961     trimmed=true;
962     l5+=l_bestaln->sr;
963     delete l_bestaln;
964     if (l5>=rlen) l5=rlen-1;
965     if (l3-l5+1<min_read_len) return true;
966     wseq=seq.substr(l5,l3-l5+1);
967     wlen=wseq.length();
968     }
969 gpertea 102 }//for each 5' adapter
970     return trimmed;
971 gpertea 98 }
972 gpertea 4
973 gpertea 12 //convert qvs to/from phred64 from/to phread33
974     void convertPhred(GStr& q) {
975     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
976 gpertea 4 }
977    
978 gpertea 12 void convertPhred(char* q, int len) {
979     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
980 gpertea 4 }
981    
982 gpertea 13 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
983     GStr& rname, GStr& rinfo, GStr& infname) {
984     rseq="";
985     rqv="";
986     rname="";
987     rinfo="";
988     if (fq.eof()) return false;
989     char* l=fq.getLine();
990     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
991     if (l==NULL) return false;
992     /* if (rawFormat) {
993     //TODO: implement raw qseq parsing here
994     //if (raw type=N) then continue; //skip invalid/bad records
995     } //raw qseq format
996     else { // FASTQ or FASTA */
997     isfasta=(l[0]=='>');
998     if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
999     infname.chars(), l);
1000     GStr s(l);
1001     rname=&(l[1]);
1002     for (int i=0;i<rname.length();i++)
1003     if (rname[i]<=' ') {
1004     if (i<rname.length()-2) rinfo=rname.substr(i+1);
1005     rname.cut(i);
1006     break;
1007     }
1008     //now get the sequence
1009     if ((l=fq.getLine())==NULL)
1010     GError("Error: unexpected EOF after header for read %s (%s)\n",
1011     rname.chars(), infname.chars());
1012     rseq=l; //this must be the DNA line
1013     while ((l=fq.getLine())!=NULL) {
1014     //seq can span multiple lines
1015     if (l[0]=='>' || l[0]=='+') {
1016     fq.pushBack();
1017     break; //
1018     }
1019     rseq+=l;
1020     } //check for multi-line seq
1021     if (!isfasta) { //reading fastq quality values, which can also be multi-line
1022     if ((l=fq.getLine())==NULL)
1023     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1024     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1025     if ((l=fq.getLine())==NULL)
1026     GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1027     rqv=l;
1028     //if (rqv.length()!=rseq.length())
1029     // GError("Error: qv len != seq len for %s\n", rname.chars());
1030     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1031     rqv+=l; //append to qv string
1032     }
1033     }// fastq
1034     // } //<-- FASTA or FASTQ
1035 gpertea 94 rseq.upper();
1036 gpertea 13 return true;
1037     }
1038    
1039     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1040     //returns 0 if the read was untouched, 1 if it was just trimmed
1041     // and a trash code if it was trashed
1042     l5=0;
1043     l3=rseq.length()-1;
1044 gpertea 103 #ifdef GDEBUG
1045     GMessage(">%s\n", rname.chars());
1046     GMessage("==>%s\n",rseq.chars());
1047     #endif
1048 gpertea 13 if (l3-l5+1<min_read_len) {
1049     return 's';
1050     }
1051     GStr wseq(rseq.chars());
1052     GStr wqv(rqv.chars());
1053     int w5=l5;
1054     int w3=l3;
1055     //first do the q-based trimming
1056     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1057     if (w3-w5+1<min_read_len) {
1058     return 'Q'; //invalid read
1059     }
1060     //-- keep only the w5..w3 range
1061     l5=w5;
1062     l3=w3;
1063     wseq=wseq.substr(w5, w3-w5+1);
1064     if (!wqv.is_empty())
1065     wqv=wqv.substr(w5, w3-w5+1);
1066     } //qv trimming
1067     // N-trimming on the remaining read seq
1068     if (ntrim(wseq, w5, w3)) {
1069     //GMessage("before: %s\n",wseq.chars());
1070     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1071     l5+=w5;
1072     l3-=(wseq.length()-1-w3);
1073     if (w3-w5+1<min_read_len) {
1074     return 'N'; //to be trashed
1075     }
1076     //-- keep only the w5..w3 range
1077     wseq=wseq.substr(w5, w3-w5+1);
1078     if (!wqv.is_empty())
1079     wqv=wqv.substr(w5, w3-w5+1);
1080     w5=0;
1081     w3=wseq.length()-1;
1082     }
1083 gpertea 103 char trim_code;
1084     do {
1085     trim_code=0;
1086     if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1087     trim_code='A';
1088     #ifdef GDEBUG
1089     GMessage("\t-trimmed poly-A at 5' end\n");
1090     #endif
1091     }
1092     else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1093     trim_code='T';
1094     #ifdef GDEBUG
1095     GMessage("\t-trimmed poly-T at 5' end\n");
1096     #endif
1097     }
1098     else if (trim_adapter5(wseq, w5, w3)) {
1099     trim_code='5';
1100     #ifdef GDEBUG
1101     GMessage("\t-trimmed adapter at 5' end\n");
1102     #endif
1103     }
1104     if (trim_code) {
1105 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1106 gpertea 103 num_trimmed5++;
1107     if (trimlen<min_trimmed5)
1108     min_trimmed5=trimlen;
1109 gpertea 13 l5+=w5;
1110     l3-=(wseq.length()-1-w3);
1111     if (w3-w5+1<min_read_len) {
1112 gpertea 103 return trim_code;
1113 gpertea 13 }
1114     //-- keep only the w5..w3 range
1115     wseq=wseq.substr(w5, w3-w5+1);
1116     if (!wqv.is_empty())
1117     wqv=wqv.substr(w5, w3-w5+1);
1118 gpertea 103 }// trimmed at 5' end
1119     } while (trim_code);
1120    
1121     do {
1122     trim_code=0;
1123     if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1124     trim_code='A';
1125     }
1126     else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1127     trim_code='T';
1128     }
1129     else if (trim_adapter3(wseq, w5, w3)) {
1130     trim_code='3';
1131     }
1132     if (trim_code) {
1133 gpertea 13 int trimlen=wseq.length()-(w3-w5+1);
1134 gpertea 103 num_trimmed3++;
1135     if (trimlen<min_trimmed3)
1136     min_trimmed3=trimlen;
1137 gpertea 13 l5+=w5;
1138     l3-=(wseq.length()-1-w3);
1139     if (w3-w5+1<min_read_len) {
1140 gpertea 103 return trim_code;
1141 gpertea 13 }
1142     //-- keep only the w5..w3 range
1143     wseq=wseq.substr(w5, w3-w5+1);
1144     if (!wqv.is_empty())
1145     wqv=wqv.substr(w5, w3-w5+1);
1146 gpertea 103 }//trimming at 3' end
1147     } while (trim_code);
1148    
1149    
1150 gpertea 13 if (doCollapse) {
1151     //keep read for later
1152     FqDupRec* dr=dhash.Find(wseq.chars());
1153     if (dr==NULL) { //new entry
1154     //if (prefix.is_empty())
1155     dhash.Add(wseq.chars(),
1156     new FqDupRec(&wqv, rname.chars()));
1157     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1158     }
1159     else
1160     dr->add(wqv);
1161     } //collapsing duplicates
1162     else { //not collapsing duplicates
1163     //apply the dust filter now
1164     if (doDust) {
1165     int dustbases=dust(wseq);
1166     if (dustbases>(wseq.length()>>1)) {
1167     return 'D';
1168     }
1169     }
1170     } //not collapsing duplicates
1171     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1172     }
1173    
1174     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1175     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1176     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1177     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1178     }
1179    
1180     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1181     outcounter++;
1182     bool asFasta=(rqv.is_empty() || fastaOutput);
1183     if (asFasta) {
1184     if (prefix.is_empty()) {
1185     printHeader(f_out, '>',rname,rinfo);
1186     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1187     }
1188     else {
1189     fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1190     rseq.chars());
1191     }
1192     }
1193     else { //fastq
1194     if (convert_phred) convertPhred(rqv);
1195     if (prefix.is_empty()) {
1196     printHeader(f_out, '@', rname, rinfo);
1197     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1198     }
1199     else
1200     fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1201     rseq.chars(),rqv.chars() );
1202     }
1203     }
1204    
1205     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1206     if (freport==NULL || trashcode<=' ') return;
1207     if (trashcode=='3' || trashcode=='5') {
1208 gpertea 103 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1209 gpertea 13 }
1210     else {
1211     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1212     }
1213     //tcounter++;
1214     }
1215    
1216     GStr getFext(GStr& s, int* xpos=NULL) {
1217     //s must be a filename without a path
1218     GStr r("");
1219     if (xpos!=NULL) *xpos=0;
1220     if (s.is_empty() || s=="-") return r;
1221     int p=s.rindex('.');
1222     int d=s.rindex('/');
1223     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1224     r=s.substr(p+1);
1225     if (xpos!=NULL) *xpos=p+1;
1226     r.lower();
1227     return r;
1228     }
1229    
1230     void baseFileName(GStr& fname) {
1231     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1232     int xpos=0;
1233     GStr fext=getFext(fname, &xpos);
1234     if (xpos<=1) return;
1235     bool extdel=false;
1236     GStr f2;
1237     if (fext=="z" || fext=="zip") {
1238     extdel=true;
1239     }
1240     else if (fext.length()>=2) {
1241     f2=fext.substr(0,2);
1242     extdel=(f2=="gz" || f2=="bz");
1243     }
1244     if (extdel) {
1245     fname.cut(xpos-1);
1246     fext=getFext(fname, &xpos);
1247     if (xpos<=1) return;
1248     }
1249     extdel=false;
1250     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1251     extdel=true;
1252     }
1253     else if (fext.length()>=2) {
1254     extdel=(fext.substr(0,2)=="fa");
1255     }
1256     if (extdel) fname.cut(xpos-1);
1257     GStr fncp(fname);
1258     fncp.lower();
1259     fncp.chomp("seq");
1260     fncp.chomp("sequence");
1261     fncp.trimR("_.");
1262     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1263     }
1264    
1265     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1266     FILE* f_out=NULL;
1267     GStr fname(getFileName(infname.chars()));
1268     //eliminate known extensions
1269     baseFileName(fname);
1270     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1271     else if (pocmd.is_empty()) {
1272     GStr oname(fname);
1273     oname.append('.');
1274     oname.append(outsuffix);
1275     f_out=fopen(oname.chars(),"w");
1276     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1277     }
1278     else {
1279     GStr oname(">");
1280     oname.append(fname);
1281     oname.append('.');
1282     oname.append(outsuffix);
1283     pocmd.append(oname);
1284     f_out=popen(pocmd.chars(), "w");
1285     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1286     }
1287     return f_out;
1288     }
1289    
1290     void guess_unzip(GStr& fname, GStr& picmd) {
1291     GStr fext=getFext(fname);
1292     if (fext=="gz" || fext=="gzip" || fext=="z") {
1293     picmd="gzip -cd ";
1294     }
1295     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1296     picmd="bzip2 -cd ";
1297     }
1298     }
1299    
1300 gpertea 94
1301     int loadAdapters(const char* fname) {
1302     GLineReader lr(fname);
1303     char* l;
1304     while ((l=lr.nextLine())!=NULL) {
1305     if (lr.length()<=3 || l[0]=='#') continue;
1306 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1307     l[0]==';'|| l[0]==':' ) {
1308 gpertea 94 int i=1;
1309     while (l[i]!=0 && isspace(l[i])) {
1310     i++;
1311     }
1312     if (l[i]!=0) {
1313 gpertea 102 GStr s(&(l[i]));
1314     adapters3.Add(s);
1315 gpertea 94 continue;
1316     }
1317     }
1318     else {
1319 gpertea 98 GStr s(l);
1320     s.startTokenize("\t ;,:");
1321     GStr a5,a3;
1322     if (s.nextToken(a5))
1323     s.nextToken(a3);
1324 gpertea 102 a5.upper();
1325     a3.upper();
1326     adapters5.Add(a5);
1327     adapters3.Add(a3);
1328 gpertea 94 }
1329     }
1330 gpertea 102 return adapters5.Count()+adapters3.Count();
1331 gpertea 94 }
1332    
1333 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1334     GStr& s, GStr& infname, GStr& infname2) {
1335     // uses outsuffix to generate output file names and open file handles as needed
1336     infname="";
1337     infname2="";
1338     f_in=NULL;
1339     f_in2=NULL;
1340     f_out=NULL;
1341     f_out2=NULL;
1342     //analyze outsuffix intent
1343     GStr pocmd;
1344 gpertea 76 if (outsuffix=="-") {
1345     f_out=stdout;
1346     }
1347     else {
1348     GStr ox=getFext(outsuffix);
1349     if (ox.length()>2) ox=ox.substr(0,2);
1350     if (ox=="gz") pocmd="gzip -9 -c ";
1351     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1352     }
1353 gpertea 13 if (s=="-") {
1354     f_in=stdin;
1355     infname="stdin";
1356     f_out=prepOutFile(infname, pocmd);
1357     return;
1358     } // streaming from stdin
1359     s.startTokenize(",:");
1360     s.nextToken(infname);
1361     bool paired=s.nextToken(infname2);
1362     if (fileExists(infname.chars())==0)
1363     GError("Error: cannot find file %s!\n",infname.chars());
1364     GStr fname(getFileName(infname.chars()));
1365     GStr picmd;
1366     guess_unzip(fname, picmd);
1367     if (picmd.is_empty()) {
1368     f_in=fopen(infname.chars(), "r");
1369     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1370     }
1371     else {
1372     picmd.append(infname);
1373     f_in=popen(picmd.chars(), "r");
1374     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1375     }
1376 gpertea 76 if (f_out==stdout) {
1377     if (paired) GError("Error: output suffix required for paired reads\n");
1378     return;
1379     }
1380 gpertea 13 f_out=prepOutFile(infname, pocmd);
1381     if (!paired) return;
1382     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1383     // ---- paired reads:-------------
1384     if (fileExists(infname2.chars())==0)
1385     GError("Error: cannot find file %s!\n",infname2.chars());
1386     picmd="";
1387     GStr fname2(getFileName(infname2.chars()));
1388     guess_unzip(fname2, picmd);
1389     if (picmd.is_empty()) {
1390     f_in2=fopen(infname2.chars(), "r");
1391     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1392     }
1393     else {
1394     picmd.append(infname2);
1395     f_in2=popen(picmd.chars(), "r");
1396     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1397     }
1398     f_out2=prepOutFile(infname2, pocmd);
1399     }