ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 102
Committed: Fri Oct 7 21:44:59 2011 UTC (8 years ago) by gpertea
File size: 38427 byte(s)
Log Message:
wip

Line User Rev File contents
1 gpertea 4 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5 gpertea 13 #include <ctype.h>
6 gpertea 94 #include "GAlnExtend.h"
7 gpertea 4
8     #define USAGE "Usage:\n\
9 gpertea 76 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10     [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11     [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12     <input.fq>[,<input_mates.fq>\n\
13 gpertea 4 \n\
14 gpertea 76 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15     for low complexity and collapse duplicate reads.\n\
16 gpertea 13 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17     one read in a pair), the two file names should be given delimited by a comma\n\
18     or a colon character\n\
19 gpertea 4 \n\
20     Options:\n\
21     -n rename all the reads using the <prefix> followed by a read counter;\n\
22     if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23     the read duplication count\n\
24 gpertea 76 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25     file(s) named <input>.<outsuffix> which will be created in the \n\
26     current (working) directory; (writes to stdout if -o- is given);\n\
27     a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28     -f file with adapter sequences to trim, each line having this format:\n\
29     <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 gpertea 4 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31     (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32     -3 trim the given adapter sequence at the 3' end of each read\n\
33     (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 gpertea 94 -A disable polyA trimming (enabled by default)\n\
35     -T enable polyT trimming (disabled by default)\n\
36     -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
37 gpertea 13 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38 gpertea 76 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39 gpertea 12 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
40     -l minimum \"clean\" length after trimming that a read must have\n\
41     in order to pass the filter (default: 16)\n\
42 gpertea 4 -r write a simple \"trash report\" file listing the discarded reads with a\n\
43     one letter code for the reason why they were trashed\n\
44 gpertea 12 -D apply a low-complexity (dust) filter and discard any read that has over\n\
45     50% of its length detected as low complexity\n\
46 gpertea 4 -C collapse duplicate reads and add a prefix with their count to the read\n\
47 gpertea 12 name (e.g. for microRNAs)\n\
48     -p input is phred64/phred33 (use -p64 or -p33)\n\
49     -Q convert quality values to the other Phred qv type\n\
50     -V verbose processing\n\
51 gpertea 4 "
52 gpertea 13
53     //-z for -o option, the output stream(s) will be first piped into the given\n
54     // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
55    
56    
57 gpertea 4 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
58 gpertea 12
59 gpertea 76 //For paired reads sequencing:
60 gpertea 12 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
61     //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
62 gpertea 13 //FILE* f_out=NULL; //stdout if not provided
63     //FILE* f_out2=NULL; //for paired reads
64     //FILE* f_in=NULL; //input fastq (stdin if not provided)
65     //FILE* f_in2=NULL; //for paired reads
66 gpertea 4 FILE* freport=NULL;
67 gpertea 13
68 gpertea 4 bool debug=false;
69 gpertea 12 bool verbose=false;
70 gpertea 4 bool doCollapse=false;
71     bool doDust=false;
72 gpertea 13 bool fastaOutput=false;
73 gpertea 4 bool trashReport=false;
74 gpertea 12 //bool rawFormat=false;
75 gpertea 4 int min_read_len=16;
76 gpertea 12 double max_perc_N=7.0;
77 gpertea 4 int dust_cutoff=16;
78     bool isfasta=false;
79 gpertea 12 bool convert_phred=false;
80 gpertea 13 GStr outsuffix; // -o
81     GStr prefix;
82     GStr zcmd;
83     int num_trimmed5=0;
84     int num_trimmed3=0;
85     int num_trimmedN=0;
86     int num_trimmedQ=0;
87     int min_trimmed5=INT_MAX;
88     int min_trimmed3=INT_MAX;
89 gpertea 12
90 gpertea 13 int qvtrim_qmin=0;
91     int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
92 gpertea 12 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93     int qv_cvtadd=0; //could be -31 or +31
94    
95 gpertea 94 // adaptor matching metrics -- for X-drop ungapped extension
96     const int poly_m_score=2; //match score
97     const int poly_mis_score=-3; //mismatch
98 gpertea 88 const int poly_dropoff_score=7;
99 gpertea 94 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
100 gpertea 4
101 gpertea 94 const char *polyA_seed="AAAA";
102     const char *polyT_seed="TTTT";
103 gpertea 102 /*
104 gpertea 94 struct CAdapters {
105     GStr a5;
106     GStr a3;
107     CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
108 gpertea 12 }
109 gpertea 94 };
110 gpertea 102 */
111     GVec<GStr> adapters5;
112     GVec<GStr> adapters3;
113 gpertea 12
114 gpertea 4 // element in dhash:
115     class FqDupRec {
116     public:
117     int count; //how many of these reads are the same
118     int len; //length of qv
119     char* firstname; //optional, only if we want to keep the original read names
120     char* qv;
121     FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
122     len=0;
123     qv=NULL;
124     firstname=NULL;
125     count=0;
126     if (qstr!=NULL) {
127     qv=Gstrdup(qstr->chars());
128     len=qstr->length();
129     count++;
130     }
131     if (rname!=NULL) firstname=Gstrdup(rname);
132     }
133     ~FqDupRec() {
134     GFREE(qv);
135     GFREE(firstname);
136     }
137     void add(GStr& d) { //collapse another record into this one
138     if (d.length()!=len)
139     GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
140     count++;
141     for (int i=0;i<len;i++)
142     qv[i]=(qv[i]+d[i])/2;
143     }
144     };
145    
146     void openfw(FILE* &f, GArgs& args, char opt) {
147     GStr s=args.getOpt(opt);
148     if (!s.is_empty()) {
149     if (s=='-') f=stdout;
150     else {
151 gpertea 13 f=fopen(s.chars(),"w");
152 gpertea 4 if (f==NULL) GError("Error creating file: %s\n", s.chars());
153     }
154     }
155     }
156    
157     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
158     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
159    
160     GHash<FqDupRec> dhash; //hash to keep track of duplicates
161    
162 gpertea 94 int loadAdapters(const char* fname);
163    
164 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
165     GStr& s, GStr& infname, GStr& infname2);
166     // uses outsuffix to generate output file names and open file handles as needed
167    
168     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
169     void trash_report(char trashcode, GStr& rname, FILE* freport);
170    
171     bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
172     GStr& rname, GStr& rinfo, GStr& infname);
173    
174     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
175     //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
176    
177 gpertea 12 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
178     bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
179 gpertea 4 int dust(GStr& seq);
180 gpertea 94 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
181     bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
182 gpertea 4 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
183 gpertea 94 bool trim_adapter3(GStr& seq, int &l5, int &l3);
184 gpertea 4
185 gpertea 12 void convertPhred(char* q, int len);
186     void convertPhred(GStr& q);
187 gpertea 4
188     int main(int argc, char * const argv[]) {
189 gpertea 76 GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
190 gpertea 4 int e;
191     if ((e=args.isError())>0) {
192     GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
193     exit(224);
194     }
195 gpertea 9 debug=(args.getOpt('Y')!=NULL);
196 gpertea 13 verbose=(args.getOpt('V')!=NULL);
197 gpertea 12 convert_phred=(args.getOpt('Q')!=NULL);
198 gpertea 4 doCollapse=(args.getOpt('C')!=NULL);
199     doDust=(args.getOpt('D')!=NULL);
200 gpertea 12 /*
201 gpertea 4 rawFormat=(args.getOpt('R')!=NULL);
202     if (rawFormat) {
203     GError("Sorry, raw qseq format parsing is not implemented yet!\n");
204     }
205 gpertea 12 */
206 gpertea 4 prefix=args.getOpt('n');
207     GStr s=args.getOpt('l');
208     if (!s.is_empty())
209     min_read_len=s.asInt();
210 gpertea 12 s=args.getOpt('m');
211     if (!s.is_empty())
212     max_perc_N=s.asDouble();
213 gpertea 4 s=args.getOpt('d');
214     if (!s.is_empty()) {
215     dust_cutoff=s.asInt();
216     doDust=true;
217     }
218 gpertea 12 s=args.getOpt('q');
219     if (!s.is_empty()) {
220 gpertea 13 qvtrim_qmin=s.asInt();
221 gpertea 12 }
222 gpertea 13 s=args.getOpt('t');
223     if (!s.is_empty()) {
224     qvtrim_max=s.asInt();
225     }
226 gpertea 12 s=args.getOpt('p');
227     if (!s.is_empty()) {
228     int v=s.asInt();
229     if (v==33) {
230     qv_phredtype=33;
231     qv_cvtadd=31;
232     }
233     else if (v==64) {
234     qv_phredtype=64;
235     qv_cvtadd=-31;
236     }
237     else
238     GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
239     }
240 gpertea 94 s=args.getOpt('f');
241     if (!s.is_empty()) {
242     loadAdapters(s.chars());
243     }
244 gpertea 102 bool fileAdapters=adapters5.Count()+adapters3.Count();
245 gpertea 94 s=args.getOpt('5');
246     if (!s.is_empty()) {
247     if (fileAdapters)
248     GError("Error: options -5 and -f cannot be used together!\n");
249     s.upper();
250 gpertea 102 adapters5.Add(s);
251 gpertea 4 }
252 gpertea 94 s=args.getOpt('3');
253     if (!s.is_empty()) {
254     if (fileAdapters)
255     GError("Error: options -3 and -f cannot be used together!\n");
256 gpertea 102 s.upper();
257     adapters3.Add(s);
258 gpertea 4 }
259 gpertea 94 s=args.getOpt('y');
260 gpertea 13 if (!s.is_empty()) {
261 gpertea 94 int minmatch=s.asInt();
262     poly_minScore=minmatch*poly_m_score;
263 gpertea 13 }
264 gpertea 76
265 gpertea 13 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
266 gpertea 76 else outsuffix="-";
267 gpertea 4 trashReport= (args.getOpt('r')!=NULL);
268 gpertea 13 int fcount=args.startNonOpt();
269     if (fcount==0) {
270 gpertea 4 GMessage(USAGE);
271     exit(224);
272     }
273 gpertea 13 if (fcount>1 && doCollapse) {
274     GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
275     }
276     //openfw(f_out, args, 'o');
277     //if (f_out==NULL) f_out=stdout;
278 gpertea 4 if (trashReport)
279     openfw(freport, args, 'r');
280     char* infile=NULL;
281     while ((infile=args.nextNonOpt())!=NULL) {
282 gpertea 13 int incounter=0; //counter for input reads
283     int outcounter=0; //counter for output reads
284     int trash_s=0; //too short from the get go
285     int trash_Q=0;
286     int trash_N=0;
287     int trash_D=0;
288     int trash_A3=0;
289     int trash_A5=0;
290     s=infile;
291     GStr infname;
292     GStr infname2;
293     FILE* f_in=NULL;
294     FILE* f_in2=NULL;
295     FILE* f_out=NULL;
296     FILE* f_out2=NULL;
297     bool paired_reads=false;
298     setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
299     GLineReader fq(f_in);
300     GLineReader* fq2=NULL;
301     if (f_in2!=NULL) {
302     fq2=new GLineReader(f_in2);
303     paired_reads=true;
304     }
305     GStr rseq, rqv, seqid, seqinfo;
306     GStr rseq2, rqv2, seqid2, seqinfo2;
307     while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
308     incounter++;
309     int a5=0, a3=0, b5=0, b3=0;
310     char tcode=0, tcode2=0;
311     tcode=process_read(seqid, rseq, rqv, a5, a3);
312     //if (!doCollapse) {
313     if (fq2!=NULL) {
314     getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
315     if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
316     GError("Error: no paired match for read %s vs %s (%s,%s)\n",
317     seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
318     }
319     tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
320     //decide what to do with this pair and print rseq2 if the pair makes it
321     if (tcode>1 && tcode2<=1) {
322     //"untrash" rseq
323     if (a3-a5+1<min_read_len) {
324     a5=1;
325     if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
326 gpertea 4 }
327 gpertea 13 tcode=1;
328     }
329     else if (tcode<=1 && tcode2>1) {
330     //"untrash" rseq2
331     if (b3-b5+1<min_read_len) {
332     b5=1;
333     if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
334     }
335     tcode2=1;
336     }
337     if (tcode<=1) { //trimmed or left intact -- write it!
338     if (tcode>0) {
339     rseq2=rseq2.substr(b5,b3-b5+1);
340     if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
341 gpertea 4 }
342 gpertea 13 int nocounter=0;
343     writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
344     }
345     } //paired read
346     // }
347     if (tcode>1) { //trashed
348     if (tcode=='s') trash_s++;
349     else if (tcode=='Q') trash_Q++;
350     else if (tcode=='N') trash_N++;
351     else if (tcode=='D') trash_D++;
352     else if (tcode=='3') trash_A3++;
353     else if (tcode=='5') trash_A5++;
354     if (trashReport) trash_report(tcode, seqid, freport);
355     }
356     else if (!doCollapse) { //write it
357     if (tcode>0) {
358     rseq=rseq.substr(a5,a3-a5+1);
359     if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
360     }
361     writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
362     }
363     } //for each fastq record
364     delete fq2;
365     FRCLOSE(f_in);
366     FRCLOSE(f_in2);
367     if (doCollapse) {
368     outcounter=0;
369     int maxdup_count=1;
370     char* maxdup_seq=NULL;
371     dhash.startIterate();
372     FqDupRec* qd=NULL;
373     char* seq=NULL;
374     while ((qd=dhash.NextData(seq))!=NULL) {
375     GStr rseq(seq);
376     //do the dusting here
377     if (doDust) {
378     int dustbases=dust(rseq);
379     if (dustbases>(rseq.length()>>1)) {
380     if (trashReport && qd->firstname!=NULL) {
381     fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
382 gpertea 4 }
383 gpertea 13 trash_D+=qd->count;
384     continue;
385     }
386     }
387     outcounter++;
388     if (qd->count>maxdup_count) {
389     maxdup_count=qd->count;
390     maxdup_seq=seq;
391     }
392     if (isfasta) {
393     if (prefix.is_empty()) {
394     fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
395     rseq.chars());
396 gpertea 4 }
397 gpertea 13 else { //use custom read name
398     fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
399     qd->count, rseq.chars());
400 gpertea 4 }
401 gpertea 13 }
402     else { //fastq format
403     if (convert_phred) convertPhred(qd->qv, qd->len);
404     if (prefix.is_empty()) {
405     fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
406     rseq.chars(), qd->qv);
407 gpertea 4 }
408 gpertea 13 else { //use custom read name
409     fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
410     qd->count, rseq.chars(), qd->qv);
411     }
412     }
413     }//for each element of dhash
414     if (maxdup_count>1) {
415     GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
416 gpertea 4 }
417 gpertea 13 } //collapse entries
418     if (verbose) {
419     if (paired_reads) {
420     GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
421     GMessage("Number of input pairs :%9d\n", incounter);
422     GMessage(" Output pairs :%9d\n", outcounter);
423     }
424     else {
425     GMessage(">Input file : %s\n", infname.chars());
426     GMessage("Number of input reads :%9d\n", incounter);
427     GMessage(" Output reads :%9d\n", outcounter);
428     }
429     GMessage("------------------------------------\n");
430     if (num_trimmed5)
431     GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
432     if (num_trimmed3)
433     GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
434     GMessage("------------------------------------\n");
435     if (trash_s>0)
436     GMessage(" Trashed by length:%9d\n", trash_s);
437     if (trash_N>0)
438     GMessage(" Trashed by N%%:%9d\n", trash_N);
439     if (trash_Q>0)
440     GMessage("Trashed by low quality:%9d\n", trash_Q);
441     if (trash_A5>0)
442     GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
443     if (trash_A3>0)
444     GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
445     }
446     if (trashReport) {
447     FWCLOSE(freport);
448 gpertea 4 }
449 gpertea 13 FWCLOSE(f_out);
450     FWCLOSE(f_out2);
451     } //while each input file
452 gpertea 4
453 gpertea 12 //getc(stdin);
454 gpertea 4 }
455    
456     class NData {
457     public:
458     int NPos[1024]; //there should be no reads longer than 1K ?
459     int NCount;
460     int end5;
461     int end3;
462     int seqlen;
463     double perc_N; //percentage of Ns in end5..end3 range only!
464     const char* seq;
465     bool valid;
466     NData() {
467     NCount=0;
468     end5=0;
469     end3=0;
470     seq=NULL;
471     perc_N=0;
472     valid=true;
473     }
474     void init(GStr& rseq) {
475     NCount=0;
476     perc_N=0;
477     seqlen=rseq.length();
478     seq=rseq.chars();
479     end5=1;
480     end3=seqlen;
481     for (int i=0;i<seqlen;i++)
482 gpertea 12 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
483 gpertea 4 NPos[NCount]=i;
484     NCount++;
485     }
486     perc_N=(NCount*100.0)/seqlen;
487     valid=true;
488     }
489     void N_calc() { //only in the region after trimming
490     int n=0;
491     for (int i=end3-1;i<end5;i++) {
492     if (seq[i]=='N') n++;
493     }
494     perc_N=(n*100.0)/(end5-end3+1);
495     }
496     };
497    
498     static NData feat;
499     int perc_lenN=12; // incremental distance from ends, in percentage of
500     // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
501    
502     void N_analyze(int l5, int l3, int p5, int p3) {
503     /* assumes feat was filled properly */
504     int old_dif, t5,t3,v;
505 gpertea 12 if (l3<l5+2 || p5>p3 ) {
506 gpertea 4 feat.end5=l5+1;
507     feat.end3=l3+1;
508     return;
509     }
510    
511     t5=feat.NPos[p5]-l5;
512     t3=l3-feat.NPos[p3];
513     old_dif=p3-p5;
514     v=(int)((((double)(l3-l5))*perc_lenN)/100);
515     if (v>20) v=20; /* enforce N-search limit for very long reads */
516     if (t5 < v ) {
517     l5=feat.NPos[p5]+1;
518     p5++;
519     }
520     if (t3 < v) {
521     l3=feat.NPos[p3]-1;
522     p3--;
523     }
524     /* restNs=p3-p5; number of Ns in the new CLR */
525     if (p3-p5==old_dif) { /* no change, return */
526     /* don't trim if this may shorten the read too much */
527     //if (l5-l3<min_read_len) return;
528     feat.end5=l5+1;
529     feat.end3=l3+1;
530     return;
531     }
532     else
533     N_analyze(l5,l3, p5,p3);
534     }
535    
536    
537 gpertea 12 bool qtrim(GStr& qvs, int &l5, int &l3) {
538 gpertea 13 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
539 gpertea 12 if (qv_phredtype==0) {
540     //try to guess the Phred type
541     int vmin=256, vmax=0;
542     for (int i=0;i<qvs.length();i++) {
543     if (vmin>qvs[i]) vmin=qvs[i];
544     if (vmax<qvs[i]) vmax=qvs[i];
545     }
546     if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
547     if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
548     if (qv_phredtype==0) {
549     GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
550     }
551 gpertea 13 if (verbose)
552     GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
553     } //guessing Phred type
554 gpertea 12 for (l3=qvs.length()-1;l3>2;l3--) {
555 gpertea 13 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
556 gpertea 12 }
557     //just in case, check also the 5' the end (?)
558     for (l5=0;l5<qvs.length()-3;l5++) {
559 gpertea 13 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
560 gpertea 12 }
561 gpertea 13 if (qvtrim_max>0) {
562     if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
563     if (l5>qvtrim_max) l5=qvtrim_max;
564     }
565 gpertea 12 return (l5>0 || l3<qvs.length()-1);
566     }
567    
568     bool ntrim(GStr& rseq, int &l5, int &l3) {
569     //count Ns in the sequence, trim N-rich ends
570 gpertea 4 feat.init(rseq);
571     l5=feat.end5-1;
572     l3=feat.end3-1;
573     N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
574 gpertea 12 if (l5==feat.end5-1 && l3==feat.end3-1) {
575     if (feat.perc_N>max_perc_N) {
576     feat.valid=false;
577     l3=l5+1;
578     return true;
579     }
580     else {
581     return false; //nothing changed
582     }
583     }
584 gpertea 4 l5=feat.end5-1;
585     l3=feat.end3-1;
586     if (l3-l5+1<min_read_len) {
587     feat.valid=false;
588     return true;
589     }
590     feat.N_calc();
591 gpertea 12
592     if (feat.perc_N>max_perc_N) {
593 gpertea 4 feat.valid=false;
594     l3=l5+1;
595     return true;
596     }
597     return true;
598     }
599    
600     //--------------- dust functions ----------------
601     class DNADuster {
602     public:
603     int dustword;
604     int dustwindow;
605     int dustwindow2;
606     int dustcutoff;
607     int mv, iv, jv;
608     int counts[32*32*32];
609     int iis[32*32*32];
610     DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
611     dustword=wordsize;
612     dustwindow=winsize;
613     dustwindow2 = (winsize>>1);
614     dustcutoff=cutoff;
615     mv=0;
616     iv=0;
617     jv=0;
618     }
619     void setWindowSize(int value) {
620     dustwindow = value;
621     dustwindow2 = (dustwindow >> 1);
622     }
623     void setWordSize(int value) {
624     dustword=value;
625     }
626     void wo1(int len, const char* s, int ivv) {
627     int i, ii, j, v, t, n, n1, sum;
628     int js, nis;
629     n = 32 * 32 * 32;
630     n1 = n - 1;
631     nis = 0;
632     i = 0;
633     ii = 0;
634     sum = 0;
635     v = 0;
636     for (j=0; j < len; j++, s++) {
637     ii <<= 5;
638     if (*s<=32) {
639     i=0;
640     continue;
641     }
642     ii |= *s - 'A'; //assume uppercase!
643     ii &= n1;
644     i++;
645     if (i >= dustword) {
646     for (js=0; js < nis && iis[js] != ii; js++) ;
647     if (js == nis) {
648     iis[nis] = ii;
649     counts[ii] = 0;
650     nis++;
651     }
652     if ((t = counts[ii]) > 0) {
653     sum += t;
654     v = 10 * sum / j;
655     if (mv < v) {
656     mv = v;
657     iv = ivv;
658     jv = j;
659     }
660     }
661     counts[ii]++;
662     }
663     }
664     }
665    
666     int wo(int len, const char* s, int* beg, int* end) {
667     int i, l1;
668     l1 = len - dustword + 1;
669     if (l1 < 0) {
670     *beg = 0;
671     *end = len - 1;
672     return 0;
673     }
674     mv = 0;
675     iv = 0;
676     jv = 0;
677     for (i=0; i < l1; i++) {
678     wo1(len-i, s+i, i);
679     }
680     *beg = iv;
681     *end = iv + jv;
682     return mv;
683     }
684    
685     void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
686     int i, j, l, a, b, v;
687     if (cutoff==0) cutoff=dustcutoff;
688     a=0;b=0;
689     //GMessage("Dust cutoff=%d\n", cutoff);
690     for (i=0; i < seqlen; i += dustwindow2) {
691     l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
692     v = wo(l, seq+i, &a, &b);
693     if (v > cutoff) {
694     //for (j = a; j <= b && j < dustwindow2; j++) {
695     for (j = a; j <= b; j++) {
696     seqmsk[i+j]='N';//could be made lowercase instead
697     }
698     }
699     }
700     //return first;
701     }
702     };
703    
704     static DNADuster duster;
705    
706     int dust(GStr& rseq) {
707     char* seq=Gstrdup(rseq.chars());
708     duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
709     //check the number of Ns:
710     int ncount=0;
711     for (int i=0;i<rseq.length();i++) {
712     if (seq[i]=='N') ncount++;
713     }
714     GFREE(seq);
715     return ncount;
716     }
717    
718 gpertea 12 int get3mer_value(const char* s) {
719     return (s[0]<<16)+(s[1]<<8)+s[2];
720     }
721    
722     int w3_match(int qv, const char* str, int slen, int start_index=0) {
723     if (start_index>=slen || start_index<0) return -1;
724     for (int i=start_index;i<slen-3;i++) {
725     int rv=get3mer_value(str+i);
726     if (rv==qv) return i;
727     }
728     return -1;
729     }
730    
731     int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
732     if (end_index>=slen) return -1;
733     if (end_index<0) end_index=slen-1;
734     for (int i=end_index-2;i>=0;i--) {
735     int rv=get3mer_value(str+i);
736     if (rv==qv) return i;
737     }
738     return -1;
739     }
740    
741 gpertea 94 struct SLocScore {
742     int pos;
743     int score;
744     SLocScore(int p=0,int s=0) {
745     pos=p;
746     score=s;
747     }
748     void set(int p, int s) {
749     pos=p;
750     score=s;
751     }
752     void add(int p, int add) {
753     pos=p;
754     score+=add;
755     }
756     };
757 gpertea 12
758 gpertea 94 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
759     int rlen=seq.length();
760     l5=0;
761     l3=rlen-1;
762     int32 seedVal=*(int32*)poly_seed;
763     char polyChar=poly_seed[0];
764     //assumes N trimming was already done
765     //so a poly match should be very close to the end of the read
766     // -- find the initial match (seed)
767     int lmin=GMAX((rlen-12), 0);
768     int li;
769     for (li=rlen-4;li>lmin;li--) {
770     if (seedVal==*(int*)&(seq[li])) {
771     break;
772     }
773 gpertea 12 }
774 gpertea 94 if (li<=lmin) return false;
775     //seed found, try to extend it both ways
776     //extend right
777     int ri=li+3;
778     SLocScore loc(ri, poly_m_score<<2);
779     SLocScore maxloc(loc);
780     //extend right
781     while (ri<rlen-2) {
782     ri++;
783     if (seq[ri]==polyChar) {
784     loc.add(ri,poly_m_score);
785     }
786     else if (seq[ri]=='N') {
787     loc.add(ri,0);
788     }
789     else { //mismatch
790     loc.add(ri,poly_mis_score);
791     if (maxloc.score-loc.score>poly_dropoff_score) break;
792     }
793     if (maxloc.score<=loc.score) {
794     maxloc=loc;
795     }
796 gpertea 12 }
797 gpertea 94 ri=maxloc.pos;
798     if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end
799     //ri = right boundary for the poly match
800     //extend left
801     loc.set(li, maxloc.score);
802     maxloc.pos=li;
803     while (li>0) {
804     li--;
805     if (seq[li]==polyChar) {
806     loc.add(li,poly_m_score);
807     }
808     else if (seq[li]=='N') {
809     loc.add(li,0);
810     }
811     else { //mismatch
812     loc.add(li,poly_mis_score);
813     if (maxloc.score-loc.score>poly_dropoff_score) break;
814     }
815     if (maxloc.score<=loc.score) {
816     maxloc=loc;
817     }
818     }
819     if (maxloc.score>poly_minScore && ri>=rlen-3) {
820     l5=li;
821     l3=ri;
822     return true;
823     }
824     return false;
825 gpertea 12 }
826    
827 gpertea 94
828     bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
829 gpertea 88 int rlen=seq.length();
830     l5=0;
831     l3=rlen-1;
832 gpertea 94 int32 seedVal=*(int32*)poly_seed;
833     char polyChar=poly_seed[0];
834 gpertea 88 //assumes N trimming was already done
835     //so a poly match should be very close to the end of the read
836     // -- find the initial match (seed)
837 gpertea 94 int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
838     int li;
839     for (li=0;li<=lmax;li++) {
840     if (seedVal==*(int*)&(seq[li])) {
841 gpertea 88 break;
842     }
843     }
844 gpertea 94 if (li>lmax) return false;
845     //seed found, try to extend it both ways
846     //extend left
847     int ri=li+3; //save rightmost base of the seed
848     SLocScore loc(li, poly_m_score<<2);
849     SLocScore maxloc(loc);
850     while (li>0) {
851     li--;
852     if (seq[li]==polyChar) {
853     loc.add(li,poly_m_score);
854     }
855     else if (seq[li]=='N') {
856     loc.add(li,0);
857     }
858     else { //mismatch
859     loc.add(li,poly_mis_score);
860     if (maxloc.score-loc.score>poly_dropoff_score) break;
861     }
862     if (maxloc.score<=loc.score) {
863     maxloc=loc;
864     }
865 gpertea 88 }
866 gpertea 94 li=maxloc.pos;
867     if (li>3) return false; //no trimming wanted, too far from 5' end
868     //li = right boundary for the poly match
869    
870     //extend right
871     loc.set(ri, maxloc.score);
872     maxloc.pos=ri;
873     while (ri<rlen-2) {
874     ri++;
875     if (seq[ri]==polyChar) {
876     loc.add(ri,poly_m_score);
877     }
878     else if (seq[ri]=='N') {
879     loc.add(ri,0);
880     }
881     else { //mismatch
882     loc.add(ri,poly_mis_score);
883     if (maxloc.score-loc.score>poly_dropoff_score) break;
884     }
885     if (maxloc.score<=loc.score) {
886     maxloc=loc;
887     }
888     }
889    
890     if (maxloc.score>poly_minScore && li<=3) {
891     l5=li;
892     l3=ri;
893     return true;
894     }
895     return false;
896 gpertea 88 }
897    
898 gpertea 4 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
899     int rlen=seq.length();
900     l5=0;
901     l3=rlen-1;
902 gpertea 12
903 gpertea 4 return false; //no adapter parts found
904     }
905    
906     bool trim_adapter5(GStr& seq, int&l5, int &l3) {
907     int rlen=seq.length();
908     l5=0;
909     l3=rlen-1;
910 gpertea 102 bool trimmed=false;
911     for (int ai=0;ai<adapters5.Count();ai++) {
912     if (adapters5[ai].is_empty()) continue;
913     int a5len=adapters5[ai].length();
914     GStr& adapter5=adapters5[ai];
915 gpertea 12
916 gpertea 102 }//for each 5' adapter
917     return trimmed;
918 gpertea 98 }
919 gpertea 4
920 gpertea 12 //convert qvs to/from phred64 from/to phread33
921     void convertPhred(GStr& q) {
922     for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
923 gpertea 4 }
924    
925 gpertea 12 void convertPhred(char* q, int len) {
926     for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
927 gpertea 4 }
928    
929 gpertea 13 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
930     GStr& rname, GStr& rinfo, GStr& infname) {
931     rseq="";
932     rqv="";
933     rname="";
934     rinfo="";
935     if (fq.eof()) return false;
936     char* l=fq.getLine();
937     while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
938     if (l==NULL) return false;
939     /* if (rawFormat) {
940     //TODO: implement raw qseq parsing here
941     //if (raw type=N) then continue; //skip invalid/bad records
942     } //raw qseq format
943     else { // FASTQ or FASTA */
944     isfasta=(l[0]=='>');
945     if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
946     infname.chars(), l);
947     GStr s(l);
948     rname=&(l[1]);
949     for (int i=0;i<rname.length();i++)
950     if (rname[i]<=' ') {
951     if (i<rname.length()-2) rinfo=rname.substr(i+1);
952     rname.cut(i);
953     break;
954     }
955     //now get the sequence
956     if ((l=fq.getLine())==NULL)
957     GError("Error: unexpected EOF after header for read %s (%s)\n",
958     rname.chars(), infname.chars());
959     rseq=l; //this must be the DNA line
960     while ((l=fq.getLine())!=NULL) {
961     //seq can span multiple lines
962     if (l[0]=='>' || l[0]=='+') {
963     fq.pushBack();
964     break; //
965     }
966     rseq+=l;
967     } //check for multi-line seq
968     if (!isfasta) { //reading fastq quality values, which can also be multi-line
969     if ((l=fq.getLine())==NULL)
970     GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
971     if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
972     if ((l=fq.getLine())==NULL)
973     GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
974     rqv=l;
975     //if (rqv.length()!=rseq.length())
976     // GError("Error: qv len != seq len for %s\n", rname.chars());
977     while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
978     rqv+=l; //append to qv string
979     }
980     }// fastq
981     // } //<-- FASTA or FASTQ
982 gpertea 94 rseq.upper();
983 gpertea 13 return true;
984     }
985    
986     char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
987     //returns 0 if the read was untouched, 1 if it was just trimmed
988     // and a trash code if it was trashed
989     l5=0;
990     l3=rseq.length()-1;
991     if (l3-l5+1<min_read_len) {
992     return 's';
993     }
994     GStr wseq(rseq.chars());
995     GStr wqv(rqv.chars());
996     int w5=l5;
997     int w3=l3;
998     //first do the q-based trimming
999     if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1000     if (w3-w5+1<min_read_len) {
1001     return 'Q'; //invalid read
1002     }
1003     //-- keep only the w5..w3 range
1004     l5=w5;
1005     l3=w3;
1006     wseq=wseq.substr(w5, w3-w5+1);
1007     if (!wqv.is_empty())
1008     wqv=wqv.substr(w5, w3-w5+1);
1009     } //qv trimming
1010     // N-trimming on the remaining read seq
1011     if (ntrim(wseq, w5, w3)) {
1012     //GMessage("before: %s\n",wseq.chars());
1013     //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1014     l5+=w5;
1015     l3-=(wseq.length()-1-w3);
1016     if (w3-w5+1<min_read_len) {
1017     return 'N'; //to be trashed
1018     }
1019     //-- keep only the w5..w3 range
1020     wseq=wseq.substr(w5, w3-w5+1);
1021     if (!wqv.is_empty())
1022     wqv=wqv.substr(w5, w3-w5+1);
1023     w5=0;
1024     w3=wseq.length()-1;
1025     }
1026 gpertea 102 if (adapters3.Count()>0) {
1027 gpertea 13 if (trim_adapter3(wseq, w5, w3)) {
1028     int trimlen=wseq.length()-(w3-w5+1);
1029     num_trimmed3++;
1030     if (trimlen<min_trimmed3)
1031     min_trimmed3=trimlen;
1032     l5+=w5;
1033     l3-=(wseq.length()-1-w3);
1034     if (w3-w5+1<min_read_len) {
1035     return '3';
1036     }
1037     //-- keep only the w5..w3 range
1038     wseq=wseq.substr(w5, w3-w5+1);
1039     if (!wqv.is_empty())
1040     wqv=wqv.substr(w5, w3-w5+1);
1041     }//some adapter was trimmed
1042     } //adapter trimming
1043 gpertea 102 if (adapters5.Count()>0) {
1044 gpertea 13 if (trim_adapter5(wseq, w5, w3)) {
1045     int trimlen=wseq.length()-(w3-w5+1);
1046     num_trimmed5++;
1047     if (trimlen<min_trimmed5)
1048     min_trimmed5=trimlen;
1049     l5+=w5;
1050     l3-=(wseq.length()-1-w3);
1051     if (w3-w5+1<min_read_len) {
1052     return '5';
1053     }
1054     //-- keep only the w5..w3 range
1055     wseq=wseq.substr(w5, w3-w5+1);
1056     if (!wqv.is_empty())
1057     wqv=wqv.substr(w5, w3-w5+1);
1058     }//some adapter was trimmed
1059     } //adapter trimming
1060     if (doCollapse) {
1061     //keep read for later
1062     FqDupRec* dr=dhash.Find(wseq.chars());
1063     if (dr==NULL) { //new entry
1064     //if (prefix.is_empty())
1065     dhash.Add(wseq.chars(),
1066     new FqDupRec(&wqv, rname.chars()));
1067     //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1068     }
1069     else
1070     dr->add(wqv);
1071     } //collapsing duplicates
1072     else { //not collapsing duplicates
1073     //apply the dust filter now
1074     if (doDust) {
1075     int dustbases=dust(wseq);
1076     if (dustbases>(wseq.length()>>1)) {
1077     return 'D';
1078     }
1079     }
1080     } //not collapsing duplicates
1081     return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1082     }
1083    
1084     void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1085     //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1086     if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1087     else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1088     }
1089    
1090     void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1091     outcounter++;
1092     bool asFasta=(rqv.is_empty() || fastaOutput);
1093     if (asFasta) {
1094     if (prefix.is_empty()) {
1095     printHeader(f_out, '>',rname,rinfo);
1096     fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1097     }
1098     else {
1099     fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1100     rseq.chars());
1101     }
1102     }
1103     else { //fastq
1104     if (convert_phred) convertPhred(rqv);
1105     if (prefix.is_empty()) {
1106     printHeader(f_out, '@', rname, rinfo);
1107     fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1108     }
1109     else
1110     fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1111     rseq.chars(),rqv.chars() );
1112     }
1113     }
1114    
1115     void trash_report(char trashcode, GStr& rname, FILE* freport) {
1116     if (freport==NULL || trashcode<=' ') return;
1117     if (trashcode=='3' || trashcode=='5') {
1118     fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1119     }
1120     else {
1121     fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1122     }
1123     //tcounter++;
1124     }
1125    
1126     GStr getFext(GStr& s, int* xpos=NULL) {
1127     //s must be a filename without a path
1128     GStr r("");
1129     if (xpos!=NULL) *xpos=0;
1130     if (s.is_empty() || s=="-") return r;
1131     int p=s.rindex('.');
1132     int d=s.rindex('/');
1133     if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1134     r=s.substr(p+1);
1135     if (xpos!=NULL) *xpos=p+1;
1136     r.lower();
1137     return r;
1138     }
1139    
1140     void baseFileName(GStr& fname) {
1141     //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1142     int xpos=0;
1143     GStr fext=getFext(fname, &xpos);
1144     if (xpos<=1) return;
1145     bool extdel=false;
1146     GStr f2;
1147     if (fext=="z" || fext=="zip") {
1148     extdel=true;
1149     }
1150     else if (fext.length()>=2) {
1151     f2=fext.substr(0,2);
1152     extdel=(f2=="gz" || f2=="bz");
1153     }
1154     if (extdel) {
1155     fname.cut(xpos-1);
1156     fext=getFext(fname, &xpos);
1157     if (xpos<=1) return;
1158     }
1159     extdel=false;
1160     if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1161     extdel=true;
1162     }
1163     else if (fext.length()>=2) {
1164     extdel=(fext.substr(0,2)=="fa");
1165     }
1166     if (extdel) fname.cut(xpos-1);
1167     GStr fncp(fname);
1168     fncp.lower();
1169     fncp.chomp("seq");
1170     fncp.chomp("sequence");
1171     fncp.trimR("_.");
1172     if (fncp.length()<fname.length()) fname.cut(fncp.length());
1173     }
1174    
1175     FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1176     FILE* f_out=NULL;
1177     GStr fname(getFileName(infname.chars()));
1178     //eliminate known extensions
1179     baseFileName(fname);
1180     if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1181     else if (pocmd.is_empty()) {
1182     GStr oname(fname);
1183     oname.append('.');
1184     oname.append(outsuffix);
1185     f_out=fopen(oname.chars(),"w");
1186     if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1187     }
1188     else {
1189     GStr oname(">");
1190     oname.append(fname);
1191     oname.append('.');
1192     oname.append(outsuffix);
1193     pocmd.append(oname);
1194     f_out=popen(pocmd.chars(), "w");
1195     if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1196     }
1197     return f_out;
1198     }
1199    
1200     void guess_unzip(GStr& fname, GStr& picmd) {
1201     GStr fext=getFext(fname);
1202     if (fext=="gz" || fext=="gzip" || fext=="z") {
1203     picmd="gzip -cd ";
1204     }
1205     else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1206     picmd="bzip2 -cd ";
1207     }
1208     }
1209    
1210 gpertea 94
1211     int loadAdapters(const char* fname) {
1212     GLineReader lr(fname);
1213     char* l;
1214     while ((l=lr.nextLine())!=NULL) {
1215     if (lr.length()<=3 || l[0]=='#') continue;
1216 gpertea 98 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1217     l[0]==';'|| l[0]==':' ) {
1218 gpertea 94 int i=1;
1219     while (l[i]!=0 && isspace(l[i])) {
1220     i++;
1221     }
1222     if (l[i]!=0) {
1223 gpertea 102 GStr s(&(l[i]));
1224     adapters3.Add(s);
1225 gpertea 94 continue;
1226     }
1227     }
1228     else {
1229 gpertea 98 GStr s(l);
1230     s.startTokenize("\t ;,:");
1231     GStr a5,a3;
1232     if (s.nextToken(a5))
1233     s.nextToken(a3);
1234 gpertea 102 a5.upper();
1235     a3.upper();
1236     adapters5.Add(a5);
1237     adapters3.Add(a3);
1238 gpertea 94 }
1239     }
1240 gpertea 102 return adapters5.Count()+adapters3.Count();
1241 gpertea 94 }
1242    
1243 gpertea 13 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1244     GStr& s, GStr& infname, GStr& infname2) {
1245     // uses outsuffix to generate output file names and open file handles as needed
1246     infname="";
1247     infname2="";
1248     f_in=NULL;
1249     f_in2=NULL;
1250     f_out=NULL;
1251     f_out2=NULL;
1252     //analyze outsuffix intent
1253     GStr pocmd;
1254 gpertea 76 if (outsuffix=="-") {
1255     f_out=stdout;
1256     }
1257     else {
1258     GStr ox=getFext(outsuffix);
1259     if (ox.length()>2) ox=ox.substr(0,2);
1260     if (ox=="gz") pocmd="gzip -9 -c ";
1261     else if (ox=="bz") pocmd="bzip2 -9 -c ";
1262     }
1263 gpertea 13 if (s=="-") {
1264     f_in=stdin;
1265     infname="stdin";
1266     f_out=prepOutFile(infname, pocmd);
1267     return;
1268     } // streaming from stdin
1269     s.startTokenize(",:");
1270     s.nextToken(infname);
1271     bool paired=s.nextToken(infname2);
1272     if (fileExists(infname.chars())==0)
1273     GError("Error: cannot find file %s!\n",infname.chars());
1274     GStr fname(getFileName(infname.chars()));
1275     GStr picmd;
1276     guess_unzip(fname, picmd);
1277     if (picmd.is_empty()) {
1278     f_in=fopen(infname.chars(), "r");
1279     if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1280     }
1281     else {
1282     picmd.append(infname);
1283     f_in=popen(picmd.chars(), "r");
1284     if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1285     }
1286 gpertea 76 if (f_out==stdout) {
1287     if (paired) GError("Error: output suffix required for paired reads\n");
1288     return;
1289     }
1290 gpertea 13 f_out=prepOutFile(infname, pocmd);
1291     if (!paired) return;
1292     if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1293     // ---- paired reads:-------------
1294     if (fileExists(infname2.chars())==0)
1295     GError("Error: cannot find file %s!\n",infname2.chars());
1296     picmd="";
1297     GStr fname2(getFileName(infname2.chars()));
1298     guess_unzip(fname2, picmd);
1299     if (picmd.is_empty()) {
1300     f_in2=fopen(infname2.chars(), "r");
1301     if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1302     }
1303     else {
1304     picmd.append(infname2);
1305     f_in2=popen(picmd.chars(), "r");
1306     if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1307     }
1308     f_out2=prepOutFile(infname2, pocmd);
1309     }