ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 103
Committed: Mon Oct 10 19:33:39 2011 UTC (8 years ago) by gpertea
File size: 41090 byte(s)
Log Message:
wip

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7
8 #define USAGE "Usage:\n\
9 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 <input.fq>[,<input_mates.fq>\n\
13 \n\
14 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 for low complexity and collapse duplicate reads.\n\
16 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 one read in a pair), the two file names should be given delimited by a comma\n\
18 or a colon character\n\
19 \n\
20 Options:\n\
21 -n rename all the reads using the <prefix> followed by a read counter;\n\
22 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23 the read duplication count\n\
24 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25 file(s) named <input>.<outsuffix> which will be created in the \n\
26 current (working) directory; (writes to stdout if -o- is given);\n\
27 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 -f file with adapter sequences to trim, each line having this format:\n\
29 <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 -3 trim the given adapter sequence at the 3' end of each read\n\
33 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 -A disable polyA/T trimming (enabled by default)\n\
35 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 -l minimum \"clean\" length after trimming that a read must have\n\
40 in order to pass the filter (default: 16)\n\
41 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42 one letter code for the reason why they were trashed\n\
43 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44 50% of its length detected as low complexity\n\
45 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 name (e.g. for microRNAs)\n\
47 -p input is phred64/phred33 (use -p64 or -p33)\n\
48 -Q convert quality values to the other Phred qv type\n\
49 -V verbose processing\n\
50 "
51
52 //-z for -o option, the output stream(s) will be first piped into the given\n
53 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54
55
56 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57
58 //For paired reads sequencing:
59 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 //FILE* f_out=NULL; //stdout if not provided
62 //FILE* f_out2=NULL; //for paired reads
63 //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 //FILE* f_in2=NULL; //for paired reads
65 FILE* freport=NULL;
66
67 bool debug=false;
68 bool verbose=false;
69 bool doCollapse=false;
70 bool doDust=false;
71 bool doPolyTrim=true;
72 bool fastaOutput=false;
73 bool trashReport=false;
74 //bool rawFormat=false;
75 int min_read_len=16;
76 double max_perc_N=7.0;
77 int dust_cutoff=16;
78 bool isfasta=false;
79 bool convert_phred=false;
80 GStr outsuffix; // -o
81 GStr prefix;
82 GStr zcmd;
83 int num_trimmed5=0;
84 int num_trimmed3=0;
85 int num_trimmedN=0;
86 int num_trimmedQ=0;
87 int min_trimmed5=INT_MAX;
88 int min_trimmed3=INT_MAX;
89
90 int qvtrim_qmin=0;
91 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
92 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93 int qv_cvtadd=0; //could be -31 or +31
94
95 // adaptor matching metrics -- for X-drop ungapped extension
96 const int match_reward=2;
97 const int mismatch_penalty=3;
98 const int Xdrop=8;
99
100 const int poly_m_score=2; //match score
101 const int poly_mis_score=-3; //mismatch
102 const int poly_dropoff_score=7;
103 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
104
105 const char *polyA_seed="AAAA";
106 const char *polyT_seed="TTTT";
107 GVec<GStr> adapters5;
108 GVec<GStr> adapters3;
109
110 CGreedyAlignData* gxmem_l=NULL;
111 CGreedyAlignData* gxmem_r=NULL;
112
113 // element in dhash:
114 class FqDupRec {
115 public:
116 int count; //how many of these reads are the same
117 int len; //length of qv
118 char* firstname; //optional, only if we want to keep the original read names
119 char* qv;
120 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
121 len=0;
122 qv=NULL;
123 firstname=NULL;
124 count=0;
125 if (qstr!=NULL) {
126 qv=Gstrdup(qstr->chars());
127 len=qstr->length();
128 count++;
129 }
130 if (rname!=NULL) firstname=Gstrdup(rname);
131 }
132 ~FqDupRec() {
133 GFREE(qv);
134 GFREE(firstname);
135 }
136 void add(GStr& d) { //collapse another record into this one
137 if (d.length()!=len)
138 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
139 count++;
140 for (int i=0;i<len;i++)
141 qv[i]=(qv[i]+d[i])/2;
142 }
143 };
144
145 void openfw(FILE* &f, GArgs& args, char opt) {
146 GStr s=args.getOpt(opt);
147 if (!s.is_empty()) {
148 if (s=='-') f=stdout;
149 else {
150 f=fopen(s.chars(),"w");
151 if (f==NULL) GError("Error creating file: %s\n", s.chars());
152 }
153 }
154 }
155
156 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
157 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
158
159 GHash<FqDupRec> dhash; //hash to keep track of duplicates
160
161 int loadAdapters(const char* fname);
162
163 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
164 GStr& s, GStr& infname, GStr& infname2);
165 // uses outsuffix to generate output file names and open file handles as needed
166
167 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
168 void trash_report(char trashcode, GStr& rname, FILE* freport);
169
170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
171 GStr& rname, GStr& rinfo, GStr& infname);
172
173 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
174 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
175
176 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
177 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
178 int dust(GStr& seq);
179 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
180 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
181 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
182 bool trim_adapter3(GStr& seq, int &l5, int &l3);
183
184 void convertPhred(char* q, int len);
185 void convertPhred(GStr& q);
186
187 int main(int argc, char * const argv[]) {
188 GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189 int e;
190 if ((e=args.isError())>0) {
191 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
192 exit(224);
193 }
194 debug=(args.getOpt('Y')!=NULL);
195 verbose=(args.getOpt('V')!=NULL);
196 convert_phred=(args.getOpt('Q')!=NULL);
197 doCollapse=(args.getOpt('C')!=NULL);
198 doDust=(args.getOpt('D')!=NULL);
199 if (args.getOpt('A')) doPolyTrim=false;
200 /*
201 rawFormat=(args.getOpt('R')!=NULL);
202 if (rawFormat) {
203 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
204 }
205 */
206 prefix=args.getOpt('n');
207 GStr s=args.getOpt('l');
208 if (!s.is_empty())
209 min_read_len=s.asInt();
210 s=args.getOpt('m');
211 if (!s.is_empty())
212 max_perc_N=s.asDouble();
213 s=args.getOpt('d');
214 if (!s.is_empty()) {
215 dust_cutoff=s.asInt();
216 doDust=true;
217 }
218 s=args.getOpt('q');
219 if (!s.is_empty()) {
220 qvtrim_qmin=s.asInt();
221 }
222 s=args.getOpt('t');
223 if (!s.is_empty()) {
224 qvtrim_max=s.asInt();
225 }
226 s=args.getOpt('p');
227 if (!s.is_empty()) {
228 int v=s.asInt();
229 if (v==33) {
230 qv_phredtype=33;
231 qv_cvtadd=31;
232 }
233 else if (v==64) {
234 qv_phredtype=64;
235 qv_cvtadd=-31;
236 }
237 else
238 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
239 }
240 s=args.getOpt('f');
241 if (!s.is_empty()) {
242 loadAdapters(s.chars());
243 }
244 bool fileAdapters=adapters5.Count()+adapters3.Count();
245 s=args.getOpt('5');
246 if (!s.is_empty()) {
247 if (fileAdapters)
248 GError("Error: options -5 and -f cannot be used together!\n");
249 s.upper();
250 adapters5.Add(s);
251 }
252 s=args.getOpt('3');
253 if (!s.is_empty()) {
254 if (fileAdapters)
255 GError("Error: options -3 and -f cannot be used together!\n");
256 s.upper();
257 adapters3.Add(s);
258 }
259 s=args.getOpt('y');
260 if (!s.is_empty()) {
261 int minmatch=s.asInt();
262 poly_minScore=minmatch*poly_m_score;
263 }
264
265 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
266 else outsuffix="-";
267 trashReport= (args.getOpt('r')!=NULL);
268 int fcount=args.startNonOpt();
269 if (fcount==0) {
270 GMessage(USAGE);
271 exit(224);
272 }
273 if (fcount>1 && doCollapse) {
274 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
275 }
276 //openfw(f_out, args, 'o');
277 //if (f_out==NULL) f_out=stdout;
278 if (trashReport)
279 openfw(freport, args, 'r');
280 char* infile=NULL;
281
282 if (adapters5.Count()>0)
283 gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
284 if (adapters3.Count()>0)
285 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
286
287 while ((infile=args.nextNonOpt())!=NULL) {
288 //for each input file
289 int incounter=0; //counter for input reads
290 int outcounter=0; //counter for output reads
291 int trash_s=0; //too short from the get go
292 int trash_Q=0;
293 int trash_N=0;
294 int trash_D=0;
295 int trash_poly=0;
296 int trash_A3=0;
297 int trash_A5=0;
298 s=infile;
299 GStr infname;
300 GStr infname2;
301 FILE* f_in=NULL;
302 FILE* f_in2=NULL;
303 FILE* f_out=NULL;
304 FILE* f_out2=NULL;
305 bool paired_reads=false;
306 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
307 GLineReader fq(f_in);
308 GLineReader* fq2=NULL;
309 if (f_in2!=NULL) {
310 fq2=new GLineReader(f_in2);
311 paired_reads=true;
312 }
313 GStr rseq, rqv, seqid, seqinfo;
314 GStr rseq2, rqv2, seqid2, seqinfo2;
315 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
316 incounter++;
317 int a5=0, a3=0, b5=0, b3=0;
318 char tcode=0, tcode2=0;
319 tcode=process_read(seqid, rseq, rqv, a5, a3);
320 if (fq2!=NULL) {
321 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
322 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
323 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
324 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
325 }
326 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
327 //decide what to do with this pair and print rseq2 if the pair makes it
328 if (tcode>1 && tcode2<=1) {
329 //"untrash" rseq
330 if (a3-a5+1<min_read_len) {
331 a5=1;
332 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
333 }
334 tcode=1;
335 }
336 else if (tcode<=1 && tcode2>1) {
337 //"untrash" rseq2
338 if (b3-b5+1<min_read_len) {
339 b5=1;
340 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
341 }
342 tcode2=1;
343 }
344 if (tcode<=1) { //trimmed or left intact -- write it!
345 if (tcode>0) {
346 rseq2=rseq2.substr(b5,b3-b5+1);
347 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
348 }
349 int nocounter=0;
350 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
351 }
352 } //pair read
353 if (tcode>1) { //trashed
354 #ifdef GDEBUG
355 GMessage(" !!!!TRASH => 'N'\n");
356 #endif
357 if (tcode=='s') trash_s++;
358 else if (tcode=='A' || tcode=='T') trash_poly++;
359 else if (tcode=='Q') trash_Q++;
360 else if (tcode=='N') trash_N++;
361 else if (tcode=='D') trash_D++;
362 else if (tcode=='3') trash_A3++;
363 else if (tcode=='5') trash_A5++;
364 if (trashReport) trash_report(tcode, seqid, freport);
365 }
366 else if (!doCollapse) { //write it
367 if (tcode>0) {
368 rseq=rseq.substr(a5,a3-a5+1);
369 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
370 }
371 #ifdef GDEBUG
372 GMessage(" After trimming:\n");
373 GMessage("<==%s\n",rseq.chars());
374 #endif
375 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
376 }
377 } //for each fastq record
378 delete fq2;
379 FRCLOSE(f_in);
380 FRCLOSE(f_in2);
381 if (doCollapse) {
382 outcounter=0;
383 int maxdup_count=1;
384 char* maxdup_seq=NULL;
385 dhash.startIterate();
386 FqDupRec* qd=NULL;
387 char* seq=NULL;
388 while ((qd=dhash.NextData(seq))!=NULL) {
389 GStr rseq(seq);
390 //do the dusting here
391 if (doDust) {
392 int dustbases=dust(rseq);
393 if (dustbases>(rseq.length()>>1)) {
394 if (trashReport && qd->firstname!=NULL) {
395 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
396 }
397 trash_D+=qd->count;
398 continue;
399 }
400 }
401 outcounter++;
402 if (qd->count>maxdup_count) {
403 maxdup_count=qd->count;
404 maxdup_seq=seq;
405 }
406 if (isfasta) {
407 if (prefix.is_empty()) {
408 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
409 rseq.chars());
410 }
411 else { //use custom read name
412 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
413 qd->count, rseq.chars());
414 }
415 }
416 else { //fastq format
417 if (convert_phred) convertPhred(qd->qv, qd->len);
418 if (prefix.is_empty()) {
419 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
420 rseq.chars(), qd->qv);
421 }
422 else { //use custom read name
423 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
424 qd->count, rseq.chars(), qd->qv);
425 }
426 }
427 }//for each element of dhash
428 if (maxdup_count>1) {
429 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
430 }
431 } //collapse entries
432 if (verbose) {
433 if (paired_reads) {
434 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
435 GMessage("Number of input pairs :%9d\n", incounter);
436 GMessage(" Output pairs :%9d\n", outcounter);
437 }
438 else {
439 GMessage(">Input file : %s\n", infname.chars());
440 GMessage("Number of input reads :%9d\n", incounter);
441 GMessage(" Output reads :%9d\n", outcounter);
442 }
443 GMessage("------------------------------------\n");
444 if (num_trimmed5)
445 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
446 if (num_trimmed3)
447 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
448 GMessage("------------------------------------\n");
449 if (trash_s>0)
450 GMessage(" Trashed by length:%9d\n", trash_s);
451 if (trash_N>0)
452 GMessage(" Trashed by N%%:%9d\n", trash_N);
453 if (trash_Q>0)
454 GMessage("Trashed by low quality:%9d\n", trash_Q);
455 if (trash_poly>0)
456 GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
457 if (trash_A5>0)
458 GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
459 if (trash_A3>0)
460 GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
461 }
462 if (trashReport) {
463 FWCLOSE(freport);
464 }
465 FWCLOSE(f_out);
466 FWCLOSE(f_out2);
467 } //while each input file
468 delete gxmem_l;
469 delete gxmem_r;
470 //getc(stdin);
471 }
472
473 class NData {
474 public:
475 int NPos[1024]; //there should be no reads longer than 1K ?
476 int NCount;
477 int end5;
478 int end3;
479 int seqlen;
480 double perc_N; //percentage of Ns in end5..end3 range only!
481 const char* seq;
482 bool valid;
483 NData() {
484 NCount=0;
485 end5=0;
486 end3=0;
487 seq=NULL;
488 perc_N=0;
489 valid=true;
490 }
491 void init(GStr& rseq) {
492 NCount=0;
493 perc_N=0;
494 seqlen=rseq.length();
495 seq=rseq.chars();
496 end5=1;
497 end3=seqlen;
498 for (int i=0;i<seqlen;i++)
499 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
500 NPos[NCount]=i;
501 NCount++;
502 }
503 perc_N=(NCount*100.0)/seqlen;
504 valid=true;
505 }
506 void N_calc() { //only in the region after trimming
507 int n=0;
508 for (int i=end3-1;i<end5;i++) {
509 if (seq[i]=='N') n++;
510 }
511 perc_N=(n*100.0)/(end5-end3+1);
512 }
513 };
514
515 static NData feat;
516 int perc_lenN=12; // incremental distance from ends, in percentage of
517 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
518
519 void N_analyze(int l5, int l3, int p5, int p3) {
520 /* assumes feat was filled properly */
521 int old_dif, t5,t3,v;
522 if (l3<l5+2 || p5>p3 ) {
523 feat.end5=l5+1;
524 feat.end3=l3+1;
525 return;
526 }
527
528 t5=feat.NPos[p5]-l5;
529 t3=l3-feat.NPos[p3];
530 old_dif=p3-p5;
531 v=(int)((((double)(l3-l5))*perc_lenN)/100);
532 if (v>20) v=20; /* enforce N-search limit for very long reads */
533 if (t5 < v ) {
534 l5=feat.NPos[p5]+1;
535 p5++;
536 }
537 if (t3 < v) {
538 l3=feat.NPos[p3]-1;
539 p3--;
540 }
541 /* restNs=p3-p5; number of Ns in the new CLR */
542 if (p3-p5==old_dif) { /* no change, return */
543 /* don't trim if this may shorten the read too much */
544 //if (l5-l3<min_read_len) return;
545 feat.end5=l5+1;
546 feat.end3=l3+1;
547 return;
548 }
549 else
550 N_analyze(l5,l3, p5,p3);
551 }
552
553
554 bool qtrim(GStr& qvs, int &l5, int &l3) {
555 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
556 if (qv_phredtype==0) {
557 //try to guess the Phred type
558 int vmin=256, vmax=0;
559 for (int i=0;i<qvs.length();i++) {
560 if (vmin>qvs[i]) vmin=qvs[i];
561 if (vmax<qvs[i]) vmax=qvs[i];
562 }
563 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
564 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
565 if (qv_phredtype==0) {
566 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
567 }
568 if (verbose)
569 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
570 } //guessing Phred type
571 for (l3=qvs.length()-1;l3>2;l3--) {
572 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
573 }
574 //just in case, check also the 5' the end (?)
575 for (l5=0;l5<qvs.length()-3;l5++) {
576 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
577 }
578 if (qvtrim_max>0) {
579 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
580 if (l5>qvtrim_max) l5=qvtrim_max;
581 }
582 return (l5>0 || l3<qvs.length()-1);
583 }
584
585 bool ntrim(GStr& rseq, int &l5, int &l3) {
586 //count Ns in the sequence, trim N-rich ends
587 feat.init(rseq);
588 l5=feat.end5-1;
589 l3=feat.end3-1;
590 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
591 if (l5==feat.end5-1 && l3==feat.end3-1) {
592 if (feat.perc_N>max_perc_N) {
593 feat.valid=false;
594 l3=l5+1;
595 return true;
596 }
597 else {
598 return false; //nothing changed
599 }
600 }
601 l5=feat.end5-1;
602 l3=feat.end3-1;
603 if (l3-l5+1<min_read_len) {
604 feat.valid=false;
605 return true;
606 }
607 feat.N_calc();
608
609 if (feat.perc_N>max_perc_N) {
610 feat.valid=false;
611 l3=l5+1;
612 return true;
613 }
614 return true;
615 }
616
617 //--------------- dust functions ----------------
618 class DNADuster {
619 public:
620 int dustword;
621 int dustwindow;
622 int dustwindow2;
623 int dustcutoff;
624 int mv, iv, jv;
625 int counts[32*32*32];
626 int iis[32*32*32];
627 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
628 dustword=wordsize;
629 dustwindow=winsize;
630 dustwindow2 = (winsize>>1);
631 dustcutoff=cutoff;
632 mv=0;
633 iv=0;
634 jv=0;
635 }
636 void setWindowSize(int value) {
637 dustwindow = value;
638 dustwindow2 = (dustwindow >> 1);
639 }
640 void setWordSize(int value) {
641 dustword=value;
642 }
643 void wo1(int len, const char* s, int ivv) {
644 int i, ii, j, v, t, n, n1, sum;
645 int js, nis;
646 n = 32 * 32 * 32;
647 n1 = n - 1;
648 nis = 0;
649 i = 0;
650 ii = 0;
651 sum = 0;
652 v = 0;
653 for (j=0; j < len; j++, s++) {
654 ii <<= 5;
655 if (*s<=32) {
656 i=0;
657 continue;
658 }
659 ii |= *s - 'A'; //assume uppercase!
660 ii &= n1;
661 i++;
662 if (i >= dustword) {
663 for (js=0; js < nis && iis[js] != ii; js++) ;
664 if (js == nis) {
665 iis[nis] = ii;
666 counts[ii] = 0;
667 nis++;
668 }
669 if ((t = counts[ii]) > 0) {
670 sum += t;
671 v = 10 * sum / j;
672 if (mv < v) {
673 mv = v;
674 iv = ivv;
675 jv = j;
676 }
677 }
678 counts[ii]++;
679 }
680 }
681 }
682
683 int wo(int len, const char* s, int* beg, int* end) {
684 int i, l1;
685 l1 = len - dustword + 1;
686 if (l1 < 0) {
687 *beg = 0;
688 *end = len - 1;
689 return 0;
690 }
691 mv = 0;
692 iv = 0;
693 jv = 0;
694 for (i=0; i < l1; i++) {
695 wo1(len-i, s+i, i);
696 }
697 *beg = iv;
698 *end = iv + jv;
699 return mv;
700 }
701
702 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
703 int i, j, l, a, b, v;
704 if (cutoff==0) cutoff=dustcutoff;
705 a=0;b=0;
706 //GMessage("Dust cutoff=%d\n", cutoff);
707 for (i=0; i < seqlen; i += dustwindow2) {
708 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
709 v = wo(l, seq+i, &a, &b);
710 if (v > cutoff) {
711 //for (j = a; j <= b && j < dustwindow2; j++) {
712 for (j = a; j <= b; j++) {
713 seqmsk[i+j]='N';//could be made lowercase instead
714 }
715 }
716 }
717 //return first;
718 }
719 };
720
721 static DNADuster duster;
722
723 int dust(GStr& rseq) {
724 char* seq=Gstrdup(rseq.chars());
725 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
726 //check the number of Ns:
727 int ncount=0;
728 for (int i=0;i<rseq.length();i++) {
729 if (seq[i]=='N') ncount++;
730 }
731 GFREE(seq);
732 return ncount;
733 }
734
735 int get3mer_value(const char* s) {
736 return (s[0]<<16)+(s[1]<<8)+s[2];
737 }
738
739 int w3_match(int qv, const char* str, int slen, int start_index=0) {
740 if (start_index>=slen || start_index<0) return -1;
741 for (int i=start_index;i<slen-3;i++) {
742 int rv=get3mer_value(str+i);
743 if (rv==qv) return i;
744 }
745 return -1;
746 }
747
748 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
749 if (end_index>=slen) return -1;
750 if (end_index<0) end_index=slen-1;
751 for (int i=end_index-2;i>=0;i--) {
752 int rv=get3mer_value(str+i);
753 if (rv==qv) return i;
754 }
755 return -1;
756 }
757
758 struct SLocScore {
759 int pos;
760 int score;
761 SLocScore(int p=0,int s=0) {
762 pos=p;
763 score=s;
764 }
765 void set(int p, int s) {
766 pos=p;
767 score=s;
768 }
769 void add(int p, int add) {
770 pos=p;
771 score+=add;
772 }
773 };
774
775 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
776 if (!doPolyTrim) return false;
777 int rlen=seq.length();
778 l5=0;
779 l3=rlen-1;
780 int32 seedVal=*(int32*)poly_seed;
781 char polyChar=poly_seed[0];
782 //assumes N trimming was already done
783 //so a poly match should be very close to the end of the read
784 // -- find the initial match (seed)
785 int lmin=GMAX((rlen-12), 0);
786 int li;
787 for (li=rlen-4;li>lmin;li--) {
788 if (seedVal==*(int*)&(seq[li])) {
789 break;
790 }
791 }
792 if (li<=lmin) return false;
793 //seed found, try to extend it both ways
794 //extend right
795 int ri=li+3;
796 SLocScore loc(ri, poly_m_score<<2);
797 SLocScore maxloc(loc);
798 //extend right
799 while (ri<rlen-2) {
800 ri++;
801 if (seq[ri]==polyChar) {
802 loc.add(ri,poly_m_score);
803 }
804 else if (seq[ri]=='N') {
805 loc.add(ri,0);
806 }
807 else { //mismatch
808 loc.add(ri,poly_mis_score);
809 if (maxloc.score-loc.score>poly_dropoff_score) break;
810 }
811 if (maxloc.score<=loc.score) {
812 maxloc=loc;
813 }
814 }
815 ri=maxloc.pos;
816 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
817 //ri = right boundary for the poly match
818 //extend left
819 loc.set(li, maxloc.score);
820 maxloc.pos=li;
821 while (li>0) {
822 li--;
823 if (seq[li]==polyChar) {
824 loc.add(li,poly_m_score);
825 }
826 else if (seq[li]=='N') {
827 loc.add(li,0);
828 }
829 else { //mismatch
830 loc.add(li,poly_mis_score);
831 if (maxloc.score-loc.score>poly_dropoff_score) break;
832 }
833 if (maxloc.score<=loc.score) {
834 maxloc=loc;
835 }
836 }
837 if ((maxloc.score>poly_minScore && ri>=rlen-3) ||
838 (maxloc.score==poly_minScore && ri==rlen-1) ||
839 (maxloc.score>(poly_minScore<<1) && ri>=rlen-6)) {
840 l5=li;
841 l3=ri;
842 return true;
843 }
844 return false;
845 }
846
847 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
848 if (!doPolyTrim) return false;
849 int rlen=seq.length();
850 l5=0;
851 l3=rlen-1;
852 int32 seedVal=*(int32*)poly_seed;
853 char polyChar=poly_seed[0];
854 //assumes N trimming was already done
855 //so a poly match should be very close to the end of the read
856 // -- find the initial match (seed)
857 int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
858 int li;
859 for (li=0;li<=lmax;li++) {
860 if (seedVal==*(int*)&(seq[li])) {
861 break;
862 }
863 }
864 if (li>lmax) return false;
865 //seed found, try to extend it both ways
866 //extend left
867 int ri=li+3; //save rightmost base of the seed
868 SLocScore loc(li, poly_m_score<<2);
869 SLocScore maxloc(loc);
870 while (li>0) {
871 li--;
872 if (seq[li]==polyChar) {
873 loc.add(li,poly_m_score);
874 }
875 else if (seq[li]=='N') {
876 loc.add(li,0);
877 }
878 else { //mismatch
879 loc.add(li,poly_mis_score);
880 if (maxloc.score-loc.score>poly_dropoff_score) break;
881 }
882 if (maxloc.score<=loc.score) {
883 maxloc=loc;
884 }
885 }
886 li=maxloc.pos;
887 if (li>5) return false; //no trimming wanted, too far from 5' end
888 //li = right boundary for the poly match
889
890 //extend right
891 loc.set(ri, maxloc.score);
892 maxloc.pos=ri;
893 while (ri<rlen-2) {
894 ri++;
895 if (seq[ri]==polyChar) {
896 loc.add(ri,poly_m_score);
897 }
898 else if (seq[ri]=='N') {
899 loc.add(ri,0);
900 }
901 else { //mismatch
902 loc.add(ri,poly_mis_score);
903 if (maxloc.score-loc.score>poly_dropoff_score) break;
904 }
905 if (maxloc.score<=loc.score) {
906 maxloc=loc;
907 }
908 }
909
910 if ((maxloc.score==poly_minScore && li==0) ||
911 (maxloc.score>poly_minScore && li<2)
912 || (maxloc.score>(poly_minScore<<1) && li<6)) {
913 l5=li;
914 l3=ri;
915 return true;
916 }
917 return false;
918 }
919
920 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
921 if (adapters3.Count()==0) return false;
922 int rlen=seq.length();
923 l5=0;
924 l3=rlen-1;
925 bool trimmed=false;
926 GStr wseq(seq.chars());
927 int wlen=rlen;
928 for (int ai=0;ai<adapters3.Count();ai++) {
929 if (adapters3[ai].is_empty()) continue;
930 int alen=adapters3[ai].length();
931 GStr& aseq=adapters3[ai];
932 GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
933 if (r_bestaln) {
934 trimmed=true;
935 //keep unmatched region on the left, if any
936 l3-=(wlen-r_bestaln->sl+1);
937 delete r_bestaln;
938 if (l3<0) l3=0;
939 if (l3-l5+1<min_read_len) return true;
940 wseq=seq.substr(l5,l3-l5+1);
941 wlen=wseq.length();
942 }
943 }//for each 5' adapter
944 return trimmed;
945 }
946
947 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
948 if (adapters5.Count()==0) return false;
949 int rlen=seq.length();
950 l5=0;
951 l3=rlen-1;
952 bool trimmed=false;
953 GStr wseq(seq.chars());
954 int wlen=rlen;
955 for (int ai=0;ai<adapters5.Count();ai++) {
956 if (adapters5[ai].is_empty()) continue;
957 int alen=adapters5[ai].length();
958 GStr& aseq=adapters5[ai];
959 GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_l, 84);
960 if (l_bestaln) {
961 trimmed=true;
962 l5+=l_bestaln->sr;
963 delete l_bestaln;
964 if (l5>=rlen) l5=rlen-1;
965 if (l3-l5+1<min_read_len) return true;
966 wseq=seq.substr(l5,l3-l5+1);
967 wlen=wseq.length();
968 }
969 }//for each 5' adapter
970 return trimmed;
971 }
972
973 //convert qvs to/from phred64 from/to phread33
974 void convertPhred(GStr& q) {
975 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
976 }
977
978 void convertPhred(char* q, int len) {
979 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
980 }
981
982 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
983 GStr& rname, GStr& rinfo, GStr& infname) {
984 rseq="";
985 rqv="";
986 rname="";
987 rinfo="";
988 if (fq.eof()) return false;
989 char* l=fq.getLine();
990 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
991 if (l==NULL) return false;
992 /* if (rawFormat) {
993 //TODO: implement raw qseq parsing here
994 //if (raw type=N) then continue; //skip invalid/bad records
995 } //raw qseq format
996 else { // FASTQ or FASTA */
997 isfasta=(l[0]=='>');
998 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
999 infname.chars(), l);
1000 GStr s(l);
1001 rname=&(l[1]);
1002 for (int i=0;i<rname.length();i++)
1003 if (rname[i]<=' ') {
1004 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1005 rname.cut(i);
1006 break;
1007 }
1008 //now get the sequence
1009 if ((l=fq.getLine())==NULL)
1010 GError("Error: unexpected EOF after header for read %s (%s)\n",
1011 rname.chars(), infname.chars());
1012 rseq=l; //this must be the DNA line
1013 while ((l=fq.getLine())!=NULL) {
1014 //seq can span multiple lines
1015 if (l[0]=='>' || l[0]=='+') {
1016 fq.pushBack();
1017 break; //
1018 }
1019 rseq+=l;
1020 } //check for multi-line seq
1021 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1022 if ((l=fq.getLine())==NULL)
1023 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1024 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1025 if ((l=fq.getLine())==NULL)
1026 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1027 rqv=l;
1028 //if (rqv.length()!=rseq.length())
1029 // GError("Error: qv len != seq len for %s\n", rname.chars());
1030 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1031 rqv+=l; //append to qv string
1032 }
1033 }// fastq
1034 // } //<-- FASTA or FASTQ
1035 rseq.upper();
1036 return true;
1037 }
1038
1039 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1040 //returns 0 if the read was untouched, 1 if it was just trimmed
1041 // and a trash code if it was trashed
1042 l5=0;
1043 l3=rseq.length()-1;
1044 #ifdef GDEBUG
1045 GMessage(">%s\n", rname.chars());
1046 GMessage("==>%s\n",rseq.chars());
1047 #endif
1048 if (l3-l5+1<min_read_len) {
1049 return 's';
1050 }
1051 GStr wseq(rseq.chars());
1052 GStr wqv(rqv.chars());
1053 int w5=l5;
1054 int w3=l3;
1055 //first do the q-based trimming
1056 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1057 if (w3-w5+1<min_read_len) {
1058 return 'Q'; //invalid read
1059 }
1060 //-- keep only the w5..w3 range
1061 l5=w5;
1062 l3=w3;
1063 wseq=wseq.substr(w5, w3-w5+1);
1064 if (!wqv.is_empty())
1065 wqv=wqv.substr(w5, w3-w5+1);
1066 } //qv trimming
1067 // N-trimming on the remaining read seq
1068 if (ntrim(wseq, w5, w3)) {
1069 //GMessage("before: %s\n",wseq.chars());
1070 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1071 l5+=w5;
1072 l3-=(wseq.length()-1-w3);
1073 if (w3-w5+1<min_read_len) {
1074 return 'N'; //to be trashed
1075 }
1076 //-- keep only the w5..w3 range
1077 wseq=wseq.substr(w5, w3-w5+1);
1078 if (!wqv.is_empty())
1079 wqv=wqv.substr(w5, w3-w5+1);
1080 w5=0;
1081 w3=wseq.length()-1;
1082 }
1083 char trim_code;
1084 do {
1085 trim_code=0;
1086 if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1087 trim_code='A';
1088 #ifdef GDEBUG
1089 GMessage("\t-trimmed poly-A at 5' end\n");
1090 #endif
1091 }
1092 else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1093 trim_code='T';
1094 #ifdef GDEBUG
1095 GMessage("\t-trimmed poly-T at 5' end\n");
1096 #endif
1097 }
1098 else if (trim_adapter5(wseq, w5, w3)) {
1099 trim_code='5';
1100 #ifdef GDEBUG
1101 GMessage("\t-trimmed adapter at 5' end\n");
1102 #endif
1103 }
1104 if (trim_code) {
1105 int trimlen=wseq.length()-(w3-w5+1);
1106 num_trimmed5++;
1107 if (trimlen<min_trimmed5)
1108 min_trimmed5=trimlen;
1109 l5+=w5;
1110 l3-=(wseq.length()-1-w3);
1111 if (w3-w5+1<min_read_len) {
1112 return trim_code;
1113 }
1114 //-- keep only the w5..w3 range
1115 wseq=wseq.substr(w5, w3-w5+1);
1116 if (!wqv.is_empty())
1117 wqv=wqv.substr(w5, w3-w5+1);
1118 }// trimmed at 5' end
1119 } while (trim_code);
1120
1121 do {
1122 trim_code=0;
1123 if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1124 trim_code='A';
1125 }
1126 else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1127 trim_code='T';
1128 }
1129 else if (trim_adapter3(wseq, w5, w3)) {
1130 trim_code='3';
1131 }
1132 if (trim_code) {
1133 int trimlen=wseq.length()-(w3-w5+1);
1134 num_trimmed3++;
1135 if (trimlen<min_trimmed3)
1136 min_trimmed3=trimlen;
1137 l5+=w5;
1138 l3-=(wseq.length()-1-w3);
1139 if (w3-w5+1<min_read_len) {
1140 return trim_code;
1141 }
1142 //-- keep only the w5..w3 range
1143 wseq=wseq.substr(w5, w3-w5+1);
1144 if (!wqv.is_empty())
1145 wqv=wqv.substr(w5, w3-w5+1);
1146 }//trimming at 3' end
1147 } while (trim_code);
1148
1149
1150 if (doCollapse) {
1151 //keep read for later
1152 FqDupRec* dr=dhash.Find(wseq.chars());
1153 if (dr==NULL) { //new entry
1154 //if (prefix.is_empty())
1155 dhash.Add(wseq.chars(),
1156 new FqDupRec(&wqv, rname.chars()));
1157 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1158 }
1159 else
1160 dr->add(wqv);
1161 } //collapsing duplicates
1162 else { //not collapsing duplicates
1163 //apply the dust filter now
1164 if (doDust) {
1165 int dustbases=dust(wseq);
1166 if (dustbases>(wseq.length()>>1)) {
1167 return 'D';
1168 }
1169 }
1170 } //not collapsing duplicates
1171 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1172 }
1173
1174 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1175 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1176 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1177 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1178 }
1179
1180 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1181 outcounter++;
1182 bool asFasta=(rqv.is_empty() || fastaOutput);
1183 if (asFasta) {
1184 if (prefix.is_empty()) {
1185 printHeader(f_out, '>',rname,rinfo);
1186 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1187 }
1188 else {
1189 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1190 rseq.chars());
1191 }
1192 }
1193 else { //fastq
1194 if (convert_phred) convertPhred(rqv);
1195 if (prefix.is_empty()) {
1196 printHeader(f_out, '@', rname, rinfo);
1197 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1198 }
1199 else
1200 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1201 rseq.chars(),rqv.chars() );
1202 }
1203 }
1204
1205 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1206 if (freport==NULL || trashcode<=' ') return;
1207 if (trashcode=='3' || trashcode=='5') {
1208 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1209 }
1210 else {
1211 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1212 }
1213 //tcounter++;
1214 }
1215
1216 GStr getFext(GStr& s, int* xpos=NULL) {
1217 //s must be a filename without a path
1218 GStr r("");
1219 if (xpos!=NULL) *xpos=0;
1220 if (s.is_empty() || s=="-") return r;
1221 int p=s.rindex('.');
1222 int d=s.rindex('/');
1223 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1224 r=s.substr(p+1);
1225 if (xpos!=NULL) *xpos=p+1;
1226 r.lower();
1227 return r;
1228 }
1229
1230 void baseFileName(GStr& fname) {
1231 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1232 int xpos=0;
1233 GStr fext=getFext(fname, &xpos);
1234 if (xpos<=1) return;
1235 bool extdel=false;
1236 GStr f2;
1237 if (fext=="z" || fext=="zip") {
1238 extdel=true;
1239 }
1240 else if (fext.length()>=2) {
1241 f2=fext.substr(0,2);
1242 extdel=(f2=="gz" || f2=="bz");
1243 }
1244 if (extdel) {
1245 fname.cut(xpos-1);
1246 fext=getFext(fname, &xpos);
1247 if (xpos<=1) return;
1248 }
1249 extdel=false;
1250 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1251 extdel=true;
1252 }
1253 else if (fext.length()>=2) {
1254 extdel=(fext.substr(0,2)=="fa");
1255 }
1256 if (extdel) fname.cut(xpos-1);
1257 GStr fncp(fname);
1258 fncp.lower();
1259 fncp.chomp("seq");
1260 fncp.chomp("sequence");
1261 fncp.trimR("_.");
1262 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1263 }
1264
1265 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1266 FILE* f_out=NULL;
1267 GStr fname(getFileName(infname.chars()));
1268 //eliminate known extensions
1269 baseFileName(fname);
1270 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1271 else if (pocmd.is_empty()) {
1272 GStr oname(fname);
1273 oname.append('.');
1274 oname.append(outsuffix);
1275 f_out=fopen(oname.chars(),"w");
1276 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1277 }
1278 else {
1279 GStr oname(">");
1280 oname.append(fname);
1281 oname.append('.');
1282 oname.append(outsuffix);
1283 pocmd.append(oname);
1284 f_out=popen(pocmd.chars(), "w");
1285 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1286 }
1287 return f_out;
1288 }
1289
1290 void guess_unzip(GStr& fname, GStr& picmd) {
1291 GStr fext=getFext(fname);
1292 if (fext=="gz" || fext=="gzip" || fext=="z") {
1293 picmd="gzip -cd ";
1294 }
1295 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1296 picmd="bzip2 -cd ";
1297 }
1298 }
1299
1300
1301 int loadAdapters(const char* fname) {
1302 GLineReader lr(fname);
1303 char* l;
1304 while ((l=lr.nextLine())!=NULL) {
1305 if (lr.length()<=3 || l[0]=='#') continue;
1306 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1307 l[0]==';'|| l[0]==':' ) {
1308 int i=1;
1309 while (l[i]!=0 && isspace(l[i])) {
1310 i++;
1311 }
1312 if (l[i]!=0) {
1313 GStr s(&(l[i]));
1314 adapters3.Add(s);
1315 continue;
1316 }
1317 }
1318 else {
1319 GStr s(l);
1320 s.startTokenize("\t ;,:");
1321 GStr a5,a3;
1322 if (s.nextToken(a5))
1323 s.nextToken(a3);
1324 a5.upper();
1325 a3.upper();
1326 adapters5.Add(a5);
1327 adapters3.Add(a3);
1328 }
1329 }
1330 return adapters5.Count()+adapters3.Count();
1331 }
1332
1333 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1334 GStr& s, GStr& infname, GStr& infname2) {
1335 // uses outsuffix to generate output file names and open file handles as needed
1336 infname="";
1337 infname2="";
1338 f_in=NULL;
1339 f_in2=NULL;
1340 f_out=NULL;
1341 f_out2=NULL;
1342 //analyze outsuffix intent
1343 GStr pocmd;
1344 if (outsuffix=="-") {
1345 f_out=stdout;
1346 }
1347 else {
1348 GStr ox=getFext(outsuffix);
1349 if (ox.length()>2) ox=ox.substr(0,2);
1350 if (ox=="gz") pocmd="gzip -9 -c ";
1351 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1352 }
1353 if (s=="-") {
1354 f_in=stdin;
1355 infname="stdin";
1356 f_out=prepOutFile(infname, pocmd);
1357 return;
1358 } // streaming from stdin
1359 s.startTokenize(",:");
1360 s.nextToken(infname);
1361 bool paired=s.nextToken(infname2);
1362 if (fileExists(infname.chars())==0)
1363 GError("Error: cannot find file %s!\n",infname.chars());
1364 GStr fname(getFileName(infname.chars()));
1365 GStr picmd;
1366 guess_unzip(fname, picmd);
1367 if (picmd.is_empty()) {
1368 f_in=fopen(infname.chars(), "r");
1369 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1370 }
1371 else {
1372 picmd.append(infname);
1373 f_in=popen(picmd.chars(), "r");
1374 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1375 }
1376 if (f_out==stdout) {
1377 if (paired) GError("Error: output suffix required for paired reads\n");
1378 return;
1379 }
1380 f_out=prepOutFile(infname, pocmd);
1381 if (!paired) return;
1382 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1383 // ---- paired reads:-------------
1384 if (fileExists(infname2.chars())==0)
1385 GError("Error: cannot find file %s!\n",infname2.chars());
1386 picmd="";
1387 GStr fname2(getFileName(infname2.chars()));
1388 guess_unzip(fname2, picmd);
1389 if (picmd.is_empty()) {
1390 f_in2=fopen(infname2.chars(), "r");
1391 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1392 }
1393 else {
1394 picmd.append(infname2);
1395 f_in2=popen(picmd.chars(), "r");
1396 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1397 }
1398 f_out2=prepOutFile(infname2, pocmd);
1399 }