2 |
|
#include "GStr.h" |
3 |
|
#include "GHash.hh" |
4 |
|
#include "GList.hh" |
5 |
+ |
#include <ctype.h> |
6 |
|
|
7 |
|
#define USAGE "Usage:\n\ |
8 |
< |
fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-q <minqv>] [-C] [-D]\\\n\ |
9 |
< |
[-p {64|33}] [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>]\\\n\ |
10 |
< |
[-Q] <input.fq>\n\ |
8 |
> |
fqtrim [-5 <5adapter>] [-3 <3adapter>] [-a <min_matchlen>] [-p {64|33}] [-q <minq> [-t <trim_max>]]\\\n\ |
9 |
> |
[-n <rename_prefix>] [-o <outsuffix>] [-z <zcmd>] [-r <discarded.lst>]\\\n\ |
10 |
> |
[-l <minlen>] [-C] [-D] [-Q] <input.fq>[,<input_mates.fq>\n\ |
11 |
|
\n\ |
12 |
< |
Trim low quality bases at 3' end, optionally trim adapter sequence, filter\n\ |
12 |
> |
Trim low quality bases at the 3' end, optionally trim adapter sequence, filter\n\ |
13 |
|
for low complexity and collapse duplicate reads\n\ |
14 |
+ |
If read pairs should be trimmed and kept together (i.e. without discarding\n\ |
15 |
+ |
one read in a pair), the two file names should be given delimited by a comma\n\ |
16 |
+ |
or a colon character\n\ |
17 |
|
\n\ |
18 |
|
Options:\n\ |
19 |
|
-n rename all the reads using the <prefix> followed by a read counter;\n\ |
20 |
|
if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\ |
21 |
|
the read duplication count\n\ |
22 |
< |
-o write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\ |
22 |
> |
-o write the trimmed/filtered reads to file(s) named <input>.<outsuffix>\n\ |
23 |
> |
which will be created in the current (working) directory\n\ |
24 |
|
-5 trim the given adapter or primer sequence at the 5' end of each read\n\ |
25 |
|
(e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\ |
26 |
|
-3 trim the given adapter sequence at the 3' end of each read\n\ |
27 |
|
(e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\ |
28 |
< |
-q trim bases with quality value lower than <minq> (starting the 3' end)\n\ |
28 |
> |
-a minimum bases to match to adaptor sequence (default 5)\n\ |
29 |
> |
-q trim bases with quality value lower than <minq> (starting at the 3' end)\n\ |
30 |
> |
-t for -q option, maximum trimming at the 3' end is limited to <trim_max>\n\ |
31 |
|
-m maximum percentage of Ns allowed in a read after trimming (default 7)\n\ |
32 |
|
-l minimum \"clean\" length after trimming that a read must have\n\ |
33 |
|
in order to pass the filter (default: 16)\n\ |
41 |
|
-Q convert quality values to the other Phred qv type\n\ |
42 |
|
-V verbose processing\n\ |
43 |
|
" |
44 |
+ |
|
45 |
+ |
//-z for -o option, the output stream(s) will be first piped into the given\n |
46 |
+ |
// <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n |
47 |
+ |
|
48 |
+ |
|
49 |
|
// example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG |
50 |
|
|
51 |
|
//For pair ends sequencing: |
52 |
|
//3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT |
53 |
|
//5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG |
54 |
< |
FILE* f_out=NULL; //stdout if not provided |
55 |
< |
FILE* f_in=NULL; //input fastq (stdin if not provided) |
54 |
> |
//FILE* f_out=NULL; //stdout if not provided |
55 |
> |
//FILE* f_out2=NULL; //for paired reads |
56 |
> |
//FILE* f_in=NULL; //input fastq (stdin if not provided) |
57 |
> |
//FILE* f_in2=NULL; //for paired reads |
58 |
|
FILE* freport=NULL; |
59 |
+ |
|
60 |
|
bool debug=false; |
61 |
|
bool verbose=false; |
62 |
|
bool doCollapse=false; |
63 |
|
bool doDust=false; |
64 |
+ |
bool fastaOutput=false; |
65 |
|
bool trashReport=false; |
66 |
|
//bool rawFormat=false; |
67 |
|
int min_read_len=16; |
69 |
|
int dust_cutoff=16; |
70 |
|
bool isfasta=false; |
71 |
|
bool convert_phred=false; |
72 |
< |
GStr prefix; |
72 |
> |
GStr outsuffix; // -o |
73 |
|
GStr adapter3; |
74 |
|
GStr adapter5; |
75 |
+ |
GStr prefix; |
76 |
+ |
GStr zcmd; |
77 |
+ |
int num_trimmed5=0; |
78 |
+ |
int num_trimmed3=0; |
79 |
+ |
int num_trimmedN=0; |
80 |
+ |
int num_trimmedQ=0; |
81 |
+ |
int min_trimmed5=INT_MAX; |
82 |
+ |
int min_trimmed3=INT_MAX; |
83 |
|
|
84 |
< |
int qvtrim_min=0; |
85 |
< |
|
84 |
> |
int qvtrim_qmin=0; |
85 |
> |
int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases |
86 |
|
int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet) |
87 |
|
int qv_cvtadd=0; //could be -31 or +31 |
88 |
|
|
92 |
|
const int a_m_score=2; //match score |
93 |
|
const int a_mis_score=-3; //mismatch |
94 |
|
const int a_dropoff_score=7; |
95 |
< |
const int a_min_score=8; //an exact match of 4 bases at the proper ends WILL be trimmed |
95 |
> |
int a_min_score=10; //an exact match of 5 bases at the proper ends WILL be trimmed |
96 |
|
const int a_min_chain_score=15; //for gapped alignments |
97 |
|
|
98 |
|
class CSegChain; |
297 |
|
if (!s.is_empty()) { |
298 |
|
if (s=='-') f=stdout; |
299 |
|
else { |
300 |
< |
f=fopen(s,"w"); |
300 |
> |
f=fopen(s.chars(),"w"); |
301 |
|
if (f==NULL) GError("Error creating file: %s\n", s.chars()); |
302 |
|
} |
303 |
|
} |
308 |
|
|
309 |
|
GHash<FqDupRec> dhash; //hash to keep track of duplicates |
310 |
|
|
311 |
+ |
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
312 |
+ |
GStr& s, GStr& infname, GStr& infname2); |
313 |
+ |
// uses outsuffix to generate output file names and open file handles as needed |
314 |
+ |
|
315 |
+ |
void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter); |
316 |
+ |
void trash_report(char trashcode, GStr& rname, FILE* freport); |
317 |
+ |
|
318 |
+ |
bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv, |
319 |
+ |
GStr& rname, GStr& rinfo, GStr& infname); |
320 |
+ |
|
321 |
+ |
char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3); |
322 |
+ |
//returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed |
323 |
+ |
|
324 |
|
bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured |
325 |
|
bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured |
326 |
|
int dust(GStr& seq); |
331 |
|
void convertPhred(GStr& q); |
332 |
|
|
333 |
|
int main(int argc, char * const argv[]) { |
334 |
< |
GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:o:"); |
334 |
> |
GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:t:o:z:a:"); |
335 |
|
int e; |
299 |
– |
int icounter=0; //counter for input reads |
300 |
– |
int outcounter=0; //counter for output reads |
336 |
|
if ((e=args.isError())>0) { |
337 |
|
GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]); |
338 |
|
exit(224); |
339 |
|
} |
340 |
|
debug=(args.getOpt('Y')!=NULL); |
341 |
< |
debug=(args.getOpt('V')!=NULL); |
341 |
> |
verbose=(args.getOpt('V')!=NULL); |
342 |
|
convert_phred=(args.getOpt('Q')!=NULL); |
343 |
|
doCollapse=(args.getOpt('C')!=NULL); |
344 |
|
doDust=(args.getOpt('D')!=NULL); |
362 |
|
} |
363 |
|
s=args.getOpt('q'); |
364 |
|
if (!s.is_empty()) { |
365 |
< |
qvtrim_min=s.asInt(); |
365 |
> |
qvtrim_qmin=s.asInt(); |
366 |
> |
} |
367 |
> |
s=args.getOpt('t'); |
368 |
> |
if (!s.is_empty()) { |
369 |
> |
qvtrim_max=s.asInt(); |
370 |
|
} |
371 |
|
s=args.getOpt('p'); |
372 |
|
if (!s.is_empty()) { |
392 |
|
adapter5.upper(); |
393 |
|
a5len=adapter5.length(); |
394 |
|
} |
395 |
+ |
s=args.getOpt('a'); |
396 |
+ |
if (!s.is_empty()) { |
397 |
+ |
int a_minmatch=s.asInt(); |
398 |
+ |
a_min_score=a_minmatch<<1; |
399 |
+ |
} |
400 |
+ |
|
401 |
+ |
if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o'); |
402 |
|
trashReport= (args.getOpt('r')!=NULL); |
403 |
< |
if (args.startNonOpt()==0) { |
403 |
> |
int fcount=args.startNonOpt(); |
404 |
> |
if (fcount==0) { |
405 |
|
GMessage(USAGE); |
406 |
|
exit(224); |
407 |
|
} |
408 |
< |
|
409 |
< |
openfw(f_out, args, 'o'); |
410 |
< |
if (f_out==NULL) f_out=stdout; |
408 |
> |
if (fcount>1 && doCollapse) { |
409 |
> |
GError("%s Sorry, the -C option only works with a single input.\n", USAGE); |
410 |
> |
} |
411 |
> |
//openfw(f_out, args, 'o'); |
412 |
> |
//if (f_out==NULL) f_out=stdout; |
413 |
|
if (trashReport) |
414 |
|
openfw(freport, args, 'r'); |
415 |
|
char* infile=NULL; |
416 |
|
while ((infile=args.nextNonOpt())!=NULL) { |
417 |
< |
GStr infname(infile); |
418 |
< |
if (strcmp(infile,"-")==0) { |
419 |
< |
f_in=stdin; infname="stdin"; } |
420 |
< |
else { |
421 |
< |
f_in=fopen(infile,"r"); |
422 |
< |
if (f_in==NULL) |
423 |
< |
GError("Cannot open input file %s!\n",infile); |
424 |
< |
} |
425 |
< |
GLineReader fq(f_in); |
426 |
< |
char* l=NULL; |
427 |
< |
while ((l=fq.getLine())!=NULL) { |
428 |
< |
GStr rname; //current read name |
429 |
< |
GStr rseq; //current read sequence |
430 |
< |
GStr rqv; //current read quality values |
431 |
< |
GStr s; |
432 |
< |
/* if (rawFormat) { |
433 |
< |
//TODO: implement qseq parsing here |
434 |
< |
//if (raw type=N) then continue; //skip invalid/bad records |
435 |
< |
|
436 |
< |
} //raw qseq format |
437 |
< |
else { // FASTQ or FASTA */ |
438 |
< |
isfasta=(l[0]=='>'); |
439 |
< |
if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n"); |
440 |
< |
s=l; |
441 |
< |
rname=&(l[1]); |
442 |
< |
icounter++; |
443 |
< |
for (int i=0;i<rname.length();i++) |
444 |
< |
if (rname[i]<=' ') { rname.cut(i); break; } |
445 |
< |
//now get the sequence |
446 |
< |
if ((l=fq.getLine())==NULL) |
447 |
< |
GError("Error: unexpected EOF after header for %s\n",rname.chars()); |
448 |
< |
rseq=l; //this must be the DNA line |
449 |
< |
while ((l=fq.getLine())!=NULL) { |
450 |
< |
//seq can span multiple lines |
451 |
< |
if (l[0]=='>' || l[0]=='+') { |
452 |
< |
fq.pushBack(); |
453 |
< |
break; // |
417 |
> |
int incounter=0; //counter for input reads |
418 |
> |
int outcounter=0; //counter for output reads |
419 |
> |
int trash_s=0; //too short from the get go |
420 |
> |
int trash_Q=0; |
421 |
> |
int trash_N=0; |
422 |
> |
int trash_D=0; |
423 |
> |
int trash_A3=0; |
424 |
> |
int trash_A5=0; |
425 |
> |
s=infile; |
426 |
> |
GStr infname; |
427 |
> |
GStr infname2; |
428 |
> |
FILE* f_in=NULL; |
429 |
> |
FILE* f_in2=NULL; |
430 |
> |
FILE* f_out=NULL; |
431 |
> |
FILE* f_out2=NULL; |
432 |
> |
bool paired_reads=false; |
433 |
> |
setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2); |
434 |
> |
GLineReader fq(f_in); |
435 |
> |
GLineReader* fq2=NULL; |
436 |
> |
if (f_in2!=NULL) { |
437 |
> |
fq2=new GLineReader(f_in2); |
438 |
> |
paired_reads=true; |
439 |
> |
} |
440 |
> |
GStr rseq, rqv, seqid, seqinfo; |
441 |
> |
GStr rseq2, rqv2, seqid2, seqinfo2; |
442 |
> |
while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) { |
443 |
> |
incounter++; |
444 |
> |
int a5=0, a3=0, b5=0, b3=0; |
445 |
> |
char tcode=0, tcode2=0; |
446 |
> |
tcode=process_read(seqid, rseq, rqv, a5, a3); |
447 |
> |
//if (!doCollapse) { |
448 |
> |
if (fq2!=NULL) { |
449 |
> |
getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2); |
450 |
> |
if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) { |
451 |
> |
GError("Error: no paired match for read %s vs %s (%s,%s)\n", |
452 |
> |
seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars()); |
453 |
> |
} |
454 |
> |
tcode2=process_read(seqid2, rseq2, rqv2, b5, b3); |
455 |
> |
//decide what to do with this pair and print rseq2 if the pair makes it |
456 |
> |
if (tcode>1 && tcode2<=1) { |
457 |
> |
//"untrash" rseq |
458 |
> |
if (a3-a5+1<min_read_len) { |
459 |
> |
a5=1; |
460 |
> |
if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); } |
461 |
|
} |
462 |
< |
rseq+=l; |
463 |
< |
} //check for multi-line seq |
464 |
< |
if (!isfasta) { //reading fastq quality values, which can also be multi-line |
465 |
< |
if ((l=fq.getLine())==NULL) |
466 |
< |
GError("Error: unexpected EOF after sequence for %s\n", rname.chars()); |
467 |
< |
if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n"); |
468 |
< |
if ((l=fq.getLine())==NULL) |
469 |
< |
GError("Error: unexpected EOF after qv header for %s\n", rname.chars()); |
470 |
< |
rqv=l; |
471 |
< |
//if (rqv.length()!=rseq.length()) |
472 |
< |
// GError("Error: qv len != seq len for %s\n", rname.chars()); |
473 |
< |
while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) { |
474 |
< |
rqv+=l; //append to qv string |
475 |
< |
} |
420 |
< |
}// fastq |
421 |
< |
// } //<-- FASTA or FASTQ |
422 |
< |
rseq.upper(); |
423 |
< |
int l5=0; |
424 |
< |
int l3=rseq.length()-1; |
425 |
< |
if (l3-l5+1<min_read_len) { |
426 |
< |
if (trashReport) { |
427 |
< |
fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars()); |
428 |
< |
} |
429 |
< |
continue; |
430 |
< |
} |
431 |
< |
if (ntrim(rseq, l5, l3)) { // N-trimming |
432 |
< |
//GMessage("before: %s\n",rseq.chars()); |
433 |
< |
//GMessage("after : %s (%d)\n",rseq.substr(l5,l3-l5+1).chars(),l3-l5+1); |
434 |
< |
if (l3-l5+1<min_read_len) { |
435 |
< |
if (trashReport) { |
436 |
< |
fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars()); |
437 |
< |
} |
438 |
< |
continue; //invalid read |
439 |
< |
} |
440 |
< |
//-- keep only the l5..l3 range |
441 |
< |
rseq=rseq.substr(l5, l3-l5+1); |
442 |
< |
if (!rqv.is_empty()) |
443 |
< |
rqv=rqv.substr(l5, l3-l5+1); |
444 |
< |
l5=0; |
445 |
< |
l3=rseq.length()-1; |
446 |
< |
} |
447 |
< |
if (qvtrim_min!=0 && !rqv.is_empty() && qtrim(rqv, l5, l3)) { // qv-threshold trimming |
448 |
< |
if (l3-l5+1<min_read_len) { |
449 |
< |
if (trashReport) { |
450 |
< |
fprintf(freport, "%s\tQ\t%s\n", rname.chars(), rseq.chars()); |
451 |
< |
} |
452 |
< |
continue; //invalid read |
453 |
< |
} |
454 |
< |
//-- keep only the l5..l3 range |
455 |
< |
rseq=rseq.substr(l5, l3-l5+1); |
456 |
< |
if (!rqv.is_empty()) |
457 |
< |
rqv=rqv.substr(l5, l3-l5+1); |
458 |
< |
} //qv trimming |
459 |
< |
if (a3len>0) { |
460 |
< |
if (trim_adapter3(rseq, l5, l3)) { |
461 |
< |
if (l3-l5+1<min_read_len) { |
462 |
< |
if (trashReport) { |
463 |
< |
fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars()); |
464 |
< |
} |
465 |
< |
continue; |
466 |
< |
} |
467 |
< |
//-- keep only the l5..l3 range |
468 |
< |
rseq=rseq.substr(l5, l3-l5+1); |
469 |
< |
if (!rqv.is_empty()) |
470 |
< |
rqv=rqv.substr(l5, l3-l5+1); |
471 |
< |
}//some adapter was trimmed |
472 |
< |
} //adapter trimming |
473 |
< |
if (a5len>0) { |
474 |
< |
if (trim_adapter5(rseq, l5, l3)) { |
475 |
< |
if (l3-l5+1<min_read_len) { |
476 |
< |
if (trashReport) { |
477 |
< |
fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars()); |
478 |
< |
} |
479 |
< |
continue; |
462 |
> |
tcode=1; |
463 |
> |
} |
464 |
> |
else if (tcode<=1 && tcode2>1) { |
465 |
> |
//"untrash" rseq2 |
466 |
> |
if (b3-b5+1<min_read_len) { |
467 |
> |
b5=1; |
468 |
> |
if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); } |
469 |
> |
} |
470 |
> |
tcode2=1; |
471 |
> |
} |
472 |
> |
if (tcode<=1) { //trimmed or left intact -- write it! |
473 |
> |
if (tcode>0) { |
474 |
> |
rseq2=rseq2.substr(b5,b3-b5+1); |
475 |
> |
if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1); |
476 |
|
} |
477 |
< |
//-- keep only the l5..l3 range |
478 |
< |
rseq=rseq.substr(l5, l3-l5+1); |
479 |
< |
if (!rqv.is_empty()) |
480 |
< |
rqv=rqv.substr(l5, l3-l5+1); |
481 |
< |
}//some adapter was trimmed |
482 |
< |
} //adapter trimming |
483 |
< |
if (doCollapse) { |
484 |
< |
//keep read for later |
485 |
< |
FqDupRec* dr=dhash.Find(rseq.chars()); |
486 |
< |
if (dr==NULL) { //new entry |
487 |
< |
//if (prefix.is_empty()) |
488 |
< |
dhash.Add(rseq.chars(), |
489 |
< |
new FqDupRec(&rqv, rname.chars())); |
490 |
< |
//else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length())); |
477 |
> |
int nocounter=0; |
478 |
> |
writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter); |
479 |
> |
} |
480 |
> |
} //paired read |
481 |
> |
// } |
482 |
> |
if (tcode>1) { //trashed |
483 |
> |
if (tcode=='s') trash_s++; |
484 |
> |
else if (tcode=='Q') trash_Q++; |
485 |
> |
else if (tcode=='N') trash_N++; |
486 |
> |
else if (tcode=='D') trash_D++; |
487 |
> |
else if (tcode=='3') trash_A3++; |
488 |
> |
else if (tcode=='5') trash_A5++; |
489 |
> |
if (trashReport) trash_report(tcode, seqid, freport); |
490 |
> |
} |
491 |
> |
else if (!doCollapse) { //write it |
492 |
> |
if (tcode>0) { |
493 |
> |
rseq=rseq.substr(a5,a3-a5+1); |
494 |
> |
if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1); |
495 |
> |
} |
496 |
> |
writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter); |
497 |
> |
} |
498 |
> |
} //for each fastq record |
499 |
> |
delete fq2; |
500 |
> |
FRCLOSE(f_in); |
501 |
> |
FRCLOSE(f_in2); |
502 |
> |
if (doCollapse) { |
503 |
> |
outcounter=0; |
504 |
> |
int maxdup_count=1; |
505 |
> |
char* maxdup_seq=NULL; |
506 |
> |
dhash.startIterate(); |
507 |
> |
FqDupRec* qd=NULL; |
508 |
> |
char* seq=NULL; |
509 |
> |
while ((qd=dhash.NextData(seq))!=NULL) { |
510 |
> |
GStr rseq(seq); |
511 |
> |
//do the dusting here |
512 |
> |
if (doDust) { |
513 |
> |
int dustbases=dust(rseq); |
514 |
> |
if (dustbases>(rseq.length()>>1)) { |
515 |
> |
if (trashReport && qd->firstname!=NULL) { |
516 |
> |
fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count); |
517 |
|
} |
518 |
< |
else |
519 |
< |
dr->add(rqv); |
520 |
< |
} //collapsing duplicates |
521 |
< |
else { //not collapsing duplicates |
522 |
< |
//do the dust filter now |
523 |
< |
if (doDust) { |
524 |
< |
int dustbases=dust(rseq); |
525 |
< |
if (dustbases>(rseq.length()>>1)) { |
526 |
< |
if (trashReport) { |
527 |
< |
fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars()); |
528 |
< |
} |
529 |
< |
continue; |
530 |
< |
} |
509 |
< |
} |
510 |
< |
//print this record here |
511 |
< |
outcounter++; |
512 |
< |
if (isfasta) { |
513 |
< |
if (prefix.is_empty()) |
514 |
< |
fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars()); |
515 |
< |
else |
516 |
< |
fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter, |
517 |
< |
rseq.chars()); |
518 |
> |
trash_D+=qd->count; |
519 |
> |
continue; |
520 |
> |
} |
521 |
> |
} |
522 |
> |
outcounter++; |
523 |
> |
if (qd->count>maxdup_count) { |
524 |
> |
maxdup_count=qd->count; |
525 |
> |
maxdup_seq=seq; |
526 |
> |
} |
527 |
> |
if (isfasta) { |
528 |
> |
if (prefix.is_empty()) { |
529 |
> |
fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count, |
530 |
> |
rseq.chars()); |
531 |
|
} |
532 |
< |
else { //fastq |
533 |
< |
if (convert_phred) convertPhred(rqv); |
534 |
< |
if (prefix.is_empty()) |
522 |
< |
fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars()); |
523 |
< |
else |
524 |
< |
fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
525 |
< |
rseq.chars(),rqv.chars() ); |
532 |
> |
else { //use custom read name |
533 |
> |
fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter, |
534 |
> |
qd->count, rseq.chars()); |
535 |
|
} |
536 |
< |
} //not collapsing duplicates |
537 |
< |
} //for each fastq record |
538 |
< |
} //while each input file |
539 |
< |
FRCLOSE(f_in); |
540 |
< |
if (doCollapse) { |
541 |
< |
outcounter=0; |
533 |
< |
int maxdup_count=1; |
534 |
< |
char* maxdup_seq=NULL; |
535 |
< |
dhash.startIterate(); |
536 |
< |
FqDupRec* qd=NULL; |
537 |
< |
char* seq=NULL; |
538 |
< |
while ((qd=dhash.NextData(seq))!=NULL) { |
539 |
< |
GStr rseq(seq); |
540 |
< |
//do the dusting here |
541 |
< |
if (doDust) { |
542 |
< |
int dustbases=dust(rseq); |
543 |
< |
if (dustbases>(rseq.length()>>1)) { |
544 |
< |
if (trashReport && qd->firstname!=NULL) { |
545 |
< |
fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq); |
546 |
< |
} |
547 |
< |
continue; |
536 |
> |
} |
537 |
> |
else { //fastq format |
538 |
> |
if (convert_phred) convertPhred(qd->qv, qd->len); |
539 |
> |
if (prefix.is_empty()) { |
540 |
> |
fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count, |
541 |
> |
rseq.chars(), qd->qv); |
542 |
|
} |
543 |
+ |
else { //use custom read name |
544 |
+ |
fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
545 |
+ |
qd->count, rseq.chars(), qd->qv); |
546 |
+ |
} |
547 |
+ |
} |
548 |
+ |
}//for each element of dhash |
549 |
+ |
if (maxdup_count>1) { |
550 |
+ |
GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq); |
551 |
|
} |
552 |
< |
outcounter++; |
553 |
< |
if (qd->count>maxdup_count) { |
554 |
< |
maxdup_count=qd->count; |
555 |
< |
maxdup_seq=seq; |
556 |
< |
} |
557 |
< |
if (isfasta) { |
558 |
< |
if (prefix.is_empty()) { |
559 |
< |
fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count, |
560 |
< |
rseq.chars()); |
561 |
< |
} |
562 |
< |
else { //use custom read name |
563 |
< |
fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter, |
564 |
< |
qd->count, rseq.chars()); |
552 |
> |
} //collapse entries |
553 |
> |
if (verbose) { |
554 |
> |
if (paired_reads) { |
555 |
> |
GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars()); |
556 |
> |
GMessage("Number of input pairs :%9d\n", incounter); |
557 |
> |
GMessage(" Output pairs :%9d\n", outcounter); |
558 |
> |
} |
559 |
> |
else { |
560 |
> |
GMessage(">Input file : %s\n", infname.chars()); |
561 |
> |
GMessage("Number of input reads :%9d\n", incounter); |
562 |
> |
GMessage(" Output reads :%9d\n", outcounter); |
563 |
> |
} |
564 |
> |
GMessage("------------------------------------\n"); |
565 |
> |
if (num_trimmed5) |
566 |
> |
GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5); |
567 |
> |
if (num_trimmed3) |
568 |
> |
GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3); |
569 |
> |
GMessage("------------------------------------\n"); |
570 |
> |
if (trash_s>0) |
571 |
> |
GMessage(" Trashed by length:%9d\n", trash_s); |
572 |
> |
if (trash_N>0) |
573 |
> |
GMessage(" Trashed by N%%:%9d\n", trash_N); |
574 |
> |
if (trash_Q>0) |
575 |
> |
GMessage("Trashed by low quality:%9d\n", trash_Q); |
576 |
> |
if (trash_A5>0) |
577 |
> |
GMessage(" Trashed by 5' adapter:%9d\n", trash_A5); |
578 |
> |
if (trash_A3>0) |
579 |
> |
GMessage(" Trashed by 3' adapter:%9d\n", trash_A3); |
580 |
> |
} |
581 |
> |
if (trashReport) { |
582 |
> |
FWCLOSE(freport); |
583 |
|
} |
584 |
< |
} |
585 |
< |
else { //fastq format |
586 |
< |
if (convert_phred) convertPhred(qd->qv, qd->len); |
567 |
< |
if (prefix.is_empty()) { |
568 |
< |
fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count, |
569 |
< |
rseq.chars(), qd->qv); |
570 |
< |
} |
571 |
< |
else { //use custom read name |
572 |
< |
fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
573 |
< |
qd->count, rseq.chars(), qd->qv); |
574 |
< |
} |
575 |
< |
} |
576 |
< |
}//for each element of dhash |
577 |
< |
if (maxdup_count>1) { |
578 |
< |
GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq); |
579 |
< |
} |
580 |
< |
} //report collapsed dhash entries |
581 |
< |
GMessage("Number of input reads: %9d\n", icounter); |
582 |
< |
GMessage(" Output records: %9d\n", outcounter); |
583 |
< |
if (trashReport) { |
584 |
< |
FWCLOSE(freport); |
585 |
< |
} |
584 |
> |
FWCLOSE(f_out); |
585 |
> |
FWCLOSE(f_out2); |
586 |
> |
} //while each input file |
587 |
|
|
587 |
– |
FWCLOSE(f_out); |
588 |
|
//getc(stdin); |
589 |
|
} |
590 |
|
|
670 |
|
|
671 |
|
|
672 |
|
bool qtrim(GStr& qvs, int &l5, int &l3) { |
673 |
< |
if (qvtrim_min==0 || qvs.is_empty()) return false; |
673 |
> |
if (qvtrim_qmin==0 || qvs.is_empty()) return false; |
674 |
|
if (qv_phredtype==0) { |
675 |
|
//try to guess the Phred type |
676 |
|
int vmin=256, vmax=0; |
683 |
|
if (qv_phredtype==0) { |
684 |
|
GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n"); |
685 |
|
} |
686 |
< |
} //guessing the Phred type |
686 |
> |
if (verbose) |
687 |
> |
GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64)); |
688 |
> |
} //guessing Phred type |
689 |
|
for (l3=qvs.length()-1;l3>2;l3--) { |
690 |
< |
if (qvs[l3]-qv_phredtype>=qvtrim_min && qvs[l3-1]-qv_phredtype>=qvtrim_min) break; |
690 |
> |
if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break; |
691 |
|
} |
692 |
|
//just in case, check also the 5' the end (?) |
693 |
|
for (l5=0;l5<qvs.length()-3;l5++) { |
694 |
< |
if (qvs[l5]-qv_phredtype>=qvtrim_min && qvs[l5+1]-qv_phredtype>=qvtrim_min) break; |
694 |
> |
if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break; |
695 |
> |
} |
696 |
> |
if (qvtrim_max>0) { |
697 |
> |
if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max; |
698 |
> |
if (l5>qvtrim_max) l5=qvtrim_max; |
699 |
|
} |
700 |
|
return (l5>0 || l3<qvs.length()-1); |
701 |
|
} |
1319 |
|
for (int i=0;i<len;i++) q[i]+=qv_cvtadd; |
1320 |
|
} |
1321 |
|
|
1322 |
+ |
bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv, |
1323 |
+ |
GStr& rname, GStr& rinfo, GStr& infname) { |
1324 |
+ |
rseq=""; |
1325 |
+ |
rqv=""; |
1326 |
+ |
rname=""; |
1327 |
+ |
rinfo=""; |
1328 |
+ |
if (fq.eof()) return false; |
1329 |
+ |
char* l=fq.getLine(); |
1330 |
+ |
while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines |
1331 |
+ |
if (l==NULL) return false; |
1332 |
+ |
/* if (rawFormat) { |
1333 |
+ |
//TODO: implement raw qseq parsing here |
1334 |
+ |
//if (raw type=N) then continue; //skip invalid/bad records |
1335 |
+ |
} //raw qseq format |
1336 |
+ |
else { // FASTQ or FASTA */ |
1337 |
+ |
isfasta=(l[0]=='>'); |
1338 |
+ |
if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n", |
1339 |
+ |
infname.chars(), l); |
1340 |
+ |
GStr s(l); |
1341 |
+ |
rname=&(l[1]); |
1342 |
+ |
for (int i=0;i<rname.length();i++) |
1343 |
+ |
if (rname[i]<=' ') { |
1344 |
+ |
if (i<rname.length()-2) rinfo=rname.substr(i+1); |
1345 |
+ |
rname.cut(i); |
1346 |
+ |
break; |
1347 |
+ |
} |
1348 |
+ |
//now get the sequence |
1349 |
+ |
if ((l=fq.getLine())==NULL) |
1350 |
+ |
GError("Error: unexpected EOF after header for read %s (%s)\n", |
1351 |
+ |
rname.chars(), infname.chars()); |
1352 |
+ |
rseq=l; //this must be the DNA line |
1353 |
+ |
while ((l=fq.getLine())!=NULL) { |
1354 |
+ |
//seq can span multiple lines |
1355 |
+ |
if (l[0]=='>' || l[0]=='+') { |
1356 |
+ |
fq.pushBack(); |
1357 |
+ |
break; // |
1358 |
+ |
} |
1359 |
+ |
rseq+=l; |
1360 |
+ |
} //check for multi-line seq |
1361 |
+ |
if (!isfasta) { //reading fastq quality values, which can also be multi-line |
1362 |
+ |
if ((l=fq.getLine())==NULL) |
1363 |
+ |
GError("Error: unexpected EOF after sequence for %s\n", rname.chars()); |
1364 |
+ |
if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n"); |
1365 |
+ |
if ((l=fq.getLine())==NULL) |
1366 |
+ |
GError("Error: unexpected EOF after qv header for %s\n", rname.chars()); |
1367 |
+ |
rqv=l; |
1368 |
+ |
//if (rqv.length()!=rseq.length()) |
1369 |
+ |
// GError("Error: qv len != seq len for %s\n", rname.chars()); |
1370 |
+ |
while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) { |
1371 |
+ |
rqv+=l; //append to qv string |
1372 |
+ |
} |
1373 |
+ |
}// fastq |
1374 |
+ |
// } //<-- FASTA or FASTQ |
1375 |
+ |
rseq.upper(); //TODO: what if we care about masking? |
1376 |
+ |
return true; |
1377 |
+ |
} |
1378 |
+ |
|
1379 |
+ |
char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) { |
1380 |
+ |
//returns 0 if the read was untouched, 1 if it was just trimmed |
1381 |
+ |
// and a trash code if it was trashed |
1382 |
+ |
l5=0; |
1383 |
+ |
l3=rseq.length()-1; |
1384 |
+ |
if (l3-l5+1<min_read_len) { |
1385 |
+ |
return 's'; |
1386 |
+ |
} |
1387 |
+ |
GStr wseq(rseq.chars()); |
1388 |
+ |
GStr wqv(rqv.chars()); |
1389 |
+ |
int w5=l5; |
1390 |
+ |
int w3=l3; |
1391 |
+ |
//first do the q-based trimming |
1392 |
+ |
if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming |
1393 |
+ |
if (w3-w5+1<min_read_len) { |
1394 |
+ |
return 'Q'; //invalid read |
1395 |
+ |
} |
1396 |
+ |
//-- keep only the w5..w3 range |
1397 |
+ |
l5=w5; |
1398 |
+ |
l3=w3; |
1399 |
+ |
wseq=wseq.substr(w5, w3-w5+1); |
1400 |
+ |
if (!wqv.is_empty()) |
1401 |
+ |
wqv=wqv.substr(w5, w3-w5+1); |
1402 |
+ |
} //qv trimming |
1403 |
+ |
// N-trimming on the remaining read seq |
1404 |
+ |
if (ntrim(wseq, w5, w3)) { |
1405 |
+ |
//GMessage("before: %s\n",wseq.chars()); |
1406 |
+ |
//GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1); |
1407 |
+ |
l5+=w5; |
1408 |
+ |
l3-=(wseq.length()-1-w3); |
1409 |
+ |
if (w3-w5+1<min_read_len) { |
1410 |
+ |
return 'N'; //to be trashed |
1411 |
+ |
} |
1412 |
+ |
//-- keep only the w5..w3 range |
1413 |
+ |
wseq=wseq.substr(w5, w3-w5+1); |
1414 |
+ |
if (!wqv.is_empty()) |
1415 |
+ |
wqv=wqv.substr(w5, w3-w5+1); |
1416 |
+ |
w5=0; |
1417 |
+ |
w3=wseq.length()-1; |
1418 |
+ |
} |
1419 |
+ |
if (a3len>0) { |
1420 |
+ |
if (trim_adapter3(wseq, w5, w3)) { |
1421 |
+ |
int trimlen=wseq.length()-(w3-w5+1); |
1422 |
+ |
num_trimmed3++; |
1423 |
+ |
if (trimlen<min_trimmed3) |
1424 |
+ |
min_trimmed3=trimlen; |
1425 |
+ |
l5+=w5; |
1426 |
+ |
l3-=(wseq.length()-1-w3); |
1427 |
+ |
if (w3-w5+1<min_read_len) { |
1428 |
+ |
return '3'; |
1429 |
+ |
} |
1430 |
+ |
//-- keep only the w5..w3 range |
1431 |
+ |
wseq=wseq.substr(w5, w3-w5+1); |
1432 |
+ |
if (!wqv.is_empty()) |
1433 |
+ |
wqv=wqv.substr(w5, w3-w5+1); |
1434 |
+ |
}//some adapter was trimmed |
1435 |
+ |
} //adapter trimming |
1436 |
+ |
if (a5len>0) { |
1437 |
+ |
if (trim_adapter5(wseq, w5, w3)) { |
1438 |
+ |
int trimlen=wseq.length()-(w3-w5+1); |
1439 |
+ |
num_trimmed5++; |
1440 |
+ |
if (trimlen<min_trimmed5) |
1441 |
+ |
min_trimmed5=trimlen; |
1442 |
+ |
l5+=w5; |
1443 |
+ |
l3-=(wseq.length()-1-w3); |
1444 |
+ |
if (w3-w5+1<min_read_len) { |
1445 |
+ |
return '5'; |
1446 |
+ |
} |
1447 |
+ |
//-- keep only the w5..w3 range |
1448 |
+ |
wseq=wseq.substr(w5, w3-w5+1); |
1449 |
+ |
if (!wqv.is_empty()) |
1450 |
+ |
wqv=wqv.substr(w5, w3-w5+1); |
1451 |
+ |
}//some adapter was trimmed |
1452 |
+ |
} //adapter trimming |
1453 |
+ |
if (doCollapse) { |
1454 |
+ |
//keep read for later |
1455 |
+ |
FqDupRec* dr=dhash.Find(wseq.chars()); |
1456 |
+ |
if (dr==NULL) { //new entry |
1457 |
+ |
//if (prefix.is_empty()) |
1458 |
+ |
dhash.Add(wseq.chars(), |
1459 |
+ |
new FqDupRec(&wqv, rname.chars())); |
1460 |
+ |
//else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length())); |
1461 |
+ |
} |
1462 |
+ |
else |
1463 |
+ |
dr->add(wqv); |
1464 |
+ |
} //collapsing duplicates |
1465 |
+ |
else { //not collapsing duplicates |
1466 |
+ |
//apply the dust filter now |
1467 |
+ |
if (doDust) { |
1468 |
+ |
int dustbases=dust(wseq); |
1469 |
+ |
if (dustbases>(wseq.length()>>1)) { |
1470 |
+ |
return 'D'; |
1471 |
+ |
} |
1472 |
+ |
} |
1473 |
+ |
} //not collapsing duplicates |
1474 |
+ |
return (l5>0 || l3<rseq.length()-1) ? 1 : 0; |
1475 |
+ |
} |
1476 |
+ |
|
1477 |
+ |
void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) { |
1478 |
+ |
//GMessage("printing Header..%c%s\n",recmarker, rname.chars()); |
1479 |
+ |
if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars()); |
1480 |
+ |
else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars()); |
1481 |
+ |
} |
1482 |
+ |
|
1483 |
+ |
void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) { |
1484 |
+ |
outcounter++; |
1485 |
+ |
bool asFasta=(rqv.is_empty() || fastaOutput); |
1486 |
+ |
if (asFasta) { |
1487 |
+ |
if (prefix.is_empty()) { |
1488 |
+ |
printHeader(f_out, '>',rname,rinfo); |
1489 |
+ |
fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now |
1490 |
+ |
} |
1491 |
+ |
else { |
1492 |
+ |
fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter, |
1493 |
+ |
rseq.chars()); |
1494 |
+ |
} |
1495 |
+ |
} |
1496 |
+ |
else { //fastq |
1497 |
+ |
if (convert_phred) convertPhred(rqv); |
1498 |
+ |
if (prefix.is_empty()) { |
1499 |
+ |
printHeader(f_out, '@', rname, rinfo); |
1500 |
+ |
fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars()); |
1501 |
+ |
} |
1502 |
+ |
else |
1503 |
+ |
fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
1504 |
+ |
rseq.chars(),rqv.chars() ); |
1505 |
+ |
} |
1506 |
+ |
} |
1507 |
+ |
|
1508 |
+ |
void trash_report(char trashcode, GStr& rname, FILE* freport) { |
1509 |
+ |
if (freport==NULL || trashcode<=' ') return; |
1510 |
+ |
if (trashcode=='3' || trashcode=='5') { |
1511 |
+ |
fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode); |
1512 |
+ |
} |
1513 |
+ |
else { |
1514 |
+ |
fprintf(freport, "%s\t%c\n",rname.chars(),trashcode); |
1515 |
+ |
} |
1516 |
+ |
//tcounter++; |
1517 |
+ |
} |
1518 |
+ |
|
1519 |
+ |
GStr getFext(GStr& s, int* xpos=NULL) { |
1520 |
+ |
//s must be a filename without a path |
1521 |
+ |
GStr r(""); |
1522 |
+ |
if (xpos!=NULL) *xpos=0; |
1523 |
+ |
if (s.is_empty() || s=="-") return r; |
1524 |
+ |
int p=s.rindex('.'); |
1525 |
+ |
int d=s.rindex('/'); |
1526 |
+ |
if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r; |
1527 |
+ |
r=s.substr(p+1); |
1528 |
+ |
if (xpos!=NULL) *xpos=p+1; |
1529 |
+ |
r.lower(); |
1530 |
+ |
return r; |
1531 |
+ |
} |
1532 |
+ |
|
1533 |
+ |
void baseFileName(GStr& fname) { |
1534 |
+ |
//remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) . |
1535 |
+ |
int xpos=0; |
1536 |
+ |
GStr fext=getFext(fname, &xpos); |
1537 |
+ |
if (xpos<=1) return; |
1538 |
+ |
bool extdel=false; |
1539 |
+ |
GStr f2; |
1540 |
+ |
if (fext=="z" || fext=="zip") { |
1541 |
+ |
extdel=true; |
1542 |
+ |
} |
1543 |
+ |
else if (fext.length()>=2) { |
1544 |
+ |
f2=fext.substr(0,2); |
1545 |
+ |
extdel=(f2=="gz" || f2=="bz"); |
1546 |
+ |
} |
1547 |
+ |
if (extdel) { |
1548 |
+ |
fname.cut(xpos-1); |
1549 |
+ |
fext=getFext(fname, &xpos); |
1550 |
+ |
if (xpos<=1) return; |
1551 |
+ |
} |
1552 |
+ |
extdel=false; |
1553 |
+ |
if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") { |
1554 |
+ |
extdel=true; |
1555 |
+ |
} |
1556 |
+ |
else if (fext.length()>=2) { |
1557 |
+ |
extdel=(fext.substr(0,2)=="fa"); |
1558 |
+ |
} |
1559 |
+ |
if (extdel) fname.cut(xpos-1); |
1560 |
+ |
GStr fncp(fname); |
1561 |
+ |
fncp.lower(); |
1562 |
+ |
fncp.chomp("seq"); |
1563 |
+ |
fncp.chomp("sequence"); |
1564 |
+ |
fncp.trimR("_."); |
1565 |
+ |
if (fncp.length()<fname.length()) fname.cut(fncp.length()); |
1566 |
+ |
} |
1567 |
+ |
|
1568 |
+ |
FILE* prepOutFile(GStr& infname, GStr& pocmd) { |
1569 |
+ |
FILE* f_out=NULL; |
1570 |
+ |
GStr fname(getFileName(infname.chars())); |
1571 |
+ |
//eliminate known extensions |
1572 |
+ |
baseFileName(fname); |
1573 |
+ |
if (outsuffix.is_empty() || outsuffix=="-") { return stdout; } |
1574 |
+ |
else if (pocmd.is_empty()) { |
1575 |
+ |
GStr oname(fname); |
1576 |
+ |
oname.append('.'); |
1577 |
+ |
oname.append(outsuffix); |
1578 |
+ |
f_out=fopen(oname.chars(),"w"); |
1579 |
+ |
if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars()); |
1580 |
+ |
} |
1581 |
+ |
else { |
1582 |
+ |
GStr oname(">"); |
1583 |
+ |
oname.append(fname); |
1584 |
+ |
oname.append('.'); |
1585 |
+ |
oname.append(outsuffix); |
1586 |
+ |
pocmd.append(oname); |
1587 |
+ |
f_out=popen(pocmd.chars(), "w"); |
1588 |
+ |
if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars()); |
1589 |
+ |
} |
1590 |
+ |
return f_out; |
1591 |
+ |
} |
1592 |
+ |
|
1593 |
+ |
void guess_unzip(GStr& fname, GStr& picmd) { |
1594 |
+ |
GStr fext=getFext(fname); |
1595 |
+ |
if (fext=="gz" || fext=="gzip" || fext=="z") { |
1596 |
+ |
picmd="gzip -cd "; |
1597 |
+ |
} |
1598 |
+ |
else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") { |
1599 |
+ |
picmd="bzip2 -cd "; |
1600 |
+ |
} |
1601 |
+ |
} |
1602 |
+ |
|
1603 |
+ |
void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2, |
1604 |
+ |
GStr& s, GStr& infname, GStr& infname2) { |
1605 |
+ |
// uses outsuffix to generate output file names and open file handles as needed |
1606 |
+ |
infname=""; |
1607 |
+ |
infname2=""; |
1608 |
+ |
f_in=NULL; |
1609 |
+ |
f_in2=NULL; |
1610 |
+ |
f_out=NULL; |
1611 |
+ |
f_out2=NULL; |
1612 |
+ |
//analyze outsuffix intent |
1613 |
+ |
GStr pocmd; |
1614 |
+ |
GStr ox=getFext(outsuffix); |
1615 |
+ |
if (ox.length()>2) ox=ox.substr(0,2); |
1616 |
+ |
if (ox=="gz") pocmd="gzip -9 -c "; |
1617 |
+ |
else if (ox=="bz") pocmd="bzip2 -9 -c "; |
1618 |
+ |
if (s=="-") { |
1619 |
+ |
f_in=stdin; |
1620 |
+ |
infname="stdin"; |
1621 |
+ |
f_out=prepOutFile(infname, pocmd); |
1622 |
+ |
return; |
1623 |
+ |
} // streaming from stdin |
1624 |
+ |
s.startTokenize(",:"); |
1625 |
+ |
s.nextToken(infname); |
1626 |
+ |
bool paired=s.nextToken(infname2); |
1627 |
+ |
if (fileExists(infname.chars())==0) |
1628 |
+ |
GError("Error: cannot find file %s!\n",infname.chars()); |
1629 |
+ |
GStr fname(getFileName(infname.chars())); |
1630 |
+ |
GStr picmd; |
1631 |
+ |
guess_unzip(fname, picmd); |
1632 |
+ |
if (picmd.is_empty()) { |
1633 |
+ |
f_in=fopen(infname.chars(), "r"); |
1634 |
+ |
if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars()); |
1635 |
+ |
} |
1636 |
+ |
else { |
1637 |
+ |
picmd.append(infname); |
1638 |
+ |
f_in=popen(picmd.chars(), "r"); |
1639 |
+ |
if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars()); |
1640 |
+ |
} |
1641 |
+ |
f_out=prepOutFile(infname, pocmd); |
1642 |
+ |
if (!paired) return; |
1643 |
+ |
if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n"); |
1644 |
+ |
// ---- paired reads:------------- |
1645 |
+ |
if (fileExists(infname2.chars())==0) |
1646 |
+ |
GError("Error: cannot find file %s!\n",infname2.chars()); |
1647 |
+ |
picmd=""; |
1648 |
+ |
GStr fname2(getFileName(infname2.chars())); |
1649 |
+ |
guess_unzip(fname2, picmd); |
1650 |
+ |
if (picmd.is_empty()) { |
1651 |
+ |
f_in2=fopen(infname2.chars(), "r"); |
1652 |
+ |
if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars()); |
1653 |
+ |
} |
1654 |
+ |
else { |
1655 |
+ |
picmd.append(infname2); |
1656 |
+ |
f_in2=popen(picmd.chars(), "r"); |
1657 |
+ |
if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars()); |
1658 |
+ |
} |
1659 |
+ |
f_out2=prepOutFile(infname2, pocmd); |
1660 |
+ |
} |