ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 102
Committed: Fri Oct 7 21:44:59 2011 UTC (8 years, 1 month ago) by gpertea
File size: 38427 byte(s)
Log Message:
wip

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7
8 #define USAGE "Usage:\n\
9 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 <input.fq>[,<input_mates.fq>\n\
13 \n\
14 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 for low complexity and collapse duplicate reads.\n\
16 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 one read in a pair), the two file names should be given delimited by a comma\n\
18 or a colon character\n\
19 \n\
20 Options:\n\
21 -n rename all the reads using the <prefix> followed by a read counter;\n\
22 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23 the read duplication count\n\
24 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25 file(s) named <input>.<outsuffix> which will be created in the \n\
26 current (working) directory; (writes to stdout if -o- is given);\n\
27 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 -f file with adapter sequences to trim, each line having this format:\n\
29 <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 -3 trim the given adapter sequence at the 3' end of each read\n\
33 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 -A disable polyA trimming (enabled by default)\n\
35 -T enable polyT trimming (disabled by default)\n\
36 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
37 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
40 -l minimum \"clean\" length after trimming that a read must have\n\
41 in order to pass the filter (default: 16)\n\
42 -r write a simple \"trash report\" file listing the discarded reads with a\n\
43 one letter code for the reason why they were trashed\n\
44 -D apply a low-complexity (dust) filter and discard any read that has over\n\
45 50% of its length detected as low complexity\n\
46 -C collapse duplicate reads and add a prefix with their count to the read\n\
47 name (e.g. for microRNAs)\n\
48 -p input is phred64/phred33 (use -p64 or -p33)\n\
49 -Q convert quality values to the other Phred qv type\n\
50 -V verbose processing\n\
51 "
52
53 //-z for -o option, the output stream(s) will be first piped into the given\n
54 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
55
56
57 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
58
59 //For paired reads sequencing:
60 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
61 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
62 //FILE* f_out=NULL; //stdout if not provided
63 //FILE* f_out2=NULL; //for paired reads
64 //FILE* f_in=NULL; //input fastq (stdin if not provided)
65 //FILE* f_in2=NULL; //for paired reads
66 FILE* freport=NULL;
67
68 bool debug=false;
69 bool verbose=false;
70 bool doCollapse=false;
71 bool doDust=false;
72 bool fastaOutput=false;
73 bool trashReport=false;
74 //bool rawFormat=false;
75 int min_read_len=16;
76 double max_perc_N=7.0;
77 int dust_cutoff=16;
78 bool isfasta=false;
79 bool convert_phred=false;
80 GStr outsuffix; // -o
81 GStr prefix;
82 GStr zcmd;
83 int num_trimmed5=0;
84 int num_trimmed3=0;
85 int num_trimmedN=0;
86 int num_trimmedQ=0;
87 int min_trimmed5=INT_MAX;
88 int min_trimmed3=INT_MAX;
89
90 int qvtrim_qmin=0;
91 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
92 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93 int qv_cvtadd=0; //could be -31 or +31
94
95 // adaptor matching metrics -- for X-drop ungapped extension
96 const int poly_m_score=2; //match score
97 const int poly_mis_score=-3; //mismatch
98 const int poly_dropoff_score=7;
99 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
100
101 const char *polyA_seed="AAAA";
102 const char *polyT_seed="TTTT";
103 /*
104 struct CAdapters {
105 GStr a5;
106 GStr a3;
107 CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
108 }
109 };
110 */
111 GVec<GStr> adapters5;
112 GVec<GStr> adapters3;
113
114 // element in dhash:
115 class FqDupRec {
116 public:
117 int count; //how many of these reads are the same
118 int len; //length of qv
119 char* firstname; //optional, only if we want to keep the original read names
120 char* qv;
121 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
122 len=0;
123 qv=NULL;
124 firstname=NULL;
125 count=0;
126 if (qstr!=NULL) {
127 qv=Gstrdup(qstr->chars());
128 len=qstr->length();
129 count++;
130 }
131 if (rname!=NULL) firstname=Gstrdup(rname);
132 }
133 ~FqDupRec() {
134 GFREE(qv);
135 GFREE(firstname);
136 }
137 void add(GStr& d) { //collapse another record into this one
138 if (d.length()!=len)
139 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
140 count++;
141 for (int i=0;i<len;i++)
142 qv[i]=(qv[i]+d[i])/2;
143 }
144 };
145
146 void openfw(FILE* &f, GArgs& args, char opt) {
147 GStr s=args.getOpt(opt);
148 if (!s.is_empty()) {
149 if (s=='-') f=stdout;
150 else {
151 f=fopen(s.chars(),"w");
152 if (f==NULL) GError("Error creating file: %s\n", s.chars());
153 }
154 }
155 }
156
157 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
158 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
159
160 GHash<FqDupRec> dhash; //hash to keep track of duplicates
161
162 int loadAdapters(const char* fname);
163
164 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
165 GStr& s, GStr& infname, GStr& infname2);
166 // uses outsuffix to generate output file names and open file handles as needed
167
168 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
169 void trash_report(char trashcode, GStr& rname, FILE* freport);
170
171 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
172 GStr& rname, GStr& rinfo, GStr& infname);
173
174 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
175 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
176
177 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
178 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
179 int dust(GStr& seq);
180 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
181 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
182 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
183 bool trim_adapter3(GStr& seq, int &l5, int &l3);
184
185 void convertPhred(char* q, int len);
186 void convertPhred(GStr& q);
187
188 int main(int argc, char * const argv[]) {
189 GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
190 int e;
191 if ((e=args.isError())>0) {
192 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
193 exit(224);
194 }
195 debug=(args.getOpt('Y')!=NULL);
196 verbose=(args.getOpt('V')!=NULL);
197 convert_phred=(args.getOpt('Q')!=NULL);
198 doCollapse=(args.getOpt('C')!=NULL);
199 doDust=(args.getOpt('D')!=NULL);
200 /*
201 rawFormat=(args.getOpt('R')!=NULL);
202 if (rawFormat) {
203 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
204 }
205 */
206 prefix=args.getOpt('n');
207 GStr s=args.getOpt('l');
208 if (!s.is_empty())
209 min_read_len=s.asInt();
210 s=args.getOpt('m');
211 if (!s.is_empty())
212 max_perc_N=s.asDouble();
213 s=args.getOpt('d');
214 if (!s.is_empty()) {
215 dust_cutoff=s.asInt();
216 doDust=true;
217 }
218 s=args.getOpt('q');
219 if (!s.is_empty()) {
220 qvtrim_qmin=s.asInt();
221 }
222 s=args.getOpt('t');
223 if (!s.is_empty()) {
224 qvtrim_max=s.asInt();
225 }
226 s=args.getOpt('p');
227 if (!s.is_empty()) {
228 int v=s.asInt();
229 if (v==33) {
230 qv_phredtype=33;
231 qv_cvtadd=31;
232 }
233 else if (v==64) {
234 qv_phredtype=64;
235 qv_cvtadd=-31;
236 }
237 else
238 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
239 }
240 s=args.getOpt('f');
241 if (!s.is_empty()) {
242 loadAdapters(s.chars());
243 }
244 bool fileAdapters=adapters5.Count()+adapters3.Count();
245 s=args.getOpt('5');
246 if (!s.is_empty()) {
247 if (fileAdapters)
248 GError("Error: options -5 and -f cannot be used together!\n");
249 s.upper();
250 adapters5.Add(s);
251 }
252 s=args.getOpt('3');
253 if (!s.is_empty()) {
254 if (fileAdapters)
255 GError("Error: options -3 and -f cannot be used together!\n");
256 s.upper();
257 adapters3.Add(s);
258 }
259 s=args.getOpt('y');
260 if (!s.is_empty()) {
261 int minmatch=s.asInt();
262 poly_minScore=minmatch*poly_m_score;
263 }
264
265 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
266 else outsuffix="-";
267 trashReport= (args.getOpt('r')!=NULL);
268 int fcount=args.startNonOpt();
269 if (fcount==0) {
270 GMessage(USAGE);
271 exit(224);
272 }
273 if (fcount>1 && doCollapse) {
274 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
275 }
276 //openfw(f_out, args, 'o');
277 //if (f_out==NULL) f_out=stdout;
278 if (trashReport)
279 openfw(freport, args, 'r');
280 char* infile=NULL;
281 while ((infile=args.nextNonOpt())!=NULL) {
282 int incounter=0; //counter for input reads
283 int outcounter=0; //counter for output reads
284 int trash_s=0; //too short from the get go
285 int trash_Q=0;
286 int trash_N=0;
287 int trash_D=0;
288 int trash_A3=0;
289 int trash_A5=0;
290 s=infile;
291 GStr infname;
292 GStr infname2;
293 FILE* f_in=NULL;
294 FILE* f_in2=NULL;
295 FILE* f_out=NULL;
296 FILE* f_out2=NULL;
297 bool paired_reads=false;
298 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
299 GLineReader fq(f_in);
300 GLineReader* fq2=NULL;
301 if (f_in2!=NULL) {
302 fq2=new GLineReader(f_in2);
303 paired_reads=true;
304 }
305 GStr rseq, rqv, seqid, seqinfo;
306 GStr rseq2, rqv2, seqid2, seqinfo2;
307 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
308 incounter++;
309 int a5=0, a3=0, b5=0, b3=0;
310 char tcode=0, tcode2=0;
311 tcode=process_read(seqid, rseq, rqv, a5, a3);
312 //if (!doCollapse) {
313 if (fq2!=NULL) {
314 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
315 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
316 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
317 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
318 }
319 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
320 //decide what to do with this pair and print rseq2 if the pair makes it
321 if (tcode>1 && tcode2<=1) {
322 //"untrash" rseq
323 if (a3-a5+1<min_read_len) {
324 a5=1;
325 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
326 }
327 tcode=1;
328 }
329 else if (tcode<=1 && tcode2>1) {
330 //"untrash" rseq2
331 if (b3-b5+1<min_read_len) {
332 b5=1;
333 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
334 }
335 tcode2=1;
336 }
337 if (tcode<=1) { //trimmed or left intact -- write it!
338 if (tcode>0) {
339 rseq2=rseq2.substr(b5,b3-b5+1);
340 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
341 }
342 int nocounter=0;
343 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
344 }
345 } //paired read
346 // }
347 if (tcode>1) { //trashed
348 if (tcode=='s') trash_s++;
349 else if (tcode=='Q') trash_Q++;
350 else if (tcode=='N') trash_N++;
351 else if (tcode=='D') trash_D++;
352 else if (tcode=='3') trash_A3++;
353 else if (tcode=='5') trash_A5++;
354 if (trashReport) trash_report(tcode, seqid, freport);
355 }
356 else if (!doCollapse) { //write it
357 if (tcode>0) {
358 rseq=rseq.substr(a5,a3-a5+1);
359 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
360 }
361 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
362 }
363 } //for each fastq record
364 delete fq2;
365 FRCLOSE(f_in);
366 FRCLOSE(f_in2);
367 if (doCollapse) {
368 outcounter=0;
369 int maxdup_count=1;
370 char* maxdup_seq=NULL;
371 dhash.startIterate();
372 FqDupRec* qd=NULL;
373 char* seq=NULL;
374 while ((qd=dhash.NextData(seq))!=NULL) {
375 GStr rseq(seq);
376 //do the dusting here
377 if (doDust) {
378 int dustbases=dust(rseq);
379 if (dustbases>(rseq.length()>>1)) {
380 if (trashReport && qd->firstname!=NULL) {
381 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
382 }
383 trash_D+=qd->count;
384 continue;
385 }
386 }
387 outcounter++;
388 if (qd->count>maxdup_count) {
389 maxdup_count=qd->count;
390 maxdup_seq=seq;
391 }
392 if (isfasta) {
393 if (prefix.is_empty()) {
394 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
395 rseq.chars());
396 }
397 else { //use custom read name
398 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
399 qd->count, rseq.chars());
400 }
401 }
402 else { //fastq format
403 if (convert_phred) convertPhred(qd->qv, qd->len);
404 if (prefix.is_empty()) {
405 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
406 rseq.chars(), qd->qv);
407 }
408 else { //use custom read name
409 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
410 qd->count, rseq.chars(), qd->qv);
411 }
412 }
413 }//for each element of dhash
414 if (maxdup_count>1) {
415 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
416 }
417 } //collapse entries
418 if (verbose) {
419 if (paired_reads) {
420 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
421 GMessage("Number of input pairs :%9d\n", incounter);
422 GMessage(" Output pairs :%9d\n", outcounter);
423 }
424 else {
425 GMessage(">Input file : %s\n", infname.chars());
426 GMessage("Number of input reads :%9d\n", incounter);
427 GMessage(" Output reads :%9d\n", outcounter);
428 }
429 GMessage("------------------------------------\n");
430 if (num_trimmed5)
431 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
432 if (num_trimmed3)
433 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
434 GMessage("------------------------------------\n");
435 if (trash_s>0)
436 GMessage(" Trashed by length:%9d\n", trash_s);
437 if (trash_N>0)
438 GMessage(" Trashed by N%%:%9d\n", trash_N);
439 if (trash_Q>0)
440 GMessage("Trashed by low quality:%9d\n", trash_Q);
441 if (trash_A5>0)
442 GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
443 if (trash_A3>0)
444 GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
445 }
446 if (trashReport) {
447 FWCLOSE(freport);
448 }
449 FWCLOSE(f_out);
450 FWCLOSE(f_out2);
451 } //while each input file
452
453 //getc(stdin);
454 }
455
456 class NData {
457 public:
458 int NPos[1024]; //there should be no reads longer than 1K ?
459 int NCount;
460 int end5;
461 int end3;
462 int seqlen;
463 double perc_N; //percentage of Ns in end5..end3 range only!
464 const char* seq;
465 bool valid;
466 NData() {
467 NCount=0;
468 end5=0;
469 end3=0;
470 seq=NULL;
471 perc_N=0;
472 valid=true;
473 }
474 void init(GStr& rseq) {
475 NCount=0;
476 perc_N=0;
477 seqlen=rseq.length();
478 seq=rseq.chars();
479 end5=1;
480 end3=seqlen;
481 for (int i=0;i<seqlen;i++)
482 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
483 NPos[NCount]=i;
484 NCount++;
485 }
486 perc_N=(NCount*100.0)/seqlen;
487 valid=true;
488 }
489 void N_calc() { //only in the region after trimming
490 int n=0;
491 for (int i=end3-1;i<end5;i++) {
492 if (seq[i]=='N') n++;
493 }
494 perc_N=(n*100.0)/(end5-end3+1);
495 }
496 };
497
498 static NData feat;
499 int perc_lenN=12; // incremental distance from ends, in percentage of
500 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
501
502 void N_analyze(int l5, int l3, int p5, int p3) {
503 /* assumes feat was filled properly */
504 int old_dif, t5,t3,v;
505 if (l3<l5+2 || p5>p3 ) {
506 feat.end5=l5+1;
507 feat.end3=l3+1;
508 return;
509 }
510
511 t5=feat.NPos[p5]-l5;
512 t3=l3-feat.NPos[p3];
513 old_dif=p3-p5;
514 v=(int)((((double)(l3-l5))*perc_lenN)/100);
515 if (v>20) v=20; /* enforce N-search limit for very long reads */
516 if (t5 < v ) {
517 l5=feat.NPos[p5]+1;
518 p5++;
519 }
520 if (t3 < v) {
521 l3=feat.NPos[p3]-1;
522 p3--;
523 }
524 /* restNs=p3-p5; number of Ns in the new CLR */
525 if (p3-p5==old_dif) { /* no change, return */
526 /* don't trim if this may shorten the read too much */
527 //if (l5-l3<min_read_len) return;
528 feat.end5=l5+1;
529 feat.end3=l3+1;
530 return;
531 }
532 else
533 N_analyze(l5,l3, p5,p3);
534 }
535
536
537 bool qtrim(GStr& qvs, int &l5, int &l3) {
538 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
539 if (qv_phredtype==0) {
540 //try to guess the Phred type
541 int vmin=256, vmax=0;
542 for (int i=0;i<qvs.length();i++) {
543 if (vmin>qvs[i]) vmin=qvs[i];
544 if (vmax<qvs[i]) vmax=qvs[i];
545 }
546 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
547 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
548 if (qv_phredtype==0) {
549 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
550 }
551 if (verbose)
552 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
553 } //guessing Phred type
554 for (l3=qvs.length()-1;l3>2;l3--) {
555 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
556 }
557 //just in case, check also the 5' the end (?)
558 for (l5=0;l5<qvs.length()-3;l5++) {
559 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
560 }
561 if (qvtrim_max>0) {
562 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
563 if (l5>qvtrim_max) l5=qvtrim_max;
564 }
565 return (l5>0 || l3<qvs.length()-1);
566 }
567
568 bool ntrim(GStr& rseq, int &l5, int &l3) {
569 //count Ns in the sequence, trim N-rich ends
570 feat.init(rseq);
571 l5=feat.end5-1;
572 l3=feat.end3-1;
573 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
574 if (l5==feat.end5-1 && l3==feat.end3-1) {
575 if (feat.perc_N>max_perc_N) {
576 feat.valid=false;
577 l3=l5+1;
578 return true;
579 }
580 else {
581 return false; //nothing changed
582 }
583 }
584 l5=feat.end5-1;
585 l3=feat.end3-1;
586 if (l3-l5+1<min_read_len) {
587 feat.valid=false;
588 return true;
589 }
590 feat.N_calc();
591
592 if (feat.perc_N>max_perc_N) {
593 feat.valid=false;
594 l3=l5+1;
595 return true;
596 }
597 return true;
598 }
599
600 //--------------- dust functions ----------------
601 class DNADuster {
602 public:
603 int dustword;
604 int dustwindow;
605 int dustwindow2;
606 int dustcutoff;
607 int mv, iv, jv;
608 int counts[32*32*32];
609 int iis[32*32*32];
610 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
611 dustword=wordsize;
612 dustwindow=winsize;
613 dustwindow2 = (winsize>>1);
614 dustcutoff=cutoff;
615 mv=0;
616 iv=0;
617 jv=0;
618 }
619 void setWindowSize(int value) {
620 dustwindow = value;
621 dustwindow2 = (dustwindow >> 1);
622 }
623 void setWordSize(int value) {
624 dustword=value;
625 }
626 void wo1(int len, const char* s, int ivv) {
627 int i, ii, j, v, t, n, n1, sum;
628 int js, nis;
629 n = 32 * 32 * 32;
630 n1 = n - 1;
631 nis = 0;
632 i = 0;
633 ii = 0;
634 sum = 0;
635 v = 0;
636 for (j=0; j < len; j++, s++) {
637 ii <<= 5;
638 if (*s<=32) {
639 i=0;
640 continue;
641 }
642 ii |= *s - 'A'; //assume uppercase!
643 ii &= n1;
644 i++;
645 if (i >= dustword) {
646 for (js=0; js < nis && iis[js] != ii; js++) ;
647 if (js == nis) {
648 iis[nis] = ii;
649 counts[ii] = 0;
650 nis++;
651 }
652 if ((t = counts[ii]) > 0) {
653 sum += t;
654 v = 10 * sum / j;
655 if (mv < v) {
656 mv = v;
657 iv = ivv;
658 jv = j;
659 }
660 }
661 counts[ii]++;
662 }
663 }
664 }
665
666 int wo(int len, const char* s, int* beg, int* end) {
667 int i, l1;
668 l1 = len - dustword + 1;
669 if (l1 < 0) {
670 *beg = 0;
671 *end = len - 1;
672 return 0;
673 }
674 mv = 0;
675 iv = 0;
676 jv = 0;
677 for (i=0; i < l1; i++) {
678 wo1(len-i, s+i, i);
679 }
680 *beg = iv;
681 *end = iv + jv;
682 return mv;
683 }
684
685 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
686 int i, j, l, a, b, v;
687 if (cutoff==0) cutoff=dustcutoff;
688 a=0;b=0;
689 //GMessage("Dust cutoff=%d\n", cutoff);
690 for (i=0; i < seqlen; i += dustwindow2) {
691 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
692 v = wo(l, seq+i, &a, &b);
693 if (v > cutoff) {
694 //for (j = a; j <= b && j < dustwindow2; j++) {
695 for (j = a; j <= b; j++) {
696 seqmsk[i+j]='N';//could be made lowercase instead
697 }
698 }
699 }
700 //return first;
701 }
702 };
703
704 static DNADuster duster;
705
706 int dust(GStr& rseq) {
707 char* seq=Gstrdup(rseq.chars());
708 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
709 //check the number of Ns:
710 int ncount=0;
711 for (int i=0;i<rseq.length();i++) {
712 if (seq[i]=='N') ncount++;
713 }
714 GFREE(seq);
715 return ncount;
716 }
717
718 int get3mer_value(const char* s) {
719 return (s[0]<<16)+(s[1]<<8)+s[2];
720 }
721
722 int w3_match(int qv, const char* str, int slen, int start_index=0) {
723 if (start_index>=slen || start_index<0) return -1;
724 for (int i=start_index;i<slen-3;i++) {
725 int rv=get3mer_value(str+i);
726 if (rv==qv) return i;
727 }
728 return -1;
729 }
730
731 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
732 if (end_index>=slen) return -1;
733 if (end_index<0) end_index=slen-1;
734 for (int i=end_index-2;i>=0;i--) {
735 int rv=get3mer_value(str+i);
736 if (rv==qv) return i;
737 }
738 return -1;
739 }
740
741 struct SLocScore {
742 int pos;
743 int score;
744 SLocScore(int p=0,int s=0) {
745 pos=p;
746 score=s;
747 }
748 void set(int p, int s) {
749 pos=p;
750 score=s;
751 }
752 void add(int p, int add) {
753 pos=p;
754 score+=add;
755 }
756 };
757
758 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
759 int rlen=seq.length();
760 l5=0;
761 l3=rlen-1;
762 int32 seedVal=*(int32*)poly_seed;
763 char polyChar=poly_seed[0];
764 //assumes N trimming was already done
765 //so a poly match should be very close to the end of the read
766 // -- find the initial match (seed)
767 int lmin=GMAX((rlen-12), 0);
768 int li;
769 for (li=rlen-4;li>lmin;li--) {
770 if (seedVal==*(int*)&(seq[li])) {
771 break;
772 }
773 }
774 if (li<=lmin) return false;
775 //seed found, try to extend it both ways
776 //extend right
777 int ri=li+3;
778 SLocScore loc(ri, poly_m_score<<2);
779 SLocScore maxloc(loc);
780 //extend right
781 while (ri<rlen-2) {
782 ri++;
783 if (seq[ri]==polyChar) {
784 loc.add(ri,poly_m_score);
785 }
786 else if (seq[ri]=='N') {
787 loc.add(ri,0);
788 }
789 else { //mismatch
790 loc.add(ri,poly_mis_score);
791 if (maxloc.score-loc.score>poly_dropoff_score) break;
792 }
793 if (maxloc.score<=loc.score) {
794 maxloc=loc;
795 }
796 }
797 ri=maxloc.pos;
798 if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end
799 //ri = right boundary for the poly match
800 //extend left
801 loc.set(li, maxloc.score);
802 maxloc.pos=li;
803 while (li>0) {
804 li--;
805 if (seq[li]==polyChar) {
806 loc.add(li,poly_m_score);
807 }
808 else if (seq[li]=='N') {
809 loc.add(li,0);
810 }
811 else { //mismatch
812 loc.add(li,poly_mis_score);
813 if (maxloc.score-loc.score>poly_dropoff_score) break;
814 }
815 if (maxloc.score<=loc.score) {
816 maxloc=loc;
817 }
818 }
819 if (maxloc.score>poly_minScore && ri>=rlen-3) {
820 l5=li;
821 l3=ri;
822 return true;
823 }
824 return false;
825 }
826
827
828 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
829 int rlen=seq.length();
830 l5=0;
831 l3=rlen-1;
832 int32 seedVal=*(int32*)poly_seed;
833 char polyChar=poly_seed[0];
834 //assumes N trimming was already done
835 //so a poly match should be very close to the end of the read
836 // -- find the initial match (seed)
837 int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
838 int li;
839 for (li=0;li<=lmax;li++) {
840 if (seedVal==*(int*)&(seq[li])) {
841 break;
842 }
843 }
844 if (li>lmax) return false;
845 //seed found, try to extend it both ways
846 //extend left
847 int ri=li+3; //save rightmost base of the seed
848 SLocScore loc(li, poly_m_score<<2);
849 SLocScore maxloc(loc);
850 while (li>0) {
851 li--;
852 if (seq[li]==polyChar) {
853 loc.add(li,poly_m_score);
854 }
855 else if (seq[li]=='N') {
856 loc.add(li,0);
857 }
858 else { //mismatch
859 loc.add(li,poly_mis_score);
860 if (maxloc.score-loc.score>poly_dropoff_score) break;
861 }
862 if (maxloc.score<=loc.score) {
863 maxloc=loc;
864 }
865 }
866 li=maxloc.pos;
867 if (li>3) return false; //no trimming wanted, too far from 5' end
868 //li = right boundary for the poly match
869
870 //extend right
871 loc.set(ri, maxloc.score);
872 maxloc.pos=ri;
873 while (ri<rlen-2) {
874 ri++;
875 if (seq[ri]==polyChar) {
876 loc.add(ri,poly_m_score);
877 }
878 else if (seq[ri]=='N') {
879 loc.add(ri,0);
880 }
881 else { //mismatch
882 loc.add(ri,poly_mis_score);
883 if (maxloc.score-loc.score>poly_dropoff_score) break;
884 }
885 if (maxloc.score<=loc.score) {
886 maxloc=loc;
887 }
888 }
889
890 if (maxloc.score>poly_minScore && li<=3) {
891 l5=li;
892 l3=ri;
893 return true;
894 }
895 return false;
896 }
897
898 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
899 int rlen=seq.length();
900 l5=0;
901 l3=rlen-1;
902
903 return false; //no adapter parts found
904 }
905
906 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
907 int rlen=seq.length();
908 l5=0;
909 l3=rlen-1;
910 bool trimmed=false;
911 for (int ai=0;ai<adapters5.Count();ai++) {
912 if (adapters5[ai].is_empty()) continue;
913 int a5len=adapters5[ai].length();
914 GStr& adapter5=adapters5[ai];
915
916 }//for each 5' adapter
917 return trimmed;
918 }
919
920 //convert qvs to/from phred64 from/to phread33
921 void convertPhred(GStr& q) {
922 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
923 }
924
925 void convertPhred(char* q, int len) {
926 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
927 }
928
929 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
930 GStr& rname, GStr& rinfo, GStr& infname) {
931 rseq="";
932 rqv="";
933 rname="";
934 rinfo="";
935 if (fq.eof()) return false;
936 char* l=fq.getLine();
937 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
938 if (l==NULL) return false;
939 /* if (rawFormat) {
940 //TODO: implement raw qseq parsing here
941 //if (raw type=N) then continue; //skip invalid/bad records
942 } //raw qseq format
943 else { // FASTQ or FASTA */
944 isfasta=(l[0]=='>');
945 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
946 infname.chars(), l);
947 GStr s(l);
948 rname=&(l[1]);
949 for (int i=0;i<rname.length();i++)
950 if (rname[i]<=' ') {
951 if (i<rname.length()-2) rinfo=rname.substr(i+1);
952 rname.cut(i);
953 break;
954 }
955 //now get the sequence
956 if ((l=fq.getLine())==NULL)
957 GError("Error: unexpected EOF after header for read %s (%s)\n",
958 rname.chars(), infname.chars());
959 rseq=l; //this must be the DNA line
960 while ((l=fq.getLine())!=NULL) {
961 //seq can span multiple lines
962 if (l[0]=='>' || l[0]=='+') {
963 fq.pushBack();
964 break; //
965 }
966 rseq+=l;
967 } //check for multi-line seq
968 if (!isfasta) { //reading fastq quality values, which can also be multi-line
969 if ((l=fq.getLine())==NULL)
970 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
971 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
972 if ((l=fq.getLine())==NULL)
973 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
974 rqv=l;
975 //if (rqv.length()!=rseq.length())
976 // GError("Error: qv len != seq len for %s\n", rname.chars());
977 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
978 rqv+=l; //append to qv string
979 }
980 }// fastq
981 // } //<-- FASTA or FASTQ
982 rseq.upper();
983 return true;
984 }
985
986 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
987 //returns 0 if the read was untouched, 1 if it was just trimmed
988 // and a trash code if it was trashed
989 l5=0;
990 l3=rseq.length()-1;
991 if (l3-l5+1<min_read_len) {
992 return 's';
993 }
994 GStr wseq(rseq.chars());
995 GStr wqv(rqv.chars());
996 int w5=l5;
997 int w3=l3;
998 //first do the q-based trimming
999 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1000 if (w3-w5+1<min_read_len) {
1001 return 'Q'; //invalid read
1002 }
1003 //-- keep only the w5..w3 range
1004 l5=w5;
1005 l3=w3;
1006 wseq=wseq.substr(w5, w3-w5+1);
1007 if (!wqv.is_empty())
1008 wqv=wqv.substr(w5, w3-w5+1);
1009 } //qv trimming
1010 // N-trimming on the remaining read seq
1011 if (ntrim(wseq, w5, w3)) {
1012 //GMessage("before: %s\n",wseq.chars());
1013 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1014 l5+=w5;
1015 l3-=(wseq.length()-1-w3);
1016 if (w3-w5+1<min_read_len) {
1017 return 'N'; //to be trashed
1018 }
1019 //-- keep only the w5..w3 range
1020 wseq=wseq.substr(w5, w3-w5+1);
1021 if (!wqv.is_empty())
1022 wqv=wqv.substr(w5, w3-w5+1);
1023 w5=0;
1024 w3=wseq.length()-1;
1025 }
1026 if (adapters3.Count()>0) {
1027 if (trim_adapter3(wseq, w5, w3)) {
1028 int trimlen=wseq.length()-(w3-w5+1);
1029 num_trimmed3++;
1030 if (trimlen<min_trimmed3)
1031 min_trimmed3=trimlen;
1032 l5+=w5;
1033 l3-=(wseq.length()-1-w3);
1034 if (w3-w5+1<min_read_len) {
1035 return '3';
1036 }
1037 //-- keep only the w5..w3 range
1038 wseq=wseq.substr(w5, w3-w5+1);
1039 if (!wqv.is_empty())
1040 wqv=wqv.substr(w5, w3-w5+1);
1041 }//some adapter was trimmed
1042 } //adapter trimming
1043 if (adapters5.Count()>0) {
1044 if (trim_adapter5(wseq, w5, w3)) {
1045 int trimlen=wseq.length()-(w3-w5+1);
1046 num_trimmed5++;
1047 if (trimlen<min_trimmed5)
1048 min_trimmed5=trimlen;
1049 l5+=w5;
1050 l3-=(wseq.length()-1-w3);
1051 if (w3-w5+1<min_read_len) {
1052 return '5';
1053 }
1054 //-- keep only the w5..w3 range
1055 wseq=wseq.substr(w5, w3-w5+1);
1056 if (!wqv.is_empty())
1057 wqv=wqv.substr(w5, w3-w5+1);
1058 }//some adapter was trimmed
1059 } //adapter trimming
1060 if (doCollapse) {
1061 //keep read for later
1062 FqDupRec* dr=dhash.Find(wseq.chars());
1063 if (dr==NULL) { //new entry
1064 //if (prefix.is_empty())
1065 dhash.Add(wseq.chars(),
1066 new FqDupRec(&wqv, rname.chars()));
1067 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1068 }
1069 else
1070 dr->add(wqv);
1071 } //collapsing duplicates
1072 else { //not collapsing duplicates
1073 //apply the dust filter now
1074 if (doDust) {
1075 int dustbases=dust(wseq);
1076 if (dustbases>(wseq.length()>>1)) {
1077 return 'D';
1078 }
1079 }
1080 } //not collapsing duplicates
1081 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1082 }
1083
1084 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1085 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1086 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1087 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1088 }
1089
1090 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1091 outcounter++;
1092 bool asFasta=(rqv.is_empty() || fastaOutput);
1093 if (asFasta) {
1094 if (prefix.is_empty()) {
1095 printHeader(f_out, '>',rname,rinfo);
1096 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1097 }
1098 else {
1099 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1100 rseq.chars());
1101 }
1102 }
1103 else { //fastq
1104 if (convert_phred) convertPhred(rqv);
1105 if (prefix.is_empty()) {
1106 printHeader(f_out, '@', rname, rinfo);
1107 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1108 }
1109 else
1110 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1111 rseq.chars(),rqv.chars() );
1112 }
1113 }
1114
1115 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1116 if (freport==NULL || trashcode<=' ') return;
1117 if (trashcode=='3' || trashcode=='5') {
1118 fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1119 }
1120 else {
1121 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1122 }
1123 //tcounter++;
1124 }
1125
1126 GStr getFext(GStr& s, int* xpos=NULL) {
1127 //s must be a filename without a path
1128 GStr r("");
1129 if (xpos!=NULL) *xpos=0;
1130 if (s.is_empty() || s=="-") return r;
1131 int p=s.rindex('.');
1132 int d=s.rindex('/');
1133 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1134 r=s.substr(p+1);
1135 if (xpos!=NULL) *xpos=p+1;
1136 r.lower();
1137 return r;
1138 }
1139
1140 void baseFileName(GStr& fname) {
1141 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1142 int xpos=0;
1143 GStr fext=getFext(fname, &xpos);
1144 if (xpos<=1) return;
1145 bool extdel=false;
1146 GStr f2;
1147 if (fext=="z" || fext=="zip") {
1148 extdel=true;
1149 }
1150 else if (fext.length()>=2) {
1151 f2=fext.substr(0,2);
1152 extdel=(f2=="gz" || f2=="bz");
1153 }
1154 if (extdel) {
1155 fname.cut(xpos-1);
1156 fext=getFext(fname, &xpos);
1157 if (xpos<=1) return;
1158 }
1159 extdel=false;
1160 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1161 extdel=true;
1162 }
1163 else if (fext.length()>=2) {
1164 extdel=(fext.substr(0,2)=="fa");
1165 }
1166 if (extdel) fname.cut(xpos-1);
1167 GStr fncp(fname);
1168 fncp.lower();
1169 fncp.chomp("seq");
1170 fncp.chomp("sequence");
1171 fncp.trimR("_.");
1172 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1173 }
1174
1175 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1176 FILE* f_out=NULL;
1177 GStr fname(getFileName(infname.chars()));
1178 //eliminate known extensions
1179 baseFileName(fname);
1180 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1181 else if (pocmd.is_empty()) {
1182 GStr oname(fname);
1183 oname.append('.');
1184 oname.append(outsuffix);
1185 f_out=fopen(oname.chars(),"w");
1186 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1187 }
1188 else {
1189 GStr oname(">");
1190 oname.append(fname);
1191 oname.append('.');
1192 oname.append(outsuffix);
1193 pocmd.append(oname);
1194 f_out=popen(pocmd.chars(), "w");
1195 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1196 }
1197 return f_out;
1198 }
1199
1200 void guess_unzip(GStr& fname, GStr& picmd) {
1201 GStr fext=getFext(fname);
1202 if (fext=="gz" || fext=="gzip" || fext=="z") {
1203 picmd="gzip -cd ";
1204 }
1205 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1206 picmd="bzip2 -cd ";
1207 }
1208 }
1209
1210
1211 int loadAdapters(const char* fname) {
1212 GLineReader lr(fname);
1213 char* l;
1214 while ((l=lr.nextLine())!=NULL) {
1215 if (lr.length()<=3 || l[0]=='#') continue;
1216 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1217 l[0]==';'|| l[0]==':' ) {
1218 int i=1;
1219 while (l[i]!=0 && isspace(l[i])) {
1220 i++;
1221 }
1222 if (l[i]!=0) {
1223 GStr s(&(l[i]));
1224 adapters3.Add(s);
1225 continue;
1226 }
1227 }
1228 else {
1229 GStr s(l);
1230 s.startTokenize("\t ;,:");
1231 GStr a5,a3;
1232 if (s.nextToken(a5))
1233 s.nextToken(a3);
1234 a5.upper();
1235 a3.upper();
1236 adapters5.Add(a5);
1237 adapters3.Add(a3);
1238 }
1239 }
1240 return adapters5.Count()+adapters3.Count();
1241 }
1242
1243 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1244 GStr& s, GStr& infname, GStr& infname2) {
1245 // uses outsuffix to generate output file names and open file handles as needed
1246 infname="";
1247 infname2="";
1248 f_in=NULL;
1249 f_in2=NULL;
1250 f_out=NULL;
1251 f_out2=NULL;
1252 //analyze outsuffix intent
1253 GStr pocmd;
1254 if (outsuffix=="-") {
1255 f_out=stdout;
1256 }
1257 else {
1258 GStr ox=getFext(outsuffix);
1259 if (ox.length()>2) ox=ox.substr(0,2);
1260 if (ox=="gz") pocmd="gzip -9 -c ";
1261 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1262 }
1263 if (s=="-") {
1264 f_in=stdin;
1265 infname="stdin";
1266 f_out=prepOutFile(infname, pocmd);
1267 return;
1268 } // streaming from stdin
1269 s.startTokenize(",:");
1270 s.nextToken(infname);
1271 bool paired=s.nextToken(infname2);
1272 if (fileExists(infname.chars())==0)
1273 GError("Error: cannot find file %s!\n",infname.chars());
1274 GStr fname(getFileName(infname.chars()));
1275 GStr picmd;
1276 guess_unzip(fname, picmd);
1277 if (picmd.is_empty()) {
1278 f_in=fopen(infname.chars(), "r");
1279 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1280 }
1281 else {
1282 picmd.append(infname);
1283 f_in=popen(picmd.chars(), "r");
1284 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1285 }
1286 if (f_out==stdout) {
1287 if (paired) GError("Error: output suffix required for paired reads\n");
1288 return;
1289 }
1290 f_out=prepOutFile(infname, pocmd);
1291 if (!paired) return;
1292 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1293 // ---- paired reads:-------------
1294 if (fileExists(infname2.chars())==0)
1295 GError("Error: cannot find file %s!\n",infname2.chars());
1296 picmd="";
1297 GStr fname2(getFileName(infname2.chars()));
1298 guess_unzip(fname2, picmd);
1299 if (picmd.is_empty()) {
1300 f_in2=fopen(infname2.chars(), "r");
1301 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1302 }
1303 else {
1304 picmd.append(infname2);
1305 f_in2=popen(picmd.chars(), "r");
1306 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1307 }
1308 f_out2=prepOutFile(infname2, pocmd);
1309 }