ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 105
Committed: Mon Oct 10 21:44:55 2011 UTC (7 years, 10 months ago) by gpertea
File size: 41600 byte(s)
Log Message:
wip - works?

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7
8 #define USAGE "Usage:\n\
9 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 <input.fq>[,<input_mates.fq>\n\
13 \n\
14 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 for low complexity and collapse duplicate reads.\n\
16 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 one read in a pair), the two file names should be given delimited by a comma\n\
18 or a colon character\n\
19 \n\
20 Options:\n\
21 -n rename all the reads using the <prefix> followed by a read counter;\n\
22 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23 the read duplication count\n\
24 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25 file(s) named <input>.<outsuffix> which will be created in the \n\
26 current (working) directory; (writes to stdout if -o- is given);\n\
27 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 -f file with adapter sequences to trim, each line having this format:\n\
29 <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 -3 trim the given adapter sequence at the 3' end of each read\n\
33 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 -A disable polyA/T trimming (enabled by default)\n\
35 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 -l minimum \"clean\" length after trimming that a read must have\n\
40 in order to pass the filter (default: 16)\n\
41 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42 one letter code for the reason why they were trashed\n\
43 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44 50% of its length detected as low complexity\n\
45 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 name (e.g. for microRNAs)\n\
47 -p input is phred64/phred33 (use -p64 or -p33)\n\
48 -Q convert quality values to the other Phred qv type\n\
49 -V verbose processing\n\
50 "
51
52 //-z for -o option, the output stream(s) will be first piped into the given\n
53 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54
55
56 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57
58 //For paired reads sequencing:
59 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 //FILE* f_out=NULL; //stdout if not provided
62 //FILE* f_out2=NULL; //for paired reads
63 //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 //FILE* f_in2=NULL; //for paired reads
65 FILE* freport=NULL;
66
67 bool debug=false;
68 bool verbose=false;
69 bool doCollapse=false;
70 bool doDust=false;
71 bool doPolyTrim=true;
72 bool fastaOutput=false;
73 bool trashReport=false;
74 //bool rawFormat=false;
75 int min_read_len=16;
76 double max_perc_N=7.0;
77 int dust_cutoff=16;
78 bool isfasta=false;
79 bool convert_phred=false;
80 GStr outsuffix; // -o
81 GStr prefix;
82 GStr zcmd;
83 int num_trimmed5=0;
84 int num_trimmed3=0;
85 int num_trimmedN=0;
86 int num_trimmedQ=0;
87 int min_trimmed5=INT_MAX;
88 int min_trimmed3=INT_MAX;
89
90 int qvtrim_qmin=0;
91 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
92 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93 int qv_cvtadd=0; //could be -31 or +31
94
95 // adaptor matching metrics -- for X-drop ungapped extension
96 const int match_reward=2;
97 const int mismatch_penalty=3;
98 const int Xdrop=8;
99
100 const int poly_m_score=2; //match score
101 const int poly_mis_score=-3; //mismatch
102 const int poly_dropoff_score=7;
103 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
104
105 const char *polyA_seed="AAAA";
106 const char *polyT_seed="TTTT";
107 GVec<GStr> adapters5;
108 GVec<GStr> adapters3;
109
110 CGreedyAlignData* gxmem_l=NULL;
111 CGreedyAlignData* gxmem_r=NULL;
112
113 // element in dhash:
114 class FqDupRec {
115 public:
116 int count; //how many of these reads are the same
117 int len; //length of qv
118 char* firstname; //optional, only if we want to keep the original read names
119 char* qv;
120 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
121 len=0;
122 qv=NULL;
123 firstname=NULL;
124 count=0;
125 if (qstr!=NULL) {
126 qv=Gstrdup(qstr->chars());
127 len=qstr->length();
128 count++;
129 }
130 if (rname!=NULL) firstname=Gstrdup(rname);
131 }
132 ~FqDupRec() {
133 GFREE(qv);
134 GFREE(firstname);
135 }
136 void add(GStr& d) { //collapse another record into this one
137 if (d.length()!=len)
138 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
139 count++;
140 for (int i=0;i<len;i++)
141 qv[i]=(qv[i]+d[i])/2;
142 }
143 };
144
145 void openfw(FILE* &f, GArgs& args, char opt) {
146 GStr s=args.getOpt(opt);
147 if (!s.is_empty()) {
148 if (s=='-') f=stdout;
149 else {
150 f=fopen(s.chars(),"w");
151 if (f==NULL) GError("Error creating file: %s\n", s.chars());
152 }
153 }
154 }
155
156 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
157 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
158
159 GHash<FqDupRec> dhash; //hash to keep track of duplicates
160
161 int loadAdapters(const char* fname);
162
163 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
164 GStr& s, GStr& infname, GStr& infname2);
165 // uses outsuffix to generate output file names and open file handles as needed
166
167 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
168 void trash_report(char trashcode, GStr& rname, FILE* freport);
169
170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
171 GStr& rname, GStr& rinfo, GStr& infname);
172
173 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
174 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
175
176 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
177 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
178 int dust(GStr& seq);
179 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
180 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
181 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
182 bool trim_adapter3(GStr& seq, int &l5, int &l3);
183
184 void convertPhred(char* q, int len);
185 void convertPhred(GStr& q);
186
187 int main(int argc, char * const argv[]) {
188 GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189 int e;
190 if ((e=args.isError())>0) {
191 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
192 exit(224);
193 }
194 debug=(args.getOpt('Y')!=NULL);
195 verbose=(args.getOpt('V')!=NULL);
196 convert_phred=(args.getOpt('Q')!=NULL);
197 doCollapse=(args.getOpt('C')!=NULL);
198 doDust=(args.getOpt('D')!=NULL);
199 if (args.getOpt('A')) doPolyTrim=false;
200 /*
201 rawFormat=(args.getOpt('R')!=NULL);
202 if (rawFormat) {
203 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
204 }
205 */
206 prefix=args.getOpt('n');
207 GStr s=args.getOpt('l');
208 if (!s.is_empty())
209 min_read_len=s.asInt();
210 s=args.getOpt('m');
211 if (!s.is_empty())
212 max_perc_N=s.asDouble();
213 s=args.getOpt('d');
214 if (!s.is_empty()) {
215 dust_cutoff=s.asInt();
216 doDust=true;
217 }
218 s=args.getOpt('q');
219 if (!s.is_empty()) {
220 qvtrim_qmin=s.asInt();
221 }
222 s=args.getOpt('t');
223 if (!s.is_empty()) {
224 qvtrim_max=s.asInt();
225 }
226 s=args.getOpt('p');
227 if (!s.is_empty()) {
228 int v=s.asInt();
229 if (v==33) {
230 qv_phredtype=33;
231 qv_cvtadd=31;
232 }
233 else if (v==64) {
234 qv_phredtype=64;
235 qv_cvtadd=-31;
236 }
237 else
238 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
239 }
240 s=args.getOpt('f');
241 if (!s.is_empty()) {
242 loadAdapters(s.chars());
243 }
244 bool fileAdapters=adapters5.Count()+adapters3.Count();
245 s=args.getOpt('5');
246 if (!s.is_empty()) {
247 if (fileAdapters)
248 GError("Error: options -5 and -f cannot be used together!\n");
249 s.upper();
250 adapters5.Add(s);
251 }
252 s=args.getOpt('3');
253 if (!s.is_empty()) {
254 if (fileAdapters)
255 GError("Error: options -3 and -f cannot be used together!\n");
256 s.upper();
257 adapters3.Add(s);
258 }
259 s=args.getOpt('y');
260 if (!s.is_empty()) {
261 int minmatch=s.asInt();
262 poly_minScore=minmatch*poly_m_score;
263 }
264
265 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
266 else outsuffix="-";
267 trashReport= (args.getOpt('r')!=NULL);
268 int fcount=args.startNonOpt();
269 if (fcount==0) {
270 GMessage(USAGE);
271 exit(224);
272 }
273 if (fcount>1 && doCollapse) {
274 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
275 }
276 //openfw(f_out, args, 'o');
277 //if (f_out==NULL) f_out=stdout;
278 if (trashReport)
279 openfw(freport, args, 'r');
280 char* infile=NULL;
281
282 if (adapters5.Count()>0)
283 gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
284 if (adapters3.Count()>0)
285 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
286
287 while ((infile=args.nextNonOpt())!=NULL) {
288 //for each input file
289 int incounter=0; //counter for input reads
290 int outcounter=0; //counter for output reads
291 int trash_s=0; //too short from the get go
292 int trash_Q=0;
293 int trash_N=0;
294 int trash_D=0;
295 int trash_poly=0;
296 int trash_A3=0;
297 int trash_A5=0;
298 s=infile;
299 GStr infname;
300 GStr infname2;
301 FILE* f_in=NULL;
302 FILE* f_in2=NULL;
303 FILE* f_out=NULL;
304 FILE* f_out2=NULL;
305 bool paired_reads=false;
306 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
307 GLineReader fq(f_in);
308 GLineReader* fq2=NULL;
309 if (f_in2!=NULL) {
310 fq2=new GLineReader(f_in2);
311 paired_reads=true;
312 }
313 GStr rseq, rqv, seqid, seqinfo;
314 GStr rseq2, rqv2, seqid2, seqinfo2;
315 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
316 incounter++;
317 int a5=0, a3=0, b5=0, b3=0;
318 char tcode=0, tcode2=0;
319 tcode=process_read(seqid, rseq, rqv, a5, a3);
320 if (fq2!=NULL) {
321 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
322 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
323 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
324 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
325 }
326 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
327 //decide what to do with this pair and print rseq2 if the pair makes it
328 if (tcode>1 && tcode2<=1) {
329 //"untrash" rseq
330 if (a3-a5+1<min_read_len) {
331 a5=1;
332 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
333 }
334 tcode=1;
335 }
336 else if (tcode<=1 && tcode2>1) {
337 //"untrash" rseq2
338 if (b3-b5+1<min_read_len) {
339 b5=1;
340 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
341 }
342 tcode2=1;
343 }
344 if (tcode<=1) { //trimmed or left intact -- write it!
345 if (tcode>0) {
346 rseq2=rseq2.substr(b5,b3-b5+1);
347 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
348 }
349 int nocounter=0;
350 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
351 }
352 } //pair read
353 if (tcode>1) { //trashed
354 #ifdef GDEBUG
355 GMessage(" !!!!TRASH => 'N'\n");
356 #endif
357 if (tcode=='s') trash_s++;
358 else if (tcode=='A' || tcode=='T') trash_poly++;
359 else if (tcode=='Q') trash_Q++;
360 else if (tcode=='N') trash_N++;
361 else if (tcode=='D') trash_D++;
362 else if (tcode=='3') trash_A3++;
363 else if (tcode=='5') trash_A5++;
364 if (trashReport) trash_report(tcode, seqid, freport);
365 }
366 else if (!doCollapse) { //write it
367 if (tcode>0) {
368 rseq=rseq.substr(a5,a3-a5+1);
369 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
370 }
371 #ifdef GDEBUG
372 GMessage(" After trimming:\n");
373 GMessage("%s\n",rseq.chars());
374 #endif
375 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
376 }
377 } //for each fastq record
378 delete fq2;
379 FRCLOSE(f_in);
380 FRCLOSE(f_in2);
381 if (doCollapse) {
382 outcounter=0;
383 int maxdup_count=1;
384 char* maxdup_seq=NULL;
385 dhash.startIterate();
386 FqDupRec* qd=NULL;
387 char* seq=NULL;
388 while ((qd=dhash.NextData(seq))!=NULL) {
389 GStr rseq(seq);
390 //do the dusting here
391 if (doDust) {
392 int dustbases=dust(rseq);
393 if (dustbases>(rseq.length()>>1)) {
394 if (trashReport && qd->firstname!=NULL) {
395 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
396 }
397 trash_D+=qd->count;
398 continue;
399 }
400 }
401 outcounter++;
402 if (qd->count>maxdup_count) {
403 maxdup_count=qd->count;
404 maxdup_seq=seq;
405 }
406 if (isfasta) {
407 if (prefix.is_empty()) {
408 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
409 rseq.chars());
410 }
411 else { //use custom read name
412 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
413 qd->count, rseq.chars());
414 }
415 }
416 else { //fastq format
417 if (convert_phred) convertPhred(qd->qv, qd->len);
418 if (prefix.is_empty()) {
419 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
420 rseq.chars(), qd->qv);
421 }
422 else { //use custom read name
423 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
424 qd->count, rseq.chars(), qd->qv);
425 }
426 }
427 }//for each element of dhash
428 if (maxdup_count>1) {
429 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
430 }
431 } //collapse entries
432 if (verbose) {
433 if (paired_reads) {
434 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
435 GMessage("Number of input pairs :%9d\n", incounter);
436 GMessage(" Output pairs :%9d\n", outcounter);
437 }
438 else {
439 GMessage(">Input file : %s\n", infname.chars());
440 GMessage("Number of input reads :%9d\n", incounter);
441 GMessage(" Output reads :%9d\n", outcounter);
442 }
443 GMessage("------------------------------------\n");
444 if (num_trimmed5)
445 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
446 if (num_trimmed3)
447 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
448 GMessage("------------------------------------\n");
449 if (trash_s>0)
450 GMessage(" Trashed by length:%9d\n", trash_s);
451 if (trash_N>0)
452 GMessage(" Trashed by N%%:%9d\n", trash_N);
453 if (trash_Q>0)
454 GMessage("Trashed by low quality:%9d\n", trash_Q);
455 if (trash_poly>0)
456 GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
457 if (trash_A5>0)
458 GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
459 if (trash_A3>0)
460 GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
461 }
462 if (trashReport) {
463 FWCLOSE(freport);
464 }
465 FWCLOSE(f_out);
466 FWCLOSE(f_out2);
467 } //while each input file
468 delete gxmem_l;
469 delete gxmem_r;
470 //getc(stdin);
471 }
472
473 class NData {
474 public:
475 int NPos[1024]; //there should be no reads longer than 1K ?
476 int NCount;
477 int end5;
478 int end3;
479 int seqlen;
480 double perc_N; //percentage of Ns in end5..end3 range only!
481 const char* seq;
482 bool valid;
483 NData() {
484 NCount=0;
485 end5=0;
486 end3=0;
487 seq=NULL;
488 perc_N=0;
489 valid=true;
490 }
491 void init(GStr& rseq) {
492 NCount=0;
493 perc_N=0;
494 seqlen=rseq.length();
495 seq=rseq.chars();
496 end5=1;
497 end3=seqlen;
498 for (int i=0;i<seqlen;i++)
499 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
500 NPos[NCount]=i;
501 NCount++;
502 }
503 perc_N=(NCount*100.0)/seqlen;
504 valid=true;
505 }
506 void N_calc() { //only in the region after trimming
507 int n=0;
508 for (int i=end3-1;i<end5;i++) {
509 if (seq[i]=='N') n++;
510 }
511 perc_N=(n*100.0)/(end5-end3+1);
512 }
513 };
514
515 static NData feat;
516 int perc_lenN=12; // incremental distance from ends, in percentage of
517 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
518
519 void N_analyze(int l5, int l3, int p5, int p3) {
520 /* assumes feat was filled properly */
521 int old_dif, t5,t3,v;
522 if (l3<l5+2 || p5>p3 ) {
523 feat.end5=l5+1;
524 feat.end3=l3+1;
525 return;
526 }
527
528 t5=feat.NPos[p5]-l5;
529 t3=l3-feat.NPos[p3];
530 old_dif=p3-p5;
531 v=(int)((((double)(l3-l5))*perc_lenN)/100);
532 if (v>20) v=20; /* enforce N-search limit for very long reads */
533 if (t5 < v ) {
534 l5=feat.NPos[p5]+1;
535 p5++;
536 }
537 if (t3 < v) {
538 l3=feat.NPos[p3]-1;
539 p3--;
540 }
541 /* restNs=p3-p5; number of Ns in the new CLR */
542 if (p3-p5==old_dif) { /* no change, return */
543 /* don't trim if this may shorten the read too much */
544 //if (l5-l3<min_read_len) return;
545 feat.end5=l5+1;
546 feat.end3=l3+1;
547 return;
548 }
549 else
550 N_analyze(l5,l3, p5,p3);
551 }
552
553
554 bool qtrim(GStr& qvs, int &l5, int &l3) {
555 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
556 if (qv_phredtype==0) {
557 //try to guess the Phred type
558 int vmin=256, vmax=0;
559 for (int i=0;i<qvs.length();i++) {
560 if (vmin>qvs[i]) vmin=qvs[i];
561 if (vmax<qvs[i]) vmax=qvs[i];
562 }
563 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
564 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
565 if (qv_phredtype==0) {
566 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
567 }
568 if (verbose)
569 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
570 } //guessing Phred type
571 for (l3=qvs.length()-1;l3>2;l3--) {
572 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
573 }
574 //just in case, check also the 5' the end (?)
575 for (l5=0;l5<qvs.length()-3;l5++) {
576 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
577 }
578 if (qvtrim_max>0) {
579 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
580 if (l5>qvtrim_max) l5=qvtrim_max;
581 }
582 return (l5>0 || l3<qvs.length()-1);
583 }
584
585 bool ntrim(GStr& rseq, int &l5, int &l3) {
586 //count Ns in the sequence, trim N-rich ends
587 feat.init(rseq);
588 l5=feat.end5-1;
589 l3=feat.end3-1;
590 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
591 if (l5==feat.end5-1 && l3==feat.end3-1) {
592 if (feat.perc_N>max_perc_N) {
593 feat.valid=false;
594 l3=l5+1;
595 return true;
596 }
597 else {
598 return false; //nothing changed
599 }
600 }
601 l5=feat.end5-1;
602 l3=feat.end3-1;
603 if (l3-l5+1<min_read_len) {
604 feat.valid=false;
605 return true;
606 }
607 feat.N_calc();
608
609 if (feat.perc_N>max_perc_N) {
610 feat.valid=false;
611 l3=l5+1;
612 return true;
613 }
614 return true;
615 }
616
617 //--------------- dust functions ----------------
618 class DNADuster {
619 public:
620 int dustword;
621 int dustwindow;
622 int dustwindow2;
623 int dustcutoff;
624 int mv, iv, jv;
625 int counts[32*32*32];
626 int iis[32*32*32];
627 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
628 dustword=wordsize;
629 dustwindow=winsize;
630 dustwindow2 = (winsize>>1);
631 dustcutoff=cutoff;
632 mv=0;
633 iv=0;
634 jv=0;
635 }
636 void setWindowSize(int value) {
637 dustwindow = value;
638 dustwindow2 = (dustwindow >> 1);
639 }
640 void setWordSize(int value) {
641 dustword=value;
642 }
643 void wo1(int len, const char* s, int ivv) {
644 int i, ii, j, v, t, n, n1, sum;
645 int js, nis;
646 n = 32 * 32 * 32;
647 n1 = n - 1;
648 nis = 0;
649 i = 0;
650 ii = 0;
651 sum = 0;
652 v = 0;
653 for (j=0; j < len; j++, s++) {
654 ii <<= 5;
655 if (*s<=32) {
656 i=0;
657 continue;
658 }
659 ii |= *s - 'A'; //assume uppercase!
660 ii &= n1;
661 i++;
662 if (i >= dustword) {
663 for (js=0; js < nis && iis[js] != ii; js++) ;
664 if (js == nis) {
665 iis[nis] = ii;
666 counts[ii] = 0;
667 nis++;
668 }
669 if ((t = counts[ii]) > 0) {
670 sum += t;
671 v = 10 * sum / j;
672 if (mv < v) {
673 mv = v;
674 iv = ivv;
675 jv = j;
676 }
677 }
678 counts[ii]++;
679 }
680 }
681 }
682
683 int wo(int len, const char* s, int* beg, int* end) {
684 int i, l1;
685 l1 = len - dustword + 1;
686 if (l1 < 0) {
687 *beg = 0;
688 *end = len - 1;
689 return 0;
690 }
691 mv = 0;
692 iv = 0;
693 jv = 0;
694 for (i=0; i < l1; i++) {
695 wo1(len-i, s+i, i);
696 }
697 *beg = iv;
698 *end = iv + jv;
699 return mv;
700 }
701
702 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
703 int i, j, l, a, b, v;
704 if (cutoff==0) cutoff=dustcutoff;
705 a=0;b=0;
706 //GMessage("Dust cutoff=%d\n", cutoff);
707 for (i=0; i < seqlen; i += dustwindow2) {
708 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
709 v = wo(l, seq+i, &a, &b);
710 if (v > cutoff) {
711 //for (j = a; j <= b && j < dustwindow2; j++) {
712 for (j = a; j <= b; j++) {
713 seqmsk[i+j]='N';//could be made lowercase instead
714 }
715 }
716 }
717 //return first;
718 }
719 };
720
721 static DNADuster duster;
722
723 int dust(GStr& rseq) {
724 char* seq=Gstrdup(rseq.chars());
725 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
726 //check the number of Ns:
727 int ncount=0;
728 for (int i=0;i<rseq.length();i++) {
729 if (seq[i]=='N') ncount++;
730 }
731 GFREE(seq);
732 return ncount;
733 }
734
735 int get3mer_value(const char* s) {
736 return (s[0]<<16)+(s[1]<<8)+s[2];
737 }
738
739 int w3_match(int qv, const char* str, int slen, int start_index=0) {
740 if (start_index>=slen || start_index<0) return -1;
741 for (int i=start_index;i<slen-3;i++) {
742 int rv=get3mer_value(str+i);
743 if (rv==qv) return i;
744 }
745 return -1;
746 }
747
748 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
749 if (end_index>=slen) return -1;
750 if (end_index<0) end_index=slen-1;
751 for (int i=end_index-2;i>=0;i--) {
752 int rv=get3mer_value(str+i);
753 if (rv==qv) return i;
754 }
755 return -1;
756 }
757
758 struct SLocScore {
759 int pos;
760 int score;
761 SLocScore(int p=0,int s=0) {
762 pos=p;
763 score=s;
764 }
765 void set(int p, int s) {
766 pos=p;
767 score=s;
768 }
769 void add(int p, int add) {
770 pos=p;
771 score+=add;
772 }
773 };
774
775 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
776 if (!doPolyTrim) return false;
777 int rlen=seq.length();
778 l5=0;
779 l3=rlen-1;
780 int32 seedVal=*(int32*)poly_seed;
781 char polyChar=poly_seed[0];
782 //assumes N trimming was already done
783 //so a poly match should be very close to the end of the read
784 // -- find the initial match (seed)
785 int lmin=GMAX((rlen-12), 0);
786 int li;
787 for (li=rlen-4;li>lmin;li--) {
788 if (seedVal==*(int*)&(seq[li])) {
789 break;
790 }
791 }
792 if (li<=lmin) return false;
793 //seed found, try to extend it both ways
794 //extend right
795 int ri=li+3;
796 SLocScore loc(ri, poly_m_score<<2);
797 SLocScore maxloc(loc);
798 //extend right
799 while (ri<rlen-2) {
800 ri++;
801 if (seq[ri]==polyChar) {
802 loc.add(ri,poly_m_score);
803 }
804 else if (seq[ri]=='N') {
805 loc.add(ri,0);
806 }
807 else { //mismatch
808 loc.add(ri,poly_mis_score);
809 if (maxloc.score-loc.score>poly_dropoff_score) break;
810 }
811 if (maxloc.score<=loc.score) {
812 maxloc=loc;
813 }
814 }
815 ri=maxloc.pos;
816 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
817 //ri = right boundary for the poly match
818 //extend left
819 loc.set(li, maxloc.score);
820 maxloc.pos=li;
821 while (li>0) {
822 li--;
823 if (seq[li]==polyChar) {
824 loc.add(li,poly_m_score);
825 }
826 else if (seq[li]=='N') {
827 loc.add(li,0);
828 }
829 else { //mismatch
830 loc.add(li,poly_mis_score);
831 if (maxloc.score-loc.score>poly_dropoff_score) break;
832 }
833 if (maxloc.score<=loc.score) {
834 maxloc=loc;
835 }
836 }
837 li=maxloc.pos;
838 if ((maxloc.score>poly_minScore && ri>=rlen-3) ||
839 (maxloc.score==poly_minScore && ri==rlen-1) ||
840 (maxloc.score>(poly_minScore<<1) && ri>=rlen-6)) {
841 //trimming this li-ri match at 3' end
842 l3=li-1;
843 if (l3<0) l3=0;
844 return true;
845 }
846 return false;
847 }
848
849 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
850 if (!doPolyTrim) return false;
851 int rlen=seq.length();
852 l5=0;
853 l3=rlen-1;
854 int32 seedVal=*(int32*)poly_seed;
855 char polyChar=poly_seed[0];
856 //assumes N trimming was already done
857 //so a poly match should be very close to the end of the read
858 // -- find the initial match (seed)
859 int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
860 int li;
861 for (li=0;li<=lmax;li++) {
862 if (seedVal==*(int*)&(seq[li])) {
863 break;
864 }
865 }
866 if (li>lmax) return false;
867 //seed found, try to extend it both ways
868 //extend left
869 int ri=li+3; //save rightmost base of the seed
870 SLocScore loc(li, poly_m_score<<2);
871 SLocScore maxloc(loc);
872 while (li>0) {
873 li--;
874 if (seq[li]==polyChar) {
875 loc.add(li,poly_m_score);
876 }
877 else if (seq[li]=='N') {
878 loc.add(li,0);
879 }
880 else { //mismatch
881 loc.add(li,poly_mis_score);
882 if (maxloc.score-loc.score>poly_dropoff_score) break;
883 }
884 if (maxloc.score<=loc.score) {
885 maxloc=loc;
886 }
887 }
888 li=maxloc.pos;
889 if (li>5) return false; //no trimming wanted, too far from 5' end
890 //li = right boundary for the poly match
891
892 //extend right
893 loc.set(ri, maxloc.score);
894 maxloc.pos=ri;
895 while (ri<rlen-2) {
896 ri++;
897 if (seq[ri]==polyChar) {
898 loc.add(ri,poly_m_score);
899 }
900 else if (seq[ri]=='N') {
901 loc.add(ri,0);
902 }
903 else { //mismatch
904 loc.add(ri,poly_mis_score);
905 if (maxloc.score-loc.score>poly_dropoff_score) break;
906 }
907 if (maxloc.score<=loc.score) {
908 maxloc=loc;
909 }
910 }
911 ri=maxloc.pos;
912 if ((maxloc.score==poly_minScore && li==0) ||
913 (maxloc.score>poly_minScore && li<2)
914 || (maxloc.score>(poly_minScore<<1) && li<6)) {
915 //adjust l5 to reflect this trimming of 5' end
916 l5=ri+1;
917 if (l5>rlen-1) l5=rlen-1;
918 return true;
919 }
920 return false;
921 }
922
923 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
924 if (adapters3.Count()==0) return false;
925 int rlen=seq.length();
926 l5=0;
927 l3=rlen-1;
928 bool trimmed=false;
929 GStr wseq(seq.chars());
930 int wlen=rlen;
931 for (int ai=0;ai<adapters3.Count();ai++) {
932 if (adapters3[ai].is_empty()) continue;
933 int alen=adapters3[ai].length();
934 GStr& aseq=adapters3[ai];
935 GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
936 if (r_bestaln) {
937 trimmed=true;
938 //keep unmatched region on the left, if any
939 l3-=(wlen-r_bestaln->sl+1);
940 delete r_bestaln;
941 if (l3<0) l3=0;
942 if (l3-l5+1<min_read_len) return true;
943 wseq=seq.substr(l5,l3-l5+1);
944 wlen=wseq.length();
945 }
946 }//for each 5' adapter
947 return trimmed;
948 }
949
950 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
951 if (adapters5.Count()==0) return false;
952 int rlen=seq.length();
953 l5=0;
954 l3=rlen-1;
955 bool trimmed=false;
956 GStr wseq(seq.chars());
957 int wlen=rlen;
958 for (int ai=0;ai<adapters5.Count();ai++) {
959 if (adapters5[ai].is_empty()) continue;
960 int alen=adapters5[ai].length();
961 GStr& aseq=adapters5[ai];
962 GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_l, 84);
963 if (l_bestaln) {
964 trimmed=true;
965 l5+=l_bestaln->sr;
966 delete l_bestaln;
967 if (l5>=rlen) l5=rlen-1;
968 if (l3-l5+1<min_read_len) return true;
969 wseq=seq.substr(l5,l3-l5+1);
970 wlen=wseq.length();
971 }
972 }//for each 5' adapter
973 return trimmed;
974 }
975
976 //convert qvs to/from phred64 from/to phread33
977 void convertPhred(GStr& q) {
978 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
979 }
980
981 void convertPhred(char* q, int len) {
982 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
983 }
984
985 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
986 GStr& rname, GStr& rinfo, GStr& infname) {
987 rseq="";
988 rqv="";
989 rname="";
990 rinfo="";
991 if (fq.eof()) return false;
992 char* l=fq.getLine();
993 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
994 if (l==NULL) return false;
995 /* if (rawFormat) {
996 //TODO: implement raw qseq parsing here
997 //if (raw type=N) then continue; //skip invalid/bad records
998 } //raw qseq format
999 else { // FASTQ or FASTA */
1000 isfasta=(l[0]=='>');
1001 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1002 infname.chars(), l);
1003 GStr s(l);
1004 rname=&(l[1]);
1005 for (int i=0;i<rname.length();i++)
1006 if (rname[i]<=' ') {
1007 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1008 rname.cut(i);
1009 break;
1010 }
1011 //now get the sequence
1012 if ((l=fq.getLine())==NULL)
1013 GError("Error: unexpected EOF after header for read %s (%s)\n",
1014 rname.chars(), infname.chars());
1015 rseq=l; //this must be the DNA line
1016 while ((l=fq.getLine())!=NULL) {
1017 //seq can span multiple lines
1018 if (l[0]=='>' || l[0]=='+') {
1019 fq.pushBack();
1020 break; //
1021 }
1022 rseq+=l;
1023 } //check for multi-line seq
1024 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1025 if ((l=fq.getLine())==NULL)
1026 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1027 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1028 if ((l=fq.getLine())==NULL)
1029 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1030 rqv=l;
1031 //if (rqv.length()!=rseq.length())
1032 // GError("Error: qv len != seq len for %s\n", rname.chars());
1033 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1034 rqv+=l; //append to qv string
1035 }
1036 }// fastq
1037 // } //<-- FASTA or FASTQ
1038 rseq.upper();
1039 return true;
1040 }
1041
1042 #ifdef GDEBUG
1043 void showTrim(GStr& s, int l5, int l3) {
1044 if (l5>0) {
1045 color_bg(c_red);
1046 }
1047 for (int i=0;i<s.length()-1;i++) {
1048 if (i && i==l5) color_resetbg();
1049 fprintf(stderr, "%c", s[i]);
1050 if (i==l3) color_bg(c_red);
1051 }
1052 fprintf(stderr, "%c", s[s.length()-1]);
1053 color_reset();
1054 fprintf(stderr, "\n");
1055 }
1056 #endif
1057
1058 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1059 //returns 0 if the read was untouched, 1 if it was just trimmed
1060 // and a trash code if it was trashed
1061 l5=0;
1062 l3=rseq.length()-1;
1063 #ifdef GDEBUG
1064 GMessage(">%s\n", rname.chars());
1065 GMessage("%s\n",rseq.chars());
1066 #endif
1067 if (l3-l5+1<min_read_len) {
1068 return 's';
1069 }
1070 GStr wseq(rseq.chars());
1071 GStr wqv(rqv.chars());
1072 int w5=l5;
1073 int w3=l3;
1074 //first do the q-based trimming
1075 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1076 if (w3-w5+1<min_read_len) {
1077 return 'Q'; //invalid read
1078 }
1079 //-- keep only the w5..w3 range
1080 l5=w5;
1081 l3=w3;
1082 wseq=wseq.substr(w5, w3-w5+1);
1083 if (!wqv.is_empty())
1084 wqv=wqv.substr(w5, w3-w5+1);
1085 } //qv trimming
1086 // N-trimming on the remaining read seq
1087 if (ntrim(wseq, w5, w3)) {
1088 //GMessage("before: %s\n",wseq.chars());
1089 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1090 l5+=w5;
1091 l3-=(wseq.length()-1-w3);
1092 if (w3-w5+1<min_read_len) {
1093 return 'N'; //to be trashed
1094 }
1095 //-- keep only the w5..w3 range
1096 wseq=wseq.substr(w5, w3-w5+1);
1097 if (!wqv.is_empty())
1098 wqv=wqv.substr(w5, w3-w5+1);
1099 w5=0;
1100 w3=wseq.length()-1;
1101 }
1102 char trim_code;
1103 do {
1104 trim_code=0;
1105 if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1106 trim_code='A';
1107 }
1108 else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1109 trim_code='T';
1110 }
1111 else if (trim_adapter5(wseq, w5, w3)) {
1112 trim_code='5';
1113 }
1114 if (trim_code) {
1115 #ifdef GDEBUG
1116 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1117 showTrim(wseq, w5, w3);
1118 #endif
1119 int trimlen=wseq.length()-(w3-w5+1);
1120 num_trimmed5++;
1121 if (trimlen<min_trimmed5)
1122 min_trimmed5=trimlen;
1123 l5+=w5;
1124 l3-=(wseq.length()-1-w3);
1125 if (w3-w5+1<min_read_len) {
1126 return trim_code;
1127 }
1128 //-- keep only the w5..w3 range
1129 wseq=wseq.substr(w5, w3-w5+1);
1130 if (!wqv.is_empty())
1131 wqv=wqv.substr(w5, w3-w5+1);
1132 }// trimmed at 5' end
1133 } while (trim_code);
1134
1135 do {
1136 trim_code=0;
1137 if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1138 trim_code='A';
1139 }
1140 else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1141 trim_code='T';
1142 }
1143 else if (trim_adapter3(wseq, w5, w3)) {
1144 trim_code='3';
1145 }
1146 if (trim_code) {
1147 #ifdef GDEBUG
1148 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1149 showTrim(wseq, w5, w3);
1150 #endif
1151 int trimlen=wseq.length()-(w3-w5+1);
1152 num_trimmed3++;
1153 if (trimlen<min_trimmed3)
1154 min_trimmed3=trimlen;
1155 l5+=w5;
1156 l3-=(wseq.length()-1-w3);
1157 if (w3-w5+1<min_read_len) {
1158 return trim_code;
1159 }
1160 //-- keep only the w5..w3 range
1161 wseq=wseq.substr(w5, w3-w5+1);
1162 if (!wqv.is_empty())
1163 wqv=wqv.substr(w5, w3-w5+1);
1164 }//trimming at 3' end
1165 } while (trim_code);
1166
1167
1168 if (doCollapse) {
1169 //keep read for later
1170 FqDupRec* dr=dhash.Find(wseq.chars());
1171 if (dr==NULL) { //new entry
1172 //if (prefix.is_empty())
1173 dhash.Add(wseq.chars(),
1174 new FqDupRec(&wqv, rname.chars()));
1175 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1176 }
1177 else
1178 dr->add(wqv);
1179 } //collapsing duplicates
1180 else { //not collapsing duplicates
1181 //apply the dust filter now
1182 if (doDust) {
1183 int dustbases=dust(wseq);
1184 if (dustbases>(wseq.length()>>1)) {
1185 return 'D';
1186 }
1187 }
1188 } //not collapsing duplicates
1189 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1190 }
1191
1192 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1193 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1194 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1195 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1196 }
1197
1198 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1199 outcounter++;
1200 bool asFasta=(rqv.is_empty() || fastaOutput);
1201 if (asFasta) {
1202 if (prefix.is_empty()) {
1203 printHeader(f_out, '>',rname,rinfo);
1204 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1205 }
1206 else {
1207 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1208 rseq.chars());
1209 }
1210 }
1211 else { //fastq
1212 if (convert_phred) convertPhred(rqv);
1213 if (prefix.is_empty()) {
1214 printHeader(f_out, '@', rname, rinfo);
1215 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1216 }
1217 else
1218 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1219 rseq.chars(),rqv.chars() );
1220 }
1221 }
1222
1223 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1224 if (freport==NULL || trashcode<=' ') return;
1225 if (trashcode=='3' || trashcode=='5') {
1226 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1227 }
1228 else {
1229 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1230 }
1231 //tcounter++;
1232 }
1233
1234 GStr getFext(GStr& s, int* xpos=NULL) {
1235 //s must be a filename without a path
1236 GStr r("");
1237 if (xpos!=NULL) *xpos=0;
1238 if (s.is_empty() || s=="-") return r;
1239 int p=s.rindex('.');
1240 int d=s.rindex('/');
1241 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1242 r=s.substr(p+1);
1243 if (xpos!=NULL) *xpos=p+1;
1244 r.lower();
1245 return r;
1246 }
1247
1248 void baseFileName(GStr& fname) {
1249 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1250 int xpos=0;
1251 GStr fext=getFext(fname, &xpos);
1252 if (xpos<=1) return;
1253 bool extdel=false;
1254 GStr f2;
1255 if (fext=="z" || fext=="zip") {
1256 extdel=true;
1257 }
1258 else if (fext.length()>=2) {
1259 f2=fext.substr(0,2);
1260 extdel=(f2=="gz" || f2=="bz");
1261 }
1262 if (extdel) {
1263 fname.cut(xpos-1);
1264 fext=getFext(fname, &xpos);
1265 if (xpos<=1) return;
1266 }
1267 extdel=false;
1268 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1269 extdel=true;
1270 }
1271 else if (fext.length()>=2) {
1272 extdel=(fext.substr(0,2)=="fa");
1273 }
1274 if (extdel) fname.cut(xpos-1);
1275 GStr fncp(fname);
1276 fncp.lower();
1277 fncp.chomp("seq");
1278 fncp.chomp("sequence");
1279 fncp.trimR("_.");
1280 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1281 }
1282
1283 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1284 FILE* f_out=NULL;
1285 GStr fname(getFileName(infname.chars()));
1286 //eliminate known extensions
1287 baseFileName(fname);
1288 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1289 else if (pocmd.is_empty()) {
1290 GStr oname(fname);
1291 oname.append('.');
1292 oname.append(outsuffix);
1293 f_out=fopen(oname.chars(),"w");
1294 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1295 }
1296 else {
1297 GStr oname(">");
1298 oname.append(fname);
1299 oname.append('.');
1300 oname.append(outsuffix);
1301 pocmd.append(oname);
1302 f_out=popen(pocmd.chars(), "w");
1303 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1304 }
1305 return f_out;
1306 }
1307
1308 void guess_unzip(GStr& fname, GStr& picmd) {
1309 GStr fext=getFext(fname);
1310 if (fext=="gz" || fext=="gzip" || fext=="z") {
1311 picmd="gzip -cd ";
1312 }
1313 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1314 picmd="bzip2 -cd ";
1315 }
1316 }
1317
1318
1319 int loadAdapters(const char* fname) {
1320 GLineReader lr(fname);
1321 char* l;
1322 while ((l=lr.nextLine())!=NULL) {
1323 if (lr.length()<=3 || l[0]=='#') continue;
1324 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1325 l[0]==';'|| l[0]==':' ) {
1326 int i=1;
1327 while (l[i]!=0 && isspace(l[i])) {
1328 i++;
1329 }
1330 if (l[i]!=0) {
1331 GStr s(&(l[i]));
1332 adapters3.Add(s);
1333 continue;
1334 }
1335 }
1336 else {
1337 GStr s(l);
1338 s.startTokenize("\t ;,:");
1339 GStr a5,a3;
1340 if (s.nextToken(a5))
1341 s.nextToken(a3);
1342 a5.upper();
1343 a3.upper();
1344 adapters5.Add(a5);
1345 adapters3.Add(a3);
1346 }
1347 }
1348 return adapters5.Count()+adapters3.Count();
1349 }
1350
1351 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1352 GStr& s, GStr& infname, GStr& infname2) {
1353 // uses outsuffix to generate output file names and open file handles as needed
1354 infname="";
1355 infname2="";
1356 f_in=NULL;
1357 f_in2=NULL;
1358 f_out=NULL;
1359 f_out2=NULL;
1360 //analyze outsuffix intent
1361 GStr pocmd;
1362 if (outsuffix=="-") {
1363 f_out=stdout;
1364 }
1365 else {
1366 GStr ox=getFext(outsuffix);
1367 if (ox.length()>2) ox=ox.substr(0,2);
1368 if (ox=="gz") pocmd="gzip -9 -c ";
1369 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1370 }
1371 if (s=="-") {
1372 f_in=stdin;
1373 infname="stdin";
1374 f_out=prepOutFile(infname, pocmd);
1375 return;
1376 } // streaming from stdin
1377 s.startTokenize(",:");
1378 s.nextToken(infname);
1379 bool paired=s.nextToken(infname2);
1380 if (fileExists(infname.chars())==0)
1381 GError("Error: cannot find file %s!\n",infname.chars());
1382 GStr fname(getFileName(infname.chars()));
1383 GStr picmd;
1384 guess_unzip(fname, picmd);
1385 if (picmd.is_empty()) {
1386 f_in=fopen(infname.chars(), "r");
1387 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1388 }
1389 else {
1390 picmd.append(infname);
1391 f_in=popen(picmd.chars(), "r");
1392 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1393 }
1394 if (f_out==stdout) {
1395 if (paired) GError("Error: output suffix required for paired reads\n");
1396 return;
1397 }
1398 f_out=prepOutFile(infname, pocmd);
1399 if (!paired) return;
1400 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1401 // ---- paired reads:-------------
1402 if (fileExists(infname2.chars())==0)
1403 GError("Error: cannot find file %s!\n",infname2.chars());
1404 picmd="";
1405 GStr fname2(getFileName(infname2.chars()));
1406 guess_unzip(fname2, picmd);
1407 if (picmd.is_empty()) {
1408 f_in2=fopen(infname2.chars(), "r");
1409 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1410 }
1411 else {
1412 picmd.append(infname2);
1413 f_in2=popen(picmd.chars(), "r");
1414 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1415 }
1416 f_out2=prepOutFile(infname2, pocmd);
1417 }