ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 108
Committed: Tue Oct 11 19:49:29 2011 UTC (7 years, 10 months ago) by gpertea
File size: 41757 byte(s)
Log Message:
made more stringent on the 5' end, other tests and fixes

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7
8 #define USAGE "Usage:\n\
9 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 <input.fq>[,<input_mates.fq>\n\
13 \n\
14 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 for low complexity and collapse duplicate reads.\n\
16 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 one read in a pair), the two file names should be given delimited by a comma\n\
18 or a colon character\n\
19 \n\
20 Options:\n\
21 -n rename all the reads using the <prefix> followed by a read counter;\n\
22 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23 the read duplication count\n\
24 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25 file(s) named <input>.<outsuffix> which will be created in the \n\
26 current (working) directory; (writes to stdout if -o- is given);\n\
27 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 -f file with adapter sequences to trim, each line having this format:\n\
29 <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 -3 trim the given adapter sequence at the 3' end of each read\n\
33 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 -A disable polyA/T trimming (enabled by default)\n\
35 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 -l minimum \"clean\" length after trimming that a read must have\n\
40 in order to pass the filter (default: 16)\n\
41 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42 one letter code for the reason why they were trashed\n\
43 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44 50% of its length detected as low complexity\n\
45 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 name (e.g. for microRNAs)\n\
47 -p input is phred64/phred33 (use -p64 or -p33)\n\
48 -Q convert quality values to the other Phred qv type\n\
49 -V verbose processing\n\
50 "
51
52 //-z for -o option, the output stream(s) will be first piped into the given\n
53 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54
55
56 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57
58 //For paired reads sequencing:
59 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 //FILE* f_out=NULL; //stdout if not provided
62 //FILE* f_out2=NULL; //for paired reads
63 //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 //FILE* f_in2=NULL; //for paired reads
65 FILE* freport=NULL;
66
67 bool debug=false;
68 bool verbose=false;
69 bool doCollapse=false;
70 bool doDust=false;
71 bool doPolyTrim=true;
72 bool fastaOutput=false;
73 bool trashReport=false;
74 //bool rawFormat=false;
75 int min_read_len=16;
76 double max_perc_N=7.0;
77 int dust_cutoff=16;
78 bool isfasta=false;
79 bool convert_phred=false;
80 GStr outsuffix; // -o
81 GStr prefix;
82 GStr zcmd;
83 int num_trimmed5=0;
84 int num_trimmed3=0;
85 int num_trimmedN=0;
86 int num_trimmedQ=0;
87 int min_trimmed5=INT_MAX;
88 int min_trimmed3=INT_MAX;
89
90 int qvtrim_qmin=0;
91 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
92 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93 int qv_cvtadd=0; //could be -31 or +31
94
95 // adaptor matching metrics -- for X-drop ungapped extension
96 const int match_reward=2;
97 const int mismatch_penalty=3;
98 const int Xdrop=8;
99
100 const int poly_m_score=2; //match score
101 const int poly_mis_score=-3; //mismatch
102 const int poly_dropoff_score=7;
103 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
104
105 const char *polyA_seed="AAAA";
106 const char *polyT_seed="TTTT";
107 GVec<GStr> adapters5;
108 GVec<GStr> adapters3;
109
110 CGreedyAlignData* gxmem_l=NULL;
111 CGreedyAlignData* gxmem_r=NULL;
112
113 // element in dhash:
114 class FqDupRec {
115 public:
116 int count; //how many of these reads are the same
117 int len; //length of qv
118 char* firstname; //optional, only if we want to keep the original read names
119 char* qv;
120 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
121 len=0;
122 qv=NULL;
123 firstname=NULL;
124 count=0;
125 if (qstr!=NULL) {
126 qv=Gstrdup(qstr->chars());
127 len=qstr->length();
128 count++;
129 }
130 if (rname!=NULL) firstname=Gstrdup(rname);
131 }
132 ~FqDupRec() {
133 GFREE(qv);
134 GFREE(firstname);
135 }
136 void add(GStr& d) { //collapse another record into this one
137 if (d.length()!=len)
138 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
139 count++;
140 for (int i=0;i<len;i++)
141 qv[i]=(qv[i]+d[i])/2;
142 }
143 };
144
145 void openfw(FILE* &f, GArgs& args, char opt) {
146 GStr s=args.getOpt(opt);
147 if (!s.is_empty()) {
148 if (s=='-') f=stdout;
149 else {
150 f=fopen(s.chars(),"w");
151 if (f==NULL) GError("Error creating file: %s\n", s.chars());
152 }
153 }
154 }
155
156 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
157 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
158
159 GHash<FqDupRec> dhash; //hash to keep track of duplicates
160
161 int loadAdapters(const char* fname);
162
163 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
164 GStr& s, GStr& infname, GStr& infname2);
165 // uses outsuffix to generate output file names and open file handles as needed
166
167 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
168 void trash_report(char trashcode, GStr& rname, FILE* freport);
169
170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
171 GStr& rname, GStr& rinfo, GStr& infname);
172
173 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
174 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
175
176 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
177 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
178 int dust(GStr& seq);
179 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
180 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
181 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
182 bool trim_adapter3(GStr& seq, int &l5, int &l3);
183
184 void convertPhred(char* q, int len);
185 void convertPhred(GStr& q);
186
187 int main(int argc, char * const argv[]) {
188 GArgs args(argc, argv, "YQDCVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189 int e;
190 if ((e=args.isError())>0) {
191 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
192 exit(224);
193 }
194 debug=(args.getOpt('Y')!=NULL);
195 verbose=(args.getOpt('V')!=NULL);
196 convert_phred=(args.getOpt('Q')!=NULL);
197 doCollapse=(args.getOpt('C')!=NULL);
198 doDust=(args.getOpt('D')!=NULL);
199 if (args.getOpt('A')) doPolyTrim=false;
200 /*
201 rawFormat=(args.getOpt('R')!=NULL);
202 if (rawFormat) {
203 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
204 }
205 */
206 prefix=args.getOpt('n');
207 GStr s=args.getOpt('l');
208 if (!s.is_empty())
209 min_read_len=s.asInt();
210 s=args.getOpt('m');
211 if (!s.is_empty())
212 max_perc_N=s.asDouble();
213 s=args.getOpt('d');
214 if (!s.is_empty()) {
215 dust_cutoff=s.asInt();
216 doDust=true;
217 }
218 s=args.getOpt('q');
219 if (!s.is_empty()) {
220 qvtrim_qmin=s.asInt();
221 }
222 s=args.getOpt('t');
223 if (!s.is_empty()) {
224 qvtrim_max=s.asInt();
225 }
226 s=args.getOpt('p');
227 if (!s.is_empty()) {
228 int v=s.asInt();
229 if (v==33) {
230 qv_phredtype=33;
231 qv_cvtadd=31;
232 }
233 else if (v==64) {
234 qv_phredtype=64;
235 qv_cvtadd=-31;
236 }
237 else
238 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
239 }
240 s=args.getOpt('f');
241 if (!s.is_empty()) {
242 loadAdapters(s.chars());
243 }
244 bool fileAdapters=adapters5.Count()+adapters3.Count();
245 s=args.getOpt('5');
246 if (!s.is_empty()) {
247 if (fileAdapters)
248 GError("Error: options -5 and -f cannot be used together!\n");
249 s.upper();
250 adapters5.Add(s);
251 }
252 s=args.getOpt('3');
253 if (!s.is_empty()) {
254 if (fileAdapters)
255 GError("Error: options -3 and -f cannot be used together!\n");
256 s.upper();
257 adapters3.Add(s);
258 }
259 s=args.getOpt('y');
260 if (!s.is_empty()) {
261 int minmatch=s.asInt();
262 poly_minScore=minmatch*poly_m_score;
263 }
264
265 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
266 else outsuffix="-";
267 trashReport= (args.getOpt('r')!=NULL);
268 int fcount=args.startNonOpt();
269 if (fcount==0) {
270 GMessage(USAGE);
271 exit(224);
272 }
273 if (fcount>1 && doCollapse) {
274 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
275 }
276 //openfw(f_out, args, 'o');
277 //if (f_out==NULL) f_out=stdout;
278 if (trashReport)
279 openfw(freport, args, 'r');
280 char* infile=NULL;
281
282 if (adapters5.Count()>0)
283 gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
284 if (adapters3.Count()>0)
285 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
286
287 while ((infile=args.nextNonOpt())!=NULL) {
288 //for each input file
289 int incounter=0; //counter for input reads
290 int outcounter=0; //counter for output reads
291 int trash_s=0; //too short from the get go
292 int trash_Q=0;
293 int trash_N=0;
294 int trash_D=0;
295 int trash_poly=0;
296 int trash_A3=0;
297 int trash_A5=0;
298 s=infile;
299 GStr infname;
300 GStr infname2;
301 FILE* f_in=NULL;
302 FILE* f_in2=NULL;
303 FILE* f_out=NULL;
304 FILE* f_out2=NULL;
305 bool paired_reads=false;
306 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
307 GLineReader fq(f_in);
308 GLineReader* fq2=NULL;
309 if (f_in2!=NULL) {
310 fq2=new GLineReader(f_in2);
311 paired_reads=true;
312 }
313 GStr rseq, rqv, seqid, seqinfo;
314 GStr rseq2, rqv2, seqid2, seqinfo2;
315 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
316 incounter++;
317 int a5=0, a3=0, b5=0, b3=0;
318 char tcode=0, tcode2=0;
319 tcode=process_read(seqid, rseq, rqv, a5, a3);
320 if (fq2!=NULL) {
321 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
322 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
323 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
324 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
325 }
326 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
327 //decide what to do with this pair and print rseq2 if the pair makes it
328 if (tcode>1 && tcode2<=1) {
329 //"untrash" rseq
330 if (a3-a5+1<min_read_len) {
331 a5=1;
332 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
333 }
334 tcode=1;
335 }
336 else if (tcode<=1 && tcode2>1) {
337 //"untrash" rseq2
338 if (b3-b5+1<min_read_len) {
339 b5=1;
340 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
341 }
342 tcode2=1;
343 }
344 if (tcode<=1) { //trimmed or left intact -- write it!
345 if (tcode>0) {
346 rseq2=rseq2.substr(b5,b3-b5+1);
347 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
348 }
349 int nocounter=0;
350 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
351 }
352 } //pair read
353 if (tcode>1) { //trashed
354 #ifdef GDEBUG
355 GMessage(" !!!!TRASH => 'N'\n");
356 #endif
357 if (tcode=='s') trash_s++;
358 else if (tcode=='A' || tcode=='T') trash_poly++;
359 else if (tcode=='Q') trash_Q++;
360 else if (tcode=='N') trash_N++;
361 else if (tcode=='D') trash_D++;
362 else if (tcode=='3') trash_A3++;
363 else if (tcode=='5') trash_A5++;
364 if (trashReport) trash_report(tcode, seqid, freport);
365 }
366 else if (!doCollapse) { //write it
367 if (tcode>0) {
368 rseq=rseq.substr(a5,a3-a5+1);
369 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
370 }
371 #ifdef GDEBUG
372 GMessage(" After trimming:\n");
373 GMessage("%s\n",rseq.chars());
374 #endif
375 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
376 }
377 } //for each fastq record
378 delete fq2;
379 FRCLOSE(f_in);
380 FRCLOSE(f_in2);
381 if (doCollapse) {
382 outcounter=0;
383 int maxdup_count=1;
384 char* maxdup_seq=NULL;
385 dhash.startIterate();
386 FqDupRec* qd=NULL;
387 char* seq=NULL;
388 while ((qd=dhash.NextData(seq))!=NULL) {
389 GStr rseq(seq);
390 //do the dusting here
391 if (doDust) {
392 int dustbases=dust(rseq);
393 if (dustbases>(rseq.length()>>1)) {
394 if (trashReport && qd->firstname!=NULL) {
395 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
396 }
397 trash_D+=qd->count;
398 continue;
399 }
400 }
401 outcounter++;
402 if (qd->count>maxdup_count) {
403 maxdup_count=qd->count;
404 maxdup_seq=seq;
405 }
406 if (isfasta) {
407 if (prefix.is_empty()) {
408 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
409 rseq.chars());
410 }
411 else { //use custom read name
412 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
413 qd->count, rseq.chars());
414 }
415 }
416 else { //fastq format
417 if (convert_phred) convertPhred(qd->qv, qd->len);
418 if (prefix.is_empty()) {
419 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
420 rseq.chars(), qd->qv);
421 }
422 else { //use custom read name
423 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
424 qd->count, rseq.chars(), qd->qv);
425 }
426 }
427 }//for each element of dhash
428 if (maxdup_count>1) {
429 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
430 }
431 } //collapse entries
432 if (verbose) {
433 if (paired_reads) {
434 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
435 GMessage("Number of input pairs :%9d\n", incounter);
436 GMessage(" Output pairs :%9d\n", outcounter);
437 }
438 else {
439 GMessage(">Input file : %s\n", infname.chars());
440 GMessage("Number of input reads :%9d\n", incounter);
441 GMessage(" Output reads :%9d\n", outcounter);
442 }
443 GMessage("------------------------------------\n");
444 if (num_trimmed5)
445 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
446 if (num_trimmed3)
447 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
448 GMessage("------------------------------------\n");
449 if (trash_s>0)
450 GMessage(" Trashed by length:%9d\n", trash_s);
451 if (trash_N>0)
452 GMessage(" Trashed by N%%:%9d\n", trash_N);
453 if (trash_Q>0)
454 GMessage("Trashed by low quality:%9d\n", trash_Q);
455 if (trash_poly>0)
456 GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
457 if (trash_A5>0)
458 GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
459 if (trash_A3>0)
460 GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
461 }
462 if (trashReport) {
463 FWCLOSE(freport);
464 }
465 FWCLOSE(f_out);
466 FWCLOSE(f_out2);
467 } //while each input file
468 delete gxmem_l;
469 delete gxmem_r;
470 //getc(stdin);
471 }
472
473 class NData {
474 public:
475 int NPos[1024]; //there should be no reads longer than 1K ?
476 int NCount;
477 int end5;
478 int end3;
479 int seqlen;
480 double perc_N; //percentage of Ns in end5..end3 range only!
481 const char* seq;
482 bool valid;
483 NData() {
484 NCount=0;
485 end5=0;
486 end3=0;
487 seq=NULL;
488 perc_N=0;
489 valid=true;
490 }
491 void init(GStr& rseq) {
492 NCount=0;
493 perc_N=0;
494 seqlen=rseq.length();
495 seq=rseq.chars();
496 end5=1;
497 end3=seqlen;
498 for (int i=0;i<seqlen;i++)
499 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
500 NPos[NCount]=i;
501 NCount++;
502 }
503 perc_N=(NCount*100.0)/seqlen;
504 valid=true;
505 }
506 void N_calc() { //only in the region after trimming
507 int n=0;
508 for (int i=end3-1;i<end5;i++) {
509 if (seq[i]=='N') n++;
510 }
511 perc_N=(n*100.0)/(end5-end3+1);
512 }
513 };
514
515 static NData feat;
516 int perc_lenN=12; // incremental distance from ends, in percentage of
517 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
518
519 void N_analyze(int l5, int l3, int p5, int p3) {
520 /* assumes feat was filled properly */
521 int old_dif, t5,t3,v;
522 if (l3<l5+2 || p5>p3 ) {
523 feat.end5=l5+1;
524 feat.end3=l3+1;
525 return;
526 }
527
528 t5=feat.NPos[p5]-l5;
529 t3=l3-feat.NPos[p3];
530 old_dif=p3-p5;
531 v=(int)((((double)(l3-l5))*perc_lenN)/100);
532 if (v>20) v=20; /* enforce N-search limit for very long reads */
533 if (t5 < v ) {
534 l5=feat.NPos[p5]+1;
535 p5++;
536 }
537 if (t3 < v) {
538 l3=feat.NPos[p3]-1;
539 p3--;
540 }
541 /* restNs=p3-p5; number of Ns in the new CLR */
542 if (p3-p5==old_dif) { /* no change, return */
543 /* don't trim if this may shorten the read too much */
544 //if (l5-l3<min_read_len) return;
545 feat.end5=l5+1;
546 feat.end3=l3+1;
547 return;
548 }
549 else
550 N_analyze(l5,l3, p5,p3);
551 }
552
553
554 bool qtrim(GStr& qvs, int &l5, int &l3) {
555 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
556 if (qv_phredtype==0) {
557 //try to guess the Phred type
558 int vmin=256, vmax=0;
559 for (int i=0;i<qvs.length();i++) {
560 if (vmin>qvs[i]) vmin=qvs[i];
561 if (vmax<qvs[i]) vmax=qvs[i];
562 }
563 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
564 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
565 if (qv_phredtype==0) {
566 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
567 }
568 if (verbose)
569 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
570 } //guessing Phred type
571 for (l3=qvs.length()-1;l3>2;l3--) {
572 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
573 }
574 //just in case, check also the 5' the end (?)
575 for (l5=0;l5<qvs.length()-3;l5++) {
576 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
577 }
578 if (qvtrim_max>0) {
579 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
580 if (l5>qvtrim_max) l5=qvtrim_max;
581 }
582 return (l5>0 || l3<qvs.length()-1);
583 }
584
585 bool ntrim(GStr& rseq, int &l5, int &l3) {
586 //count Ns in the sequence, trim N-rich ends
587 feat.init(rseq);
588 l5=feat.end5-1;
589 l3=feat.end3-1;
590 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
591 if (l5==feat.end5-1 && l3==feat.end3-1) {
592 if (feat.perc_N>max_perc_N) {
593 feat.valid=false;
594 l3=l5+1;
595 return true;
596 }
597 else {
598 return false; //nothing changed
599 }
600 }
601 l5=feat.end5-1;
602 l3=feat.end3-1;
603 if (l3-l5+1<min_read_len) {
604 feat.valid=false;
605 return true;
606 }
607 feat.N_calc();
608
609 if (feat.perc_N>max_perc_N) {
610 feat.valid=false;
611 l3=l5+1;
612 return true;
613 }
614 return true;
615 }
616
617 //--------------- dust functions ----------------
618 class DNADuster {
619 public:
620 int dustword;
621 int dustwindow;
622 int dustwindow2;
623 int dustcutoff;
624 int mv, iv, jv;
625 int counts[32*32*32];
626 int iis[32*32*32];
627 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
628 dustword=wordsize;
629 dustwindow=winsize;
630 dustwindow2 = (winsize>>1);
631 dustcutoff=cutoff;
632 mv=0;
633 iv=0;
634 jv=0;
635 }
636 void setWindowSize(int value) {
637 dustwindow = value;
638 dustwindow2 = (dustwindow >> 1);
639 }
640 void setWordSize(int value) {
641 dustword=value;
642 }
643 void wo1(int len, const char* s, int ivv) {
644 int i, ii, j, v, t, n, n1, sum;
645 int js, nis;
646 n = 32 * 32 * 32;
647 n1 = n - 1;
648 nis = 0;
649 i = 0;
650 ii = 0;
651 sum = 0;
652 v = 0;
653 for (j=0; j < len; j++, s++) {
654 ii <<= 5;
655 if (*s<=32) {
656 i=0;
657 continue;
658 }
659 ii |= *s - 'A'; //assume uppercase!
660 ii &= n1;
661 i++;
662 if (i >= dustword) {
663 for (js=0; js < nis && iis[js] != ii; js++) ;
664 if (js == nis) {
665 iis[nis] = ii;
666 counts[ii] = 0;
667 nis++;
668 }
669 if ((t = counts[ii]) > 0) {
670 sum += t;
671 v = 10 * sum / j;
672 if (mv < v) {
673 mv = v;
674 iv = ivv;
675 jv = j;
676 }
677 }
678 counts[ii]++;
679 }
680 }
681 }
682
683 int wo(int len, const char* s, int* beg, int* end) {
684 int i, l1;
685 l1 = len - dustword + 1;
686 if (l1 < 0) {
687 *beg = 0;
688 *end = len - 1;
689 return 0;
690 }
691 mv = 0;
692 iv = 0;
693 jv = 0;
694 for (i=0; i < l1; i++) {
695 wo1(len-i, s+i, i);
696 }
697 *beg = iv;
698 *end = iv + jv;
699 return mv;
700 }
701
702 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
703 int i, j, l, a, b, v;
704 if (cutoff==0) cutoff=dustcutoff;
705 a=0;b=0;
706 //GMessage("Dust cutoff=%d\n", cutoff);
707 for (i=0; i < seqlen; i += dustwindow2) {
708 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
709 v = wo(l, seq+i, &a, &b);
710 if (v > cutoff) {
711 //for (j = a; j <= b && j < dustwindow2; j++) {
712 for (j = a; j <= b; j++) {
713 seqmsk[i+j]='N';//could be made lowercase instead
714 }
715 }
716 }
717 //return first;
718 }
719 };
720
721 static DNADuster duster;
722
723 int dust(GStr& rseq) {
724 char* seq=Gstrdup(rseq.chars());
725 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
726 //check the number of Ns:
727 int ncount=0;
728 for (int i=0;i<rseq.length();i++) {
729 if (seq[i]=='N') ncount++;
730 }
731 GFREE(seq);
732 return ncount;
733 }
734
735 int get3mer_value(const char* s) {
736 return (s[0]<<16)+(s[1]<<8)+s[2];
737 }
738
739 int w3_match(int qv, const char* str, int slen, int start_index=0) {
740 if (start_index>=slen || start_index<0) return -1;
741 for (int i=start_index;i<slen-3;i++) {
742 int rv=get3mer_value(str+i);
743 if (rv==qv) return i;
744 }
745 return -1;
746 }
747
748 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
749 if (end_index>=slen) return -1;
750 if (end_index<0) end_index=slen-1;
751 for (int i=end_index-2;i>=0;i--) {
752 int rv=get3mer_value(str+i);
753 if (rv==qv) return i;
754 }
755 return -1;
756 }
757
758 struct SLocScore {
759 int pos;
760 int score;
761 SLocScore(int p=0,int s=0) {
762 pos=p;
763 score=s;
764 }
765 void set(int p, int s) {
766 pos=p;
767 score=s;
768 }
769 void add(int p, int add) {
770 pos=p;
771 score+=add;
772 }
773 };
774
775 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
776 if (!doPolyTrim) return false;
777 int rlen=seq.length();
778 l5=0;
779 l3=rlen-1;
780 int32 seedVal=*(int32*)poly_seed;
781 char polyChar=poly_seed[0];
782 //assumes N trimming was already done
783 //so a poly match should be very close to the end of the read
784 // -- find the initial match (seed)
785 int lmin=GMAX((rlen-16), 0);
786 int li;
787 for (li=rlen-4;li>lmin;li--) {
788 if (seedVal==*(int*)&(seq[li])) {
789 break;
790 }
791 }
792 if (li<=lmin) return false;
793 //seed found, try to extend it both ways
794 //extend right
795 int ri=li+3;
796 SLocScore loc(ri, poly_m_score<<2);
797 SLocScore maxloc(loc);
798 //extend right
799 while (ri<rlen-2) {
800 ri++;
801 if (seq[ri]==polyChar) {
802 loc.add(ri,poly_m_score);
803 }
804 else if (seq[ri]=='N') {
805 loc.add(ri,0);
806 }
807 else { //mismatch
808 loc.add(ri,poly_mis_score);
809 if (maxloc.score-loc.score>poly_dropoff_score) break;
810 }
811 if (maxloc.score<=loc.score) {
812 maxloc=loc;
813 }
814 }
815 ri=maxloc.pos;
816 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
817 //ri = right boundary for the poly match
818 //extend left
819 loc.set(li, maxloc.score);
820 maxloc.pos=li;
821 while (li>0) {
822 li--;
823 if (seq[li]==polyChar) {
824 loc.add(li,poly_m_score);
825 }
826 else if (seq[li]=='N') {
827 loc.add(li,0);
828 }
829 else { //mismatch
830 loc.add(li,poly_mis_score);
831 if (maxloc.score-loc.score>poly_dropoff_score) break;
832 }
833 if (maxloc.score<=loc.score) {
834 maxloc=loc;
835 }
836 }
837 li=maxloc.pos;
838 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
839 (maxloc.score>poly_minScore && ri>=rlen-3) ||
840 (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
841 //trimming this li-ri match at 3' end
842 l3=li-1;
843 if (l3<0) l3=0;
844 return true;
845 }
846 return false;
847 }
848
849 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
850 if (!doPolyTrim) return false;
851 int rlen=seq.length();
852 l5=0;
853 l3=rlen-1;
854 int32 seedVal=*(int32*)poly_seed;
855 char polyChar=poly_seed[0];
856 //assumes N trimming was already done
857 //so a poly match should be very close to the end of the read
858 // -- find the initial match (seed)
859 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
860 int li;
861 for (li=0;li<=lmax;li++) {
862 if (seedVal==*(int*)&(seq[li])) {
863 break;
864 }
865 }
866 if (li>lmax) return false;
867 //seed found, try to extend it both ways
868 //extend left
869 int ri=li+3; //save rightmost base of the seed
870 SLocScore loc(li, poly_m_score<<2);
871 SLocScore maxloc(loc);
872 while (li>0) {
873 li--;
874 if (seq[li]==polyChar) {
875 loc.add(li,poly_m_score);
876 }
877 else if (seq[li]=='N') {
878 loc.add(li,0);
879 }
880 else { //mismatch
881 loc.add(li,poly_mis_score);
882 if (maxloc.score-loc.score>poly_dropoff_score) break;
883 }
884 if (maxloc.score<=loc.score) {
885 maxloc=loc;
886 }
887 }
888 li=maxloc.pos;
889 if (li>5) return false; //no trimming wanted, too far from 5' end
890 //li = right boundary for the poly match
891
892 //extend right
893 loc.set(ri, maxloc.score);
894 maxloc.pos=ri;
895 while (ri<rlen-2) {
896 ri++;
897 if (seq[ri]==polyChar) {
898 loc.add(ri,poly_m_score);
899 }
900 else if (seq[ri]=='N') {
901 loc.add(ri,0);
902 }
903 else { //mismatch
904 loc.add(ri,poly_mis_score);
905 if (maxloc.score-loc.score>poly_dropoff_score) break;
906 }
907 if (maxloc.score<=loc.score) {
908 maxloc=loc;
909 }
910 }
911 ri=maxloc.pos;
912 if ((maxloc.score==poly_minScore && li==0) ||
913 (maxloc.score>poly_minScore && li<2)
914 || (maxloc.score>(poly_minScore*3) && li<8)) {
915 //adjust l5 to reflect this trimming of 5' end
916 l5=ri+1;
917 if (l5>rlen-1) l5=rlen-1;
918 return true;
919 }
920 return false;
921 }
922
923 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
924 if (adapters3.Count()==0) return false;
925 int rlen=seq.length();
926 l5=0;
927 l3=rlen-1;
928 bool trimmed=false;
929 GStr wseq(seq.chars());
930 int wlen=rlen;
931 for (int ai=0;ai<adapters3.Count();ai++) {
932 if (adapters3[ai].is_empty()) continue;
933 int alen=adapters3[ai].length();
934 GStr& aseq=adapters3[ai];
935 GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_r, 74);
936 if (r_bestaln) {
937 trimmed=true;
938 //keep unmatched region on the left, if any
939 l3-=(wlen-r_bestaln->sl+1);
940 delete r_bestaln;
941 if (l3<0) l3=0;
942 if (l3-l5+1<min_read_len) return true;
943 wseq=seq.substr(l5,l3-l5+1);
944 wlen=wseq.length();
945 }
946 }//for each 5' adapter
947 return trimmed;
948 }
949
950 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
951 if (adapters5.Count()==0) return false;
952 int rlen=seq.length();
953 l5=0;
954 l3=rlen-1;
955 bool trimmed=false;
956 GStr wseq(seq.chars());
957 int wlen=rlen;
958 for (int ai=0;ai<adapters5.Count();ai++) {
959 if (adapters5[ai].is_empty()) continue;
960 int alen=adapters5[ai].length();
961 GStr& aseq=adapters5[ai];
962 GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, wseq.chars(), wlen, gxmem_l, 84);
963 if (l_bestaln) {
964 trimmed=true;
965 l5+=l_bestaln->sr;
966 delete l_bestaln;
967 if (l5>=rlen) l5=rlen-1;
968 if (l3-l5+1<min_read_len) return true;
969 wseq=seq.substr(l5,l3-l5+1);
970 wlen=wseq.length();
971 }
972 }//for each 5' adapter
973 return trimmed;
974 }
975
976 //convert qvs to/from phred64 from/to phread33
977 void convertPhred(GStr& q) {
978 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
979 }
980
981 void convertPhred(char* q, int len) {
982 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
983 }
984
985 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
986 GStr& rname, GStr& rinfo, GStr& infname) {
987 rseq="";
988 rqv="";
989 rname="";
990 rinfo="";
991 if (fq.eof()) return false;
992 char* l=fq.getLine();
993 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
994 if (l==NULL) return false;
995 /* if (rawFormat) {
996 //TODO: implement raw qseq parsing here
997 //if (raw type=N) then continue; //skip invalid/bad records
998 } //raw qseq format
999 else { // FASTQ or FASTA */
1000 isfasta=(l[0]=='>');
1001 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1002 infname.chars(), l);
1003 GStr s(l);
1004 rname=&(l[1]);
1005 for (int i=0;i<rname.length();i++)
1006 if (rname[i]<=' ') {
1007 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1008 rname.cut(i);
1009 break;
1010 }
1011 //now get the sequence
1012 if ((l=fq.getLine())==NULL)
1013 GError("Error: unexpected EOF after header for read %s (%s)\n",
1014 rname.chars(), infname.chars());
1015 rseq=l; //this must be the DNA line
1016 while ((l=fq.getLine())!=NULL) {
1017 //seq can span multiple lines
1018 if (l[0]=='>' || l[0]=='+') {
1019 fq.pushBack();
1020 break; //
1021 }
1022 rseq+=l;
1023 } //check for multi-line seq
1024 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1025 if ((l=fq.getLine())==NULL)
1026 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1027 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1028 if ((l=fq.getLine())==NULL)
1029 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1030 rqv=l;
1031 //if (rqv.length()!=rseq.length())
1032 // GError("Error: qv len != seq len for %s\n", rname.chars());
1033 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1034 rqv+=l; //append to qv string
1035 }
1036 }// fastq
1037 // } //<-- FASTA or FASTQ
1038 rseq.upper();
1039 return true;
1040 }
1041
1042 #ifdef GDEBUG
1043 void showTrim(GStr& s, int l5, int l3) {
1044 if (l5>0) {
1045 color_bg(c_red);
1046 }
1047 for (int i=0;i<s.length()-1;i++) {
1048 if (i && i==l5) color_resetbg();
1049 fprintf(stderr, "%c", s[i]);
1050 if (i==l3) color_bg(c_red);
1051 }
1052 fprintf(stderr, "%c", s[s.length()-1]);
1053 color_reset();
1054 fprintf(stderr, "\n");
1055 }
1056 #endif
1057
1058 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1059 //returns 0 if the read was untouched, 1 if it was just trimmed
1060 // and a trash code if it was trashed
1061 l5=0;
1062 l3=rseq.length()-1;
1063 #ifdef GDEBUG
1064 //rseq.reverse();
1065 GMessage(">%s\n", rname.chars());
1066 GMessage("%s\n",rseq.chars());
1067 #endif
1068 if (l3-l5+1<min_read_len) {
1069 return 's';
1070 }
1071 GStr wseq(rseq.chars());
1072 GStr wqv(rqv.chars());
1073 int w5=l5;
1074 int w3=l3;
1075 //first do the q-based trimming
1076 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1077 if (w3-w5+1<min_read_len) {
1078 return 'Q'; //invalid read
1079 }
1080 //-- keep only the w5..w3 range
1081 l5=w5;
1082 l3=w3;
1083 wseq=wseq.substr(w5, w3-w5+1);
1084 if (!wqv.is_empty())
1085 wqv=wqv.substr(w5, w3-w5+1);
1086 } //qv trimming
1087 // N-trimming on the remaining read seq
1088 if (ntrim(wseq, w5, w3)) {
1089 //GMessage("before: %s\n",wseq.chars());
1090 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1091 l5+=w5;
1092 l3-=(wseq.length()-1-w3);
1093 if (w3-w5+1<min_read_len) {
1094 return 'N'; //to be trashed
1095 }
1096 //-- keep only the w5..w3 range
1097 wseq=wseq.substr(w5, w3-w5+1);
1098 if (!wqv.is_empty())
1099 wqv=wqv.substr(w5, w3-w5+1);
1100 w5=0;
1101 w3=wseq.length()-1;
1102 }
1103 char trim_code;
1104 do {
1105 trim_code=0;
1106 if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1107 trim_code='A';
1108 }
1109 else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1110 trim_code='T';
1111 }
1112 else if (trim_adapter5(wseq, w5, w3)) {
1113 trim_code='5';
1114 }
1115 if (trim_code) {
1116 #ifdef GDEBUG
1117 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1118 showTrim(wseq, w5, w3);
1119 #endif
1120 int trimlen=wseq.length()-(w3-w5+1);
1121 num_trimmed5++;
1122 if (trimlen<min_trimmed5)
1123 min_trimmed5=trimlen;
1124 l5+=w5;
1125 l3-=(wseq.length()-1-w3);
1126 if (w3-w5+1<min_read_len) {
1127 return trim_code;
1128 }
1129 //-- keep only the w5..w3 range
1130 wseq=wseq.substr(w5, w3-w5+1);
1131 if (!wqv.is_empty())
1132 wqv=wqv.substr(w5, w3-w5+1);
1133 }// trimmed at 5' end
1134 } while (trim_code);
1135
1136 do {
1137 trim_code=0;
1138 if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1139 trim_code='A';
1140 }
1141 else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1142 trim_code='T';
1143 }
1144 else if (trim_adapter3(wseq, w5, w3)) {
1145 trim_code='3';
1146 }
1147 if (trim_code) {
1148 #ifdef GDEBUG
1149 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1150 showTrim(wseq, w5, w3);
1151 #endif
1152 int trimlen=wseq.length()-(w3-w5+1);
1153 num_trimmed3++;
1154 if (trimlen<min_trimmed3)
1155 min_trimmed3=trimlen;
1156 l5+=w5;
1157 l3-=(wseq.length()-1-w3);
1158 if (w3-w5+1<min_read_len) {
1159 return trim_code;
1160 }
1161 //-- keep only the w5..w3 range
1162 wseq=wseq.substr(w5, w3-w5+1);
1163 if (!wqv.is_empty())
1164 wqv=wqv.substr(w5, w3-w5+1);
1165 }//trimming at 3' end
1166 } while (trim_code);
1167
1168
1169 if (doCollapse) {
1170 //keep read for later
1171 FqDupRec* dr=dhash.Find(wseq.chars());
1172 if (dr==NULL) { //new entry
1173 //if (prefix.is_empty())
1174 dhash.Add(wseq.chars(),
1175 new FqDupRec(&wqv, rname.chars()));
1176 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1177 }
1178 else
1179 dr->add(wqv);
1180 } //collapsing duplicates
1181 else { //not collapsing duplicates
1182 //apply the dust filter now
1183 if (doDust) {
1184 int dustbases=dust(wseq);
1185 if (dustbases>(wseq.length()>>1)) {
1186 return 'D';
1187 }
1188 }
1189 } //not collapsing duplicates
1190 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1191 }
1192
1193 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1194 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1195 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1196 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1197 }
1198
1199 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1200 outcounter++;
1201 bool asFasta=(rqv.is_empty() || fastaOutput);
1202 if (asFasta) {
1203 if (prefix.is_empty()) {
1204 printHeader(f_out, '>',rname,rinfo);
1205 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1206 }
1207 else {
1208 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1209 rseq.chars());
1210 }
1211 }
1212 else { //fastq
1213 if (convert_phred) convertPhred(rqv);
1214 if (prefix.is_empty()) {
1215 printHeader(f_out, '@', rname, rinfo);
1216 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1217 }
1218 else
1219 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1220 rseq.chars(),rqv.chars() );
1221 }
1222 }
1223
1224 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1225 if (freport==NULL || trashcode<=' ') return;
1226 if (trashcode=='3' || trashcode=='5') {
1227 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1228 }
1229 else {
1230 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1231 }
1232 //tcounter++;
1233 }
1234
1235 GStr getFext(GStr& s, int* xpos=NULL) {
1236 //s must be a filename without a path
1237 GStr r("");
1238 if (xpos!=NULL) *xpos=0;
1239 if (s.is_empty() || s=="-") return r;
1240 int p=s.rindex('.');
1241 int d=s.rindex('/');
1242 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1243 r=s.substr(p+1);
1244 if (xpos!=NULL) *xpos=p+1;
1245 r.lower();
1246 return r;
1247 }
1248
1249 void baseFileName(GStr& fname) {
1250 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1251 int xpos=0;
1252 GStr fext=getFext(fname, &xpos);
1253 if (xpos<=1) return;
1254 bool extdel=false;
1255 GStr f2;
1256 if (fext=="z" || fext=="zip") {
1257 extdel=true;
1258 }
1259 else if (fext.length()>=2) {
1260 f2=fext.substr(0,2);
1261 extdel=(f2=="gz" || f2=="bz");
1262 }
1263 if (extdel) {
1264 fname.cut(xpos-1);
1265 fext=getFext(fname, &xpos);
1266 if (xpos<=1) return;
1267 }
1268 extdel=false;
1269 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1270 extdel=true;
1271 }
1272 else if (fext.length()>=2) {
1273 extdel=(fext.substr(0,2)=="fa");
1274 }
1275 if (extdel) fname.cut(xpos-1);
1276 GStr fncp(fname);
1277 fncp.lower();
1278 fncp.chomp("seq");
1279 fncp.chomp("sequence");
1280 fncp.trimR("_.");
1281 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1282 }
1283
1284 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1285 FILE* f_out=NULL;
1286 GStr fname(getFileName(infname.chars()));
1287 //eliminate known extensions
1288 baseFileName(fname);
1289 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1290 else if (pocmd.is_empty()) {
1291 GStr oname(fname);
1292 oname.append('.');
1293 oname.append(outsuffix);
1294 f_out=fopen(oname.chars(),"w");
1295 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1296 }
1297 else {
1298 GStr oname(">");
1299 oname.append(fname);
1300 oname.append('.');
1301 oname.append(outsuffix);
1302 pocmd.append(oname);
1303 f_out=popen(pocmd.chars(), "w");
1304 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1305 }
1306 return f_out;
1307 }
1308
1309 void guess_unzip(GStr& fname, GStr& picmd) {
1310 GStr fext=getFext(fname);
1311 if (fext=="gz" || fext=="gzip" || fext=="z") {
1312 picmd="gzip -cd ";
1313 }
1314 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1315 picmd="bzip2 -cd ";
1316 }
1317 }
1318
1319
1320 int loadAdapters(const char* fname) {
1321 GLineReader lr(fname);
1322 char* l;
1323 while ((l=lr.nextLine())!=NULL) {
1324 if (lr.length()<=3 || l[0]=='#') continue;
1325 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1326 l[0]==';'|| l[0]==':' ) {
1327 int i=1;
1328 while (l[i]!=0 && isspace(l[i])) {
1329 i++;
1330 }
1331 if (l[i]!=0) {
1332 GStr s(&(l[i]));
1333 #ifdef GDEBUG
1334 //s.reverse();
1335 #endif
1336 adapters3.Add(s);
1337 continue;
1338 }
1339 }
1340 else {
1341 GStr s(l);
1342 s.startTokenize("\t ;,:");
1343 GStr a5,a3;
1344 if (s.nextToken(a5))
1345 s.nextToken(a3);
1346 a5.upper();
1347 a3.upper();
1348 #ifdef GDEBUG
1349 // a5.reverse();
1350 // a3.reverse();
1351 #endif
1352 adapters5.Add(a5);
1353 adapters3.Add(a3);
1354 }
1355 }
1356 return adapters5.Count()+adapters3.Count();
1357 }
1358
1359 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1360 GStr& s, GStr& infname, GStr& infname2) {
1361 // uses outsuffix to generate output file names and open file handles as needed
1362 infname="";
1363 infname2="";
1364 f_in=NULL;
1365 f_in2=NULL;
1366 f_out=NULL;
1367 f_out2=NULL;
1368 //analyze outsuffix intent
1369 GStr pocmd;
1370 if (outsuffix=="-") {
1371 f_out=stdout;
1372 }
1373 else {
1374 GStr ox=getFext(outsuffix);
1375 if (ox.length()>2) ox=ox.substr(0,2);
1376 if (ox=="gz") pocmd="gzip -9 -c ";
1377 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1378 }
1379 if (s=="-") {
1380 f_in=stdin;
1381 infname="stdin";
1382 f_out=prepOutFile(infname, pocmd);
1383 return;
1384 } // streaming from stdin
1385 s.startTokenize(",:");
1386 s.nextToken(infname);
1387 bool paired=s.nextToken(infname2);
1388 if (fileExists(infname.chars())==0)
1389 GError("Error: cannot find file %s!\n",infname.chars());
1390 GStr fname(getFileName(infname.chars()));
1391 GStr picmd;
1392 guess_unzip(fname, picmd);
1393 if (picmd.is_empty()) {
1394 f_in=fopen(infname.chars(), "r");
1395 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1396 }
1397 else {
1398 picmd.append(infname);
1399 f_in=popen(picmd.chars(), "r");
1400 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1401 }
1402 if (f_out==stdout) {
1403 if (paired) GError("Error: output suffix required for paired reads\n");
1404 return;
1405 }
1406 f_out=prepOutFile(infname, pocmd);
1407 if (!paired) return;
1408 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1409 // ---- paired reads:-------------
1410 if (fileExists(infname2.chars())==0)
1411 GError("Error: cannot find file %s!\n",infname2.chars());
1412 picmd="";
1413 GStr fname2(getFileName(infname2.chars()));
1414 guess_unzip(fname2, picmd);
1415 if (picmd.is_empty()) {
1416 f_in2=fopen(infname2.chars(), "r");
1417 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1418 }
1419 else {
1420 picmd.append(infname2);
1421 f_in2=popen(picmd.chars(), "r");
1422 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1423 }
1424 f_out2=prepOutFile(infname2, pocmd);
1425 }