ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 172
Committed: Wed Feb 15 03:33:40 2012 UTC (7 years, 6 months ago) by gpertea
File size: 42690 byte(s)
Log Message:
wip fqtrim

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7
8 #define USAGE "Usage:\n\
9 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 <input.fq>[,<input_mates.fq>\n\
13 \n\
14 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 for low complexity and collapse duplicate reads.\n\
16 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 one read in a pair), the two file names should be given delimited by a comma\n\
18 or a colon character\n\
19 \n\
20 Options:\n\
21 -n rename all the reads using the <prefix> followed by a read counter;\n\
22 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23 the read duplication count\n\
24 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25 file(s) named <input>.<outsuffix> which will be created in the \n\
26 current (working) directory; (writes to stdout if -o- is given);\n\
27 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 -f file with adapter sequences to trim, each line having this format:\n\
29 <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 -3 trim the given adapter sequence at the 3' end of each read\n\
33 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 -A disable polyA/T trimming (enabled by default)\n\
35 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 -l minimum \"clean\" length after trimming that a read must have\n\
40 in order to pass the filter (default: 16)\n\
41 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42 one letter code for the reason why they were trashed\n\
43 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44 50% of its length detected as low complexity\n\
45 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 name (e.g. for microRNAs)\n\
47 -p input is phred64/phred33 (use -p64 or -p33)\n\
48 -Q convert quality values to the other Phred qv type\n\
49 -V verbose processing\n\
50 "
51
52 //-z for -o option, the output stream(s) will be first piped into the given\n
53 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54
55
56 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57
58 //For paired reads sequencing:
59 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 //FILE* f_out=NULL; //stdout if not provided
62 //FILE* f_out2=NULL; //for paired reads
63 //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 //FILE* f_in2=NULL; //for paired reads
65
66 FILE* freport=NULL;
67
68 bool debug=false;
69 bool verbose=false;
70 bool doCollapse=false;
71 bool doDust=false;
72 bool doPolyTrim=true;
73 bool fastaOutput=false;
74 bool trashReport=false;
75 bool revCompl=false; //also reverse complement adaptor sequences
76 int min_read_len=16;
77 double max_perc_N=7.0;
78 int dust_cutoff=16;
79 bool isfasta=false;
80 bool convert_phred=false;
81 GStr outsuffix; // -o
82 GStr prefix;
83 GStr zcmd;
84 int num_trimmed5=0;
85 int num_trimmed3=0;
86 int num_trimmedN=0;
87 int num_trimmedQ=0;
88 int min_trimmed5=INT_MAX;
89 int min_trimmed3=INT_MAX;
90
91 int qvtrim_qmin=0;
92 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94 int qv_cvtadd=0; //could be -31 or +31
95
96 // adaptor matching metrics -- for X-drop ungapped extension
97 const int match_reward=2;
98 const int mismatch_penalty=3;
99 const int Xdrop=8;
100
101 const int poly_m_score=2; //match score
102 const int poly_mis_score=-3; //mismatch
103 const int poly_dropoff_score=7;
104 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
105
106 const char *polyA_seed="AAAA";
107 const char *polyT_seed="TTTT";
108
109 struct CASeqData {
110 //positional data for every possible hexamer in an adapter
111 GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
112 GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
113 GStr seq; //actual adapter sequence data
114 GStr seqr; //reverse complement sequence
115 bool use_reverse;
116 CASeqData(bool rev=false):seq(),seqr() {
117 use_reverse=rev;
118 for (int i=0;i<4096;i++) {
119 pz[i]=NULL;
120 pzr[i]=NULL;
121 }
122 }
123
124 void update(const char* s) {
125 seq=s;
126 table6mers(seq.chars(), seq.length(), pz);
127 if (!use_reverse) return;
128 //reverse complement
129 seqr=s;
130 int slen=seq.length();
131 for (int i=0;i<slen;i++)
132 seqr[i]=ntComplement(seq[slen-i-1]);
133 table6mers(seqr.chars(), seqr.length(), pzr);
134 }
135
136 ~CASeqData() {
137 for (int i=0;i<4096;i++) {
138 delete pz[i];
139 delete pzr[i];
140 }
141 }
142 };
143
144 GVec<CASeqData> adapters5;
145 GVec<CASeqData> adapters3;
146
147 CGreedyAlignData* gxmem_l=NULL;
148 CGreedyAlignData* gxmem_r=NULL;
149
150 // element in dhash:
151 class FqDupRec {
152 public:
153 int count; //how many of these reads are the same
154 int len; //length of qv
155 char* firstname; //optional, only if we want to keep the original read names
156 char* qv;
157 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
158 len=0;
159 qv=NULL;
160 firstname=NULL;
161 count=0;
162 if (qstr!=NULL) {
163 qv=Gstrdup(qstr->chars());
164 len=qstr->length();
165 count++;
166 }
167 if (rname!=NULL) firstname=Gstrdup(rname);
168 }
169 ~FqDupRec() {
170 GFREE(qv);
171 GFREE(firstname);
172 }
173 void add(GStr& d) { //collapse another record into this one
174 if (d.length()!=len)
175 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
176 count++;
177 for (int i=0;i<len;i++)
178 qv[i]=(qv[i]+d[i])/2;
179 }
180 };
181
182 void openfw(FILE* &f, GArgs& args, char opt) {
183 GStr s=args.getOpt(opt);
184 if (!s.is_empty()) {
185 if (s=='-') f=stdout;
186 else {
187 f=fopen(s.chars(),"w");
188 if (f==NULL) GError("Error creating file: %s\n", s.chars());
189 }
190 }
191 }
192
193 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
194 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
195
196 GHash<FqDupRec> dhash; //hash to keep track of duplicates
197
198 void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
199 int loadAdapters(const char* fname);
200
201 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
202 GStr& s, GStr& infname, GStr& infname2);
203 // uses outsuffix to generate output file names and open file handles as needed
204
205 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
206 void trash_report(char trashcode, GStr& rname, FILE* freport);
207
208 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
209 GStr& rname, GStr& rinfo, GStr& infname);
210
211 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
212 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
213
214 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
215 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
216 int dust(GStr& seq);
217 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
218 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
219 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
220 bool trim_adapter3(GStr& seq, int &l5, int &l3);
221
222 void convertPhred(char* q, int len);
223 void convertPhred(GStr& q);
224
225 int main(int argc, char * const argv[]) {
226 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
227 int e;
228 if ((e=args.isError())>0) {
229 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
230 exit(224);
231 }
232 debug=(args.getOpt('Y')!=NULL);
233 verbose=(args.getOpt('V')!=NULL);
234 convert_phred=(args.getOpt('Q')!=NULL);
235 doCollapse=(args.getOpt('C')!=NULL);
236 doDust=(args.getOpt('D')!=NULL);
237 revCompl=(args.getOpt('R')!=NULL);
238 if (args.getOpt('A')) doPolyTrim=false;
239 /*
240 rawFormat=(args.getOpt('R')!=NULL);
241 if (rawFormat) {
242 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
243 }
244 */
245 prefix=args.getOpt('n');
246 GStr s=args.getOpt('l');
247 if (!s.is_empty())
248 min_read_len=s.asInt();
249 s=args.getOpt('m');
250 if (!s.is_empty())
251 max_perc_N=s.asDouble();
252 s=args.getOpt('d');
253 if (!s.is_empty()) {
254 dust_cutoff=s.asInt();
255 doDust=true;
256 }
257 s=args.getOpt('q');
258 if (!s.is_empty()) {
259 qvtrim_qmin=s.asInt();
260 }
261 s=args.getOpt('t');
262 if (!s.is_empty()) {
263 qvtrim_max=s.asInt();
264 }
265 s=args.getOpt('p');
266 if (!s.is_empty()) {
267 int v=s.asInt();
268 if (v==33) {
269 qv_phredtype=33;
270 qv_cvtadd=31;
271 }
272 else if (v==64) {
273 qv_phredtype=64;
274 qv_cvtadd=-31;
275 }
276 else
277 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
278 }
279 s=args.getOpt('f');
280 if (!s.is_empty()) {
281 loadAdapters(s.chars());
282 }
283 bool fileAdapters=adapters5.Count()+adapters3.Count();
284 s=args.getOpt('5');
285 if (!s.is_empty()) {
286 if (fileAdapters)
287 GError("Error: options -5 and -f cannot be used together!\n");
288 s.upper();
289 addAdapter(adapters5, s);
290 }
291 s=args.getOpt('3');
292 if (!s.is_empty()) {
293 if (fileAdapters)
294 GError("Error: options -3 and -f cannot be used together!\n");
295 s.upper();
296 addAdapter(adapters3, s);
297 }
298 s=args.getOpt('y');
299 if (!s.is_empty()) {
300 int minmatch=s.asInt();
301 poly_minScore=minmatch*poly_m_score;
302 }
303
304 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
305 else outsuffix="-";
306 trashReport= (args.getOpt('r')!=NULL);
307 int fcount=args.startNonOpt();
308 if (fcount==0) {
309 GMessage(USAGE);
310 exit(224);
311 }
312 if (fcount>1 && doCollapse) {
313 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
314 }
315 //openfw(f_out, args, 'o');
316 //if (f_out==NULL) f_out=stdout;
317 if (trashReport)
318 openfw(freport, args, 'r');
319 char* infile=NULL;
320
321 if (adapters5.Count()>0)
322 gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
323 if (adapters3.Count()>0)
324 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
325
326 while ((infile=args.nextNonOpt())!=NULL) {
327 //for each input file
328 int incounter=0; //counter for input reads
329 int outcounter=0; //counter for output reads
330 int trash_s=0; //too short from the get go
331 int trash_Q=0;
332 int trash_N=0;
333 int trash_D=0;
334 int trash_poly=0;
335 int trash_A3=0;
336 int trash_A5=0;
337 s=infile;
338 GStr infname;
339 GStr infname2;
340 FILE* f_in=NULL;
341 FILE* f_in2=NULL;
342 FILE* f_out=NULL;
343 FILE* f_out2=NULL;
344 bool paired_reads=false;
345 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
346 GLineReader fq(f_in);
347 GLineReader* fq2=NULL;
348 if (f_in2!=NULL) {
349 fq2=new GLineReader(f_in2);
350 paired_reads=true;
351 }
352 GStr rseq, rqv, seqid, seqinfo;
353 GStr rseq2, rqv2, seqid2, seqinfo2;
354 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
355 incounter++;
356 int a5=0, a3=0, b5=0, b3=0;
357 char tcode=0, tcode2=0;
358 tcode=process_read(seqid, rseq, rqv, a5, a3);
359 if (fq2!=NULL) {
360 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
361 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
362 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
363 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
364 }
365 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
366 //decide what to do with this pair and print rseq2 if the pair makes it
367 if (tcode>1 && tcode2<=1) {
368 //"untrash" rseq
369 if (a3-a5+1<min_read_len) {
370 a5=1;
371 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
372 }
373 tcode=1;
374 }
375 else if (tcode<=1 && tcode2>1) {
376 //"untrash" rseq2
377 if (b3-b5+1<min_read_len) {
378 b5=1;
379 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
380 }
381 tcode2=1;
382 }
383 if (tcode<=1) { //trimmed or left intact -- write it!
384 if (tcode>0) {
385 rseq2=rseq2.substr(b5,b3-b5+1);
386 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
387 }
388 int nocounter=0;
389 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
390 }
391 } //pair read
392 if (tcode>1) { //trashed
393 #ifdef GDEBUG
394 GMessage(" !!!!TRASH => 'N'\n");
395 #endif
396 if (tcode=='s') trash_s++;
397 else if (tcode=='A' || tcode=='T') trash_poly++;
398 else if (tcode=='Q') trash_Q++;
399 else if (tcode=='N') trash_N++;
400 else if (tcode=='D') trash_D++;
401 else if (tcode=='3') trash_A3++;
402 else if (tcode=='5') trash_A5++;
403 if (trashReport) trash_report(tcode, seqid, freport);
404 }
405 else if (!doCollapse) { //write it
406 if (tcode>0) {
407 rseq=rseq.substr(a5,a3-a5+1);
408 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
409 }
410 #ifdef GDEBUG
411 GMessage(" After trimming:\n");
412 GMessage("%s\n",rseq.chars());
413 #endif
414 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
415 }
416 } //for each fastq record
417 delete fq2;
418 FRCLOSE(f_in);
419 FRCLOSE(f_in2);
420 if (doCollapse) {
421 outcounter=0;
422 int maxdup_count=1;
423 char* maxdup_seq=NULL;
424 dhash.startIterate();
425 FqDupRec* qd=NULL;
426 char* seq=NULL;
427 while ((qd=dhash.NextData(seq))!=NULL) {
428 GStr rseq(seq);
429 //do the dusting here
430 if (doDust) {
431 int dustbases=dust(rseq);
432 if (dustbases>(rseq.length()>>1)) {
433 if (trashReport && qd->firstname!=NULL) {
434 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
435 }
436 trash_D+=qd->count;
437 continue;
438 }
439 }
440 outcounter++;
441 if (qd->count>maxdup_count) {
442 maxdup_count=qd->count;
443 maxdup_seq=seq;
444 }
445 if (isfasta) {
446 if (prefix.is_empty()) {
447 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
448 rseq.chars());
449 }
450 else { //use custom read name
451 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
452 qd->count, rseq.chars());
453 }
454 }
455 else { //fastq format
456 if (convert_phred) convertPhred(qd->qv, qd->len);
457 if (prefix.is_empty()) {
458 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
459 rseq.chars(), qd->qv);
460 }
461 else { //use custom read name
462 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
463 qd->count, rseq.chars(), qd->qv);
464 }
465 }
466 }//for each element of dhash
467 if (maxdup_count>1) {
468 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
469 }
470 } //collapse entries
471 if (verbose) {
472 if (paired_reads) {
473 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
474 GMessage("Number of input pairs :%9d\n", incounter);
475 GMessage(" Output pairs :%9d\n", outcounter);
476 }
477 else {
478 GMessage(">Input file : %s\n", infname.chars());
479 GMessage("Number of input reads :%9d\n", incounter);
480 GMessage(" Output reads :%9d\n", outcounter);
481 }
482 GMessage("------------------------------------\n");
483 if (num_trimmed5)
484 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
485 if (num_trimmed3)
486 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
487 GMessage("------------------------------------\n");
488 if (trash_s>0)
489 GMessage(" Trashed by length:%9d\n", trash_s);
490 if (trash_N>0)
491 GMessage(" Trashed by N%%:%9d\n", trash_N);
492 if (trash_Q>0)
493 GMessage("Trashed by low quality:%9d\n", trash_Q);
494 if (trash_poly>0)
495 GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
496 if (trash_A5>0)
497 GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
498 if (trash_A3>0)
499 GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
500 }
501 if (trashReport) {
502 FWCLOSE(freport);
503 }
504 FWCLOSE(f_out);
505 FWCLOSE(f_out2);
506 } //while each input file
507 delete gxmem_l;
508 delete gxmem_r;
509 //getc(stdin);
510 }
511
512 class NData {
513 public:
514 int NPos[1024]; //there should be no reads longer than 1K ?
515 int NCount;
516 int end5;
517 int end3;
518 int seqlen;
519 double perc_N; //percentage of Ns in end5..end3 range only!
520 const char* seq;
521 bool valid;
522 NData() {
523 seqlen=0;
524 NCount=0;
525 end5=0;
526 end3=0;
527 seq=NULL;
528 perc_N=0;
529 valid=true;
530 }
531 void init(GStr& rseq) {
532 NCount=0;
533 perc_N=0;
534 seqlen=rseq.length();
535 seq=rseq.chars();
536 end5=1;
537 end3=seqlen;
538 for (int i=0;i<seqlen;i++)
539 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
540 NPos[NCount]=i;
541 NCount++;
542 }
543 perc_N=(NCount*100.0)/seqlen;
544 valid=true;
545 }
546 void N_calc() { //only in the region after trimming
547 int n=0;
548 for (int i=end3-1;i<end5;i++) {
549 if (seq[i]=='N') n++;
550 }
551 perc_N=(n*100.0)/(end5-end3+1);
552 }
553 };
554
555 static NData feat;
556 int perc_lenN=12; // incremental distance from ends, in percentage of
557 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
558
559 void N_analyze(int l5, int l3, int p5, int p3) {
560 /* assumes feat was filled properly */
561 int old_dif, t5,t3,v;
562 if (l3<l5+2 || p5>p3 ) {
563 feat.end5=l5+1;
564 feat.end3=l3+1;
565 return;
566 }
567
568 t5=feat.NPos[p5]-l5;
569 t3=l3-feat.NPos[p3];
570 old_dif=p3-p5;
571 v=(int)((((double)(l3-l5))*perc_lenN)/100);
572 if (v>20) v=20; /* enforce N-search limit for very long reads */
573 if (t5 < v ) {
574 l5=feat.NPos[p5]+1;
575 p5++;
576 }
577 if (t3 < v) {
578 l3=feat.NPos[p3]-1;
579 p3--;
580 }
581 /* restNs=p3-p5; number of Ns in the new CLR */
582 if (p3-p5==old_dif) { /* no change, return */
583 /* don't trim if this may shorten the read too much */
584 //if (l5-l3<min_read_len) return;
585 feat.end5=l5+1;
586 feat.end3=l3+1;
587 return;
588 }
589 else
590 N_analyze(l5,l3, p5,p3);
591 }
592
593
594 bool qtrim(GStr& qvs, int &l5, int &l3) {
595 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
596 if (qv_phredtype==0) {
597 //try to guess the Phred type
598 int vmin=256, vmax=0;
599 for (int i=0;i<qvs.length();i++) {
600 if (vmin>qvs[i]) vmin=qvs[i];
601 if (vmax<qvs[i]) vmax=qvs[i];
602 }
603 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
604 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
605 if (qv_phredtype==0) {
606 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
607 }
608 if (verbose)
609 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
610 } //guessing Phred type
611 for (l3=qvs.length()-1;l3>2;l3--) {
612 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
613 }
614 //just in case, check also the 5' the end (?)
615 for (l5=0;l5<qvs.length()-3;l5++) {
616 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
617 }
618 if (qvtrim_max>0) {
619 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
620 if (l5>qvtrim_max) l5=qvtrim_max;
621 }
622 return (l5>0 || l3<qvs.length()-1);
623 }
624
625 bool ntrim(GStr& rseq, int &l5, int &l3) {
626 //count Ns in the sequence, trim N-rich ends
627 feat.init(rseq);
628 l5=feat.end5-1;
629 l3=feat.end3-1;
630 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
631 if (l5==feat.end5-1 && l3==feat.end3-1) {
632 if (feat.perc_N>max_perc_N) {
633 feat.valid=false;
634 l3=l5+1;
635 return true;
636 }
637 else {
638 return false; //nothing changed
639 }
640 }
641 l5=feat.end5-1;
642 l3=feat.end3-1;
643 if (l3-l5+1<min_read_len) {
644 feat.valid=false;
645 return true;
646 }
647 feat.N_calc();
648
649 if (feat.perc_N>max_perc_N) {
650 feat.valid=false;
651 l3=l5+1;
652 return true;
653 }
654 return true;
655 }
656
657 //--------------- dust functions ----------------
658 class DNADuster {
659 public:
660 int dustword;
661 int dustwindow;
662 int dustwindow2;
663 int dustcutoff;
664 int mv, iv, jv;
665 int counts[32*32*32];
666 int iis[32*32*32];
667 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
668 dustword=wordsize;
669 dustwindow=winsize;
670 dustwindow2 = (winsize>>1);
671 dustcutoff=cutoff;
672 mv=0;
673 iv=0;
674 jv=0;
675 }
676 void setWindowSize(int value) {
677 dustwindow = value;
678 dustwindow2 = (dustwindow >> 1);
679 }
680 void setWordSize(int value) {
681 dustword=value;
682 }
683 void wo1(int len, const char* s, int ivv) {
684 int i, ii, j, v, t, n, n1, sum;
685 int js, nis;
686 n = 32 * 32 * 32;
687 n1 = n - 1;
688 nis = 0;
689 i = 0;
690 ii = 0;
691 sum = 0;
692 v = 0;
693 for (j=0; j < len; j++, s++) {
694 ii <<= 5;
695 if (*s<=32) {
696 i=0;
697 continue;
698 }
699 ii |= *s - 'A'; //assume uppercase!
700 ii &= n1;
701 i++;
702 if (i >= dustword) {
703 for (js=0; js < nis && iis[js] != ii; js++) ;
704 if (js == nis) {
705 iis[nis] = ii;
706 counts[ii] = 0;
707 nis++;
708 }
709 if ((t = counts[ii]) > 0) {
710 sum += t;
711 v = 10 * sum / j;
712 if (mv < v) {
713 mv = v;
714 iv = ivv;
715 jv = j;
716 }
717 }
718 counts[ii]++;
719 }
720 }
721 }
722
723 int wo(int len, const char* s, int* beg, int* end) {
724 int i, l1;
725 l1 = len - dustword + 1;
726 if (l1 < 0) {
727 *beg = 0;
728 *end = len - 1;
729 return 0;
730 }
731 mv = 0;
732 iv = 0;
733 jv = 0;
734 for (i=0; i < l1; i++) {
735 wo1(len-i, s+i, i);
736 }
737 *beg = iv;
738 *end = iv + jv;
739 return mv;
740 }
741
742 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
743 int i, j, l, a, b, v;
744 if (cutoff==0) cutoff=dustcutoff;
745 a=0;b=0;
746 //GMessage("Dust cutoff=%d\n", cutoff);
747 for (i=0; i < seqlen; i += dustwindow2) {
748 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
749 v = wo(l, seq+i, &a, &b);
750 if (v > cutoff) {
751 //for (j = a; j <= b && j < dustwindow2; j++) {
752 for (j = a; j <= b; j++) {
753 seqmsk[i+j]='N';//could be made lowercase instead
754 }
755 }
756 }
757 //return first;
758 }
759 };
760
761 static DNADuster duster;
762
763 int dust(GStr& rseq) {
764 char* seq=Gstrdup(rseq.chars());
765 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
766 //check the number of Ns:
767 int ncount=0;
768 for (int i=0;i<rseq.length();i++) {
769 if (seq[i]=='N') ncount++;
770 }
771 GFREE(seq);
772 return ncount;
773 }
774
775 struct SLocScore {
776 int pos;
777 int score;
778 SLocScore(int p=0,int s=0) {
779 pos=p;
780 score=s;
781 }
782 void set(int p, int s) {
783 pos=p;
784 score=s;
785 }
786 void add(int p, int add) {
787 pos=p;
788 score+=add;
789 }
790 };
791
792 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
793 if (!doPolyTrim) return false;
794 int rlen=seq.length();
795 l5=0;
796 l3=rlen-1;
797 int32 seedVal=*(int32*)poly_seed;
798 char polyChar=poly_seed[0];
799 //assumes N trimming was already done
800 //so a poly match should be very close to the end of the read
801 // -- find the initial match (seed)
802 int lmin=GMAX((rlen-16), 0);
803 int li;
804 for (li=rlen-4;li>lmin;li--) {
805 if (seedVal==*(int*)&(seq[li])) {
806 break;
807 }
808 }
809 if (li<=lmin) return false;
810 //seed found, try to extend it both ways
811 //extend right
812 int ri=li+3;
813 SLocScore loc(ri, poly_m_score<<2);
814 SLocScore maxloc(loc);
815 //extend right
816 while (ri<rlen-1) {
817 ri++;
818 if (seq[ri]==polyChar) {
819 loc.add(ri,poly_m_score);
820 }
821 else if (seq[ri]=='N') {
822 loc.add(ri,0);
823 }
824 else { //mismatch
825 loc.add(ri,poly_mis_score);
826 if (maxloc.score-loc.score>poly_dropoff_score) break;
827 }
828 if (maxloc.score<=loc.score) {
829 maxloc=loc;
830 }
831 }
832 ri=maxloc.pos;
833 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
834 //ri = right boundary for the poly match
835 //extend left
836 loc.set(li, maxloc.score);
837 maxloc.pos=li;
838 while (li>0) {
839 li--;
840 if (seq[li]==polyChar) {
841 loc.add(li,poly_m_score);
842 }
843 else if (seq[li]=='N') {
844 loc.add(li,0);
845 }
846 else { //mismatch
847 loc.add(li,poly_mis_score);
848 if (maxloc.score-loc.score>poly_dropoff_score) break;
849 }
850 if (maxloc.score<=loc.score) {
851 maxloc=loc;
852 }
853 }
854 li=maxloc.pos;
855 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
856 (maxloc.score>poly_minScore && ri>=rlen-3) ||
857 (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
858 //trimming this li-ri match at 3' end
859 l3=li-1;
860 if (l3<0) l3=0;
861 return true;
862 }
863 return false;
864 }
865
866 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
867 if (!doPolyTrim) return false;
868 int rlen=seq.length();
869 l5=0;
870 l3=rlen-1;
871 int32 seedVal=*(int32*)poly_seed;
872 char polyChar=poly_seed[0];
873 //assumes N trimming was already done
874 //so a poly match should be very close to the end of the read
875 // -- find the initial match (seed)
876 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
877 int li;
878 for (li=0;li<=lmax;li++) {
879 if (seedVal==*(int*)&(seq[li])) {
880 break;
881 }
882 }
883 if (li>lmax) return false;
884 //seed found, try to extend it both ways
885 //extend left
886 int ri=li+3; //save rightmost base of the seed
887 SLocScore loc(li, poly_m_score<<2);
888 SLocScore maxloc(loc);
889 while (li>0) {
890 li--;
891 if (seq[li]==polyChar) {
892 loc.add(li,poly_m_score);
893 }
894 else if (seq[li]=='N') {
895 loc.add(li,0);
896 }
897 else { //mismatch
898 loc.add(li,poly_mis_score);
899 if (maxloc.score-loc.score>poly_dropoff_score) break;
900 }
901 if (maxloc.score<=loc.score) {
902 maxloc=loc;
903 }
904 }
905 li=maxloc.pos;
906 if (li>5) return false; //no trimming wanted, too far from 5' end
907 //li = right boundary for the poly match
908
909 //extend right
910 loc.set(ri, maxloc.score);
911 maxloc.pos=ri;
912 while (ri<rlen-1) {
913 ri++;
914 if (seq[ri]==polyChar) {
915 loc.add(ri,poly_m_score);
916 }
917 else if (seq[ri]=='N') {
918 loc.add(ri,0);
919 }
920 else { //mismatch
921 loc.add(ri,poly_mis_score);
922 if (maxloc.score-loc.score>poly_dropoff_score) break;
923 }
924 if (maxloc.score<=loc.score) {
925 maxloc=loc;
926 }
927 }
928 ri=maxloc.pos;
929 if ((maxloc.score==poly_minScore && li==0) ||
930 (maxloc.score>poly_minScore && li<2)
931 || (maxloc.score>(poly_minScore*3) && li<8)) {
932 //adjust l5 to reflect this trimming of 5' end
933 l5=ri+1;
934 if (l5>rlen-1) l5=rlen-1;
935 return true;
936 }
937 return false;
938 }
939
940 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
941 if (adapters3.Count()==0) return false;
942 int rlen=seq.length();
943 l5=0;
944 l3=rlen-1;
945 bool trimmed=false;
946 GStr wseq(seq.chars());
947 int wlen=rlen;
948 for (int ai=0;ai<adapters3.Count();ai++) {
949 //if (adapters3[ai].is_empty()) continue;
950 int alen=adapters3[ai].seq.length();
951 GStr& aseq=adapters3[ai].seq;
952 GXAlnInfo* r_bestaln=match_RightEnd(aseq.chars(), alen, adapters3[ai].pz,
953 wseq.chars(), wlen, gxmem_r, 74);
954 if (r_bestaln) {
955 trimmed=true;
956 //keep unmatched region on the left, if any
957 l3-=(wlen-r_bestaln->sl+1);
958 delete r_bestaln;
959 if (l3<0) l3=0;
960 if (l3-l5+1<min_read_len) return true;
961 wseq=seq.substr(l5,l3-l5+1);
962 wlen=wseq.length();
963 }
964 }//for each 5' adapter
965 return trimmed;
966 }
967
968 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
969 if (adapters5.Count()==0) return false;
970 int rlen=seq.length();
971 l5=0;
972 l3=rlen-1;
973 bool trimmed=false;
974 GStr wseq(seq.chars());
975 int wlen=rlen;
976 for (int ai=0;ai<adapters5.Count();ai++) {
977 //if (adapters5[ai].is_empty()) continue;
978 int alen=adapters5[ai].seq.length();
979 GStr& aseq=adapters5[ai].seq;
980 GXAlnInfo* l_bestaln=match_LeftEnd(aseq.chars(), alen, adapters5[ai].pz,
981 wseq.chars(), wlen, gxmem_l, 84);
982 if (l_bestaln) {
983 trimmed=true;
984 l5+=l_bestaln->sr;
985 delete l_bestaln;
986 if (l5>=rlen) l5=rlen-1;
987 if (l3-l5+1<min_read_len) return true;
988 wseq=seq.substr(l5,l3-l5+1);
989 wlen=wseq.length();
990 }
991 }//for each 5' adapter
992 return trimmed;
993 }
994
995 //convert qvs to/from phred64 from/to phread33
996 void convertPhred(GStr& q) {
997 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
998 }
999
1000 void convertPhred(char* q, int len) {
1001 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1002 }
1003
1004 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1005 GStr& rname, GStr& rinfo, GStr& infname) {
1006 rseq="";
1007 rqv="";
1008 rname="";
1009 rinfo="";
1010 if (fq.eof()) return false;
1011 char* l=fq.getLine();
1012 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1013 if (l==NULL) return false;
1014 /* if (rawFormat) {
1015 //TODO: implement raw qseq parsing here
1016 //if (raw type=N) then continue; //skip invalid/bad records
1017 } //raw qseq format
1018 else { // FASTQ or FASTA */
1019 isfasta=(l[0]=='>');
1020 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1021 infname.chars(), l);
1022 GStr s(l);
1023 rname=&(l[1]);
1024 for (int i=0;i<rname.length();i++)
1025 if (rname[i]<=' ') {
1026 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1027 rname.cut(i);
1028 break;
1029 }
1030 //now get the sequence
1031 if ((l=fq.getLine())==NULL)
1032 GError("Error: unexpected EOF after header for read %s (%s)\n",
1033 rname.chars(), infname.chars());
1034 rseq=l; //this must be the DNA line
1035 while ((l=fq.getLine())!=NULL) {
1036 //seq can span multiple lines
1037 if (l[0]=='>' || l[0]=='+') {
1038 fq.pushBack();
1039 break; //
1040 }
1041 rseq+=l;
1042 } //check for multi-line seq
1043 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1044 if ((l=fq.getLine())==NULL)
1045 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1046 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1047 if ((l=fq.getLine())==NULL)
1048 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1049 rqv=l;
1050 //if (rqv.length()!=rseq.length())
1051 // GError("Error: qv len != seq len for %s\n", rname.chars());
1052 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1053 rqv+=l; //append to qv string
1054 }
1055 }// fastq
1056 // } //<-- FASTA or FASTQ
1057 rseq.upper();
1058 return true;
1059 }
1060
1061 #ifdef GDEBUG
1062 void showTrim(GStr& s, int l5, int l3) {
1063 if (l5>0) {
1064 color_bg(c_red);
1065 }
1066 for (int i=0;i<s.length()-1;i++) {
1067 if (i && i==l5) color_resetbg();
1068 fprintf(stderr, "%c", s[i]);
1069 if (i==l3) color_bg(c_red);
1070 }
1071 fprintf(stderr, "%c", s[s.length()-1]);
1072 color_reset();
1073 fprintf(stderr, "\n");
1074 }
1075 #endif
1076
1077 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1078 //returns 0 if the read was untouched, 1 if it was just trimmed
1079 // and a trash code if it was trashed
1080 l5=0;
1081 l3=rseq.length()-1;
1082 #ifdef GDEBUG
1083 //rseq.reverse();
1084 GMessage(">%s\n", rname.chars());
1085 GMessage("%s\n",rseq.chars());
1086 #endif
1087 if (l3-l5+1<min_read_len) {
1088 return 's';
1089 }
1090 GStr wseq(rseq.chars());
1091 GStr wqv(rqv.chars());
1092 int w5=l5;
1093 int w3=l3;
1094 //first do the q-based trimming
1095 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1096 if (w3-w5+1<min_read_len) {
1097 return 'Q'; //invalid read
1098 }
1099 //-- keep only the w5..w3 range
1100 l5=w5;
1101 l3=w3;
1102 wseq=wseq.substr(w5, w3-w5+1);
1103 if (!wqv.is_empty())
1104 wqv=wqv.substr(w5, w3-w5+1);
1105 } //qv trimming
1106 // N-trimming on the remaining read seq
1107 if (ntrim(wseq, w5, w3)) {
1108 //GMessage("before: %s\n",wseq.chars());
1109 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1110 l5+=w5;
1111 l3-=(wseq.length()-1-w3);
1112 if (w3-w5+1<min_read_len) {
1113 return 'N'; //to be trashed
1114 }
1115 //-- keep only the w5..w3 range
1116 wseq=wseq.substr(w5, w3-w5+1);
1117 if (!wqv.is_empty())
1118 wqv=wqv.substr(w5, w3-w5+1);
1119 w5=0;
1120 w3=wseq.length()-1;
1121 }
1122 char trim_code;
1123 do {
1124 trim_code=0;
1125 if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1126 trim_code='A';
1127 }
1128 else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1129 trim_code='T';
1130 }
1131 else if (trim_adapter5(wseq, w5, w3)) {
1132 trim_code='5';
1133 }
1134 if (trim_code) {
1135 #ifdef GDEBUG
1136 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1137 showTrim(wseq, w5, w3);
1138 #endif
1139 int trimlen=wseq.length()-(w3-w5+1);
1140 num_trimmed5++;
1141 if (trimlen<min_trimmed5)
1142 min_trimmed5=trimlen;
1143 l5+=w5;
1144 l3-=(wseq.length()-1-w3);
1145 if (w3-w5+1<min_read_len) {
1146 return trim_code;
1147 }
1148 //-- keep only the w5..w3 range
1149 wseq=wseq.substr(w5, w3-w5+1);
1150 if (!wqv.is_empty())
1151 wqv=wqv.substr(w5, w3-w5+1);
1152 }// trimmed at 5' end
1153 } while (trim_code);
1154
1155 do {
1156 trim_code=0;
1157 if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1158 trim_code='A';
1159 }
1160 else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1161 trim_code='T';
1162 }
1163 else if (trim_adapter3(wseq, w5, w3)) {
1164 trim_code='3';
1165 }
1166 if (trim_code) {
1167 #ifdef GDEBUG
1168 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1169 showTrim(wseq, w5, w3);
1170 #endif
1171 int trimlen=wseq.length()-(w3-w5+1);
1172 num_trimmed3++;
1173 if (trimlen<min_trimmed3)
1174 min_trimmed3=trimlen;
1175 l5+=w5;
1176 l3-=(wseq.length()-1-w3);
1177 if (w3-w5+1<min_read_len) {
1178 return trim_code;
1179 }
1180 //-- keep only the w5..w3 range
1181 wseq=wseq.substr(w5, w3-w5+1);
1182 if (!wqv.is_empty())
1183 wqv=wqv.substr(w5, w3-w5+1);
1184 }//trimming at 3' end
1185 } while (trim_code);
1186
1187
1188 if (doCollapse) {
1189 //keep read for later
1190 FqDupRec* dr=dhash.Find(wseq.chars());
1191 if (dr==NULL) { //new entry
1192 //if (prefix.is_empty())
1193 dhash.Add(wseq.chars(),
1194 new FqDupRec(&wqv, rname.chars()));
1195 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1196 }
1197 else
1198 dr->add(wqv);
1199 } //collapsing duplicates
1200 else { //not collapsing duplicates
1201 //apply the dust filter now
1202 if (doDust) {
1203 int dustbases=dust(wseq);
1204 if (dustbases>(wseq.length()>>1)) {
1205 return 'D';
1206 }
1207 }
1208 } //not collapsing duplicates
1209 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1210 }
1211
1212 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1213 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1214 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1215 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1216 }
1217
1218 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1219 outcounter++;
1220 bool asFasta=(rqv.is_empty() || fastaOutput);
1221 if (asFasta) {
1222 if (prefix.is_empty()) {
1223 printHeader(f_out, '>',rname,rinfo);
1224 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1225 }
1226 else {
1227 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1228 rseq.chars());
1229 }
1230 }
1231 else { //fastq
1232 if (convert_phred) convertPhred(rqv);
1233 if (prefix.is_empty()) {
1234 printHeader(f_out, '@', rname, rinfo);
1235 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1236 }
1237 else
1238 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1239 rseq.chars(),rqv.chars() );
1240 }
1241 }
1242
1243 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1244 if (freport==NULL || trashcode<=' ') return;
1245 if (trashcode=='3' || trashcode=='5') {
1246 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1247 }
1248 else {
1249 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1250 }
1251 //tcounter++;
1252 }
1253
1254 GStr getFext(GStr& s, int* xpos=NULL) {
1255 //s must be a filename without a path
1256 GStr r("");
1257 if (xpos!=NULL) *xpos=0;
1258 if (s.is_empty() || s=="-") return r;
1259 int p=s.rindex('.');
1260 int d=s.rindex('/');
1261 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1262 r=s.substr(p+1);
1263 if (xpos!=NULL) *xpos=p+1;
1264 r.lower();
1265 return r;
1266 }
1267
1268 void baseFileName(GStr& fname) {
1269 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1270 int xpos=0;
1271 GStr fext=getFext(fname, &xpos);
1272 if (xpos<=1) return;
1273 bool extdel=false;
1274 GStr f2;
1275 if (fext=="z" || fext=="zip") {
1276 extdel=true;
1277 }
1278 else if (fext.length()>=2) {
1279 f2=fext.substr(0,2);
1280 extdel=(f2=="gz" || f2=="bz");
1281 }
1282 if (extdel) {
1283 fname.cut(xpos-1);
1284 fext=getFext(fname, &xpos);
1285 if (xpos<=1) return;
1286 }
1287 extdel=false;
1288 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1289 extdel=true;
1290 }
1291 else if (fext.length()>=2) {
1292 extdel=(fext.substr(0,2)=="fa");
1293 }
1294 if (extdel) fname.cut(xpos-1);
1295 GStr fncp(fname);
1296 fncp.lower();
1297 fncp.chomp("seq");
1298 fncp.chomp("sequence");
1299 fncp.trimR("_.");
1300 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1301 }
1302
1303 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1304 FILE* f_out=NULL;
1305 GStr fname(getFileName(infname.chars()));
1306 //eliminate known extensions
1307 baseFileName(fname);
1308 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1309 else if (pocmd.is_empty()) {
1310 GStr oname(fname);
1311 oname.append('.');
1312 oname.append(outsuffix);
1313 f_out=fopen(oname.chars(),"w");
1314 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1315 }
1316 else {
1317 GStr oname(">");
1318 oname.append(fname);
1319 oname.append('.');
1320 oname.append(outsuffix);
1321 pocmd.append(oname);
1322 f_out=popen(pocmd.chars(), "w");
1323 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1324 }
1325 return f_out;
1326 }
1327
1328 void guess_unzip(GStr& fname, GStr& picmd) {
1329 GStr fext=getFext(fname);
1330 if (fext=="gz" || fext=="gzip" || fext=="z") {
1331 picmd="gzip -cd ";
1332 }
1333 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1334 picmd="bzip2 -cd ";
1335 }
1336 }
1337
1338 void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1339 //TODO: prepare CASeqData here, and collect hexamers as well
1340 CASeqData adata(revCompl);
1341 int idx=adapters.Add(adata);
1342 if (idx<0) GError("Error: failed to add adaptor!\n");
1343 adapters[idx].update(seq.chars());
1344 }
1345
1346
1347 int loadAdapters(const char* fname) {
1348 GLineReader lr(fname);
1349 char* l;
1350 while ((l=lr.nextLine())!=NULL) {
1351 if (lr.length()<=3 || l[0]=='#') continue;
1352 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1353 l[0]==';'|| l[0]==':' ) {
1354 int i=1;
1355 while (l[i]!=0 && isspace(l[i])) {
1356 i++;
1357 }
1358 if (l[i]!=0) {
1359 GStr s(&(l[i]));
1360 #ifdef GDEBUG
1361 //s.reverse();
1362 #endif
1363 addAdapter(adapters3, s);
1364 continue;
1365 }
1366 }
1367 else {
1368 GStr s(l);
1369 s.startTokenize("\t ;,:");
1370 GStr a5,a3;
1371 if (s.nextToken(a5))
1372 s.nextToken(a3);
1373 a5.upper();
1374 a3.upper();
1375 #ifdef GDEBUG
1376 // a5.reverse();
1377 // a3.reverse();
1378 #endif
1379 addAdapter(adapters5, a5);
1380 addAdapter(adapters3, a3);
1381 }
1382 }
1383 return adapters5.Count()+adapters3.Count();
1384 }
1385
1386 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1387 GStr& s, GStr& infname, GStr& infname2) {
1388 // uses outsuffix to generate output file names and open file handles as needed
1389 infname="";
1390 infname2="";
1391 f_in=NULL;
1392 f_in2=NULL;
1393 f_out=NULL;
1394 f_out2=NULL;
1395 //analyze outsuffix intent
1396 GStr pocmd;
1397 if (outsuffix=="-") {
1398 f_out=stdout;
1399 }
1400 else {
1401 GStr ox=getFext(outsuffix);
1402 if (ox.length()>2) ox=ox.substr(0,2);
1403 if (ox=="gz") pocmd="gzip -9 -c ";
1404 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1405 }
1406 if (s=="-") {
1407 f_in=stdin;
1408 infname="stdin";
1409 f_out=prepOutFile(infname, pocmd);
1410 return;
1411 } // streaming from stdin
1412 s.startTokenize(",:");
1413 s.nextToken(infname);
1414 bool paired=s.nextToken(infname2);
1415 if (fileExists(infname.chars())==0)
1416 GError("Error: cannot find file %s!\n",infname.chars());
1417 GStr fname(getFileName(infname.chars()));
1418 GStr picmd;
1419 guess_unzip(fname, picmd);
1420 if (picmd.is_empty()) {
1421 f_in=fopen(infname.chars(), "r");
1422 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1423 }
1424 else {
1425 picmd.append(infname);
1426 f_in=popen(picmd.chars(), "r");
1427 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1428 }
1429 if (f_out==stdout) {
1430 if (paired) GError("Error: output suffix required for paired reads\n");
1431 return;
1432 }
1433 f_out=prepOutFile(infname, pocmd);
1434 if (!paired) return;
1435 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1436 // ---- paired reads:-------------
1437 if (fileExists(infname2.chars())==0)
1438 GError("Error: cannot find file %s!\n",infname2.chars());
1439 picmd="";
1440 GStr fname2(getFileName(infname2.chars()));
1441 guess_unzip(fname2, picmd);
1442 if (picmd.is_empty()) {
1443 f_in2=fopen(infname2.chars(), "r");
1444 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1445 }
1446 else {
1447 picmd.append(infname2);
1448 f_in2=popen(picmd.chars(), "r");
1449 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1450 }
1451 f_out2=prepOutFile(infname2, pocmd);
1452 }