ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 177
Committed: Thu Feb 16 07:10:32 2012 UTC (7 years, 8 months ago) by gpertea
File size: 43339 byte(s)
Log Message:
collectSeeds unified

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7
8 #define USAGE "Usage:\n\
9 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 <input.fq>[,<input_mates.fq>\n\
13 \n\
14 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 for low complexity and collapse duplicate reads.\n\
16 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 one read in a pair), the two file names should be given delimited by a comma\n\
18 or a colon character\n\
19 \n\
20 Options:\n\
21 -n rename all the reads using the <prefix> followed by a read counter;\n\
22 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23 the read duplication count\n\
24 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25 file(s) named <input>.<outsuffix> which will be created in the \n\
26 current (working) directory; (writes to stdout if -o- is given);\n\
27 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 -f file with adapter sequences to trim, each line having this format:\n\
29 <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 -3 trim the given adapter sequence at the 3' end of each read\n\
33 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 -A disable polyA/T trimming (enabled by default)\n\
35 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 -l minimum \"clean\" length after trimming that a read must have\n\
40 in order to pass the filter (default: 16)\n\
41 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42 one letter code for the reason why they were trashed\n\
43 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44 50% of its length detected as low complexity\n\
45 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 name (e.g. for microRNAs)\n\
47 -p input is phred64/phred33 (use -p64 or -p33)\n\
48 -Q convert quality values to the other Phred qv type\n\
49 -V verbose processing\n\
50 "
51
52 //-z for -o option, the output stream(s) will be first piped into the given\n
53 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54
55
56 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57
58 //For paired reads sequencing:
59 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 //FILE* f_out=NULL; //stdout if not provided
62 //FILE* f_out2=NULL; //for paired reads
63 //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 //FILE* f_in2=NULL; //for paired reads
65
66 FILE* freport=NULL;
67
68 bool debug=false;
69 bool verbose=false;
70 bool doCollapse=false;
71 bool doDust=false;
72 bool doPolyTrim=true;
73 bool fastaOutput=false;
74 bool trashReport=false;
75 bool revCompl=false; //also reverse complement adaptor sequences
76 int min_read_len=16;
77 double max_perc_N=7.0;
78 int dust_cutoff=16;
79 bool isfasta=false;
80 bool convert_phred=false;
81 GStr outsuffix; // -o
82 GStr prefix;
83 GStr zcmd;
84 int num_trimmed5=0;
85 int num_trimmed3=0;
86 int num_trimmedN=0;
87 int num_trimmedQ=0;
88 int min_trimmed5=INT_MAX;
89 int min_trimmed3=INT_MAX;
90
91 int qvtrim_qmin=0;
92 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94 int qv_cvtadd=0; //could be -31 or +31
95
96 // adaptor matching metrics -- for X-drop ungapped extension
97 //const int match_reward=2;
98 //const int mismatch_penalty=3;
99 const int match_reward=2;
100 const int mismatch_penalty=4;
101 const int Xdrop=10;
102
103 const int poly_m_score=2; //match score
104 const int poly_mis_score=-3; //mismatch
105 const int poly_dropoff_score=7;
106 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
107
108 const char *polyA_seed="AAAA";
109 const char *polyT_seed="TTTT";
110
111 struct CASeqData {
112 //positional data for every possible hexamer in an adapter
113 GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
114 GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
115 GStr seq; //actual adapter sequence data
116 GStr seqr; //reverse complement sequence
117 int amlen; //fraction of adapter length matching that's
118 //enough to consider the alignment
119 bool use_reverse;
120 CASeqData(bool rev=false):seq(),seqr(),
121 amlen(0), use_reverse(rev) {
122 for (int i=0;i<4096;i++) {
123 pz[i]=NULL;
124 pzr[i]=NULL;
125 }
126 }
127
128 void update(const char* s) {
129 seq=s;
130 table6mers(seq.chars(), seq.length(), pz);
131 amlen=iround(double(seq.length())*0.8);
132 if (amlen<12)
133 amlen=12;
134 if (!use_reverse) return;
135 //reverse complement
136 seqr=s;
137 int slen=seq.length();
138 for (int i=0;i<slen;i++)
139 seqr[i]=ntComplement(seq[slen-i-1]);
140 table6mers(seqr.chars(), seqr.length(), pzr);
141 }
142
143 ~CASeqData() {
144 for (int i=0;i<4096;i++) {
145 delete pz[i];
146 delete pzr[i];
147 }
148 }
149 };
150
151 GVec<CASeqData> adapters5;
152 GVec<CASeqData> adapters3;
153
154 CGreedyAlignData* gxmem_l=NULL;
155 CGreedyAlignData* gxmem_r=NULL;
156
157 // element in dhash:
158 class FqDupRec {
159 public:
160 int count; //how many of these reads are the same
161 int len; //length of qv
162 char* firstname; //optional, only if we want to keep the original read names
163 char* qv;
164 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
165 len=0;
166 qv=NULL;
167 firstname=NULL;
168 count=0;
169 if (qstr!=NULL) {
170 qv=Gstrdup(qstr->chars());
171 len=qstr->length();
172 count++;
173 }
174 if (rname!=NULL) firstname=Gstrdup(rname);
175 }
176 ~FqDupRec() {
177 GFREE(qv);
178 GFREE(firstname);
179 }
180 void add(GStr& d) { //collapse another record into this one
181 if (d.length()!=len)
182 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
183 count++;
184 for (int i=0;i<len;i++)
185 qv[i]=(qv[i]+d[i])/2;
186 }
187 };
188
189 void openfw(FILE* &f, GArgs& args, char opt) {
190 GStr s=args.getOpt(opt);
191 if (!s.is_empty()) {
192 if (s=='-') f=stdout;
193 else {
194 f=fopen(s.chars(),"w");
195 if (f==NULL) GError("Error creating file: %s\n", s.chars());
196 }
197 }
198 }
199
200 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
201 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
202
203 GHash<FqDupRec> dhash; //hash to keep track of duplicates
204
205 void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
206 int loadAdapters(const char* fname);
207
208 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
209 GStr& s, GStr& infname, GStr& infname2);
210 // uses outsuffix to generate output file names and open file handles as needed
211
212 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
213 void trash_report(char trashcode, GStr& rname, FILE* freport);
214
215 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
216 GStr& rname, GStr& rinfo, GStr& infname);
217
218 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
219 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
220
221 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
222 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
223 int dust(GStr& seq);
224 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
225 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
226 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
227 bool trim_adapter3(GStr& seq, int &l5, int &l3);
228
229 void convertPhred(char* q, int len);
230 void convertPhred(GStr& q);
231
232 int main(int argc, char * const argv[]) {
233 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
234 int e;
235 if ((e=args.isError())>0) {
236 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
237 exit(224);
238 }
239 debug=(args.getOpt('Y')!=NULL);
240 verbose=(args.getOpt('V')!=NULL);
241 convert_phred=(args.getOpt('Q')!=NULL);
242 doCollapse=(args.getOpt('C')!=NULL);
243 doDust=(args.getOpt('D')!=NULL);
244 revCompl=(args.getOpt('R')!=NULL);
245 if (args.getOpt('A')) doPolyTrim=false;
246 /*
247 rawFormat=(args.getOpt('R')!=NULL);
248 if (rawFormat) {
249 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
250 }
251 */
252 prefix=args.getOpt('n');
253 GStr s=args.getOpt('l');
254 if (!s.is_empty())
255 min_read_len=s.asInt();
256 s=args.getOpt('m');
257 if (!s.is_empty())
258 max_perc_N=s.asDouble();
259 s=args.getOpt('d');
260 if (!s.is_empty()) {
261 dust_cutoff=s.asInt();
262 doDust=true;
263 }
264 s=args.getOpt('q');
265 if (!s.is_empty()) {
266 qvtrim_qmin=s.asInt();
267 }
268 s=args.getOpt('t');
269 if (!s.is_empty()) {
270 qvtrim_max=s.asInt();
271 }
272 s=args.getOpt('p');
273 if (!s.is_empty()) {
274 int v=s.asInt();
275 if (v==33) {
276 qv_phredtype=33;
277 qv_cvtadd=31;
278 }
279 else if (v==64) {
280 qv_phredtype=64;
281 qv_cvtadd=-31;
282 }
283 else
284 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
285 }
286 s=args.getOpt('f');
287 if (!s.is_empty()) {
288 loadAdapters(s.chars());
289 }
290 bool fileAdapters=adapters5.Count()+adapters3.Count();
291 s=args.getOpt('5');
292 if (!s.is_empty()) {
293 if (fileAdapters)
294 GError("Error: options -5 and -f cannot be used together!\n");
295 s.upper();
296 addAdapter(adapters5, s);
297 }
298 s=args.getOpt('3');
299 if (!s.is_empty()) {
300 if (fileAdapters)
301 GError("Error: options -3 and -f cannot be used together!\n");
302 s.upper();
303 addAdapter(adapters3, s);
304 }
305 s=args.getOpt('y');
306 if (!s.is_empty()) {
307 int minmatch=s.asInt();
308 poly_minScore=minmatch*poly_m_score;
309 }
310
311 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
312 else outsuffix="-";
313 trashReport= (args.getOpt('r')!=NULL);
314 int fcount=args.startNonOpt();
315 if (fcount==0) {
316 GMessage(USAGE);
317 exit(224);
318 }
319 if (fcount>1 && doCollapse) {
320 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
321 }
322 //openfw(f_out, args, 'o');
323 //if (f_out==NULL) f_out=stdout;
324 if (trashReport)
325 openfw(freport, args, 'r');
326 char* infile=NULL;
327
328 if (adapters5.Count()>0)
329 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
330 gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
331 if (adapters3.Count()>0)
332 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333
334 while ((infile=args.nextNonOpt())!=NULL) {
335 //for each input file
336 int incounter=0; //counter for input reads
337 int outcounter=0; //counter for output reads
338 int trash_s=0; //too short from the get go
339 int trash_Q=0;
340 int trash_N=0;
341 int trash_D=0;
342 int trash_poly=0;
343 int trash_A3=0;
344 int trash_A5=0;
345 s=infile;
346 GStr infname;
347 GStr infname2;
348 FILE* f_in=NULL;
349 FILE* f_in2=NULL;
350 FILE* f_out=NULL;
351 FILE* f_out2=NULL;
352 bool paired_reads=false;
353 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
354 GLineReader fq(f_in);
355 GLineReader* fq2=NULL;
356 if (f_in2!=NULL) {
357 fq2=new GLineReader(f_in2);
358 paired_reads=true;
359 }
360 GStr rseq, rqv, seqid, seqinfo;
361 GStr rseq2, rqv2, seqid2, seqinfo2;
362 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
363 incounter++;
364 int a5=0, a3=0, b5=0, b3=0;
365 char tcode=0, tcode2=0;
366 tcode=process_read(seqid, rseq, rqv, a5, a3);
367 if (fq2!=NULL) {
368 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
369 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
370 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
371 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
372 }
373 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
374 //decide what to do with this pair and print rseq2 if the pair makes it
375 if (tcode>1 && tcode2<=1) {
376 //"untrash" rseq
377 if (a3-a5+1<min_read_len) {
378 a5=1;
379 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
380 }
381 tcode=1;
382 }
383 else if (tcode<=1 && tcode2>1) {
384 //"untrash" rseq2
385 if (b3-b5+1<min_read_len) {
386 b5=1;
387 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
388 }
389 tcode2=1;
390 }
391 if (tcode<=1) { //trimmed or left intact -- write it!
392 if (tcode>0) {
393 rseq2=rseq2.substr(b5,b3-b5+1);
394 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
395 }
396 int nocounter=0;
397 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
398 }
399 } //pair read
400 if (tcode>1) { //trashed
401 #ifdef GDEBUG
402 GMessage(" !!!!TRASH code = %c\n",tcode);
403 #endif
404 if (tcode=='s') trash_s++;
405 else if (tcode=='A' || tcode=='T') trash_poly++;
406 else if (tcode=='Q') trash_Q++;
407 else if (tcode=='N') trash_N++;
408 else if (tcode=='D') trash_D++;
409 else if (tcode=='3') trash_A3++;
410 else if (tcode=='5') trash_A5++;
411 if (trashReport) trash_report(tcode, seqid, freport);
412 }
413 else if (!doCollapse) { //write it
414 if (tcode>0) {
415 rseq=rseq.substr(a5,a3-a5+1);
416 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
417 }
418 #ifdef GDEBUG
419 GMessage(" After trimming:\n");
420 GMessage("%s\n",rseq.chars());
421 #endif
422 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
423 }
424 } //for each fastq record
425 delete fq2;
426 FRCLOSE(f_in);
427 FRCLOSE(f_in2);
428 if (doCollapse) {
429 outcounter=0;
430 int maxdup_count=1;
431 char* maxdup_seq=NULL;
432 dhash.startIterate();
433 FqDupRec* qd=NULL;
434 char* seq=NULL;
435 while ((qd=dhash.NextData(seq))!=NULL) {
436 GStr rseq(seq);
437 //do the dusting here
438 if (doDust) {
439 int dustbases=dust(rseq);
440 if (dustbases>(rseq.length()>>1)) {
441 if (trashReport && qd->firstname!=NULL) {
442 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
443 }
444 trash_D+=qd->count;
445 continue;
446 }
447 }
448 outcounter++;
449 if (qd->count>maxdup_count) {
450 maxdup_count=qd->count;
451 maxdup_seq=seq;
452 }
453 if (isfasta) {
454 if (prefix.is_empty()) {
455 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
456 rseq.chars());
457 }
458 else { //use custom read name
459 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
460 qd->count, rseq.chars());
461 }
462 }
463 else { //fastq format
464 if (convert_phred) convertPhred(qd->qv, qd->len);
465 if (prefix.is_empty()) {
466 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
467 rseq.chars(), qd->qv);
468 }
469 else { //use custom read name
470 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
471 qd->count, rseq.chars(), qd->qv);
472 }
473 }
474 }//for each element of dhash
475 if (maxdup_count>1) {
476 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
477 }
478 } //collapse entries
479 if (verbose) {
480 if (paired_reads) {
481 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
482 GMessage("Number of input pairs :%9d\n", incounter);
483 GMessage(" Output pairs :%9d\n", outcounter);
484 }
485 else {
486 GMessage(">Input file : %s\n", infname.chars());
487 GMessage("Number of input reads :%9d\n", incounter);
488 GMessage(" Output reads :%9d\n", outcounter);
489 }
490 GMessage("------------------------------------\n");
491 if (num_trimmed5)
492 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
493 if (num_trimmed3)
494 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
495 GMessage("------------------------------------\n");
496 if (trash_s>0)
497 GMessage(" Trashed by length:%9d\n", trash_s);
498 if (trash_N>0)
499 GMessage(" Trashed by N%%:%9d\n", trash_N);
500 if (trash_Q>0)
501 GMessage("Trashed by low quality:%9d\n", trash_Q);
502 if (trash_poly>0)
503 GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
504 if (trash_A5>0)
505 GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
506 if (trash_A3>0)
507 GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
508 }
509 if (trashReport) {
510 FWCLOSE(freport);
511 }
512 FWCLOSE(f_out);
513 FWCLOSE(f_out2);
514 } //while each input file
515 delete gxmem_l;
516 delete gxmem_r;
517 //getc(stdin);
518 }
519
520 class NData {
521 public:
522 int NPos[1024]; //there should be no reads longer than 1K ?
523 int NCount;
524 int end5;
525 int end3;
526 int seqlen;
527 double perc_N; //percentage of Ns in end5..end3 range only!
528 const char* seq;
529 bool valid;
530 NData() {
531 seqlen=0;
532 NCount=0;
533 end5=0;
534 end3=0;
535 seq=NULL;
536 perc_N=0;
537 valid=true;
538 }
539 void init(GStr& rseq) {
540 NCount=0;
541 perc_N=0;
542 seqlen=rseq.length();
543 seq=rseq.chars();
544 end5=1;
545 end3=seqlen;
546 for (int i=0;i<seqlen;i++)
547 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
548 NPos[NCount]=i;
549 NCount++;
550 }
551 perc_N=(NCount*100.0)/seqlen;
552 valid=true;
553 }
554 void N_calc() { //only in the region after trimming
555 int n=0;
556 for (int i=end3-1;i<end5;i++) {
557 if (seq[i]=='N') n++;
558 }
559 perc_N=(n*100.0)/(end5-end3+1);
560 }
561 };
562
563 static NData feat;
564 int perc_lenN=12; // incremental distance from ends, in percentage of
565 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
566
567 void N_analyze(int l5, int l3, int p5, int p3) {
568 /* assumes feat was filled properly */
569 int old_dif, t5,t3,v;
570 if (l3<l5+2 || p5>p3 ) {
571 feat.end5=l5+1;
572 feat.end3=l3+1;
573 return;
574 }
575
576 t5=feat.NPos[p5]-l5;
577 t3=l3-feat.NPos[p3];
578 old_dif=p3-p5;
579 v=(int)((((double)(l3-l5))*perc_lenN)/100);
580 if (v>20) v=20; /* enforce N-search limit for very long reads */
581 if (t5 < v ) {
582 l5=feat.NPos[p5]+1;
583 p5++;
584 }
585 if (t3 < v) {
586 l3=feat.NPos[p3]-1;
587 p3--;
588 }
589 /* restNs=p3-p5; number of Ns in the new CLR */
590 if (p3-p5==old_dif) { /* no change, return */
591 /* don't trim if this may shorten the read too much */
592 //if (l5-l3<min_read_len) return;
593 feat.end5=l5+1;
594 feat.end3=l3+1;
595 return;
596 }
597 else
598 N_analyze(l5,l3, p5,p3);
599 }
600
601
602 bool qtrim(GStr& qvs, int &l5, int &l3) {
603 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
604 if (qv_phredtype==0) {
605 //try to guess the Phred type
606 int vmin=256, vmax=0;
607 for (int i=0;i<qvs.length();i++) {
608 if (vmin>qvs[i]) vmin=qvs[i];
609 if (vmax<qvs[i]) vmax=qvs[i];
610 }
611 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
612 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
613 if (qv_phredtype==0) {
614 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
615 }
616 if (verbose)
617 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
618 } //guessing Phred type
619 for (l3=qvs.length()-1;l3>2;l3--) {
620 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
621 }
622 //just in case, check also the 5' the end (?)
623 for (l5=0;l5<qvs.length()-3;l5++) {
624 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
625 }
626 if (qvtrim_max>0) {
627 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
628 if (l5>qvtrim_max) l5=qvtrim_max;
629 }
630 return (l5>0 || l3<qvs.length()-1);
631 }
632
633 bool ntrim(GStr& rseq, int &l5, int &l3) {
634 //count Ns in the sequence, trim N-rich ends
635 feat.init(rseq);
636 l5=feat.end5-1;
637 l3=feat.end3-1;
638 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
639 if (l5==feat.end5-1 && l3==feat.end3-1) {
640 if (feat.perc_N>max_perc_N) {
641 feat.valid=false;
642 l3=l5+1;
643 return true;
644 }
645 else {
646 return false; //nothing changed
647 }
648 }
649 l5=feat.end5-1;
650 l3=feat.end3-1;
651 if (l3-l5+1<min_read_len) {
652 feat.valid=false;
653 return true;
654 }
655 feat.N_calc();
656
657 if (feat.perc_N>max_perc_N) {
658 feat.valid=false;
659 l3=l5+1;
660 return true;
661 }
662 return true;
663 }
664
665 //--------------- dust functions ----------------
666 class DNADuster {
667 public:
668 int dustword;
669 int dustwindow;
670 int dustwindow2;
671 int dustcutoff;
672 int mv, iv, jv;
673 int counts[32*32*32];
674 int iis[32*32*32];
675 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
676 dustword=wordsize;
677 dustwindow=winsize;
678 dustwindow2 = (winsize>>1);
679 dustcutoff=cutoff;
680 mv=0;
681 iv=0;
682 jv=0;
683 }
684 void setWindowSize(int value) {
685 dustwindow = value;
686 dustwindow2 = (dustwindow >> 1);
687 }
688 void setWordSize(int value) {
689 dustword=value;
690 }
691 void wo1(int len, const char* s, int ivv) {
692 int i, ii, j, v, t, n, n1, sum;
693 int js, nis;
694 n = 32 * 32 * 32;
695 n1 = n - 1;
696 nis = 0;
697 i = 0;
698 ii = 0;
699 sum = 0;
700 v = 0;
701 for (j=0; j < len; j++, s++) {
702 ii <<= 5;
703 if (*s<=32) {
704 i=0;
705 continue;
706 }
707 ii |= *s - 'A'; //assume uppercase!
708 ii &= n1;
709 i++;
710 if (i >= dustword) {
711 for (js=0; js < nis && iis[js] != ii; js++) ;
712 if (js == nis) {
713 iis[nis] = ii;
714 counts[ii] = 0;
715 nis++;
716 }
717 if ((t = counts[ii]) > 0) {
718 sum += t;
719 v = 10 * sum / j;
720 if (mv < v) {
721 mv = v;
722 iv = ivv;
723 jv = j;
724 }
725 }
726 counts[ii]++;
727 }
728 }
729 }
730
731 int wo(int len, const char* s, int* beg, int* end) {
732 int i, l1;
733 l1 = len - dustword + 1;
734 if (l1 < 0) {
735 *beg = 0;
736 *end = len - 1;
737 return 0;
738 }
739 mv = 0;
740 iv = 0;
741 jv = 0;
742 for (i=0; i < l1; i++) {
743 wo1(len-i, s+i, i);
744 }
745 *beg = iv;
746 *end = iv + jv;
747 return mv;
748 }
749
750 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
751 int i, j, l, a, b, v;
752 if (cutoff==0) cutoff=dustcutoff;
753 a=0;b=0;
754 //GMessage("Dust cutoff=%d\n", cutoff);
755 for (i=0; i < seqlen; i += dustwindow2) {
756 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
757 v = wo(l, seq+i, &a, &b);
758 if (v > cutoff) {
759 //for (j = a; j <= b && j < dustwindow2; j++) {
760 for (j = a; j <= b; j++) {
761 seqmsk[i+j]='N';//could be made lowercase instead
762 }
763 }
764 }
765 //return first;
766 }
767 };
768
769 static DNADuster duster;
770
771 int dust(GStr& rseq) {
772 char* seq=Gstrdup(rseq.chars());
773 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
774 //check the number of Ns:
775 int ncount=0;
776 for (int i=0;i<rseq.length();i++) {
777 if (seq[i]=='N') ncount++;
778 }
779 GFREE(seq);
780 return ncount;
781 }
782
783 struct SLocScore {
784 int pos;
785 int score;
786 SLocScore(int p=0,int s=0) {
787 pos=p;
788 score=s;
789 }
790 void set(int p, int s) {
791 pos=p;
792 score=s;
793 }
794 void add(int p, int add) {
795 pos=p;
796 score+=add;
797 }
798 };
799
800 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
801 if (!doPolyTrim) return false;
802 int rlen=seq.length();
803 l5=0;
804 l3=rlen-1;
805 int32 seedVal=*(int32*)poly_seed;
806 char polyChar=poly_seed[0];
807 //assumes N trimming was already done
808 //so a poly match should be very close to the end of the read
809 // -- find the initial match (seed)
810 int lmin=GMAX((rlen-16), 0);
811 int li;
812 for (li=rlen-4;li>lmin;li--) {
813 if (seedVal==*(int*)&(seq[li])) {
814 break;
815 }
816 }
817 if (li<=lmin) return false;
818 //seed found, try to extend it both ways
819 //extend right
820 int ri=li+3;
821 SLocScore loc(ri, poly_m_score<<2);
822 SLocScore maxloc(loc);
823 //extend right
824 while (ri<rlen-1) {
825 ri++;
826 if (seq[ri]==polyChar) {
827 loc.add(ri,poly_m_score);
828 }
829 else if (seq[ri]=='N') {
830 loc.add(ri,0);
831 }
832 else { //mismatch
833 loc.add(ri,poly_mis_score);
834 if (maxloc.score-loc.score>poly_dropoff_score) break;
835 }
836 if (maxloc.score<=loc.score) {
837 maxloc=loc;
838 }
839 }
840 ri=maxloc.pos;
841 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
842 //ri = right boundary for the poly match
843 //extend left
844 loc.set(li, maxloc.score);
845 maxloc.pos=li;
846 while (li>0) {
847 li--;
848 if (seq[li]==polyChar) {
849 loc.add(li,poly_m_score);
850 }
851 else if (seq[li]=='N') {
852 loc.add(li,0);
853 }
854 else { //mismatch
855 loc.add(li,poly_mis_score);
856 if (maxloc.score-loc.score>poly_dropoff_score) break;
857 }
858 if (maxloc.score<=loc.score) {
859 maxloc=loc;
860 }
861 }
862 li=maxloc.pos;
863 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
864 (maxloc.score>poly_minScore && ri>=rlen-3) ||
865 (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
866 //trimming this li-ri match at 3' end
867 l3=li-1;
868 if (l3<0) l3=0;
869 return true;
870 }
871 return false;
872 }
873
874 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
875 if (!doPolyTrim) return false;
876 int rlen=seq.length();
877 l5=0;
878 l3=rlen-1;
879 int32 seedVal=*(int32*)poly_seed;
880 char polyChar=poly_seed[0];
881 //assumes N trimming was already done
882 //so a poly match should be very close to the end of the read
883 // -- find the initial match (seed)
884 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
885 int li;
886 for (li=0;li<=lmax;li++) {
887 if (seedVal==*(int*)&(seq[li])) {
888 break;
889 }
890 }
891 if (li>lmax) return false;
892 //seed found, try to extend it both ways
893 //extend left
894 int ri=li+3; //save rightmost base of the seed
895 SLocScore loc(li, poly_m_score<<2);
896 SLocScore maxloc(loc);
897 while (li>0) {
898 li--;
899 if (seq[li]==polyChar) {
900 loc.add(li,poly_m_score);
901 }
902 else if (seq[li]=='N') {
903 loc.add(li,0);
904 }
905 else { //mismatch
906 loc.add(li,poly_mis_score);
907 if (maxloc.score-loc.score>poly_dropoff_score) break;
908 }
909 if (maxloc.score<=loc.score) {
910 maxloc=loc;
911 }
912 }
913 li=maxloc.pos;
914 if (li>5) return false; //no trimming wanted, too far from 5' end
915 //li = right boundary for the poly match
916
917 //extend right
918 loc.set(ri, maxloc.score);
919 maxloc.pos=ri;
920 while (ri<rlen-1) {
921 ri++;
922 if (seq[ri]==polyChar) {
923 loc.add(ri,poly_m_score);
924 }
925 else if (seq[ri]=='N') {
926 loc.add(ri,0);
927 }
928 else { //mismatch
929 loc.add(ri,poly_mis_score);
930 if (maxloc.score-loc.score>poly_dropoff_score) break;
931 }
932 if (maxloc.score<=loc.score) {
933 maxloc=loc;
934 }
935 }
936 ri=maxloc.pos;
937 if ((maxloc.score==poly_minScore && li==0) ||
938 (maxloc.score>poly_minScore && li<2)
939 || (maxloc.score>(poly_minScore*3) && li<8)) {
940 //adjust l5 to reflect this trimming of 5' end
941 l5=ri+1;
942 if (l5>rlen-1) l5=rlen-1;
943 return true;
944 }
945 return false;
946 }
947
948 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
949 if (adapters3.Count()==0) return false;
950 int rlen=seq.length();
951 l5=0;
952 l3=rlen-1;
953 bool trimmed=false;
954 GStr wseq(seq);
955 int wlen=rlen;
956 GXSeqData seqdata;
957 for (int ai=0;ai<adapters3.Count();ai++) {
958 seqdata.update(adapters3[ai].seq.chars(), adapters3[ai].seq.length(),
959 adapters3[ai].pz, wseq.chars(), wlen, adapters3[ai].amlen);
960 GXAlnInfo* bestaln=match_RightEnd(seqdata, gxmem_r, 86);
961 if (bestaln) {
962 trimmed=true;
963 //keep unmatched region on the left OR right (the longer one)
964 if (bestaln->sl > wlen-bestaln->sr) {
965 //keep left side
966 l3-=(wlen-bestaln->sl+1);
967 if (l3<0) l3=0;
968 }
969 else { //keep right side
970 l5+=bestaln->sr;
971 if (l5>=rlen) l5=rlen-1;
972 }
973 delete bestaln;
974 if (l3-l5+1<min_read_len) return true;
975 wseq=seq.substr(l5,l3-l5+1);
976 wlen=wseq.length();
977 }
978 }//for each 5' adapter
979 return trimmed;
980 }
981
982 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
983 if (adapters5.Count()==0) return false;
984 int rlen=seq.length();
985 l5=0;
986 l3=rlen-1;
987 bool trimmed=false;
988 GStr wseq(seq);
989 int wlen=rlen;
990 GXSeqData seqdata;
991 for (int ai=0;ai<adapters5.Count();ai++) {
992 seqdata.update(adapters5[ai].seq.chars(), adapters5[ai].seq.length(),
993 adapters5[ai].pz, wseq.chars(), wlen, adapters5[ai].amlen);
994 GXAlnInfo* bestaln=match_LeftEnd(seqdata, gxmem_l, 90);
995 if (bestaln) {
996 trimmed=true;
997 if (bestaln->sl-1 > wlen-bestaln->sr) {
998 //keep left side
999 l3-=(wlen-bestaln->sl+1);
1000 if (l3<0) l3=0;
1001 }
1002 else { //keep right side
1003 l5+=bestaln->sr;
1004 if (l5>=rlen) l5=rlen-1;
1005 }
1006 delete bestaln;
1007 if (l3-l5+1<min_read_len) return true;
1008 wseq=seq.substr(l5,l3-l5+1);
1009 wlen=wseq.length();
1010 }
1011 }//for each 5' adapter
1012 return trimmed;
1013 }
1014
1015 //convert qvs to/from phred64 from/to phread33
1016 void convertPhred(GStr& q) {
1017 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1018 }
1019
1020 void convertPhred(char* q, int len) {
1021 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1022 }
1023
1024 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1025 GStr& rname, GStr& rinfo, GStr& infname) {
1026 rseq="";
1027 rqv="";
1028 rname="";
1029 rinfo="";
1030 if (fq.eof()) return false;
1031 char* l=fq.getLine();
1032 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1033 if (l==NULL) return false;
1034 /* if (rawFormat) {
1035 //TODO: implement raw qseq parsing here
1036 //if (raw type=N) then continue; //skip invalid/bad records
1037 } //raw qseq format
1038 else { // FASTQ or FASTA */
1039 isfasta=(l[0]=='>');
1040 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1041 infname.chars(), l);
1042 GStr s(l);
1043 rname=&(l[1]);
1044 for (int i=0;i<rname.length();i++)
1045 if (rname[i]<=' ') {
1046 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1047 rname.cut(i);
1048 break;
1049 }
1050 //now get the sequence
1051 if ((l=fq.getLine())==NULL)
1052 GError("Error: unexpected EOF after header for read %s (%s)\n",
1053 rname.chars(), infname.chars());
1054 rseq=l; //this must be the DNA line
1055 while ((l=fq.getLine())!=NULL) {
1056 //seq can span multiple lines
1057 if (l[0]=='>' || l[0]=='+') {
1058 fq.pushBack();
1059 break; //
1060 }
1061 rseq+=l;
1062 } //check for multi-line seq
1063 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1064 if ((l=fq.getLine())==NULL)
1065 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1066 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1067 if ((l=fq.getLine())==NULL)
1068 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1069 rqv=l;
1070 //if (rqv.length()!=rseq.length())
1071 // GError("Error: qv len != seq len for %s\n", rname.chars());
1072 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1073 rqv+=l; //append to qv string
1074 }
1075 }// fastq
1076 // } //<-- FASTA or FASTQ
1077 rseq.upper();
1078 return true;
1079 }
1080
1081 #ifdef GDEBUG
1082 void showTrim(GStr& s, int l5, int l3) {
1083 if (l5>0 || l3==0) {
1084 color_bg(c_red);
1085 }
1086 for (int i=0;i<s.length()-1;i++) {
1087 if (i && i==l5) color_resetbg();
1088 fprintf(stderr, "%c", s[i]);
1089 if (i && i==l3) color_bg(c_red);
1090 }
1091 fprintf(stderr, "%c", s[s.length()-1]);
1092 color_reset();
1093 fprintf(stderr, "\n");
1094 }
1095 #endif
1096
1097 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1098 //returns 0 if the read was untouched, 1 if it was just trimmed
1099 // and a trash code if it was trashed
1100 l5=0;
1101 l3=rseq.length()-1;
1102 #ifdef GDEBUG
1103 //rseq.reverse();
1104 GMessage(">%s\n", rname.chars());
1105 GMessage("%s\n",rseq.chars());
1106 #endif
1107 if (l3-l5+1<min_read_len) {
1108 return 's';
1109 }
1110 GStr wseq(rseq.chars());
1111 GStr wqv(rqv.chars());
1112 int w5=l5;
1113 int w3=l3;
1114 //first do the q-based trimming
1115 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1116 if (w3-w5+1<min_read_len) {
1117 return 'Q'; //invalid read
1118 }
1119 //-- keep only the w5..w3 range
1120 l5=w5;
1121 l3=w3;
1122 wseq=wseq.substr(w5, w3-w5+1);
1123 if (!wqv.is_empty())
1124 wqv=wqv.substr(w5, w3-w5+1);
1125 } //qv trimming
1126 // N-trimming on the remaining read seq
1127 if (ntrim(wseq, w5, w3)) {
1128 //GMessage("before: %s\n",wseq.chars());
1129 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1130 l5+=w5;
1131 l3-=(wseq.length()-1-w3);
1132 if (w3-w5+1<min_read_len) {
1133 return 'N'; //to be trashed
1134 }
1135 //-- keep only the w5..w3 range
1136 wseq=wseq.substr(w5, w3-w5+1);
1137 if (!wqv.is_empty())
1138 wqv=wqv.substr(w5, w3-w5+1);
1139 w5=0;
1140 w3=wseq.length()-1;
1141 }
1142 char trim_code;
1143 do {
1144 trim_code=0;
1145 if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1146 trim_code='A';
1147 }
1148 else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1149 trim_code='T';
1150 }
1151 else if (trim_adapter5(wseq, w5, w3)) {
1152 trim_code='5';
1153 }
1154 if (trim_code) {
1155 #ifdef GDEBUG
1156 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1157 showTrim(wseq, w5, w3);
1158 #endif
1159 int trimlen=wseq.length()-(w3-w5+1);
1160 num_trimmed5++;
1161 if (trimlen<min_trimmed5)
1162 min_trimmed5=trimlen;
1163 l5+=w5;
1164 l3-=(wseq.length()-1-w3);
1165 if (w3-w5+1<min_read_len) {
1166 return trim_code;
1167 }
1168 //-- keep only the w5..w3 range
1169 wseq=wseq.substr(w5, w3-w5+1);
1170 if (!wqv.is_empty())
1171 wqv=wqv.substr(w5, w3-w5+1);
1172 }// trimmed at 5' end
1173 } while (trim_code);
1174
1175 do {
1176 trim_code=0;
1177 if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1178 trim_code='A';
1179 }
1180 else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1181 trim_code='T';
1182 }
1183 else if (trim_adapter3(wseq, w5, w3)) {
1184 trim_code='3';
1185 }
1186 if (trim_code) {
1187 #ifdef GDEBUG
1188 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1189 showTrim(wseq, w5, w3);
1190 #endif
1191 int trimlen=wseq.length()-(w3-w5+1);
1192 num_trimmed3++;
1193 if (trimlen<min_trimmed3)
1194 min_trimmed3=trimlen;
1195 l5+=w5;
1196 l3-=(wseq.length()-1-w3);
1197 if (w3-w5+1<min_read_len) {
1198 return trim_code;
1199 }
1200 //-- keep only the w5..w3 range
1201 wseq=wseq.substr(w5, w3-w5+1);
1202 if (!wqv.is_empty())
1203 wqv=wqv.substr(w5, w3-w5+1);
1204 }//trimming at 3' end
1205 } while (trim_code);
1206
1207
1208 if (doCollapse) {
1209 //keep read for later
1210 FqDupRec* dr=dhash.Find(wseq.chars());
1211 if (dr==NULL) { //new entry
1212 //if (prefix.is_empty())
1213 dhash.Add(wseq.chars(),
1214 new FqDupRec(&wqv, rname.chars()));
1215 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1216 }
1217 else
1218 dr->add(wqv);
1219 } //collapsing duplicates
1220 else { //not collapsing duplicates
1221 //apply the dust filter now
1222 if (doDust) {
1223 int dustbases=dust(wseq);
1224 if (dustbases>(wseq.length()>>1)) {
1225 return 'D';
1226 }
1227 }
1228 } //not collapsing duplicates
1229 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1230 }
1231
1232 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1233 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1234 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1235 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1236 }
1237
1238 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1239 outcounter++;
1240 bool asFasta=(rqv.is_empty() || fastaOutput);
1241 if (asFasta) {
1242 if (prefix.is_empty()) {
1243 printHeader(f_out, '>',rname,rinfo);
1244 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1245 }
1246 else {
1247 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1248 rseq.chars());
1249 }
1250 }
1251 else { //fastq
1252 if (convert_phred) convertPhred(rqv);
1253 if (prefix.is_empty()) {
1254 printHeader(f_out, '@', rname, rinfo);
1255 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1256 }
1257 else
1258 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1259 rseq.chars(),rqv.chars() );
1260 }
1261 }
1262
1263 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1264 if (freport==NULL || trashcode<=' ') return;
1265 if (trashcode=='3' || trashcode=='5') {
1266 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1267 }
1268 else {
1269 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1270 }
1271 //tcounter++;
1272 }
1273
1274 GStr getFext(GStr& s, int* xpos=NULL) {
1275 //s must be a filename without a path
1276 GStr r("");
1277 if (xpos!=NULL) *xpos=0;
1278 if (s.is_empty() || s=="-") return r;
1279 int p=s.rindex('.');
1280 int d=s.rindex('/');
1281 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1282 r=s.substr(p+1);
1283 if (xpos!=NULL) *xpos=p+1;
1284 r.lower();
1285 return r;
1286 }
1287
1288 void baseFileName(GStr& fname) {
1289 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1290 int xpos=0;
1291 GStr fext=getFext(fname, &xpos);
1292 if (xpos<=1) return;
1293 bool extdel=false;
1294 GStr f2;
1295 if (fext=="z" || fext=="zip") {
1296 extdel=true;
1297 }
1298 else if (fext.length()>=2) {
1299 f2=fext.substr(0,2);
1300 extdel=(f2=="gz" || f2=="bz");
1301 }
1302 if (extdel) {
1303 fname.cut(xpos-1);
1304 fext=getFext(fname, &xpos);
1305 if (xpos<=1) return;
1306 }
1307 extdel=false;
1308 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1309 extdel=true;
1310 }
1311 else if (fext.length()>=2) {
1312 extdel=(fext.substr(0,2)=="fa");
1313 }
1314 if (extdel) fname.cut(xpos-1);
1315 GStr fncp(fname);
1316 fncp.lower();
1317 fncp.chomp("seq");
1318 fncp.chomp("sequence");
1319 fncp.trimR("_.");
1320 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1321 }
1322
1323 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1324 FILE* f_out=NULL;
1325 GStr fname(getFileName(infname.chars()));
1326 //eliminate known extensions
1327 baseFileName(fname);
1328 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1329 else if (pocmd.is_empty()) {
1330 GStr oname(fname);
1331 oname.append('.');
1332 oname.append(outsuffix);
1333 f_out=fopen(oname.chars(),"w");
1334 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1335 }
1336 else {
1337 GStr oname(">");
1338 oname.append(fname);
1339 oname.append('.');
1340 oname.append(outsuffix);
1341 pocmd.append(oname);
1342 f_out=popen(pocmd.chars(), "w");
1343 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1344 }
1345 return f_out;
1346 }
1347
1348 void guess_unzip(GStr& fname, GStr& picmd) {
1349 GStr fext=getFext(fname);
1350 if (fext=="gz" || fext=="gzip" || fext=="z") {
1351 picmd="gzip -cd ";
1352 }
1353 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1354 picmd="bzip2 -cd ";
1355 }
1356 }
1357
1358 void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1359 //TODO: prepare CASeqData here, and collect hexamers as well
1360 CASeqData adata(revCompl);
1361 int idx=adapters.Add(adata);
1362 if (idx<0) GError("Error: failed to add adaptor!\n");
1363 adapters[idx].update(seq.chars());
1364 }
1365
1366
1367 int loadAdapters(const char* fname) {
1368 GLineReader lr(fname);
1369 char* l;
1370 while ((l=lr.nextLine())!=NULL) {
1371 if (lr.length()<=3 || l[0]=='#') continue;
1372 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1373 l[0]==';'|| l[0]==':' ) {
1374 int i=1;
1375 while (l[i]!=0 && isspace(l[i])) {
1376 i++;
1377 }
1378 if (l[i]!=0) {
1379 GStr s(&(l[i]));
1380 #ifdef GDEBUG
1381 //s.reverse();
1382 #endif
1383 addAdapter(adapters3, s);
1384 continue;
1385 }
1386 }
1387 else {
1388 GStr s(l);
1389 s.startTokenize("\t ;,:");
1390 GStr a5,a3;
1391 if (s.nextToken(a5))
1392 s.nextToken(a3);
1393 a5.upper();
1394 a3.upper();
1395 #ifdef GDEBUG
1396 // a5.reverse();
1397 // a3.reverse();
1398 #endif
1399 addAdapter(adapters5, a5);
1400 addAdapter(adapters3, a3);
1401 }
1402 }
1403 return adapters5.Count()+adapters3.Count();
1404 }
1405
1406 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1407 GStr& s, GStr& infname, GStr& infname2) {
1408 // uses outsuffix to generate output file names and open file handles as needed
1409 infname="";
1410 infname2="";
1411 f_in=NULL;
1412 f_in2=NULL;
1413 f_out=NULL;
1414 f_out2=NULL;
1415 //analyze outsuffix intent
1416 GStr pocmd;
1417 if (outsuffix=="-") {
1418 f_out=stdout;
1419 }
1420 else {
1421 GStr ox=getFext(outsuffix);
1422 if (ox.length()>2) ox=ox.substr(0,2);
1423 if (ox=="gz") pocmd="gzip -9 -c ";
1424 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1425 }
1426 if (s=="-") {
1427 f_in=stdin;
1428 infname="stdin";
1429 f_out=prepOutFile(infname, pocmd);
1430 return;
1431 } // streaming from stdin
1432 s.startTokenize(",:");
1433 s.nextToken(infname);
1434 bool paired=s.nextToken(infname2);
1435 if (fileExists(infname.chars())==0)
1436 GError("Error: cannot find file %s!\n",infname.chars());
1437 GStr fname(getFileName(infname.chars()));
1438 GStr picmd;
1439 guess_unzip(fname, picmd);
1440 if (picmd.is_empty()) {
1441 f_in=fopen(infname.chars(), "r");
1442 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1443 }
1444 else {
1445 picmd.append(infname);
1446 f_in=popen(picmd.chars(), "r");
1447 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1448 }
1449 if (f_out==stdout) {
1450 if (paired) GError("Error: output suffix required for paired reads\n");
1451 return;
1452 }
1453 f_out=prepOutFile(infname, pocmd);
1454 if (!paired) return;
1455 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1456 // ---- paired reads:-------------
1457 if (fileExists(infname2.chars())==0)
1458 GError("Error: cannot find file %s!\n",infname2.chars());
1459 picmd="";
1460 GStr fname2(getFileName(infname2.chars()));
1461 guess_unzip(fname2, picmd);
1462 if (picmd.is_empty()) {
1463 f_in2=fopen(infname2.chars(), "r");
1464 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1465 }
1466 else {
1467 picmd.append(infname2);
1468 f_in2=popen(picmd.chars(), "r");
1469 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1470 }
1471 f_out2=prepOutFile(infname2, pocmd);
1472 }