ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 175
Committed: Thu Feb 16 03:46:59 2012 UTC (7 years, 7 months ago) by gpertea
File size: 43190 byte(s)
Log Message:
Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7
8 #define USAGE "Usage:\n\
9 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 <input.fq>[,<input_mates.fq>\n\
13 \n\
14 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 for low complexity and collapse duplicate reads.\n\
16 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 one read in a pair), the two file names should be given delimited by a comma\n\
18 or a colon character\n\
19 \n\
20 Options:\n\
21 -n rename all the reads using the <prefix> followed by a read counter;\n\
22 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23 the read duplication count\n\
24 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25 file(s) named <input>.<outsuffix> which will be created in the \n\
26 current (working) directory; (writes to stdout if -o- is given);\n\
27 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 -f file with adapter sequences to trim, each line having this format:\n\
29 <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 -3 trim the given adapter sequence at the 3' end of each read\n\
33 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 -A disable polyA/T trimming (enabled by default)\n\
35 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 -l minimum \"clean\" length after trimming that a read must have\n\
40 in order to pass the filter (default: 16)\n\
41 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42 one letter code for the reason why they were trashed\n\
43 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44 50% of its length detected as low complexity\n\
45 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 name (e.g. for microRNAs)\n\
47 -p input is phred64/phred33 (use -p64 or -p33)\n\
48 -Q convert quality values to the other Phred qv type\n\
49 -V verbose processing\n\
50 "
51
52 //-z for -o option, the output stream(s) will be first piped into the given\n
53 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54
55
56 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
57
58 //For paired reads sequencing:
59 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 //FILE* f_out=NULL; //stdout if not provided
62 //FILE* f_out2=NULL; //for paired reads
63 //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 //FILE* f_in2=NULL; //for paired reads
65
66 FILE* freport=NULL;
67
68 bool debug=false;
69 bool verbose=false;
70 bool doCollapse=false;
71 bool doDust=false;
72 bool doPolyTrim=true;
73 bool fastaOutput=false;
74 bool trashReport=false;
75 bool revCompl=false; //also reverse complement adaptor sequences
76 int min_read_len=16;
77 double max_perc_N=7.0;
78 int dust_cutoff=16;
79 bool isfasta=false;
80 bool convert_phred=false;
81 GStr outsuffix; // -o
82 GStr prefix;
83 GStr zcmd;
84 int num_trimmed5=0;
85 int num_trimmed3=0;
86 int num_trimmedN=0;
87 int num_trimmedQ=0;
88 int min_trimmed5=INT_MAX;
89 int min_trimmed3=INT_MAX;
90
91 int qvtrim_qmin=0;
92 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94 int qv_cvtadd=0; //could be -31 or +31
95
96 // adaptor matching metrics -- for X-drop ungapped extension
97 const int match_reward=2;
98 const int mismatch_penalty=3;
99 const int Xdrop=8;
100
101 const int poly_m_score=2; //match score
102 const int poly_mis_score=-3; //mismatch
103 const int poly_dropoff_score=7;
104 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
105
106 const char *polyA_seed="AAAA";
107 const char *polyT_seed="TTTT";
108
109 struct CASeqData {
110 //positional data for every possible hexamer in an adapter
111 GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adapter sequence
112 GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adapter sequence
113 GStr seq; //actual adapter sequence data
114 GStr seqr; //reverse complement sequence
115 int amlen; //fraction of adapter length matching that's
116 //enough to consider the alignment
117 bool use_reverse;
118 CASeqData(bool rev=false):seq(),seqr(),
119 amlen(0), use_reverse(rev) {
120 for (int i=0;i<4096;i++) {
121 pz[i]=NULL;
122 pzr[i]=NULL;
123 }
124 }
125
126 void update(const char* s) {
127 seq=s;
128 table6mers(seq.chars(), seq.length(), pz);
129 amlen=iround(double(seq.length())*0.8);
130 if (amlen<12)
131 amlen=12;
132 if (!use_reverse) return;
133 //reverse complement
134 seqr=s;
135 int slen=seq.length();
136 for (int i=0;i<slen;i++)
137 seqr[i]=ntComplement(seq[slen-i-1]);
138 table6mers(seqr.chars(), seqr.length(), pzr);
139 }
140
141 ~CASeqData() {
142 for (int i=0;i<4096;i++) {
143 delete pz[i];
144 delete pzr[i];
145 }
146 }
147 };
148
149 GVec<CASeqData> adapters5;
150 GVec<CASeqData> adapters3;
151
152 CGreedyAlignData* gxmem_l=NULL;
153 CGreedyAlignData* gxmem_r=NULL;
154
155 // element in dhash:
156 class FqDupRec {
157 public:
158 int count; //how many of these reads are the same
159 int len; //length of qv
160 char* firstname; //optional, only if we want to keep the original read names
161 char* qv;
162 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
163 len=0;
164 qv=NULL;
165 firstname=NULL;
166 count=0;
167 if (qstr!=NULL) {
168 qv=Gstrdup(qstr->chars());
169 len=qstr->length();
170 count++;
171 }
172 if (rname!=NULL) firstname=Gstrdup(rname);
173 }
174 ~FqDupRec() {
175 GFREE(qv);
176 GFREE(firstname);
177 }
178 void add(GStr& d) { //collapse another record into this one
179 if (d.length()!=len)
180 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
181 count++;
182 for (int i=0;i<len;i++)
183 qv[i]=(qv[i]+d[i])/2;
184 }
185 };
186
187 void openfw(FILE* &f, GArgs& args, char opt) {
188 GStr s=args.getOpt(opt);
189 if (!s.is_empty()) {
190 if (s=='-') f=stdout;
191 else {
192 f=fopen(s.chars(),"w");
193 if (f==NULL) GError("Error creating file: %s\n", s.chars());
194 }
195 }
196 }
197
198 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
199 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
200
201 GHash<FqDupRec> dhash; //hash to keep track of duplicates
202
203 void addAdapter(GVec<CASeqData>& adapters, GStr& seq);
204 int loadAdapters(const char* fname);
205
206 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
207 GStr& s, GStr& infname, GStr& infname2);
208 // uses outsuffix to generate output file names and open file handles as needed
209
210 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
211 void trash_report(char trashcode, GStr& rname, FILE* freport);
212
213 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
214 GStr& rname, GStr& rinfo, GStr& infname);
215
216 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
217 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
218
219 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
220 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
221 int dust(GStr& seq);
222 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
223 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
224 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
225 bool trim_adapter3(GStr& seq, int &l5, int &l3);
226
227 void convertPhred(char* q, int len);
228 void convertPhred(GStr& q);
229
230 int main(int argc, char * const argv[]) {
231 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
232 int e;
233 if ((e=args.isError())>0) {
234 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
235 exit(224);
236 }
237 debug=(args.getOpt('Y')!=NULL);
238 verbose=(args.getOpt('V')!=NULL);
239 convert_phred=(args.getOpt('Q')!=NULL);
240 doCollapse=(args.getOpt('C')!=NULL);
241 doDust=(args.getOpt('D')!=NULL);
242 revCompl=(args.getOpt('R')!=NULL);
243 if (args.getOpt('A')) doPolyTrim=false;
244 /*
245 rawFormat=(args.getOpt('R')!=NULL);
246 if (rawFormat) {
247 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
248 }
249 */
250 prefix=args.getOpt('n');
251 GStr s=args.getOpt('l');
252 if (!s.is_empty())
253 min_read_len=s.asInt();
254 s=args.getOpt('m');
255 if (!s.is_empty())
256 max_perc_N=s.asDouble();
257 s=args.getOpt('d');
258 if (!s.is_empty()) {
259 dust_cutoff=s.asInt();
260 doDust=true;
261 }
262 s=args.getOpt('q');
263 if (!s.is_empty()) {
264 qvtrim_qmin=s.asInt();
265 }
266 s=args.getOpt('t');
267 if (!s.is_empty()) {
268 qvtrim_max=s.asInt();
269 }
270 s=args.getOpt('p');
271 if (!s.is_empty()) {
272 int v=s.asInt();
273 if (v==33) {
274 qv_phredtype=33;
275 qv_cvtadd=31;
276 }
277 else if (v==64) {
278 qv_phredtype=64;
279 qv_cvtadd=-31;
280 }
281 else
282 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
283 }
284 s=args.getOpt('f');
285 if (!s.is_empty()) {
286 loadAdapters(s.chars());
287 }
288 bool fileAdapters=adapters5.Count()+adapters3.Count();
289 s=args.getOpt('5');
290 if (!s.is_empty()) {
291 if (fileAdapters)
292 GError("Error: options -5 and -f cannot be used together!\n");
293 s.upper();
294 addAdapter(adapters5, s);
295 }
296 s=args.getOpt('3');
297 if (!s.is_empty()) {
298 if (fileAdapters)
299 GError("Error: options -3 and -f cannot be used together!\n");
300 s.upper();
301 addAdapter(adapters3, s);
302 }
303 s=args.getOpt('y');
304 if (!s.is_empty()) {
305 int minmatch=s.asInt();
306 poly_minScore=minmatch*poly_m_score;
307 }
308
309 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
310 else outsuffix="-";
311 trashReport= (args.getOpt('r')!=NULL);
312 int fcount=args.startNonOpt();
313 if (fcount==0) {
314 GMessage(USAGE);
315 exit(224);
316 }
317 if (fcount>1 && doCollapse) {
318 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
319 }
320 //openfw(f_out, args, 'o');
321 //if (f_out==NULL) f_out=stdout;
322 if (trashReport)
323 openfw(freport, args, 'r');
324 char* infile=NULL;
325
326 if (adapters5.Count()>0)
327 gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
328 if (adapters3.Count()>0)
329 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
330
331 while ((infile=args.nextNonOpt())!=NULL) {
332 //for each input file
333 int incounter=0; //counter for input reads
334 int outcounter=0; //counter for output reads
335 int trash_s=0; //too short from the get go
336 int trash_Q=0;
337 int trash_N=0;
338 int trash_D=0;
339 int trash_poly=0;
340 int trash_A3=0;
341 int trash_A5=0;
342 s=infile;
343 GStr infname;
344 GStr infname2;
345 FILE* f_in=NULL;
346 FILE* f_in2=NULL;
347 FILE* f_out=NULL;
348 FILE* f_out2=NULL;
349 bool paired_reads=false;
350 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
351 GLineReader fq(f_in);
352 GLineReader* fq2=NULL;
353 if (f_in2!=NULL) {
354 fq2=new GLineReader(f_in2);
355 paired_reads=true;
356 }
357 GStr rseq, rqv, seqid, seqinfo;
358 GStr rseq2, rqv2, seqid2, seqinfo2;
359 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
360 incounter++;
361 int a5=0, a3=0, b5=0, b3=0;
362 char tcode=0, tcode2=0;
363 tcode=process_read(seqid, rseq, rqv, a5, a3);
364 if (fq2!=NULL) {
365 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
366 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
367 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
368 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
369 }
370 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
371 //decide what to do with this pair and print rseq2 if the pair makes it
372 if (tcode>1 && tcode2<=1) {
373 //"untrash" rseq
374 if (a3-a5+1<min_read_len) {
375 a5=1;
376 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
377 }
378 tcode=1;
379 }
380 else if (tcode<=1 && tcode2>1) {
381 //"untrash" rseq2
382 if (b3-b5+1<min_read_len) {
383 b5=1;
384 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
385 }
386 tcode2=1;
387 }
388 if (tcode<=1) { //trimmed or left intact -- write it!
389 if (tcode>0) {
390 rseq2=rseq2.substr(b5,b3-b5+1);
391 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
392 }
393 int nocounter=0;
394 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
395 }
396 } //pair read
397 if (tcode>1) { //trashed
398 #ifdef GDEBUG
399 GMessage(" !!!!TRASH code = %c\n",tcode);
400 #endif
401 if (tcode=='s') trash_s++;
402 else if (tcode=='A' || tcode=='T') trash_poly++;
403 else if (tcode=='Q') trash_Q++;
404 else if (tcode=='N') trash_N++;
405 else if (tcode=='D') trash_D++;
406 else if (tcode=='3') trash_A3++;
407 else if (tcode=='5') trash_A5++;
408 if (trashReport) trash_report(tcode, seqid, freport);
409 }
410 else if (!doCollapse) { //write it
411 if (tcode>0) {
412 rseq=rseq.substr(a5,a3-a5+1);
413 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
414 }
415 #ifdef GDEBUG
416 GMessage(" After trimming:\n");
417 GMessage("%s\n",rseq.chars());
418 #endif
419 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
420 }
421 } //for each fastq record
422 delete fq2;
423 FRCLOSE(f_in);
424 FRCLOSE(f_in2);
425 if (doCollapse) {
426 outcounter=0;
427 int maxdup_count=1;
428 char* maxdup_seq=NULL;
429 dhash.startIterate();
430 FqDupRec* qd=NULL;
431 char* seq=NULL;
432 while ((qd=dhash.NextData(seq))!=NULL) {
433 GStr rseq(seq);
434 //do the dusting here
435 if (doDust) {
436 int dustbases=dust(rseq);
437 if (dustbases>(rseq.length()>>1)) {
438 if (trashReport && qd->firstname!=NULL) {
439 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
440 }
441 trash_D+=qd->count;
442 continue;
443 }
444 }
445 outcounter++;
446 if (qd->count>maxdup_count) {
447 maxdup_count=qd->count;
448 maxdup_seq=seq;
449 }
450 if (isfasta) {
451 if (prefix.is_empty()) {
452 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
453 rseq.chars());
454 }
455 else { //use custom read name
456 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
457 qd->count, rseq.chars());
458 }
459 }
460 else { //fastq format
461 if (convert_phred) convertPhred(qd->qv, qd->len);
462 if (prefix.is_empty()) {
463 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
464 rseq.chars(), qd->qv);
465 }
466 else { //use custom read name
467 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
468 qd->count, rseq.chars(), qd->qv);
469 }
470 }
471 }//for each element of dhash
472 if (maxdup_count>1) {
473 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
474 }
475 } //collapse entries
476 if (verbose) {
477 if (paired_reads) {
478 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
479 GMessage("Number of input pairs :%9d\n", incounter);
480 GMessage(" Output pairs :%9d\n", outcounter);
481 }
482 else {
483 GMessage(">Input file : %s\n", infname.chars());
484 GMessage("Number of input reads :%9d\n", incounter);
485 GMessage(" Output reads :%9d\n", outcounter);
486 }
487 GMessage("------------------------------------\n");
488 if (num_trimmed5)
489 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
490 if (num_trimmed3)
491 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
492 GMessage("------------------------------------\n");
493 if (trash_s>0)
494 GMessage(" Trashed by length:%9d\n", trash_s);
495 if (trash_N>0)
496 GMessage(" Trashed by N%%:%9d\n", trash_N);
497 if (trash_Q>0)
498 GMessage("Trashed by low quality:%9d\n", trash_Q);
499 if (trash_poly>0)
500 GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
501 if (trash_A5>0)
502 GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
503 if (trash_A3>0)
504 GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
505 }
506 if (trashReport) {
507 FWCLOSE(freport);
508 }
509 FWCLOSE(f_out);
510 FWCLOSE(f_out2);
511 } //while each input file
512 delete gxmem_l;
513 delete gxmem_r;
514 //getc(stdin);
515 }
516
517 class NData {
518 public:
519 int NPos[1024]; //there should be no reads longer than 1K ?
520 int NCount;
521 int end5;
522 int end3;
523 int seqlen;
524 double perc_N; //percentage of Ns in end5..end3 range only!
525 const char* seq;
526 bool valid;
527 NData() {
528 seqlen=0;
529 NCount=0;
530 end5=0;
531 end3=0;
532 seq=NULL;
533 perc_N=0;
534 valid=true;
535 }
536 void init(GStr& rseq) {
537 NCount=0;
538 perc_N=0;
539 seqlen=rseq.length();
540 seq=rseq.chars();
541 end5=1;
542 end3=seqlen;
543 for (int i=0;i<seqlen;i++)
544 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
545 NPos[NCount]=i;
546 NCount++;
547 }
548 perc_N=(NCount*100.0)/seqlen;
549 valid=true;
550 }
551 void N_calc() { //only in the region after trimming
552 int n=0;
553 for (int i=end3-1;i<end5;i++) {
554 if (seq[i]=='N') n++;
555 }
556 perc_N=(n*100.0)/(end5-end3+1);
557 }
558 };
559
560 static NData feat;
561 int perc_lenN=12; // incremental distance from ends, in percentage of
562 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
563
564 void N_analyze(int l5, int l3, int p5, int p3) {
565 /* assumes feat was filled properly */
566 int old_dif, t5,t3,v;
567 if (l3<l5+2 || p5>p3 ) {
568 feat.end5=l5+1;
569 feat.end3=l3+1;
570 return;
571 }
572
573 t5=feat.NPos[p5]-l5;
574 t3=l3-feat.NPos[p3];
575 old_dif=p3-p5;
576 v=(int)((((double)(l3-l5))*perc_lenN)/100);
577 if (v>20) v=20; /* enforce N-search limit for very long reads */
578 if (t5 < v ) {
579 l5=feat.NPos[p5]+1;
580 p5++;
581 }
582 if (t3 < v) {
583 l3=feat.NPos[p3]-1;
584 p3--;
585 }
586 /* restNs=p3-p5; number of Ns in the new CLR */
587 if (p3-p5==old_dif) { /* no change, return */
588 /* don't trim if this may shorten the read too much */
589 //if (l5-l3<min_read_len) return;
590 feat.end5=l5+1;
591 feat.end3=l3+1;
592 return;
593 }
594 else
595 N_analyze(l5,l3, p5,p3);
596 }
597
598
599 bool qtrim(GStr& qvs, int &l5, int &l3) {
600 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
601 if (qv_phredtype==0) {
602 //try to guess the Phred type
603 int vmin=256, vmax=0;
604 for (int i=0;i<qvs.length();i++) {
605 if (vmin>qvs[i]) vmin=qvs[i];
606 if (vmax<qvs[i]) vmax=qvs[i];
607 }
608 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
609 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
610 if (qv_phredtype==0) {
611 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
612 }
613 if (verbose)
614 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
615 } //guessing Phred type
616 for (l3=qvs.length()-1;l3>2;l3--) {
617 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
618 }
619 //just in case, check also the 5' the end (?)
620 for (l5=0;l5<qvs.length()-3;l5++) {
621 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
622 }
623 if (qvtrim_max>0) {
624 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
625 if (l5>qvtrim_max) l5=qvtrim_max;
626 }
627 return (l5>0 || l3<qvs.length()-1);
628 }
629
630 bool ntrim(GStr& rseq, int &l5, int &l3) {
631 //count Ns in the sequence, trim N-rich ends
632 feat.init(rseq);
633 l5=feat.end5-1;
634 l3=feat.end3-1;
635 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
636 if (l5==feat.end5-1 && l3==feat.end3-1) {
637 if (feat.perc_N>max_perc_N) {
638 feat.valid=false;
639 l3=l5+1;
640 return true;
641 }
642 else {
643 return false; //nothing changed
644 }
645 }
646 l5=feat.end5-1;
647 l3=feat.end3-1;
648 if (l3-l5+1<min_read_len) {
649 feat.valid=false;
650 return true;
651 }
652 feat.N_calc();
653
654 if (feat.perc_N>max_perc_N) {
655 feat.valid=false;
656 l3=l5+1;
657 return true;
658 }
659 return true;
660 }
661
662 //--------------- dust functions ----------------
663 class DNADuster {
664 public:
665 int dustword;
666 int dustwindow;
667 int dustwindow2;
668 int dustcutoff;
669 int mv, iv, jv;
670 int counts[32*32*32];
671 int iis[32*32*32];
672 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
673 dustword=wordsize;
674 dustwindow=winsize;
675 dustwindow2 = (winsize>>1);
676 dustcutoff=cutoff;
677 mv=0;
678 iv=0;
679 jv=0;
680 }
681 void setWindowSize(int value) {
682 dustwindow = value;
683 dustwindow2 = (dustwindow >> 1);
684 }
685 void setWordSize(int value) {
686 dustword=value;
687 }
688 void wo1(int len, const char* s, int ivv) {
689 int i, ii, j, v, t, n, n1, sum;
690 int js, nis;
691 n = 32 * 32 * 32;
692 n1 = n - 1;
693 nis = 0;
694 i = 0;
695 ii = 0;
696 sum = 0;
697 v = 0;
698 for (j=0; j < len; j++, s++) {
699 ii <<= 5;
700 if (*s<=32) {
701 i=0;
702 continue;
703 }
704 ii |= *s - 'A'; //assume uppercase!
705 ii &= n1;
706 i++;
707 if (i >= dustword) {
708 for (js=0; js < nis && iis[js] != ii; js++) ;
709 if (js == nis) {
710 iis[nis] = ii;
711 counts[ii] = 0;
712 nis++;
713 }
714 if ((t = counts[ii]) > 0) {
715 sum += t;
716 v = 10 * sum / j;
717 if (mv < v) {
718 mv = v;
719 iv = ivv;
720 jv = j;
721 }
722 }
723 counts[ii]++;
724 }
725 }
726 }
727
728 int wo(int len, const char* s, int* beg, int* end) {
729 int i, l1;
730 l1 = len - dustword + 1;
731 if (l1 < 0) {
732 *beg = 0;
733 *end = len - 1;
734 return 0;
735 }
736 mv = 0;
737 iv = 0;
738 jv = 0;
739 for (i=0; i < l1; i++) {
740 wo1(len-i, s+i, i);
741 }
742 *beg = iv;
743 *end = iv + jv;
744 return mv;
745 }
746
747 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
748 int i, j, l, a, b, v;
749 if (cutoff==0) cutoff=dustcutoff;
750 a=0;b=0;
751 //GMessage("Dust cutoff=%d\n", cutoff);
752 for (i=0; i < seqlen; i += dustwindow2) {
753 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
754 v = wo(l, seq+i, &a, &b);
755 if (v > cutoff) {
756 //for (j = a; j <= b && j < dustwindow2; j++) {
757 for (j = a; j <= b; j++) {
758 seqmsk[i+j]='N';//could be made lowercase instead
759 }
760 }
761 }
762 //return first;
763 }
764 };
765
766 static DNADuster duster;
767
768 int dust(GStr& rseq) {
769 char* seq=Gstrdup(rseq.chars());
770 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
771 //check the number of Ns:
772 int ncount=0;
773 for (int i=0;i<rseq.length();i++) {
774 if (seq[i]=='N') ncount++;
775 }
776 GFREE(seq);
777 return ncount;
778 }
779
780 struct SLocScore {
781 int pos;
782 int score;
783 SLocScore(int p=0,int s=0) {
784 pos=p;
785 score=s;
786 }
787 void set(int p, int s) {
788 pos=p;
789 score=s;
790 }
791 void add(int p, int add) {
792 pos=p;
793 score+=add;
794 }
795 };
796
797 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
798 if (!doPolyTrim) return false;
799 int rlen=seq.length();
800 l5=0;
801 l3=rlen-1;
802 int32 seedVal=*(int32*)poly_seed;
803 char polyChar=poly_seed[0];
804 //assumes N trimming was already done
805 //so a poly match should be very close to the end of the read
806 // -- find the initial match (seed)
807 int lmin=GMAX((rlen-16), 0);
808 int li;
809 for (li=rlen-4;li>lmin;li--) {
810 if (seedVal==*(int*)&(seq[li])) {
811 break;
812 }
813 }
814 if (li<=lmin) return false;
815 //seed found, try to extend it both ways
816 //extend right
817 int ri=li+3;
818 SLocScore loc(ri, poly_m_score<<2);
819 SLocScore maxloc(loc);
820 //extend right
821 while (ri<rlen-1) {
822 ri++;
823 if (seq[ri]==polyChar) {
824 loc.add(ri,poly_m_score);
825 }
826 else if (seq[ri]=='N') {
827 loc.add(ri,0);
828 }
829 else { //mismatch
830 loc.add(ri,poly_mis_score);
831 if (maxloc.score-loc.score>poly_dropoff_score) break;
832 }
833 if (maxloc.score<=loc.score) {
834 maxloc=loc;
835 }
836 }
837 ri=maxloc.pos;
838 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
839 //ri = right boundary for the poly match
840 //extend left
841 loc.set(li, maxloc.score);
842 maxloc.pos=li;
843 while (li>0) {
844 li--;
845 if (seq[li]==polyChar) {
846 loc.add(li,poly_m_score);
847 }
848 else if (seq[li]=='N') {
849 loc.add(li,0);
850 }
851 else { //mismatch
852 loc.add(li,poly_mis_score);
853 if (maxloc.score-loc.score>poly_dropoff_score) break;
854 }
855 if (maxloc.score<=loc.score) {
856 maxloc=loc;
857 }
858 }
859 li=maxloc.pos;
860 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
861 (maxloc.score>poly_minScore && ri>=rlen-3) ||
862 (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
863 //trimming this li-ri match at 3' end
864 l3=li-1;
865 if (l3<0) l3=0;
866 return true;
867 }
868 return false;
869 }
870
871 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
872 if (!doPolyTrim) return false;
873 int rlen=seq.length();
874 l5=0;
875 l3=rlen-1;
876 int32 seedVal=*(int32*)poly_seed;
877 char polyChar=poly_seed[0];
878 //assumes N trimming was already done
879 //so a poly match should be very close to the end of the read
880 // -- find the initial match (seed)
881 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
882 int li;
883 for (li=0;li<=lmax;li++) {
884 if (seedVal==*(int*)&(seq[li])) {
885 break;
886 }
887 }
888 if (li>lmax) return false;
889 //seed found, try to extend it both ways
890 //extend left
891 int ri=li+3; //save rightmost base of the seed
892 SLocScore loc(li, poly_m_score<<2);
893 SLocScore maxloc(loc);
894 while (li>0) {
895 li--;
896 if (seq[li]==polyChar) {
897 loc.add(li,poly_m_score);
898 }
899 else if (seq[li]=='N') {
900 loc.add(li,0);
901 }
902 else { //mismatch
903 loc.add(li,poly_mis_score);
904 if (maxloc.score-loc.score>poly_dropoff_score) break;
905 }
906 if (maxloc.score<=loc.score) {
907 maxloc=loc;
908 }
909 }
910 li=maxloc.pos;
911 if (li>5) return false; //no trimming wanted, too far from 5' end
912 //li = right boundary for the poly match
913
914 //extend right
915 loc.set(ri, maxloc.score);
916 maxloc.pos=ri;
917 while (ri<rlen-1) {
918 ri++;
919 if (seq[ri]==polyChar) {
920 loc.add(ri,poly_m_score);
921 }
922 else if (seq[ri]=='N') {
923 loc.add(ri,0);
924 }
925 else { //mismatch
926 loc.add(ri,poly_mis_score);
927 if (maxloc.score-loc.score>poly_dropoff_score) break;
928 }
929 if (maxloc.score<=loc.score) {
930 maxloc=loc;
931 }
932 }
933 ri=maxloc.pos;
934 if ((maxloc.score==poly_minScore && li==0) ||
935 (maxloc.score>poly_minScore && li<2)
936 || (maxloc.score>(poly_minScore*3) && li<8)) {
937 //adjust l5 to reflect this trimming of 5' end
938 l5=ri+1;
939 if (l5>rlen-1) l5=rlen-1;
940 return true;
941 }
942 return false;
943 }
944
945 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
946 if (adapters3.Count()==0) return false;
947 int rlen=seq.length();
948 l5=0;
949 l3=rlen-1;
950 bool trimmed=false;
951 GStr wseq(seq);
952 int wlen=rlen;
953 GXSeqData seqdata;
954 for (int ai=0;ai<adapters3.Count();ai++) {
955 seqdata.update(adapters3[ai].seq.chars(), adapters3[ai].seq.length(),
956 adapters3[ai].pz, wseq.chars(), wlen, adapters3[ai].amlen);
957 GXAlnInfo* bestaln=match_RightEnd(seqdata, gxmem_r, 86);
958 if (bestaln) {
959 trimmed=true;
960 //keep unmatched region on the left OR right (the longer one)
961 if (bestaln->sl > wlen-bestaln->sr) {
962 //keep left side
963 l3-=(wlen-bestaln->sl+1);
964 if (l3<0) l3=0;
965 }
966 else { //keep right side
967 l5+=bestaln->sr;
968 if (l5>=rlen) l5=rlen-1;
969 }
970 delete bestaln;
971 if (l3-l5+1<min_read_len) return true;
972 wseq=seq.substr(l5,l3-l5+1);
973 wlen=wseq.length();
974 }
975 }//for each 5' adapter
976 return trimmed;
977 }
978
979 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
980 if (adapters5.Count()==0) return false;
981 int rlen=seq.length();
982 l5=0;
983 l3=rlen-1;
984 bool trimmed=false;
985 GStr wseq(seq);
986 int wlen=rlen;
987 GXSeqData seqdata;
988 for (int ai=0;ai<adapters5.Count();ai++) {
989 seqdata.update(adapters5[ai].seq.chars(), adapters5[ai].seq.length(),
990 adapters5[ai].pz, wseq.chars(), wlen, adapters5[ai].amlen);
991 GXAlnInfo* bestaln=match_LeftEnd(seqdata, gxmem_l, 90);
992 if (bestaln) {
993 trimmed=true;
994 if (bestaln->sl > wlen-bestaln->sr) {
995 //keep left side
996 l3-=(wlen-bestaln->sl+1);
997 if (l3<0) l3=0;
998 }
999 else { //keep right side
1000 l5+=bestaln->sr;
1001 if (l5>=rlen) l5=rlen-1;
1002 }
1003 delete bestaln;
1004 if (l3-l5+1<min_read_len) return true;
1005 wseq=seq.substr(l5,l3-l5+1);
1006 wlen=wseq.length();
1007 }
1008 }//for each 5' adapter
1009 return trimmed;
1010 }
1011
1012 //convert qvs to/from phred64 from/to phread33
1013 void convertPhred(GStr& q) {
1014 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1015 }
1016
1017 void convertPhred(char* q, int len) {
1018 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1019 }
1020
1021 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1022 GStr& rname, GStr& rinfo, GStr& infname) {
1023 rseq="";
1024 rqv="";
1025 rname="";
1026 rinfo="";
1027 if (fq.eof()) return false;
1028 char* l=fq.getLine();
1029 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1030 if (l==NULL) return false;
1031 /* if (rawFormat) {
1032 //TODO: implement raw qseq parsing here
1033 //if (raw type=N) then continue; //skip invalid/bad records
1034 } //raw qseq format
1035 else { // FASTQ or FASTA */
1036 isfasta=(l[0]=='>');
1037 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1038 infname.chars(), l);
1039 GStr s(l);
1040 rname=&(l[1]);
1041 for (int i=0;i<rname.length();i++)
1042 if (rname[i]<=' ') {
1043 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1044 rname.cut(i);
1045 break;
1046 }
1047 //now get the sequence
1048 if ((l=fq.getLine())==NULL)
1049 GError("Error: unexpected EOF after header for read %s (%s)\n",
1050 rname.chars(), infname.chars());
1051 rseq=l; //this must be the DNA line
1052 while ((l=fq.getLine())!=NULL) {
1053 //seq can span multiple lines
1054 if (l[0]=='>' || l[0]=='+') {
1055 fq.pushBack();
1056 break; //
1057 }
1058 rseq+=l;
1059 } //check for multi-line seq
1060 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1061 if ((l=fq.getLine())==NULL)
1062 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1063 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1064 if ((l=fq.getLine())==NULL)
1065 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1066 rqv=l;
1067 //if (rqv.length()!=rseq.length())
1068 // GError("Error: qv len != seq len for %s\n", rname.chars());
1069 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1070 rqv+=l; //append to qv string
1071 }
1072 }// fastq
1073 // } //<-- FASTA or FASTQ
1074 rseq.upper();
1075 return true;
1076 }
1077
1078 #ifdef GDEBUG
1079 void showTrim(GStr& s, int l5, int l3) {
1080 if (l5>0) {
1081 color_bg(c_red);
1082 }
1083 for (int i=0;i<s.length()-1;i++) {
1084 if (i && i==l5) color_resetbg();
1085 fprintf(stderr, "%c", s[i]);
1086 if (i==l3) color_bg(c_red);
1087 }
1088 fprintf(stderr, "%c", s[s.length()-1]);
1089 color_reset();
1090 fprintf(stderr, "\n");
1091 }
1092 #endif
1093
1094 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1095 //returns 0 if the read was untouched, 1 if it was just trimmed
1096 // and a trash code if it was trashed
1097 l5=0;
1098 l3=rseq.length()-1;
1099 #ifdef GDEBUG
1100 //rseq.reverse();
1101 GMessage(">%s\n", rname.chars());
1102 GMessage("%s\n",rseq.chars());
1103 #endif
1104 if (l3-l5+1<min_read_len) {
1105 return 's';
1106 }
1107 GStr wseq(rseq.chars());
1108 GStr wqv(rqv.chars());
1109 int w5=l5;
1110 int w3=l3;
1111 //first do the q-based trimming
1112 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1113 if (w3-w5+1<min_read_len) {
1114 return 'Q'; //invalid read
1115 }
1116 //-- keep only the w5..w3 range
1117 l5=w5;
1118 l3=w3;
1119 wseq=wseq.substr(w5, w3-w5+1);
1120 if (!wqv.is_empty())
1121 wqv=wqv.substr(w5, w3-w5+1);
1122 } //qv trimming
1123 // N-trimming on the remaining read seq
1124 if (ntrim(wseq, w5, w3)) {
1125 //GMessage("before: %s\n",wseq.chars());
1126 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1127 l5+=w5;
1128 l3-=(wseq.length()-1-w3);
1129 if (w3-w5+1<min_read_len) {
1130 return 'N'; //to be trashed
1131 }
1132 //-- keep only the w5..w3 range
1133 wseq=wseq.substr(w5, w3-w5+1);
1134 if (!wqv.is_empty())
1135 wqv=wqv.substr(w5, w3-w5+1);
1136 w5=0;
1137 w3=wseq.length()-1;
1138 }
1139 char trim_code;
1140 do {
1141 trim_code=0;
1142 if (trim_poly5(wseq, w5, w3, polyA_seed)) {
1143 trim_code='A';
1144 }
1145 else if (trim_poly5(wseq, w5, w3, polyT_seed)) {
1146 trim_code='T';
1147 }
1148 else if (trim_adapter5(wseq, w5, w3)) {
1149 trim_code='5';
1150 }
1151 if (trim_code) {
1152 #ifdef GDEBUG
1153 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1154 showTrim(wseq, w5, w3);
1155 #endif
1156 int trimlen=wseq.length()-(w3-w5+1);
1157 num_trimmed5++;
1158 if (trimlen<min_trimmed5)
1159 min_trimmed5=trimlen;
1160 l5+=w5;
1161 l3-=(wseq.length()-1-w3);
1162 if (w3-w5+1<min_read_len) {
1163 return trim_code;
1164 }
1165 //-- keep only the w5..w3 range
1166 wseq=wseq.substr(w5, w3-w5+1);
1167 if (!wqv.is_empty())
1168 wqv=wqv.substr(w5, w3-w5+1);
1169 }// trimmed at 5' end
1170 } while (trim_code);
1171
1172 do {
1173 trim_code=0;
1174 if (trim_poly3(wseq, w5, w3, polyA_seed)) {
1175 trim_code='A';
1176 }
1177 else if (trim_poly3(wseq, w5, w3, polyT_seed)) {
1178 trim_code='T';
1179 }
1180 else if (trim_adapter3(wseq, w5, w3)) {
1181 trim_code='3';
1182 }
1183 if (trim_code) {
1184 #ifdef GDEBUG
1185 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1186 showTrim(wseq, w5, w3);
1187 #endif
1188 int trimlen=wseq.length()-(w3-w5+1);
1189 num_trimmed3++;
1190 if (trimlen<min_trimmed3)
1191 min_trimmed3=trimlen;
1192 l5+=w5;
1193 l3-=(wseq.length()-1-w3);
1194 if (w3-w5+1<min_read_len) {
1195 return trim_code;
1196 }
1197 //-- keep only the w5..w3 range
1198 wseq=wseq.substr(w5, w3-w5+1);
1199 if (!wqv.is_empty())
1200 wqv=wqv.substr(w5, w3-w5+1);
1201 }//trimming at 3' end
1202 } while (trim_code);
1203
1204
1205 if (doCollapse) {
1206 //keep read for later
1207 FqDupRec* dr=dhash.Find(wseq.chars());
1208 if (dr==NULL) { //new entry
1209 //if (prefix.is_empty())
1210 dhash.Add(wseq.chars(),
1211 new FqDupRec(&wqv, rname.chars()));
1212 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1213 }
1214 else
1215 dr->add(wqv);
1216 } //collapsing duplicates
1217 else { //not collapsing duplicates
1218 //apply the dust filter now
1219 if (doDust) {
1220 int dustbases=dust(wseq);
1221 if (dustbases>(wseq.length()>>1)) {
1222 return 'D';
1223 }
1224 }
1225 } //not collapsing duplicates
1226 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1227 }
1228
1229 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1230 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1231 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1232 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1233 }
1234
1235 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1236 outcounter++;
1237 bool asFasta=(rqv.is_empty() || fastaOutput);
1238 if (asFasta) {
1239 if (prefix.is_empty()) {
1240 printHeader(f_out, '>',rname,rinfo);
1241 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1242 }
1243 else {
1244 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1245 rseq.chars());
1246 }
1247 }
1248 else { //fastq
1249 if (convert_phred) convertPhred(rqv);
1250 if (prefix.is_empty()) {
1251 printHeader(f_out, '@', rname, rinfo);
1252 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1253 }
1254 else
1255 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1256 rseq.chars(),rqv.chars() );
1257 }
1258 }
1259
1260 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1261 if (freport==NULL || trashcode<=' ') return;
1262 if (trashcode=='3' || trashcode=='5') {
1263 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1264 }
1265 else {
1266 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1267 }
1268 //tcounter++;
1269 }
1270
1271 GStr getFext(GStr& s, int* xpos=NULL) {
1272 //s must be a filename without a path
1273 GStr r("");
1274 if (xpos!=NULL) *xpos=0;
1275 if (s.is_empty() || s=="-") return r;
1276 int p=s.rindex('.');
1277 int d=s.rindex('/');
1278 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1279 r=s.substr(p+1);
1280 if (xpos!=NULL) *xpos=p+1;
1281 r.lower();
1282 return r;
1283 }
1284
1285 void baseFileName(GStr& fname) {
1286 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1287 int xpos=0;
1288 GStr fext=getFext(fname, &xpos);
1289 if (xpos<=1) return;
1290 bool extdel=false;
1291 GStr f2;
1292 if (fext=="z" || fext=="zip") {
1293 extdel=true;
1294 }
1295 else if (fext.length()>=2) {
1296 f2=fext.substr(0,2);
1297 extdel=(f2=="gz" || f2=="bz");
1298 }
1299 if (extdel) {
1300 fname.cut(xpos-1);
1301 fext=getFext(fname, &xpos);
1302 if (xpos<=1) return;
1303 }
1304 extdel=false;
1305 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1306 extdel=true;
1307 }
1308 else if (fext.length()>=2) {
1309 extdel=(fext.substr(0,2)=="fa");
1310 }
1311 if (extdel) fname.cut(xpos-1);
1312 GStr fncp(fname);
1313 fncp.lower();
1314 fncp.chomp("seq");
1315 fncp.chomp("sequence");
1316 fncp.trimR("_.");
1317 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1318 }
1319
1320 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1321 FILE* f_out=NULL;
1322 GStr fname(getFileName(infname.chars()));
1323 //eliminate known extensions
1324 baseFileName(fname);
1325 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1326 else if (pocmd.is_empty()) {
1327 GStr oname(fname);
1328 oname.append('.');
1329 oname.append(outsuffix);
1330 f_out=fopen(oname.chars(),"w");
1331 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1332 }
1333 else {
1334 GStr oname(">");
1335 oname.append(fname);
1336 oname.append('.');
1337 oname.append(outsuffix);
1338 pocmd.append(oname);
1339 f_out=popen(pocmd.chars(), "w");
1340 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1341 }
1342 return f_out;
1343 }
1344
1345 void guess_unzip(GStr& fname, GStr& picmd) {
1346 GStr fext=getFext(fname);
1347 if (fext=="gz" || fext=="gzip" || fext=="z") {
1348 picmd="gzip -cd ";
1349 }
1350 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1351 picmd="bzip2 -cd ";
1352 }
1353 }
1354
1355 void addAdapter(GVec<CASeqData>& adapters, GStr& seq) {
1356 //TODO: prepare CASeqData here, and collect hexamers as well
1357 CASeqData adata(revCompl);
1358 int idx=adapters.Add(adata);
1359 if (idx<0) GError("Error: failed to add adaptor!\n");
1360 adapters[idx].update(seq.chars());
1361 }
1362
1363
1364 int loadAdapters(const char* fname) {
1365 GLineReader lr(fname);
1366 char* l;
1367 while ((l=lr.nextLine())!=NULL) {
1368 if (lr.length()<=3 || l[0]=='#') continue;
1369 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1370 l[0]==';'|| l[0]==':' ) {
1371 int i=1;
1372 while (l[i]!=0 && isspace(l[i])) {
1373 i++;
1374 }
1375 if (l[i]!=0) {
1376 GStr s(&(l[i]));
1377 #ifdef GDEBUG
1378 //s.reverse();
1379 #endif
1380 addAdapter(adapters3, s);
1381 continue;
1382 }
1383 }
1384 else {
1385 GStr s(l);
1386 s.startTokenize("\t ;,:");
1387 GStr a5,a3;
1388 if (s.nextToken(a5))
1389 s.nextToken(a3);
1390 a5.upper();
1391 a3.upper();
1392 #ifdef GDEBUG
1393 // a5.reverse();
1394 // a3.reverse();
1395 #endif
1396 addAdapter(adapters5, a5);
1397 addAdapter(adapters3, a3);
1398 }
1399 }
1400 return adapters5.Count()+adapters3.Count();
1401 }
1402
1403 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1404 GStr& s, GStr& infname, GStr& infname2) {
1405 // uses outsuffix to generate output file names and open file handles as needed
1406 infname="";
1407 infname2="";
1408 f_in=NULL;
1409 f_in2=NULL;
1410 f_out=NULL;
1411 f_out2=NULL;
1412 //analyze outsuffix intent
1413 GStr pocmd;
1414 if (outsuffix=="-") {
1415 f_out=stdout;
1416 }
1417 else {
1418 GStr ox=getFext(outsuffix);
1419 if (ox.length()>2) ox=ox.substr(0,2);
1420 if (ox=="gz") pocmd="gzip -9 -c ";
1421 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1422 }
1423 if (s=="-") {
1424 f_in=stdin;
1425 infname="stdin";
1426 f_out=prepOutFile(infname, pocmd);
1427 return;
1428 } // streaming from stdin
1429 s.startTokenize(",:");
1430 s.nextToken(infname);
1431 bool paired=s.nextToken(infname2);
1432 if (fileExists(infname.chars())==0)
1433 GError("Error: cannot find file %s!\n",infname.chars());
1434 GStr fname(getFileName(infname.chars()));
1435 GStr picmd;
1436 guess_unzip(fname, picmd);
1437 if (picmd.is_empty()) {
1438 f_in=fopen(infname.chars(), "r");
1439 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1440 }
1441 else {
1442 picmd.append(infname);
1443 f_in=popen(picmd.chars(), "r");
1444 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1445 }
1446 if (f_out==stdout) {
1447 if (paired) GError("Error: output suffix required for paired reads\n");
1448 return;
1449 }
1450 f_out=prepOutFile(infname, pocmd);
1451 if (!paired) return;
1452 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1453 // ---- paired reads:-------------
1454 if (fileExists(infname2.chars())==0)
1455 GError("Error: cannot find file %s!\n",infname2.chars());
1456 picmd="";
1457 GStr fname2(getFileName(infname2.chars()));
1458 guess_unzip(fname2, picmd);
1459 if (picmd.is_empty()) {
1460 f_in2=fopen(infname2.chars(), "r");
1461 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1462 }
1463 else {
1464 picmd.append(infname2);
1465 f_in2=popen(picmd.chars(), "r");
1466 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1467 }
1468 f_out2=prepOutFile(infname2, pocmd);
1469 }