ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 182
Committed: Thu Feb 16 21:33:27 2012 UTC (7 years, 9 months ago) by gpertea
File size: 44004 byte(s)
Log Message:
Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7
8 #define USAGE "Usage:\n\
9 fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 <input.fq>[,<input_mates.fq>\n\
13 \n\
14 Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15 for low complexity and collapse duplicate reads.\n\
16 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 one read in a pair), the two file names should be given delimited by a comma\n\
18 or a colon character\n\
19 \n\
20 Options:\n\
21 -n rename all the reads using the <prefix> followed by a read counter;\n\
22 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23 the read duplication count\n\
24 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25 file(s) named <input>.<outsuffix> which will be created in the \n\
26 current (working) directory; (writes to stdout if -o- is given);\n\
27 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 -f file with adaptor sequences to trim, each line having this format:\n\
29 <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30 -5 trim the given adaptor or primer sequence at the 5' end of each read\n\
31 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 -3 trim the given adaptor sequence at the 3' end of each read\n\
33 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 -A disable polyA/T trimming (enabled by default)\n\
35 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 -l minimum \"clean\" length after trimming that a read must have\n\
40 in order to pass the filter (default: 16)\n\
41 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42 one letter code for the reason why they were trashed\n\
43 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44 50% of its length detected as low complexity\n\
45 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 name (e.g. for microRNAs)\n\
47 -p input is phred64/phred33 (use -p64 or -p33)\n\
48 -Q convert quality values to the other Phred qv type\n\
49 -V verbose processing\n\
50 "
51
52 //-z for -o option, the output stream(s) will be first piped into the given\n
53 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54
55
56 // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57
58 //For paired reads sequencing:
59 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 //FILE* f_out=NULL; //stdout if not provided
62 //FILE* f_out2=NULL; //for paired reads
63 //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 //FILE* f_in2=NULL; //for paired reads
65
66 FILE* freport=NULL;
67
68 bool debug=false;
69 bool verbose=false;
70 bool doCollapse=false;
71 bool doDust=false;
72 bool doPolyTrim=true;
73 bool fastaOutput=false;
74 bool trashReport=false;
75 bool revCompl=false; //also reverse complement adaptor sequences
76 int min_read_len=16;
77 double max_perc_N=7.0;
78 int dust_cutoff=16;
79 bool isfasta=false;
80 bool convert_phred=false;
81 GStr outsuffix; // -o
82 GStr prefix;
83 GStr zcmd;
84 int num_trimmed5=0;
85 int num_trimmed3=0;
86 int num_trimmedN=0;
87 int num_trimmedQ=0;
88 int min_trimmed5=INT_MAX;
89 int min_trimmed3=INT_MAX;
90
91 int qvtrim_qmin=0;
92 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94 int qv_cvtadd=0; //could be -31 or +31
95
96 // adaptor matching metrics -- for X-drop ungapped extension
97 //const int match_reward=2;
98 //const int mismatch_penalty=3;
99 const int match_reward=2;
100 const int mismatch_penalty=4;
101 const int Xdrop=10;
102
103 const int poly_m_score=2; //match score
104 const int poly_mis_score=-3; //mismatch
105 const int poly_dropoff_score=7;
106 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
107
108 const char *polyA_seed="AAAA";
109 const char *polyT_seed="TTTT";
110
111 struct CASeqData {
112 //positional data for every possible hexamer in an adaptor
113 GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114 GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115 GStr seq; //actual adaptor sequence data
116 GStr seqr; //reverse complement sequence
117 int amlen; //fraction of adaptor length matching that's
118 //enough to consider the alignment
119 GAlnTrimType trim_type;
120 bool use_reverse;
121 CASeqData(bool rev=false):seq(),seqr(),
122 amlen(0), use_reverse(rev) {
123 trim_type=galn_None; //should be updated later!
124 for (int i=0;i<4096;i++) {
125 pz[i]=NULL;
126 pzr[i]=NULL;
127 }
128 }
129
130 void update(const char* s) {
131 seq=s;
132 table6mers(seq.chars(), seq.length(), pz);
133 amlen=iround(double(seq.length())*0.8);
134 if (amlen<12)
135 amlen=12;
136 if (!use_reverse) return;
137 //reverse complement
138 seqr=s;
139 int slen=seq.length();
140 for (int i=0;i<slen;i++)
141 seqr[i]=ntComplement(seq[slen-i-1]);
142 table6mers(seqr.chars(), seqr.length(), pzr);
143 }
144
145 ~CASeqData() {
146 for (int i=0;i<4096;i++) {
147 delete pz[i];
148 delete pzr[i];
149 }
150 }
151 };
152
153 GVec<CASeqData> adaptors5;
154 GVec<CASeqData> adaptors3;
155
156 CGreedyAlignData* gxmem_l=NULL;
157 CGreedyAlignData* gxmem_r=NULL;
158
159 // element in dhash:
160 class FqDupRec {
161 public:
162 int count; //how many of these reads are the same
163 int len; //length of qv
164 char* firstname; //optional, only if we want to keep the original read names
165 char* qv;
166 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
167 len=0;
168 qv=NULL;
169 firstname=NULL;
170 count=0;
171 if (qstr!=NULL) {
172 qv=Gstrdup(qstr->chars());
173 len=qstr->length();
174 count++;
175 }
176 if (rname!=NULL) firstname=Gstrdup(rname);
177 }
178 ~FqDupRec() {
179 GFREE(qv);
180 GFREE(firstname);
181 }
182 void add(GStr& d) { //collapse another record into this one
183 if (d.length()!=len)
184 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
185 count++;
186 for (int i=0;i<len;i++)
187 qv[i]=(qv[i]+d[i])/2;
188 }
189 };
190
191 void openfw(FILE* &f, GArgs& args, char opt) {
192 GStr s=args.getOpt(opt);
193 if (!s.is_empty()) {
194 if (s=='-') f=stdout;
195 else {
196 f=fopen(s.chars(),"w");
197 if (f==NULL) GError("Error creating file: %s\n", s.chars());
198 }
199 }
200 }
201
202 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
203 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
204
205 GHash<FqDupRec> dhash; //hash to keep track of duplicates
206
207 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208 int loadAdaptors(const char* fname);
209
210 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211 GStr& s, GStr& infname, GStr& infname2);
212 // uses outsuffix to generate output file names and open file handles as needed
213
214 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
215 void trash_report(char trashcode, GStr& rname, FILE* freport);
216
217 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
218 GStr& rname, GStr& rinfo, GStr& infname);
219
220 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
221 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
222
223 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
224 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
225 int dust(GStr& seq);
226 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230
231 void convertPhred(char* q, int len);
232 void convertPhred(GStr& q);
233
234 int main(int argc, char * const argv[]) {
235 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
236 int e;
237 if ((e=args.isError())>0) {
238 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
239 exit(224);
240 }
241 debug=(args.getOpt('Y')!=NULL);
242 verbose=(args.getOpt('V')!=NULL);
243 convert_phred=(args.getOpt('Q')!=NULL);
244 doCollapse=(args.getOpt('C')!=NULL);
245 doDust=(args.getOpt('D')!=NULL);
246 revCompl=(args.getOpt('R')!=NULL);
247 if (args.getOpt('A')) doPolyTrim=false;
248 /*
249 rawFormat=(args.getOpt('R')!=NULL);
250 if (rawFormat) {
251 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
252 }
253 */
254 prefix=args.getOpt('n');
255 GStr s=args.getOpt('l');
256 if (!s.is_empty())
257 min_read_len=s.asInt();
258 s=args.getOpt('m');
259 if (!s.is_empty())
260 max_perc_N=s.asDouble();
261 s=args.getOpt('d');
262 if (!s.is_empty()) {
263 dust_cutoff=s.asInt();
264 doDust=true;
265 }
266 s=args.getOpt('q');
267 if (!s.is_empty()) {
268 qvtrim_qmin=s.asInt();
269 }
270 s=args.getOpt('t');
271 if (!s.is_empty()) {
272 qvtrim_max=s.asInt();
273 }
274 s=args.getOpt('p');
275 if (!s.is_empty()) {
276 int v=s.asInt();
277 if (v==33) {
278 qv_phredtype=33;
279 qv_cvtadd=31;
280 }
281 else if (v==64) {
282 qv_phredtype=64;
283 qv_cvtadd=-31;
284 }
285 else
286 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
287 }
288 s=args.getOpt('f');
289 if (!s.is_empty()) {
290 loadAdaptors(s.chars());
291 }
292 bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293 s=args.getOpt('5');
294 if (!s.is_empty()) {
295 if (fileAdaptors)
296 GError("Error: options -5 and -f cannot be used together!\n");
297 s.upper();
298 addAdaptor(adaptors5, s, galn_TrimLeft);
299 }
300 s=args.getOpt('3');
301 if (!s.is_empty()) {
302 if (fileAdaptors)
303 GError("Error: options -3 and -f cannot be used together!\n");
304 s.upper();
305 addAdaptor(adaptors3, s, galn_TrimRight);
306 }
307 s=args.getOpt('y');
308 if (!s.is_empty()) {
309 int minmatch=s.asInt();
310 poly_minScore=minmatch*poly_m_score;
311 }
312
313 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
314 else outsuffix="-";
315 trashReport= (args.getOpt('r')!=NULL);
316 int fcount=args.startNonOpt();
317 if (fcount==0) {
318 GMessage(USAGE);
319 exit(224);
320 }
321 if (fcount>1 && doCollapse) {
322 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
323 }
324 //openfw(f_out, args, 'o');
325 //if (f_out==NULL) f_out=stdout;
326 if (trashReport)
327 openfw(freport, args, 'r');
328 char* infile=NULL;
329
330 if (adaptors5.Count()>0)
331 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332 gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 if (adaptors3.Count()>0)
334 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335
336 while ((infile=args.nextNonOpt())!=NULL) {
337 //for each input file
338 int incounter=0; //counter for input reads
339 int outcounter=0; //counter for output reads
340 int trash_s=0; //too short from the get go
341 int trash_Q=0;
342 int trash_N=0;
343 int trash_D=0;
344 int trash_poly=0;
345 int trash_A3=0;
346 int trash_A5=0;
347 s=infile;
348 GStr infname;
349 GStr infname2;
350 FILE* f_in=NULL;
351 FILE* f_in2=NULL;
352 FILE* f_out=NULL;
353 FILE* f_out2=NULL;
354 bool paired_reads=false;
355 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
356 GLineReader fq(f_in);
357 GLineReader* fq2=NULL;
358 if (f_in2!=NULL) {
359 fq2=new GLineReader(f_in2);
360 paired_reads=true;
361 }
362 GStr rseq, rqv, seqid, seqinfo;
363 GStr rseq2, rqv2, seqid2, seqinfo2;
364 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
365 incounter++;
366 int a5=0, a3=0, b5=0, b3=0;
367 char tcode=0, tcode2=0;
368 tcode=process_read(seqid, rseq, rqv, a5, a3);
369 if (fq2!=NULL) {
370 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
371 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
372 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
373 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
374 }
375 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
376 //decide what to do with this pair and print rseq2 if the pair makes it
377 if (tcode>1 && tcode2<=1) {
378 //"untrash" rseq
379 if (a3-a5+1<min_read_len) {
380 a5=1;
381 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
382 }
383 tcode=1;
384 }
385 else if (tcode<=1 && tcode2>1) {
386 //"untrash" rseq2
387 if (b3-b5+1<min_read_len) {
388 b5=1;
389 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
390 }
391 tcode2=1;
392 }
393 if (tcode<=1) { //trimmed or left intact -- write it!
394 if (tcode>0) {
395 rseq2=rseq2.substr(b5,b3-b5+1);
396 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
397 }
398 int nocounter=0;
399 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
400 }
401 } //pair read
402 if (tcode>1) { //trashed
403 #ifdef GDEBUG
404 GMessage(" !!!!TRASH code = %c\n",tcode);
405 #endif
406 if (tcode=='s') trash_s++;
407 else if (tcode=='A' || tcode=='T') trash_poly++;
408 else if (tcode=='Q') trash_Q++;
409 else if (tcode=='N') trash_N++;
410 else if (tcode=='D') trash_D++;
411 else if (tcode=='3') trash_A3++;
412 else if (tcode=='5') trash_A5++;
413 if (trashReport) trash_report(tcode, seqid, freport);
414 }
415 else if (!doCollapse) { //write it
416 if (tcode>0) {
417 rseq=rseq.substr(a5,a3-a5+1);
418 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
419 }
420 #ifdef GDEBUG
421 GMessage(" After trimming:\n");
422 GMessage("%s\n",rseq.chars());
423 #endif
424 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
425 }
426 } //for each fastq record
427 delete fq2;
428 FRCLOSE(f_in);
429 FRCLOSE(f_in2);
430 if (doCollapse) {
431 outcounter=0;
432 int maxdup_count=1;
433 char* maxdup_seq=NULL;
434 dhash.startIterate();
435 FqDupRec* qd=NULL;
436 char* seq=NULL;
437 while ((qd=dhash.NextData(seq))!=NULL) {
438 GStr rseq(seq);
439 //do the dusting here
440 if (doDust) {
441 int dustbases=dust(rseq);
442 if (dustbases>(rseq.length()>>1)) {
443 if (trashReport && qd->firstname!=NULL) {
444 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
445 }
446 trash_D+=qd->count;
447 continue;
448 }
449 }
450 outcounter++;
451 if (qd->count>maxdup_count) {
452 maxdup_count=qd->count;
453 maxdup_seq=seq;
454 }
455 if (isfasta) {
456 if (prefix.is_empty()) {
457 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
458 rseq.chars());
459 }
460 else { //use custom read name
461 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
462 qd->count, rseq.chars());
463 }
464 }
465 else { //fastq format
466 if (convert_phred) convertPhred(qd->qv, qd->len);
467 if (prefix.is_empty()) {
468 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
469 rseq.chars(), qd->qv);
470 }
471 else { //use custom read name
472 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
473 qd->count, rseq.chars(), qd->qv);
474 }
475 }
476 }//for each element of dhash
477 if (maxdup_count>1) {
478 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
479 }
480 } //collapse entries
481 if (verbose) {
482 if (paired_reads) {
483 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
484 GMessage("Number of input pairs :%9d\n", incounter);
485 GMessage(" Output pairs :%9d\n", outcounter);
486 }
487 else {
488 GMessage(">Input file : %s\n", infname.chars());
489 GMessage("Number of input reads :%9d\n", incounter);
490 GMessage(" Output reads :%9d\n", outcounter);
491 }
492 GMessage("------------------------------------\n");
493 if (num_trimmed5)
494 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
495 if (num_trimmed3)
496 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
497 GMessage("------------------------------------\n");
498 if (trash_s>0)
499 GMessage(" Trashed by length:%9d\n", trash_s);
500 if (trash_N>0)
501 GMessage(" Trashed by N%%:%9d\n", trash_N);
502 if (trash_Q>0)
503 GMessage("Trashed by low quality:%9d\n", trash_Q);
504 if (trash_poly>0)
505 GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
506 if (trash_A5>0)
507 GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508 if (trash_A3>0)
509 GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510 }
511 if (trashReport) {
512 FWCLOSE(freport);
513 }
514 FWCLOSE(f_out);
515 FWCLOSE(f_out2);
516 } //while each input file
517 delete gxmem_l;
518 delete gxmem_r;
519 //getc(stdin);
520 }
521
522 class NData {
523 public:
524 int NPos[1024]; //there should be no reads longer than 1K ?
525 int NCount;
526 int end5;
527 int end3;
528 int seqlen;
529 double perc_N; //percentage of Ns in end5..end3 range only!
530 const char* seq;
531 bool valid;
532 NData() {
533 seqlen=0;
534 NCount=0;
535 end5=0;
536 end3=0;
537 seq=NULL;
538 perc_N=0;
539 valid=true;
540 }
541 void init(GStr& rseq) {
542 NCount=0;
543 perc_N=0;
544 seqlen=rseq.length();
545 seq=rseq.chars();
546 end5=1;
547 end3=seqlen;
548 for (int i=0;i<seqlen;i++)
549 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
550 NPos[NCount]=i;
551 NCount++;
552 }
553 perc_N=(NCount*100.0)/seqlen;
554 valid=true;
555 }
556 void N_calc() { //only in the region after trimming
557 int n=0;
558 for (int i=end3-1;i<end5;i++) {
559 if (seq[i]=='N') n++;
560 }
561 perc_N=(n*100.0)/(end5-end3+1);
562 }
563 };
564
565 static NData feat;
566 int perc_lenN=12; // incremental distance from ends, in percentage of
567 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
568
569 void N_analyze(int l5, int l3, int p5, int p3) {
570 /* assumes feat was filled properly */
571 int old_dif, t5,t3,v;
572 if (l3<l5+2 || p5>p3 ) {
573 feat.end5=l5+1;
574 feat.end3=l3+1;
575 return;
576 }
577
578 t5=feat.NPos[p5]-l5;
579 t3=l3-feat.NPos[p3];
580 old_dif=p3-p5;
581 v=(int)((((double)(l3-l5))*perc_lenN)/100);
582 if (v>20) v=20; /* enforce N-search limit for very long reads */
583 if (t5 < v ) {
584 l5=feat.NPos[p5]+1;
585 p5++;
586 }
587 if (t3 < v) {
588 l3=feat.NPos[p3]-1;
589 p3--;
590 }
591 /* restNs=p3-p5; number of Ns in the new CLR */
592 if (p3-p5==old_dif) { /* no change, return */
593 /* don't trim if this may shorten the read too much */
594 //if (l5-l3<min_read_len) return;
595 feat.end5=l5+1;
596 feat.end3=l3+1;
597 return;
598 }
599 else
600 N_analyze(l5,l3, p5,p3);
601 }
602
603
604 bool qtrim(GStr& qvs, int &l5, int &l3) {
605 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
606 if (qv_phredtype==0) {
607 //try to guess the Phred type
608 int vmin=256, vmax=0;
609 for (int i=0;i<qvs.length();i++) {
610 if (vmin>qvs[i]) vmin=qvs[i];
611 if (vmax<qvs[i]) vmax=qvs[i];
612 }
613 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
614 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
615 if (qv_phredtype==0) {
616 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
617 }
618 if (verbose)
619 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
620 } //guessing Phred type
621 for (l3=qvs.length()-1;l3>2;l3--) {
622 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
623 }
624 //just in case, check also the 5' the end (?)
625 for (l5=0;l5<qvs.length()-3;l5++) {
626 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
627 }
628 if (qvtrim_max>0) {
629 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
630 if (l5>qvtrim_max) l5=qvtrim_max;
631 }
632 return (l5>0 || l3<qvs.length()-1);
633 }
634
635 bool ntrim(GStr& rseq, int &l5, int &l3) {
636 //count Ns in the sequence, trim N-rich ends
637 feat.init(rseq);
638 l5=feat.end5-1;
639 l3=feat.end3-1;
640 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
641 if (l5==feat.end5-1 && l3==feat.end3-1) {
642 if (feat.perc_N>max_perc_N) {
643 feat.valid=false;
644 l3=l5+1;
645 return true;
646 }
647 else {
648 return false; //nothing changed
649 }
650 }
651 l5=feat.end5-1;
652 l3=feat.end3-1;
653 if (l3-l5+1<min_read_len) {
654 feat.valid=false;
655 return true;
656 }
657 feat.N_calc();
658
659 if (feat.perc_N>max_perc_N) {
660 feat.valid=false;
661 l3=l5+1;
662 return true;
663 }
664 return true;
665 }
666
667 //--------------- dust functions ----------------
668 class DNADuster {
669 public:
670 int dustword;
671 int dustwindow;
672 int dustwindow2;
673 int dustcutoff;
674 int mv, iv, jv;
675 int counts[32*32*32];
676 int iis[32*32*32];
677 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
678 dustword=wordsize;
679 dustwindow=winsize;
680 dustwindow2 = (winsize>>1);
681 dustcutoff=cutoff;
682 mv=0;
683 iv=0;
684 jv=0;
685 }
686 void setWindowSize(int value) {
687 dustwindow = value;
688 dustwindow2 = (dustwindow >> 1);
689 }
690 void setWordSize(int value) {
691 dustword=value;
692 }
693 void wo1(int len, const char* s, int ivv) {
694 int i, ii, j, v, t, n, n1, sum;
695 int js, nis;
696 n = 32 * 32 * 32;
697 n1 = n - 1;
698 nis = 0;
699 i = 0;
700 ii = 0;
701 sum = 0;
702 v = 0;
703 for (j=0; j < len; j++, s++) {
704 ii <<= 5;
705 if (*s<=32) {
706 i=0;
707 continue;
708 }
709 ii |= *s - 'A'; //assume uppercase!
710 ii &= n1;
711 i++;
712 if (i >= dustword) {
713 for (js=0; js < nis && iis[js] != ii; js++) ;
714 if (js == nis) {
715 iis[nis] = ii;
716 counts[ii] = 0;
717 nis++;
718 }
719 if ((t = counts[ii]) > 0) {
720 sum += t;
721 v = 10 * sum / j;
722 if (mv < v) {
723 mv = v;
724 iv = ivv;
725 jv = j;
726 }
727 }
728 counts[ii]++;
729 }
730 }
731 }
732
733 int wo(int len, const char* s, int* beg, int* end) {
734 int i, l1;
735 l1 = len - dustword + 1;
736 if (l1 < 0) {
737 *beg = 0;
738 *end = len - 1;
739 return 0;
740 }
741 mv = 0;
742 iv = 0;
743 jv = 0;
744 for (i=0; i < l1; i++) {
745 wo1(len-i, s+i, i);
746 }
747 *beg = iv;
748 *end = iv + jv;
749 return mv;
750 }
751
752 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
753 int i, j, l, a, b, v;
754 if (cutoff==0) cutoff=dustcutoff;
755 a=0;b=0;
756 //GMessage("Dust cutoff=%d\n", cutoff);
757 for (i=0; i < seqlen; i += dustwindow2) {
758 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
759 v = wo(l, seq+i, &a, &b);
760 if (v > cutoff) {
761 //for (j = a; j <= b && j < dustwindow2; j++) {
762 for (j = a; j <= b; j++) {
763 seqmsk[i+j]='N';//could be made lowercase instead
764 }
765 }
766 }
767 //return first;
768 }
769 };
770
771 static DNADuster duster;
772
773 int dust(GStr& rseq) {
774 char* seq=Gstrdup(rseq.chars());
775 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
776 //check the number of Ns:
777 int ncount=0;
778 for (int i=0;i<rseq.length();i++) {
779 if (seq[i]=='N') ncount++;
780 }
781 GFREE(seq);
782 return ncount;
783 }
784
785 struct SLocScore {
786 int pos;
787 int score;
788 SLocScore(int p=0,int s=0) {
789 pos=p;
790 score=s;
791 }
792 void set(int p, int s) {
793 pos=p;
794 score=s;
795 }
796 void add(int p, int add) {
797 pos=p;
798 score+=add;
799 }
800 };
801
802 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
803 if (!doPolyTrim) return false;
804 int rlen=seq.length();
805 l5=0;
806 l3=rlen-1;
807 int32 seedVal=*(int32*)poly_seed;
808 char polyChar=poly_seed[0];
809 //assumes N trimming was already done
810 //so a poly match should be very close to the end of the read
811 // -- find the initial match (seed)
812 int lmin=GMAX((rlen-16), 0);
813 int li;
814 for (li=rlen-4;li>lmin;li--) {
815 if (seedVal==*(int*)&(seq[li])) {
816 break;
817 }
818 }
819 if (li<=lmin) return false;
820 //seed found, try to extend it both ways
821 //extend right
822 int ri=li+3;
823 SLocScore loc(ri, poly_m_score<<2);
824 SLocScore maxloc(loc);
825 //extend right
826 while (ri<rlen-1) {
827 ri++;
828 if (seq[ri]==polyChar) {
829 loc.add(ri,poly_m_score);
830 }
831 else if (seq[ri]=='N') {
832 loc.add(ri,0);
833 }
834 else { //mismatch
835 loc.add(ri,poly_mis_score);
836 if (maxloc.score-loc.score>poly_dropoff_score) break;
837 }
838 if (maxloc.score<=loc.score) {
839 maxloc=loc;
840 }
841 }
842 ri=maxloc.pos;
843 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
844 //ri = right boundary for the poly match
845 //extend left
846 loc.set(li, maxloc.score);
847 maxloc.pos=li;
848 while (li>0) {
849 li--;
850 if (seq[li]==polyChar) {
851 loc.add(li,poly_m_score);
852 }
853 else if (seq[li]=='N') {
854 loc.add(li,0);
855 }
856 else { //mismatch
857 loc.add(li,poly_mis_score);
858 if (maxloc.score-loc.score>poly_dropoff_score) break;
859 }
860 if (maxloc.score<=loc.score) {
861 maxloc=loc;
862 }
863 }
864 li=maxloc.pos;
865 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
866 (maxloc.score>poly_minScore && ri>=rlen-3) ||
867 (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
868 //trimming this li-ri match at 3' end
869 l3=li-1;
870 if (l3<0) l3=0;
871 return true;
872 }
873 return false;
874 }
875
876 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
877 if (!doPolyTrim) return false;
878 int rlen=seq.length();
879 l5=0;
880 l3=rlen-1;
881 int32 seedVal=*(int32*)poly_seed;
882 char polyChar=poly_seed[0];
883 //assumes N trimming was already done
884 //so a poly match should be very close to the end of the read
885 // -- find the initial match (seed)
886 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
887 int li;
888 for (li=0;li<=lmax;li++) {
889 if (seedVal==*(int*)&(seq[li])) {
890 break;
891 }
892 }
893 if (li>lmax) return false;
894 //seed found, try to extend it both ways
895 //extend left
896 int ri=li+3; //save rightmost base of the seed
897 SLocScore loc(li, poly_m_score<<2);
898 SLocScore maxloc(loc);
899 while (li>0) {
900 li--;
901 if (seq[li]==polyChar) {
902 loc.add(li,poly_m_score);
903 }
904 else if (seq[li]=='N') {
905 loc.add(li,0);
906 }
907 else { //mismatch
908 loc.add(li,poly_mis_score);
909 if (maxloc.score-loc.score>poly_dropoff_score) break;
910 }
911 if (maxloc.score<=loc.score) {
912 maxloc=loc;
913 }
914 }
915 li=maxloc.pos;
916 if (li>5) return false; //no trimming wanted, too far from 5' end
917 //li = right boundary for the poly match
918
919 //extend right
920 loc.set(ri, maxloc.score);
921 maxloc.pos=ri;
922 while (ri<rlen-1) {
923 ri++;
924 if (seq[ri]==polyChar) {
925 loc.add(ri,poly_m_score);
926 }
927 else if (seq[ri]=='N') {
928 loc.add(ri,0);
929 }
930 else { //mismatch
931 loc.add(ri,poly_mis_score);
932 if (maxloc.score-loc.score>poly_dropoff_score) break;
933 }
934 if (maxloc.score<=loc.score) {
935 maxloc=loc;
936 }
937 }
938 ri=maxloc.pos;
939 if ((maxloc.score==poly_minScore && li==0) ||
940 (maxloc.score>poly_minScore && li<2)
941 || (maxloc.score>(poly_minScore*3) && li<8)) {
942 //adjust l5 to reflect this trimming of 5' end
943 l5=ri+1;
944 if (l5>rlen-1) l5=rlen-1;
945 return true;
946 }
947 return false;
948 }
949
950 bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951 if (adaptors3.Count()==0) return false;
952 int rlen=seq.length();
953 l5=0;
954 l3=rlen-1;
955 bool trimmed=false;
956 GStr wseq(seq);
957 int wlen=rlen;
958 GXSeqData seqdata;
959 int numruns=revCompl ? 2 : 1;
960
961 for (int ai=0;ai<adaptors3.Count();ai++) {
962 for (int r=0;r<numruns;r++) {
963 if (r) {
964 seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
965 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
966 }
967 else {
968 seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
969 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
970 }
971
972 GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
973 if (aln) {
974 trimmed=true;
975 //keep unmatched region on the left OR right (the longer one)
976 if (aln->sl > wlen-aln->sr) {
977 //keep left side
978 l3-=(wlen-aln->sl+1);
979 if (l3<0) l3=0;
980 }
981 else { //keep right side
982 l5+=aln->sr;
983 if (l5>=rlen) l5=rlen-1;
984 }
985 delete aln;
986 if (l3-l5+1<min_read_len) return true;
987 wseq=seq.substr(l5,l3-l5+1);
988 wlen=wseq.length();
989 }
990 }//forward and reverse adaptors
991 }//for each 3' adaptor
992 return trimmed;
993 }
994
995 bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
996 if (adaptors5.Count()==0) return false;
997 int rlen=seq.length();
998 l5=0;
999 l3=rlen-1;
1000 bool trimmed=false;
1001 GStr wseq(seq);
1002 int wlen=rlen;
1003 GXSeqData seqdata;
1004 int numruns=revCompl ? 2 : 1;
1005 for (int ai=0;ai<adaptors5.Count();ai++) {
1006 for (int r=0;r<numruns;r++) {
1007 if (r) {
1008 seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1009 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1010 }
1011 else {
1012 seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1013 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1014 }
1015 GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1016 if (aln) {
1017 trimmed=true;
1018 if (aln->sl-1 > wlen-aln->sr) {
1019 //keep left side
1020 l3-=(wlen-aln->sl+1);
1021 if (l3<0) l3=0;
1022 }
1023 else { //keep right side
1024 l5+=aln->sr;
1025 if (l5>=rlen) l5=rlen-1;
1026 }
1027 delete aln;
1028 if (l3-l5+1<min_read_len) return true;
1029 wseq=seq.substr(l5,l3-l5+1);
1030 wlen=wseq.length();
1031 }
1032 } //forward and reverse?
1033 }//for each 5' adaptor
1034 return trimmed;
1035 }
1036
1037 //convert qvs to/from phred64 from/to phread33
1038 void convertPhred(GStr& q) {
1039 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1040 }
1041
1042 void convertPhred(char* q, int len) {
1043 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1044 }
1045
1046 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1047 GStr& rname, GStr& rinfo, GStr& infname) {
1048 rseq="";
1049 rqv="";
1050 rname="";
1051 rinfo="";
1052 if (fq.eof()) return false;
1053 char* l=fq.getLine();
1054 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1055 if (l==NULL) return false;
1056 /* if (rawFormat) {
1057 //TODO: implement raw qseq parsing here
1058 //if (raw type=N) then continue; //skip invalid/bad records
1059 } //raw qseq format
1060 else { // FASTQ or FASTA */
1061 isfasta=(l[0]=='>');
1062 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1063 infname.chars(), l);
1064 GStr s(l);
1065 rname=&(l[1]);
1066 for (int i=0;i<rname.length();i++)
1067 if (rname[i]<=' ') {
1068 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1069 rname.cut(i);
1070 break;
1071 }
1072 //now get the sequence
1073 if ((l=fq.getLine())==NULL)
1074 GError("Error: unexpected EOF after header for read %s (%s)\n",
1075 rname.chars(), infname.chars());
1076 rseq=l; //this must be the DNA line
1077 while ((l=fq.getLine())!=NULL) {
1078 //seq can span multiple lines
1079 if (l[0]=='>' || l[0]=='+') {
1080 fq.pushBack();
1081 break; //
1082 }
1083 rseq+=l;
1084 } //check for multi-line seq
1085 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1086 if ((l=fq.getLine())==NULL)
1087 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1088 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1089 if ((l=fq.getLine())==NULL)
1090 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1091 rqv=l;
1092 //if (rqv.length()!=rseq.length())
1093 // GError("Error: qv len != seq len for %s\n", rname.chars());
1094 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1095 rqv+=l; //append to qv string
1096 }
1097 }// fastq
1098 // } //<-- FASTA or FASTQ
1099 rseq.upper();
1100 return true;
1101 }
1102
1103 #ifdef GDEBUG
1104 void showTrim(GStr& s, int l5, int l3) {
1105 if (l5>0 || l3==0) {
1106 color_bg(c_red);
1107 }
1108 for (int i=0;i<s.length()-1;i++) {
1109 if (i && i==l5) color_resetbg();
1110 fprintf(stderr, "%c", s[i]);
1111 if (i && i==l3) color_bg(c_red);
1112 }
1113 fprintf(stderr, "%c", s[s.length()-1]);
1114 color_reset();
1115 fprintf(stderr, "\n");
1116 }
1117 #endif
1118
1119 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1120 //returns 0 if the read was untouched, 1 if it was just trimmed
1121 // and a trash code if it was trashed
1122 l5=0;
1123 l3=rseq.length()-1;
1124 #ifdef GDEBUG
1125 //rseq.reverse();
1126 GMessage(">%s\n", rname.chars());
1127 GMessage("%s\n",rseq.chars());
1128 #endif
1129 if (l3-l5+1<min_read_len) {
1130 return 's';
1131 }
1132 GStr wseq(rseq.chars());
1133 GStr wqv(rqv.chars());
1134 int w5=l5;
1135 int w3=l3;
1136 //first do the q-based trimming
1137 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1138 if (w3-w5+1<min_read_len) {
1139 return 'Q'; //invalid read
1140 }
1141 //-- keep only the w5..w3 range
1142 l5=w5;
1143 l3=w3;
1144 wseq=wseq.substr(w5, w3-w5+1);
1145 if (!wqv.is_empty())
1146 wqv=wqv.substr(w5, w3-w5+1);
1147 } //qv trimming
1148 // N-trimming on the remaining read seq
1149 if (ntrim(wseq, w5, w3)) {
1150 //GMessage("before: %s\n",wseq.chars());
1151 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1152 l5+=w5;
1153 l3-=(wseq.length()-1-w3);
1154 if (w3-w5+1<min_read_len) {
1155 return 'N'; //to be trashed
1156 }
1157 //-- keep only the w5..w3 range
1158 wseq=wseq.substr(w5, w3-w5+1);
1159 if (!wqv.is_empty())
1160 wqv=wqv.substr(w5, w3-w5+1);
1161 w5=0;
1162 w3=wseq.length()-1;
1163 }
1164 char trim_code;
1165 //clean the more dirty end first - 3'
1166 int prev_w5=0;
1167 int prev_w3=0;
1168 bool w3upd=true;
1169 bool w5upd=true;
1170 do {
1171 trim_code=0;
1172 if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1173 trim_code='A';
1174 }
1175 else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1176 trim_code='T';
1177 }
1178 else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1179 trim_code='A';
1180 }
1181 else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1182 trim_code='T';
1183 }
1184 else if (trim_adaptor5(wseq, w5, w3)) {
1185 trim_code='5';
1186 }
1187 else if (trim_adaptor3(wseq, w5, w3)) {
1188 trim_code='3';
1189 }
1190 if (trim_code) {
1191 w3upd=(w3!=prev_w3);
1192 w5upd=(w5!=prev_w5);
1193 if (w3upd) prev_w3=w3;
1194 if (w5upd) prev_w5=w5;
1195 #ifdef GDEBUG
1196 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1197 showTrim(wseq, w5, w3);
1198 #endif
1199 int trimlen=wseq.length()-(w3-w5+1);
1200 num_trimmed3++;
1201 if (trimlen<min_trimmed3)
1202 min_trimmed3=trimlen;
1203 l5+=w5;
1204 l3-=(wseq.length()-1-w3);
1205 if (w3-w5+1<min_read_len) {
1206 return trim_code;
1207 }
1208 //-- keep only the w5..w3 range
1209 wseq=wseq.substr(w5, w3-w5+1);
1210 if (!wqv.is_empty())
1211 wqv=wqv.substr(w5, w3-w5+1);
1212 }//trimming at 3' end
1213 } while (trim_code);
1214
1215 if (doCollapse) {
1216 //keep read for later
1217 FqDupRec* dr=dhash.Find(wseq.chars());
1218 if (dr==NULL) { //new entry
1219 //if (prefix.is_empty())
1220 dhash.Add(wseq.chars(),
1221 new FqDupRec(&wqv, rname.chars()));
1222 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1223 }
1224 else
1225 dr->add(wqv);
1226 } //collapsing duplicates
1227 else { //not collapsing duplicates
1228 //apply the dust filter now
1229 if (doDust) {
1230 int dustbases=dust(wseq);
1231 if (dustbases>(wseq.length()>>1)) {
1232 return 'D';
1233 }
1234 }
1235 } //not collapsing duplicates
1236 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1237 }
1238
1239 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1240 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1241 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1242 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1243 }
1244
1245 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1246 outcounter++;
1247 bool asFasta=(rqv.is_empty() || fastaOutput);
1248 if (asFasta) {
1249 if (prefix.is_empty()) {
1250 printHeader(f_out, '>',rname,rinfo);
1251 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1252 }
1253 else {
1254 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1255 rseq.chars());
1256 }
1257 }
1258 else { //fastq
1259 if (convert_phred) convertPhred(rqv);
1260 if (prefix.is_empty()) {
1261 printHeader(f_out, '@', rname, rinfo);
1262 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1263 }
1264 else
1265 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1266 rseq.chars(),rqv.chars() );
1267 }
1268 }
1269
1270 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1271 if (freport==NULL || trashcode<=' ') return;
1272 if (trashcode=='3' || trashcode=='5') {
1273 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1274 }
1275 else {
1276 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1277 }
1278 //tcounter++;
1279 }
1280
1281 GStr getFext(GStr& s, int* xpos=NULL) {
1282 //s must be a filename without a path
1283 GStr r("");
1284 if (xpos!=NULL) *xpos=0;
1285 if (s.is_empty() || s=="-") return r;
1286 int p=s.rindex('.');
1287 int d=s.rindex('/');
1288 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1289 r=s.substr(p+1);
1290 if (xpos!=NULL) *xpos=p+1;
1291 r.lower();
1292 return r;
1293 }
1294
1295 void baseFileName(GStr& fname) {
1296 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1297 int xpos=0;
1298 GStr fext=getFext(fname, &xpos);
1299 if (xpos<=1) return;
1300 bool extdel=false;
1301 GStr f2;
1302 if (fext=="z" || fext=="zip") {
1303 extdel=true;
1304 }
1305 else if (fext.length()>=2) {
1306 f2=fext.substr(0,2);
1307 extdel=(f2=="gz" || f2=="bz");
1308 }
1309 if (extdel) {
1310 fname.cut(xpos-1);
1311 fext=getFext(fname, &xpos);
1312 if (xpos<=1) return;
1313 }
1314 extdel=false;
1315 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1316 extdel=true;
1317 }
1318 else if (fext.length()>=2) {
1319 extdel=(fext.substr(0,2)=="fa");
1320 }
1321 if (extdel) fname.cut(xpos-1);
1322 GStr fncp(fname);
1323 fncp.lower();
1324 fncp.chomp("seq");
1325 fncp.chomp("sequence");
1326 fncp.trimR("_.");
1327 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1328 }
1329
1330 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1331 FILE* f_out=NULL;
1332 GStr fname(getFileName(infname.chars()));
1333 //eliminate known extensions
1334 baseFileName(fname);
1335 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1336 else if (pocmd.is_empty()) {
1337 GStr oname(fname);
1338 oname.append('.');
1339 oname.append(outsuffix);
1340 f_out=fopen(oname.chars(),"w");
1341 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1342 }
1343 else {
1344 GStr oname(">");
1345 oname.append(fname);
1346 oname.append('.');
1347 oname.append(outsuffix);
1348 pocmd.append(oname);
1349 f_out=popen(pocmd.chars(), "w");
1350 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1351 }
1352 return f_out;
1353 }
1354
1355 void guess_unzip(GStr& fname, GStr& picmd) {
1356 GStr fext=getFext(fname);
1357 if (fext=="gz" || fext=="gzip" || fext=="z") {
1358 picmd="gzip -cd ";
1359 }
1360 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1361 picmd="bzip2 -cd ";
1362 }
1363 }
1364
1365 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1366 //TODO: prepare CASeqData here, and collect hexamers as well
1367 if (seq.is_empty() || seq=="-" ||
1368 seq=="N/A" || seq==".") return;
1369
1370 CASeqData adata(revCompl);
1371 int idx=adaptors.Add(adata);
1372 if (idx<0) GError("Error: failed to add adaptor!\n");
1373 adaptors[idx].trim_type=trim_type;
1374 adaptors[idx].update(seq.chars());
1375 }
1376
1377
1378 int loadAdaptors(const char* fname) {
1379 GLineReader lr(fname);
1380 char* l;
1381 while ((l=lr.nextLine())!=NULL) {
1382 if (lr.length()<=3 || l[0]=='#') continue;
1383 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1384 l[0]==';'|| l[0]==':' ) {
1385 int i=1;
1386 while (l[i]!=0 && isspace(l[i])) {
1387 i++;
1388 }
1389 if (l[i]!=0) {
1390 GStr s(&(l[i]));
1391 #ifdef GDEBUG
1392 //s.reverse();
1393 #endif
1394 addAdaptor(adaptors3, s, galn_TrimRight);
1395 continue;
1396 }
1397 }
1398 else {
1399 GStr s(l);
1400 s.startTokenize("\t ;,:");
1401 GStr a5,a3;
1402 if (s.nextToken(a5))
1403 s.nextToken(a3);
1404 else continue; //no tokens on this line
1405 GAlnTrimType ttype5=galn_TrimLeft;
1406 a5.upper();
1407 a3.upper();
1408 if (a3.is_empty() || a3==a5 || a3=="=") {
1409 a3.clear();
1410 ttype5=galn_TrimEither;
1411 }
1412 #ifdef GDEBUG
1413 // a5.reverse();
1414 // a3.reverse();
1415 #endif
1416 addAdaptor(adaptors5, a5, ttype5);
1417 addAdaptor(adaptors3, a3, galn_TrimRight);
1418 }
1419 }
1420 return adaptors5.Count()+adaptors3.Count();
1421 }
1422
1423 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1424 GStr& s, GStr& infname, GStr& infname2) {
1425 // uses outsuffix to generate output file names and open file handles as needed
1426 infname="";
1427 infname2="";
1428 f_in=NULL;
1429 f_in2=NULL;
1430 f_out=NULL;
1431 f_out2=NULL;
1432 //analyze outsuffix intent
1433 GStr pocmd;
1434 if (outsuffix=="-") {
1435 f_out=stdout;
1436 }
1437 else {
1438 GStr ox=getFext(outsuffix);
1439 if (ox.length()>2) ox=ox.substr(0,2);
1440 if (ox=="gz") pocmd="gzip -9 -c ";
1441 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1442 }
1443 if (s=="-") {
1444 f_in=stdin;
1445 infname="stdin";
1446 f_out=prepOutFile(infname, pocmd);
1447 return;
1448 } // streaming from stdin
1449 s.startTokenize(",:");
1450 s.nextToken(infname);
1451 bool paired=s.nextToken(infname2);
1452 if (fileExists(infname.chars())==0)
1453 GError("Error: cannot find file %s!\n",infname.chars());
1454 GStr fname(getFileName(infname.chars()));
1455 GStr picmd;
1456 guess_unzip(fname, picmd);
1457 if (picmd.is_empty()) {
1458 f_in=fopen(infname.chars(), "r");
1459 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1460 }
1461 else {
1462 picmd.append(infname);
1463 f_in=popen(picmd.chars(), "r");
1464 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1465 }
1466 if (f_out==stdout) {
1467 if (paired) GError("Error: output suffix required for paired reads\n");
1468 return;
1469 }
1470 f_out=prepOutFile(infname, pocmd);
1471 if (!paired) return;
1472 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1473 // ---- paired reads:-------------
1474 if (fileExists(infname2.chars())==0)
1475 GError("Error: cannot find file %s!\n",infname2.chars());
1476 picmd="";
1477 GStr fname2(getFileName(infname2.chars()));
1478 guess_unzip(fname2, picmd);
1479 if (picmd.is_empty()) {
1480 f_in2=fopen(infname2.chars(), "r");
1481 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1482 }
1483 else {
1484 picmd.append(infname2);
1485 f_in2=popen(picmd.chars(), "r");
1486 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1487 }
1488 f_out2=prepOutFile(infname2, pocmd);
1489 }