ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 180
Committed: Thu Feb 16 21:24:04 2012 UTC (7 years, 8 months ago) by gpertea
File size: 44059 byte(s)
Log Message:
works probably

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7
8 #define USAGE "Usage:\n\
9 fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 <input.fq>[,<input_mates.fq>\n\
13 \n\
14 Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15 for low complexity and collapse duplicate reads.\n\
16 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 one read in a pair), the two file names should be given delimited by a comma\n\
18 or a colon character\n\
19 \n\
20 Options:\n\
21 -n rename all the reads using the <prefix> followed by a read counter;\n\
22 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23 the read duplication count\n\
24 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25 file(s) named <input>.<outsuffix> which will be created in the \n\
26 current (working) directory; (writes to stdout if -o- is given);\n\
27 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 -f file with adaptor sequences to trim, each line having this format:\n\
29 <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30 -5 trim the given adaptor or primer sequence at the 5' end of each read\n\
31 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 -3 trim the given adaptor sequence at the 3' end of each read\n\
33 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 -A disable polyA/T trimming (enabled by default)\n\
35 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 -l minimum \"clean\" length after trimming that a read must have\n\
40 in order to pass the filter (default: 16)\n\
41 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42 one letter code for the reason why they were trashed\n\
43 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44 50% of its length detected as low complexity\n\
45 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 name (e.g. for microRNAs)\n\
47 -p input is phred64/phred33 (use -p64 or -p33)\n\
48 -Q convert quality values to the other Phred qv type\n\
49 -V verbose processing\n\
50 "
51
52 //-z for -o option, the output stream(s) will be first piped into the given\n
53 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54
55
56 // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57
58 //For paired reads sequencing:
59 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 //FILE* f_out=NULL; //stdout if not provided
62 //FILE* f_out2=NULL; //for paired reads
63 //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 //FILE* f_in2=NULL; //for paired reads
65
66 FILE* freport=NULL;
67
68 bool debug=false;
69 bool verbose=false;
70 bool doCollapse=false;
71 bool doDust=false;
72 bool doPolyTrim=true;
73 bool fastaOutput=false;
74 bool trashReport=false;
75 bool revCompl=false; //also reverse complement adaptor sequences
76 int min_read_len=16;
77 double max_perc_N=7.0;
78 int dust_cutoff=16;
79 bool isfasta=false;
80 bool convert_phred=false;
81 GStr outsuffix; // -o
82 GStr prefix;
83 GStr zcmd;
84 int num_trimmed5=0;
85 int num_trimmed3=0;
86 int num_trimmedN=0;
87 int num_trimmedQ=0;
88 int min_trimmed5=INT_MAX;
89 int min_trimmed3=INT_MAX;
90
91 int qvtrim_qmin=0;
92 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94 int qv_cvtadd=0; //could be -31 or +31
95
96 // adaptor matching metrics -- for X-drop ungapped extension
97 //const int match_reward=2;
98 //const int mismatch_penalty=3;
99 const int match_reward=2;
100 const int mismatch_penalty=4;
101 const int Xdrop=10;
102
103 const int poly_m_score=2; //match score
104 const int poly_mis_score=-3; //mismatch
105 const int poly_dropoff_score=7;
106 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
107
108 const char *polyA_seed="AAAA";
109 const char *polyT_seed="TTTT";
110
111 struct CASeqData {
112 //positional data for every possible hexamer in an adaptor
113 GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114 GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115 GStr seq; //actual adaptor sequence data
116 GStr seqr; //reverse complement sequence
117 int amlen; //fraction of adaptor length matching that's
118 //enough to consider the alignment
119 GAlnTrimType trim_type;
120 bool use_reverse;
121 CASeqData(bool rev=false):seq(),seqr(),
122 amlen(0), use_reverse(rev) {
123 trim_type=galn_None; //should be updated later!
124 for (int i=0;i<4096;i++) {
125 pz[i]=NULL;
126 pzr[i]=NULL;
127 }
128 }
129
130 void update(const char* s) {
131 seq=s;
132 table6mers(seq.chars(), seq.length(), pz);
133 amlen=iround(double(seq.length())*0.8);
134 if (amlen<12)
135 amlen=12;
136 if (!use_reverse) return;
137 //reverse complement
138 seqr=s;
139 int slen=seq.length();
140 for (int i=0;i<slen;i++)
141 seqr[i]=ntComplement(seq[slen-i-1]);
142 table6mers(seqr.chars(), seqr.length(), pzr);
143 }
144
145 ~CASeqData() {
146 for (int i=0;i<4096;i++) {
147 delete pz[i];
148 delete pzr[i];
149 }
150 }
151 };
152
153 GVec<CASeqData> adaptors5;
154 GVec<CASeqData> adaptors3;
155
156 CGreedyAlignData* gxmem_l=NULL;
157 CGreedyAlignData* gxmem_r=NULL;
158
159 // element in dhash:
160 class FqDupRec {
161 public:
162 int count; //how many of these reads are the same
163 int len; //length of qv
164 char* firstname; //optional, only if we want to keep the original read names
165 char* qv;
166 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
167 len=0;
168 qv=NULL;
169 firstname=NULL;
170 count=0;
171 if (qstr!=NULL) {
172 qv=Gstrdup(qstr->chars());
173 len=qstr->length();
174 count++;
175 }
176 if (rname!=NULL) firstname=Gstrdup(rname);
177 }
178 ~FqDupRec() {
179 GFREE(qv);
180 GFREE(firstname);
181 }
182 void add(GStr& d) { //collapse another record into this one
183 if (d.length()!=len)
184 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
185 count++;
186 for (int i=0;i<len;i++)
187 qv[i]=(qv[i]+d[i])/2;
188 }
189 };
190
191 void openfw(FILE* &f, GArgs& args, char opt) {
192 GStr s=args.getOpt(opt);
193 if (!s.is_empty()) {
194 if (s=='-') f=stdout;
195 else {
196 f=fopen(s.chars(),"w");
197 if (f==NULL) GError("Error creating file: %s\n", s.chars());
198 }
199 }
200 }
201
202 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
203 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
204
205 GHash<FqDupRec> dhash; //hash to keep track of duplicates
206
207 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208 int loadAdaptors(const char* fname);
209
210 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211 GStr& s, GStr& infname, GStr& infname2);
212 // uses outsuffix to generate output file names and open file handles as needed
213
214 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
215 void trash_report(char trashcode, GStr& rname, FILE* freport);
216
217 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
218 GStr& rname, GStr& rinfo, GStr& infname);
219
220 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
221 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
222
223 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
224 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
225 int dust(GStr& seq);
226 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230
231 void convertPhred(char* q, int len);
232 void convertPhred(GStr& q);
233
234 int main(int argc, char * const argv[]) {
235 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
236 int e;
237 if ((e=args.isError())>0) {
238 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
239 exit(224);
240 }
241 debug=(args.getOpt('Y')!=NULL);
242 verbose=(args.getOpt('V')!=NULL);
243 convert_phred=(args.getOpt('Q')!=NULL);
244 doCollapse=(args.getOpt('C')!=NULL);
245 doDust=(args.getOpt('D')!=NULL);
246 revCompl=(args.getOpt('R')!=NULL);
247 if (args.getOpt('A')) doPolyTrim=false;
248 /*
249 rawFormat=(args.getOpt('R')!=NULL);
250 if (rawFormat) {
251 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
252 }
253 */
254 prefix=args.getOpt('n');
255 GStr s=args.getOpt('l');
256 if (!s.is_empty())
257 min_read_len=s.asInt();
258 s=args.getOpt('m');
259 if (!s.is_empty())
260 max_perc_N=s.asDouble();
261 s=args.getOpt('d');
262 if (!s.is_empty()) {
263 dust_cutoff=s.asInt();
264 doDust=true;
265 }
266 s=args.getOpt('q');
267 if (!s.is_empty()) {
268 qvtrim_qmin=s.asInt();
269 }
270 s=args.getOpt('t');
271 if (!s.is_empty()) {
272 qvtrim_max=s.asInt();
273 }
274 s=args.getOpt('p');
275 if (!s.is_empty()) {
276 int v=s.asInt();
277 if (v==33) {
278 qv_phredtype=33;
279 qv_cvtadd=31;
280 }
281 else if (v==64) {
282 qv_phredtype=64;
283 qv_cvtadd=-31;
284 }
285 else
286 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
287 }
288 s=args.getOpt('f');
289 if (!s.is_empty()) {
290 loadAdaptors(s.chars());
291 }
292 bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293 s=args.getOpt('5');
294 if (!s.is_empty()) {
295 if (fileAdaptors)
296 GError("Error: options -5 and -f cannot be used together!\n");
297 s.upper();
298 addAdaptor(adaptors5, s, galn_TrimLeft);
299 }
300 s=args.getOpt('3');
301 if (!s.is_empty()) {
302 if (fileAdaptors)
303 GError("Error: options -3 and -f cannot be used together!\n");
304 s.upper();
305 addAdaptor(adaptors3, s, galn_TrimRight);
306 }
307 s=args.getOpt('y');
308 if (!s.is_empty()) {
309 int minmatch=s.asInt();
310 poly_minScore=minmatch*poly_m_score;
311 }
312
313 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
314 else outsuffix="-";
315 trashReport= (args.getOpt('r')!=NULL);
316 int fcount=args.startNonOpt();
317 if (fcount==0) {
318 GMessage(USAGE);
319 exit(224);
320 }
321 if (fcount>1 && doCollapse) {
322 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
323 }
324 //openfw(f_out, args, 'o');
325 //if (f_out==NULL) f_out=stdout;
326 if (trashReport)
327 openfw(freport, args, 'r');
328 char* infile=NULL;
329
330 if (adaptors5.Count()>0)
331 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332 gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 if (adaptors3.Count()>0)
334 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335
336 while ((infile=args.nextNonOpt())!=NULL) {
337 //for each input file
338 int incounter=0; //counter for input reads
339 int outcounter=0; //counter for output reads
340 int trash_s=0; //too short from the get go
341 int trash_Q=0;
342 int trash_N=0;
343 int trash_D=0;
344 int trash_poly=0;
345 int trash_A3=0;
346 int trash_A5=0;
347 s=infile;
348 GStr infname;
349 GStr infname2;
350 FILE* f_in=NULL;
351 FILE* f_in2=NULL;
352 FILE* f_out=NULL;
353 FILE* f_out2=NULL;
354 bool paired_reads=false;
355 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
356 GLineReader fq(f_in);
357 GLineReader* fq2=NULL;
358 if (f_in2!=NULL) {
359 fq2=new GLineReader(f_in2);
360 paired_reads=true;
361 }
362 GStr rseq, rqv, seqid, seqinfo;
363 GStr rseq2, rqv2, seqid2, seqinfo2;
364 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
365 incounter++;
366 int a5=0, a3=0, b5=0, b3=0;
367 char tcode=0, tcode2=0;
368 tcode=process_read(seqid, rseq, rqv, a5, a3);
369 if (fq2!=NULL) {
370 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
371 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
372 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
373 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
374 }
375 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
376 //decide what to do with this pair and print rseq2 if the pair makes it
377 if (tcode>1 && tcode2<=1) {
378 //"untrash" rseq
379 if (a3-a5+1<min_read_len) {
380 a5=1;
381 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
382 }
383 tcode=1;
384 }
385 else if (tcode<=1 && tcode2>1) {
386 //"untrash" rseq2
387 if (b3-b5+1<min_read_len) {
388 b5=1;
389 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
390 }
391 tcode2=1;
392 }
393 if (tcode<=1) { //trimmed or left intact -- write it!
394 if (tcode>0) {
395 rseq2=rseq2.substr(b5,b3-b5+1);
396 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
397 }
398 int nocounter=0;
399 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
400 }
401 } //pair read
402 if (tcode>1) { //trashed
403 #ifdef GDEBUG
404 GMessage(" !!!!TRASH code = %c\n",tcode);
405 #endif
406 if (tcode=='s') trash_s++;
407 else if (tcode=='A' || tcode=='T') trash_poly++;
408 else if (tcode=='Q') trash_Q++;
409 else if (tcode=='N') trash_N++;
410 else if (tcode=='D') trash_D++;
411 else if (tcode=='3') trash_A3++;
412 else if (tcode=='5') trash_A5++;
413 if (trashReport) trash_report(tcode, seqid, freport);
414 }
415 else if (!doCollapse) { //write it
416 if (tcode>0) {
417 rseq=rseq.substr(a5,a3-a5+1);
418 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
419 }
420 #ifdef GDEBUG
421 GMessage(" After trimming:\n");
422 GMessage("%s\n",rseq.chars());
423 #endif
424 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
425 }
426 } //for each fastq record
427 delete fq2;
428 FRCLOSE(f_in);
429 FRCLOSE(f_in2);
430 if (doCollapse) {
431 outcounter=0;
432 int maxdup_count=1;
433 char* maxdup_seq=NULL;
434 dhash.startIterate();
435 FqDupRec* qd=NULL;
436 char* seq=NULL;
437 while ((qd=dhash.NextData(seq))!=NULL) {
438 GStr rseq(seq);
439 //do the dusting here
440 if (doDust) {
441 int dustbases=dust(rseq);
442 if (dustbases>(rseq.length()>>1)) {
443 if (trashReport && qd->firstname!=NULL) {
444 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
445 }
446 trash_D+=qd->count;
447 continue;
448 }
449 }
450 outcounter++;
451 if (qd->count>maxdup_count) {
452 maxdup_count=qd->count;
453 maxdup_seq=seq;
454 }
455 if (isfasta) {
456 if (prefix.is_empty()) {
457 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
458 rseq.chars());
459 }
460 else { //use custom read name
461 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
462 qd->count, rseq.chars());
463 }
464 }
465 else { //fastq format
466 if (convert_phred) convertPhred(qd->qv, qd->len);
467 if (prefix.is_empty()) {
468 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
469 rseq.chars(), qd->qv);
470 }
471 else { //use custom read name
472 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
473 qd->count, rseq.chars(), qd->qv);
474 }
475 }
476 }//for each element of dhash
477 if (maxdup_count>1) {
478 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
479 }
480 } //collapse entries
481 if (verbose) {
482 if (paired_reads) {
483 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
484 GMessage("Number of input pairs :%9d\n", incounter);
485 GMessage(" Output pairs :%9d\n", outcounter);
486 }
487 else {
488 GMessage(">Input file : %s\n", infname.chars());
489 GMessage("Number of input reads :%9d\n", incounter);
490 GMessage(" Output reads :%9d\n", outcounter);
491 }
492 GMessage("------------------------------------\n");
493 if (num_trimmed5)
494 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
495 if (num_trimmed3)
496 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
497 GMessage("------------------------------------\n");
498 if (trash_s>0)
499 GMessage(" Trashed by length:%9d\n", trash_s);
500 if (trash_N>0)
501 GMessage(" Trashed by N%%:%9d\n", trash_N);
502 if (trash_Q>0)
503 GMessage("Trashed by low quality:%9d\n", trash_Q);
504 if (trash_poly>0)
505 GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
506 if (trash_A5>0)
507 GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508 if (trash_A3>0)
509 GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510 }
511 if (trashReport) {
512 FWCLOSE(freport);
513 }
514 FWCLOSE(f_out);
515 FWCLOSE(f_out2);
516 } //while each input file
517 delete gxmem_l;
518 delete gxmem_r;
519 //getc(stdin);
520 }
521
522 class NData {
523 public:
524 int NPos[1024]; //there should be no reads longer than 1K ?
525 int NCount;
526 int end5;
527 int end3;
528 int seqlen;
529 double perc_N; //percentage of Ns in end5..end3 range only!
530 const char* seq;
531 bool valid;
532 NData() {
533 seqlen=0;
534 NCount=0;
535 end5=0;
536 end3=0;
537 seq=NULL;
538 perc_N=0;
539 valid=true;
540 }
541 void init(GStr& rseq) {
542 NCount=0;
543 perc_N=0;
544 seqlen=rseq.length();
545 seq=rseq.chars();
546 end5=1;
547 end3=seqlen;
548 for (int i=0;i<seqlen;i++)
549 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
550 NPos[NCount]=i;
551 NCount++;
552 }
553 perc_N=(NCount*100.0)/seqlen;
554 valid=true;
555 }
556 void N_calc() { //only in the region after trimming
557 int n=0;
558 for (int i=end3-1;i<end5;i++) {
559 if (seq[i]=='N') n++;
560 }
561 perc_N=(n*100.0)/(end5-end3+1);
562 }
563 };
564
565 static NData feat;
566 int perc_lenN=12; // incremental distance from ends, in percentage of
567 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
568
569 void N_analyze(int l5, int l3, int p5, int p3) {
570 /* assumes feat was filled properly */
571 int old_dif, t5,t3,v;
572 if (l3<l5+2 || p5>p3 ) {
573 feat.end5=l5+1;
574 feat.end3=l3+1;
575 return;
576 }
577
578 t5=feat.NPos[p5]-l5;
579 t3=l3-feat.NPos[p3];
580 old_dif=p3-p5;
581 v=(int)((((double)(l3-l5))*perc_lenN)/100);
582 if (v>20) v=20; /* enforce N-search limit for very long reads */
583 if (t5 < v ) {
584 l5=feat.NPos[p5]+1;
585 p5++;
586 }
587 if (t3 < v) {
588 l3=feat.NPos[p3]-1;
589 p3--;
590 }
591 /* restNs=p3-p5; number of Ns in the new CLR */
592 if (p3-p5==old_dif) { /* no change, return */
593 /* don't trim if this may shorten the read too much */
594 //if (l5-l3<min_read_len) return;
595 feat.end5=l5+1;
596 feat.end3=l3+1;
597 return;
598 }
599 else
600 N_analyze(l5,l3, p5,p3);
601 }
602
603
604 bool qtrim(GStr& qvs, int &l5, int &l3) {
605 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
606 if (qv_phredtype==0) {
607 //try to guess the Phred type
608 int vmin=256, vmax=0;
609 for (int i=0;i<qvs.length();i++) {
610 if (vmin>qvs[i]) vmin=qvs[i];
611 if (vmax<qvs[i]) vmax=qvs[i];
612 }
613 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
614 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
615 if (qv_phredtype==0) {
616 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
617 }
618 if (verbose)
619 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
620 } //guessing Phred type
621 for (l3=qvs.length()-1;l3>2;l3--) {
622 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
623 }
624 //just in case, check also the 5' the end (?)
625 for (l5=0;l5<qvs.length()-3;l5++) {
626 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
627 }
628 if (qvtrim_max>0) {
629 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
630 if (l5>qvtrim_max) l5=qvtrim_max;
631 }
632 return (l5>0 || l3<qvs.length()-1);
633 }
634
635 bool ntrim(GStr& rseq, int &l5, int &l3) {
636 //count Ns in the sequence, trim N-rich ends
637 feat.init(rseq);
638 l5=feat.end5-1;
639 l3=feat.end3-1;
640 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
641 if (l5==feat.end5-1 && l3==feat.end3-1) {
642 if (feat.perc_N>max_perc_N) {
643 feat.valid=false;
644 l3=l5+1;
645 return true;
646 }
647 else {
648 return false; //nothing changed
649 }
650 }
651 l5=feat.end5-1;
652 l3=feat.end3-1;
653 if (l3-l5+1<min_read_len) {
654 feat.valid=false;
655 return true;
656 }
657 feat.N_calc();
658
659 if (feat.perc_N>max_perc_N) {
660 feat.valid=false;
661 l3=l5+1;
662 return true;
663 }
664 return true;
665 }
666
667 //--------------- dust functions ----------------
668 class DNADuster {
669 public:
670 int dustword;
671 int dustwindow;
672 int dustwindow2;
673 int dustcutoff;
674 int mv, iv, jv;
675 int counts[32*32*32];
676 int iis[32*32*32];
677 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
678 dustword=wordsize;
679 dustwindow=winsize;
680 dustwindow2 = (winsize>>1);
681 dustcutoff=cutoff;
682 mv=0;
683 iv=0;
684 jv=0;
685 }
686 void setWindowSize(int value) {
687 dustwindow = value;
688 dustwindow2 = (dustwindow >> 1);
689 }
690 void setWordSize(int value) {
691 dustword=value;
692 }
693 void wo1(int len, const char* s, int ivv) {
694 int i, ii, j, v, t, n, n1, sum;
695 int js, nis;
696 n = 32 * 32 * 32;
697 n1 = n - 1;
698 nis = 0;
699 i = 0;
700 ii = 0;
701 sum = 0;
702 v = 0;
703 for (j=0; j < len; j++, s++) {
704 ii <<= 5;
705 if (*s<=32) {
706 i=0;
707 continue;
708 }
709 ii |= *s - 'A'; //assume uppercase!
710 ii &= n1;
711 i++;
712 if (i >= dustword) {
713 for (js=0; js < nis && iis[js] != ii; js++) ;
714 if (js == nis) {
715 iis[nis] = ii;
716 counts[ii] = 0;
717 nis++;
718 }
719 if ((t = counts[ii]) > 0) {
720 sum += t;
721 v = 10 * sum / j;
722 if (mv < v) {
723 mv = v;
724 iv = ivv;
725 jv = j;
726 }
727 }
728 counts[ii]++;
729 }
730 }
731 }
732
733 int wo(int len, const char* s, int* beg, int* end) {
734 int i, l1;
735 l1 = len - dustword + 1;
736 if (l1 < 0) {
737 *beg = 0;
738 *end = len - 1;
739 return 0;
740 }
741 mv = 0;
742 iv = 0;
743 jv = 0;
744 for (i=0; i < l1; i++) {
745 wo1(len-i, s+i, i);
746 }
747 *beg = iv;
748 *end = iv + jv;
749 return mv;
750 }
751
752 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
753 int i, j, l, a, b, v;
754 if (cutoff==0) cutoff=dustcutoff;
755 a=0;b=0;
756 //GMessage("Dust cutoff=%d\n", cutoff);
757 for (i=0; i < seqlen; i += dustwindow2) {
758 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
759 v = wo(l, seq+i, &a, &b);
760 if (v > cutoff) {
761 //for (j = a; j <= b && j < dustwindow2; j++) {
762 for (j = a; j <= b; j++) {
763 seqmsk[i+j]='N';//could be made lowercase instead
764 }
765 }
766 }
767 //return first;
768 }
769 };
770
771 static DNADuster duster;
772
773 int dust(GStr& rseq) {
774 char* seq=Gstrdup(rseq.chars());
775 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
776 //check the number of Ns:
777 int ncount=0;
778 for (int i=0;i<rseq.length();i++) {
779 if (seq[i]=='N') ncount++;
780 }
781 GFREE(seq);
782 return ncount;
783 }
784
785 struct SLocScore {
786 int pos;
787 int score;
788 SLocScore(int p=0,int s=0) {
789 pos=p;
790 score=s;
791 }
792 void set(int p, int s) {
793 pos=p;
794 score=s;
795 }
796 void add(int p, int add) {
797 pos=p;
798 score+=add;
799 }
800 };
801
802 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
803 if (!doPolyTrim) return false;
804 int rlen=seq.length();
805 l5=0;
806 l3=rlen-1;
807 int32 seedVal=*(int32*)poly_seed;
808 char polyChar=poly_seed[0];
809 //assumes N trimming was already done
810 //so a poly match should be very close to the end of the read
811 // -- find the initial match (seed)
812 int lmin=GMAX((rlen-16), 0);
813 int li;
814 for (li=rlen-4;li>lmin;li--) {
815 if (seedVal==*(int*)&(seq[li])) {
816 break;
817 }
818 }
819 if (li<=lmin) return false;
820 //seed found, try to extend it both ways
821 //extend right
822 int ri=li+3;
823 SLocScore loc(ri, poly_m_score<<2);
824 SLocScore maxloc(loc);
825 //extend right
826 while (ri<rlen-1) {
827 ri++;
828 if (seq[ri]==polyChar) {
829 loc.add(ri,poly_m_score);
830 }
831 else if (seq[ri]=='N') {
832 loc.add(ri,0);
833 }
834 else { //mismatch
835 loc.add(ri,poly_mis_score);
836 if (maxloc.score-loc.score>poly_dropoff_score) break;
837 }
838 if (maxloc.score<=loc.score) {
839 maxloc=loc;
840 }
841 }
842 ri=maxloc.pos;
843 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
844 //ri = right boundary for the poly match
845 //extend left
846 loc.set(li, maxloc.score);
847 maxloc.pos=li;
848 while (li>0) {
849 li--;
850 if (seq[li]==polyChar) {
851 loc.add(li,poly_m_score);
852 }
853 else if (seq[li]=='N') {
854 loc.add(li,0);
855 }
856 else { //mismatch
857 loc.add(li,poly_mis_score);
858 if (maxloc.score-loc.score>poly_dropoff_score) break;
859 }
860 if (maxloc.score<=loc.score) {
861 maxloc=loc;
862 }
863 }
864 li=maxloc.pos;
865 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
866 (maxloc.score>poly_minScore && ri>=rlen-3) ||
867 (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
868 //trimming this li-ri match at 3' end
869 l3=li-1;
870 if (l3<0) l3=0;
871 return true;
872 }
873 return false;
874 }
875
876 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
877 if (!doPolyTrim) return false;
878 int rlen=seq.length();
879 l5=0;
880 l3=rlen-1;
881 int32 seedVal=*(int32*)poly_seed;
882 char polyChar=poly_seed[0];
883 //assumes N trimming was already done
884 //so a poly match should be very close to the end of the read
885 // -- find the initial match (seed)
886 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
887 int li;
888 for (li=0;li<=lmax;li++) {
889 if (seedVal==*(int*)&(seq[li])) {
890 break;
891 }
892 }
893 if (li>lmax) return false;
894 //seed found, try to extend it both ways
895 //extend left
896 int ri=li+3; //save rightmost base of the seed
897 SLocScore loc(li, poly_m_score<<2);
898 SLocScore maxloc(loc);
899 while (li>0) {
900 li--;
901 if (seq[li]==polyChar) {
902 loc.add(li,poly_m_score);
903 }
904 else if (seq[li]=='N') {
905 loc.add(li,0);
906 }
907 else { //mismatch
908 loc.add(li,poly_mis_score);
909 if (maxloc.score-loc.score>poly_dropoff_score) break;
910 }
911 if (maxloc.score<=loc.score) {
912 maxloc=loc;
913 }
914 }
915 li=maxloc.pos;
916 if (li>5) return false; //no trimming wanted, too far from 5' end
917 //li = right boundary for the poly match
918
919 //extend right
920 loc.set(ri, maxloc.score);
921 maxloc.pos=ri;
922 while (ri<rlen-1) {
923 ri++;
924 if (seq[ri]==polyChar) {
925 loc.add(ri,poly_m_score);
926 }
927 else if (seq[ri]=='N') {
928 loc.add(ri,0);
929 }
930 else { //mismatch
931 loc.add(ri,poly_mis_score);
932 if (maxloc.score-loc.score>poly_dropoff_score) break;
933 }
934 if (maxloc.score<=loc.score) {
935 maxloc=loc;
936 }
937 }
938 ri=maxloc.pos;
939 if ((maxloc.score==poly_minScore && li==0) ||
940 (maxloc.score>poly_minScore && li<2)
941 || (maxloc.score>(poly_minScore*3) && li<8)) {
942 //adjust l5 to reflect this trimming of 5' end
943 l5=ri+1;
944 if (l5>rlen-1) l5=rlen-1;
945 return true;
946 }
947 return false;
948 }
949
950 bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951 if (adaptors3.Count()==0) return false;
952 int rlen=seq.length();
953 l5=0;
954 l3=rlen-1;
955 bool trimmed=false;
956 GStr wseq(seq);
957 int wlen=rlen;
958 GXSeqData seqdata;
959 int numruns=revCompl ? 2 : 1;
960 for (int ai=0;ai<adaptors3.Count();ai++) {
961 for (int r=0;r<numruns;r++) {
962 if (r) {
963 seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
964 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
965 }
966 else {
967 seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
968 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
969 }
970
971 GXAlnInfo* bestaln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
972 if (bestaln) {
973 trimmed=true;
974 //keep unmatched region on the left OR right (the longer one)
975 if (bestaln->sl > wlen-bestaln->sr) {
976 //keep left side
977 l3-=(wlen-bestaln->sl+1);
978 if (l3<0) l3=0;
979 }
980 else { //keep right side
981 l5+=bestaln->sr;
982 if (l5>=rlen) l5=rlen-1;
983 }
984 delete bestaln;
985 if (l3-l5+1<min_read_len) return true;
986 wseq=seq.substr(l5,l3-l5+1);
987 wlen=wseq.length();
988 }
989 }//forward and reverse adaptors
990 }//for each 3' adaptor
991 return trimmed;
992 }
993
994 bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
995 if (adaptors5.Count()==0) return false;
996 int rlen=seq.length();
997 l5=0;
998 l3=rlen-1;
999 bool trimmed=false;
1000 GStr wseq(seq);
1001 int wlen=rlen;
1002 GXSeqData seqdata;
1003 int numruns=revCompl ? 2 : 1;
1004 for (int ai=0;ai<adaptors5.Count();ai++) {
1005 for (int r=0;r<numruns;r++) {
1006 if (r) {
1007 seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1008 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1009 }
1010 else {
1011 seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1012 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1013 }
1014 GXAlnInfo* bestaln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1015 if (bestaln) {
1016 trimmed=true;
1017 if (bestaln->sl-1 > wlen-bestaln->sr) {
1018 //keep left side
1019 l3-=(wlen-bestaln->sl+1);
1020 if (l3<0) l3=0;
1021 }
1022 else { //keep right side
1023 l5+=bestaln->sr;
1024 if (l5>=rlen) l5=rlen-1;
1025 }
1026 delete bestaln;
1027 if (l3-l5+1<min_read_len) return true;
1028 wseq=seq.substr(l5,l3-l5+1);
1029 wlen=wseq.length();
1030 }
1031 } //forward and reverse?
1032 }//for each 5' adaptor
1033 return trimmed;
1034 }
1035
1036 //convert qvs to/from phred64 from/to phread33
1037 void convertPhred(GStr& q) {
1038 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1039 }
1040
1041 void convertPhred(char* q, int len) {
1042 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1043 }
1044
1045 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1046 GStr& rname, GStr& rinfo, GStr& infname) {
1047 rseq="";
1048 rqv="";
1049 rname="";
1050 rinfo="";
1051 if (fq.eof()) return false;
1052 char* l=fq.getLine();
1053 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1054 if (l==NULL) return false;
1055 /* if (rawFormat) {
1056 //TODO: implement raw qseq parsing here
1057 //if (raw type=N) then continue; //skip invalid/bad records
1058 } //raw qseq format
1059 else { // FASTQ or FASTA */
1060 isfasta=(l[0]=='>');
1061 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1062 infname.chars(), l);
1063 GStr s(l);
1064 rname=&(l[1]);
1065 for (int i=0;i<rname.length();i++)
1066 if (rname[i]<=' ') {
1067 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1068 rname.cut(i);
1069 break;
1070 }
1071 //now get the sequence
1072 if ((l=fq.getLine())==NULL)
1073 GError("Error: unexpected EOF after header for read %s (%s)\n",
1074 rname.chars(), infname.chars());
1075 rseq=l; //this must be the DNA line
1076 while ((l=fq.getLine())!=NULL) {
1077 //seq can span multiple lines
1078 if (l[0]=='>' || l[0]=='+') {
1079 fq.pushBack();
1080 break; //
1081 }
1082 rseq+=l;
1083 } //check for multi-line seq
1084 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1085 if ((l=fq.getLine())==NULL)
1086 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1087 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1088 if ((l=fq.getLine())==NULL)
1089 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1090 rqv=l;
1091 //if (rqv.length()!=rseq.length())
1092 // GError("Error: qv len != seq len for %s\n", rname.chars());
1093 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1094 rqv+=l; //append to qv string
1095 }
1096 }// fastq
1097 // } //<-- FASTA or FASTQ
1098 rseq.upper();
1099 return true;
1100 }
1101
1102 #ifdef GDEBUG
1103 void showTrim(GStr& s, int l5, int l3) {
1104 if (l5>0 || l3==0) {
1105 color_bg(c_red);
1106 }
1107 for (int i=0;i<s.length()-1;i++) {
1108 if (i && i==l5) color_resetbg();
1109 fprintf(stderr, "%c", s[i]);
1110 if (i && i==l3) color_bg(c_red);
1111 }
1112 fprintf(stderr, "%c", s[s.length()-1]);
1113 color_reset();
1114 fprintf(stderr, "\n");
1115 }
1116 #endif
1117
1118 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1119 //returns 0 if the read was untouched, 1 if it was just trimmed
1120 // and a trash code if it was trashed
1121 l5=0;
1122 l3=rseq.length()-1;
1123 #ifdef GDEBUG
1124 //rseq.reverse();
1125 GMessage(">%s\n", rname.chars());
1126 GMessage("%s\n",rseq.chars());
1127 #endif
1128 if (l3-l5+1<min_read_len) {
1129 return 's';
1130 }
1131 GStr wseq(rseq.chars());
1132 GStr wqv(rqv.chars());
1133 int w5=l5;
1134 int w3=l3;
1135 //first do the q-based trimming
1136 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1137 if (w3-w5+1<min_read_len) {
1138 return 'Q'; //invalid read
1139 }
1140 //-- keep only the w5..w3 range
1141 l5=w5;
1142 l3=w3;
1143 wseq=wseq.substr(w5, w3-w5+1);
1144 if (!wqv.is_empty())
1145 wqv=wqv.substr(w5, w3-w5+1);
1146 } //qv trimming
1147 // N-trimming on the remaining read seq
1148 if (ntrim(wseq, w5, w3)) {
1149 //GMessage("before: %s\n",wseq.chars());
1150 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1151 l5+=w5;
1152 l3-=(wseq.length()-1-w3);
1153 if (w3-w5+1<min_read_len) {
1154 return 'N'; //to be trashed
1155 }
1156 //-- keep only the w5..w3 range
1157 wseq=wseq.substr(w5, w3-w5+1);
1158 if (!wqv.is_empty())
1159 wqv=wqv.substr(w5, w3-w5+1);
1160 w5=0;
1161 w3=wseq.length()-1;
1162 }
1163 char trim_code;
1164 //clean the more dirty end first - 3'
1165 int prev_w5=0;
1166 int prev_w3=0;
1167 bool w3upd=true;
1168 bool w5upd=true;
1169 do {
1170 trim_code=0;
1171 if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1172 trim_code='A';
1173 }
1174 else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1175 trim_code='T';
1176 }
1177 else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1178 trim_code='A';
1179 }
1180 else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1181 trim_code='T';
1182 }
1183 else if (trim_adaptor5(wseq, w5, w3)) {
1184 trim_code='5';
1185 }
1186 else if (trim_adaptor3(wseq, w5, w3)) {
1187 trim_code='3';
1188 }
1189 if (trim_code) {
1190 w3upd=(w3!=prev_w3);
1191 w5upd=(w5!=prev_w5);
1192 if (w3upd) prev_w3=w3;
1193 if (w5upd) prev_w5=w5;
1194 #ifdef GDEBUG
1195 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1196 showTrim(wseq, w5, w3);
1197 #endif
1198 int trimlen=wseq.length()-(w3-w5+1);
1199 num_trimmed3++;
1200 if (trimlen<min_trimmed3)
1201 min_trimmed3=trimlen;
1202 l5+=w5;
1203 l3-=(wseq.length()-1-w3);
1204 if (w3-w5+1<min_read_len) {
1205 return trim_code;
1206 }
1207 //-- keep only the w5..w3 range
1208 wseq=wseq.substr(w5, w3-w5+1);
1209 if (!wqv.is_empty())
1210 wqv=wqv.substr(w5, w3-w5+1);
1211 }//trimming at 3' end
1212 } while (trim_code);
1213
1214 if (doCollapse) {
1215 //keep read for later
1216 FqDupRec* dr=dhash.Find(wseq.chars());
1217 if (dr==NULL) { //new entry
1218 //if (prefix.is_empty())
1219 dhash.Add(wseq.chars(),
1220 new FqDupRec(&wqv, rname.chars()));
1221 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1222 }
1223 else
1224 dr->add(wqv);
1225 } //collapsing duplicates
1226 else { //not collapsing duplicates
1227 //apply the dust filter now
1228 if (doDust) {
1229 int dustbases=dust(wseq);
1230 if (dustbases>(wseq.length()>>1)) {
1231 return 'D';
1232 }
1233 }
1234 } //not collapsing duplicates
1235 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1236 }
1237
1238 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1239 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1240 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1241 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1242 }
1243
1244 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1245 outcounter++;
1246 bool asFasta=(rqv.is_empty() || fastaOutput);
1247 if (asFasta) {
1248 if (prefix.is_empty()) {
1249 printHeader(f_out, '>',rname,rinfo);
1250 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1251 }
1252 else {
1253 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1254 rseq.chars());
1255 }
1256 }
1257 else { //fastq
1258 if (convert_phred) convertPhred(rqv);
1259 if (prefix.is_empty()) {
1260 printHeader(f_out, '@', rname, rinfo);
1261 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1262 }
1263 else
1264 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1265 rseq.chars(),rqv.chars() );
1266 }
1267 }
1268
1269 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1270 if (freport==NULL || trashcode<=' ') return;
1271 if (trashcode=='3' || trashcode=='5') {
1272 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1273 }
1274 else {
1275 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1276 }
1277 //tcounter++;
1278 }
1279
1280 GStr getFext(GStr& s, int* xpos=NULL) {
1281 //s must be a filename without a path
1282 GStr r("");
1283 if (xpos!=NULL) *xpos=0;
1284 if (s.is_empty() || s=="-") return r;
1285 int p=s.rindex('.');
1286 int d=s.rindex('/');
1287 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1288 r=s.substr(p+1);
1289 if (xpos!=NULL) *xpos=p+1;
1290 r.lower();
1291 return r;
1292 }
1293
1294 void baseFileName(GStr& fname) {
1295 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1296 int xpos=0;
1297 GStr fext=getFext(fname, &xpos);
1298 if (xpos<=1) return;
1299 bool extdel=false;
1300 GStr f2;
1301 if (fext=="z" || fext=="zip") {
1302 extdel=true;
1303 }
1304 else if (fext.length()>=2) {
1305 f2=fext.substr(0,2);
1306 extdel=(f2=="gz" || f2=="bz");
1307 }
1308 if (extdel) {
1309 fname.cut(xpos-1);
1310 fext=getFext(fname, &xpos);
1311 if (xpos<=1) return;
1312 }
1313 extdel=false;
1314 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1315 extdel=true;
1316 }
1317 else if (fext.length()>=2) {
1318 extdel=(fext.substr(0,2)=="fa");
1319 }
1320 if (extdel) fname.cut(xpos-1);
1321 GStr fncp(fname);
1322 fncp.lower();
1323 fncp.chomp("seq");
1324 fncp.chomp("sequence");
1325 fncp.trimR("_.");
1326 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1327 }
1328
1329 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1330 FILE* f_out=NULL;
1331 GStr fname(getFileName(infname.chars()));
1332 //eliminate known extensions
1333 baseFileName(fname);
1334 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1335 else if (pocmd.is_empty()) {
1336 GStr oname(fname);
1337 oname.append('.');
1338 oname.append(outsuffix);
1339 f_out=fopen(oname.chars(),"w");
1340 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1341 }
1342 else {
1343 GStr oname(">");
1344 oname.append(fname);
1345 oname.append('.');
1346 oname.append(outsuffix);
1347 pocmd.append(oname);
1348 f_out=popen(pocmd.chars(), "w");
1349 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1350 }
1351 return f_out;
1352 }
1353
1354 void guess_unzip(GStr& fname, GStr& picmd) {
1355 GStr fext=getFext(fname);
1356 if (fext=="gz" || fext=="gzip" || fext=="z") {
1357 picmd="gzip -cd ";
1358 }
1359 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1360 picmd="bzip2 -cd ";
1361 }
1362 }
1363
1364 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1365 //TODO: prepare CASeqData here, and collect hexamers as well
1366 if (seq.is_empty() || seq=="-" ||
1367 seq=="N/A" || seq==".") return;
1368
1369 CASeqData adata(revCompl);
1370 int idx=adaptors.Add(adata);
1371 if (idx<0) GError("Error: failed to add adaptor!\n");
1372 adaptors[idx].trim_type=trim_type;
1373 adaptors[idx].update(seq.chars());
1374 }
1375
1376
1377 int loadAdaptors(const char* fname) {
1378 GLineReader lr(fname);
1379 char* l;
1380 while ((l=lr.nextLine())!=NULL) {
1381 if (lr.length()<=3 || l[0]=='#') continue;
1382 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1383 l[0]==';'|| l[0]==':' ) {
1384 int i=1;
1385 while (l[i]!=0 && isspace(l[i])) {
1386 i++;
1387 }
1388 if (l[i]!=0) {
1389 GStr s(&(l[i]));
1390 #ifdef GDEBUG
1391 //s.reverse();
1392 #endif
1393 addAdaptor(adaptors3, s, galn_TrimRight);
1394 continue;
1395 }
1396 }
1397 else {
1398 GStr s(l);
1399 s.startTokenize("\t ;,:");
1400 GStr a5,a3;
1401 if (s.nextToken(a5))
1402 s.nextToken(a3);
1403 else continue; //no tokens on this line
1404 GAlnTrimType ttype5=galn_TrimLeft;
1405 a5.upper();
1406 a3.upper();
1407 if (a3.is_empty() || a3==a5 || a3=="=") {
1408 a3.clear();
1409 ttype5=galn_TrimEither;
1410 }
1411 #ifdef GDEBUG
1412 // a5.reverse();
1413 // a3.reverse();
1414 #endif
1415 addAdaptor(adaptors5, a5, ttype5);
1416 addAdaptor(adaptors3, a3, galn_TrimRight);
1417 }
1418 }
1419 return adaptors5.Count()+adaptors3.Count();
1420 }
1421
1422 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1423 GStr& s, GStr& infname, GStr& infname2) {
1424 // uses outsuffix to generate output file names and open file handles as needed
1425 infname="";
1426 infname2="";
1427 f_in=NULL;
1428 f_in2=NULL;
1429 f_out=NULL;
1430 f_out2=NULL;
1431 //analyze outsuffix intent
1432 GStr pocmd;
1433 if (outsuffix=="-") {
1434 f_out=stdout;
1435 }
1436 else {
1437 GStr ox=getFext(outsuffix);
1438 if (ox.length()>2) ox=ox.substr(0,2);
1439 if (ox=="gz") pocmd="gzip -9 -c ";
1440 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1441 }
1442 if (s=="-") {
1443 f_in=stdin;
1444 infname="stdin";
1445 f_out=prepOutFile(infname, pocmd);
1446 return;
1447 } // streaming from stdin
1448 s.startTokenize(",:");
1449 s.nextToken(infname);
1450 bool paired=s.nextToken(infname2);
1451 if (fileExists(infname.chars())==0)
1452 GError("Error: cannot find file %s!\n",infname.chars());
1453 GStr fname(getFileName(infname.chars()));
1454 GStr picmd;
1455 guess_unzip(fname, picmd);
1456 if (picmd.is_empty()) {
1457 f_in=fopen(infname.chars(), "r");
1458 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1459 }
1460 else {
1461 picmd.append(infname);
1462 f_in=popen(picmd.chars(), "r");
1463 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1464 }
1465 if (f_out==stdout) {
1466 if (paired) GError("Error: output suffix required for paired reads\n");
1467 return;
1468 }
1469 f_out=prepOutFile(infname, pocmd);
1470 if (!paired) return;
1471 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1472 // ---- paired reads:-------------
1473 if (fileExists(infname2.chars())==0)
1474 GError("Error: cannot find file %s!\n",infname2.chars());
1475 picmd="";
1476 GStr fname2(getFileName(infname2.chars()));
1477 guess_unzip(fname2, picmd);
1478 if (picmd.is_empty()) {
1479 f_in2=fopen(infname2.chars(), "r");
1480 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1481 }
1482 else {
1483 picmd.append(infname2);
1484 f_in2=popen(picmd.chars(), "r");
1485 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1486 }
1487 f_out2=prepOutFile(infname2, pocmd);
1488 }