ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 185
Committed: Thu Feb 16 22:05:42 2012 UTC (7 years, 9 months ago) by gpertea
File size: 44641 byte(s)
Log Message:
Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7
8 #define USAGE "Usage:\n\
9 fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 <input.fq>[,<input_mates.fq>\n\
13 \n\
14 Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15 for low complexity and collapse duplicate reads.\n\
16 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 one read in a pair), the two file names should be given delimited by a comma\n\
18 or a colon character\n\
19 \n\
20 Options:\n\
21 -n rename all the reads using the <prefix> followed by a read counter;\n\
22 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23 the read duplication count\n\
24 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25 file(s) named <input>.<outsuffix> which will be created in the \n\
26 current (working) directory; (writes to stdout if -o- is given);\n\
27 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 -f file with adaptor sequences to trim, each line having this format:\n\
29 <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30 -5 trim the given adaptor or primer sequence at the 5' end of each read\n\
31 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 -3 trim the given adaptor sequence at the 3' end of each read\n\
33 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 -A disable polyA/T trimming (enabled by default)\n\
35 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 -l minimum \"clean\" length after trimming that a read must have\n\
40 in order to pass the filter (default: 16)\n\
41 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42 one letter code for the reason why they were trashed\n\
43 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44 50% of its length detected as low complexity\n\
45 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 name (e.g. for microRNAs)\n\
47 -p input is phred64/phred33 (use -p64 or -p33)\n\
48 -Q convert quality values to the other Phred qv type\n\
49 -V verbose processing\n\
50 "
51
52 //-z for -o option, the output stream(s) will be first piped into the given\n
53 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54
55
56 // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57
58 //For paired reads sequencing:
59 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 //FILE* f_out=NULL; //stdout if not provided
62 //FILE* f_out2=NULL; //for paired reads
63 //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 //FILE* f_in2=NULL; //for paired reads
65
66 FILE* freport=NULL;
67
68 bool debug=false;
69 bool verbose=false;
70 bool doCollapse=false;
71 bool doDust=false;
72 bool doPolyTrim=true;
73 bool fastaOutput=false;
74 bool trashReport=false;
75 bool revCompl=false; //also reverse complement adaptor sequences
76 int min_read_len=16;
77 double max_perc_N=7.0;
78 int dust_cutoff=16;
79 bool isfasta=false;
80 bool convert_phred=false;
81 GStr outsuffix; // -o
82 GStr prefix;
83 GStr zcmd;
84 int num_trimmed5=0;
85 int num_trimmed3=0;
86 int num_trimmedN=0;
87 int num_trimmedQ=0;
88 int min_trimmed5=INT_MAX;
89 int min_trimmed3=INT_MAX;
90
91 int qvtrim_qmin=0;
92 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94 int qv_cvtadd=0; //could be -31 or +31
95
96 // adaptor matching metrics -- for X-drop ungapped extension
97 //const int match_reward=2;
98 //const int mismatch_penalty=3;
99 const int match_reward=2;
100 const int mismatch_penalty=4;
101 const int Xdrop=10;
102
103 const int poly_m_score=2; //match score
104 const int poly_mis_score=-3; //mismatch
105 const int poly_dropoff_score=7;
106 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
107
108 const char *polyA_seed="AAAA";
109 const char *polyT_seed="TTTT";
110
111 struct CASeqData {
112 //positional data for every possible hexamer in an adaptor
113 GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114 GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115 GStr seq; //actual adaptor sequence data
116 GStr seqr; //reverse complement sequence
117 int amlen; //fraction of adaptor length matching that's
118 //enough to consider the alignment
119 GAlnTrimType trim_type;
120 bool use_reverse;
121 CASeqData(bool rev=false):seq(),seqr(),
122 amlen(0), use_reverse(rev) {
123 trim_type=galn_None; //should be updated later!
124 for (int i=0;i<4096;i++) {
125 pz[i]=NULL;
126 pzr[i]=NULL;
127 }
128 }
129
130 void update(const char* s) {
131 seq=s;
132 table6mers(seq.chars(), seq.length(), pz);
133 amlen=iround(double(seq.length())*0.8);
134 if (amlen<12)
135 amlen=12;
136 if (!use_reverse) return;
137 //reverse complement
138 seqr=s;
139 int slen=seq.length();
140 for (int i=0;i<slen;i++)
141 seqr[i]=ntComplement(seq[slen-i-1]);
142 table6mers(seqr.chars(), seqr.length(), pzr);
143 }
144
145 ~CASeqData() {
146 for (int i=0;i<4096;i++) {
147 delete pz[i];
148 delete pzr[i];
149 }
150 }
151 };
152
153 GVec<CASeqData> adaptors5;
154 GVec<CASeqData> adaptors3;
155
156 CGreedyAlignData* gxmem_l=NULL;
157 CGreedyAlignData* gxmem_r=NULL;
158
159 // element in dhash:
160 class FqDupRec {
161 public:
162 int count; //how many of these reads are the same
163 int len; //length of qv
164 char* firstname; //optional, only if we want to keep the original read names
165 char* qv;
166 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
167 len=0;
168 qv=NULL;
169 firstname=NULL;
170 count=0;
171 if (qstr!=NULL) {
172 qv=Gstrdup(qstr->chars());
173 len=qstr->length();
174 count++;
175 }
176 if (rname!=NULL) firstname=Gstrdup(rname);
177 }
178 ~FqDupRec() {
179 GFREE(qv);
180 GFREE(firstname);
181 }
182 void add(GStr& d) { //collapse another record into this one
183 if (d.length()!=len)
184 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
185 count++;
186 for (int i=0;i<len;i++)
187 qv[i]=(qv[i]+d[i])/2;
188 }
189 };
190
191 void openfw(FILE* &f, GArgs& args, char opt) {
192 GStr s=args.getOpt(opt);
193 if (!s.is_empty()) {
194 if (s=='-') f=stdout;
195 else {
196 f=fopen(s.chars(),"w");
197 if (f==NULL) GError("Error creating file: %s\n", s.chars());
198 }
199 }
200 }
201
202 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
203 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
204
205 GHash<FqDupRec> dhash; //hash to keep track of duplicates
206
207 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208 int loadAdaptors(const char* fname);
209
210 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211 GStr& s, GStr& infname, GStr& infname2);
212 // uses outsuffix to generate output file names and open file handles as needed
213
214 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
215 void trash_report(char trashcode, GStr& rname, FILE* freport);
216
217 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
218 GStr& rname, GStr& rinfo, GStr& infname);
219
220 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
221 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
222
223 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
224 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
225 int dust(GStr& seq);
226 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230
231 void convertPhred(char* q, int len);
232 void convertPhred(GStr& q);
233
234 int main(int argc, char * const argv[]) {
235 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
236 int e;
237 if ((e=args.isError())>0) {
238 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
239 exit(224);
240 }
241 debug=(args.getOpt('Y')!=NULL);
242 verbose=(args.getOpt('V')!=NULL);
243 convert_phred=(args.getOpt('Q')!=NULL);
244 doCollapse=(args.getOpt('C')!=NULL);
245 doDust=(args.getOpt('D')!=NULL);
246 revCompl=(args.getOpt('R')!=NULL);
247 if (args.getOpt('A')) doPolyTrim=false;
248 /*
249 rawFormat=(args.getOpt('R')!=NULL);
250 if (rawFormat) {
251 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
252 }
253 */
254 prefix=args.getOpt('n');
255 GStr s=args.getOpt('l');
256 if (!s.is_empty())
257 min_read_len=s.asInt();
258 s=args.getOpt('m');
259 if (!s.is_empty())
260 max_perc_N=s.asDouble();
261 s=args.getOpt('d');
262 if (!s.is_empty()) {
263 dust_cutoff=s.asInt();
264 doDust=true;
265 }
266 s=args.getOpt('q');
267 if (!s.is_empty()) {
268 qvtrim_qmin=s.asInt();
269 }
270 s=args.getOpt('t');
271 if (!s.is_empty()) {
272 qvtrim_max=s.asInt();
273 }
274 s=args.getOpt('p');
275 if (!s.is_empty()) {
276 int v=s.asInt();
277 if (v==33) {
278 qv_phredtype=33;
279 qv_cvtadd=31;
280 }
281 else if (v==64) {
282 qv_phredtype=64;
283 qv_cvtadd=-31;
284 }
285 else
286 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
287 }
288 s=args.getOpt('f');
289 if (!s.is_empty()) {
290 loadAdaptors(s.chars());
291 }
292 bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293 s=args.getOpt('5');
294 if (!s.is_empty()) {
295 if (fileAdaptors)
296 GError("Error: options -5 and -f cannot be used together!\n");
297 s.upper();
298 addAdaptor(adaptors5, s, galn_TrimLeft);
299 }
300 s=args.getOpt('3');
301 if (!s.is_empty()) {
302 if (fileAdaptors)
303 GError("Error: options -3 and -f cannot be used together!\n");
304 s.upper();
305 addAdaptor(adaptors3, s, galn_TrimRight);
306 }
307 s=args.getOpt('y');
308 if (!s.is_empty()) {
309 int minmatch=s.asInt();
310 poly_minScore=minmatch*poly_m_score;
311 }
312
313 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
314 else outsuffix="-";
315 trashReport= (args.getOpt('r')!=NULL);
316 int fcount=args.startNonOpt();
317 if (fcount==0) {
318 GMessage(USAGE);
319 exit(224);
320 }
321 if (fcount>1 && doCollapse) {
322 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
323 }
324 //openfw(f_out, args, 'o');
325 //if (f_out==NULL) f_out=stdout;
326 if (trashReport)
327 openfw(freport, args, 'r');
328 char* infile=NULL;
329
330 if (adaptors5.Count()>0)
331 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332 gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 if (adaptors3.Count()>0)
334 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335
336 while ((infile=args.nextNonOpt())!=NULL) {
337 //for each input file
338 int incounter=0; //counter for input reads
339 int outcounter=0; //counter for output reads
340 int trash_s=0; //too short from the get go
341 int trash_Q=0;
342 int trash_N=0;
343 int trash_D=0;
344 int trash_poly=0;
345 int trash_A3=0;
346 int trash_A5=0;
347 s=infile;
348 GStr infname;
349 GStr infname2;
350 FILE* f_in=NULL;
351 FILE* f_in2=NULL;
352 FILE* f_out=NULL;
353 FILE* f_out2=NULL;
354 bool paired_reads=false;
355 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
356 GLineReader fq(f_in);
357 GLineReader* fq2=NULL;
358 if (f_in2!=NULL) {
359 fq2=new GLineReader(f_in2);
360 paired_reads=true;
361 }
362 GStr rseq, rqv, seqid, seqinfo;
363 GStr rseq2, rqv2, seqid2, seqinfo2;
364 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
365 incounter++;
366 int a5=0, a3=0, b5=0, b3=0;
367 char tcode=0, tcode2=0;
368 tcode=process_read(seqid, rseq, rqv, a5, a3);
369 if (fq2!=NULL) {
370 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
371 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
372 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
373 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
374 }
375 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
376 //decide what to do with this pair and print rseq2 if the pair makes it
377 if (tcode>1 && tcode2<=1) {
378 //"untrash" rseq
379 if (a3-a5+1<min_read_len) {
380 a5=1;
381 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
382 }
383 tcode=1;
384 }
385 else if (tcode<=1 && tcode2>1) {
386 //"untrash" rseq2
387 if (b3-b5+1<min_read_len) {
388 b5=1;
389 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
390 }
391 tcode2=1;
392 }
393 if (tcode<=1) { //trimmed or left intact -- write it!
394 if (tcode>0) {
395 rseq2=rseq2.substr(b5,b3-b5+1);
396 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
397 }
398 int nocounter=0;
399 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
400 }
401 } //pair read
402 if (tcode>1) { //trashed
403 #ifdef GDEBUG
404 GMessage(" !!!!TRASH code = %c\n",tcode);
405 #endif
406 if (tcode=='s') trash_s++;
407 else if (tcode=='A' || tcode=='T') trash_poly++;
408 else if (tcode=='Q') trash_Q++;
409 else if (tcode=='N') trash_N++;
410 else if (tcode=='D') trash_D++;
411 else if (tcode=='3') trash_A3++;
412 else if (tcode=='5') trash_A5++;
413 if (trashReport) trash_report(tcode, seqid, freport);
414 }
415 else if (!doCollapse) { //write it
416 if (tcode>0) {
417 rseq=rseq.substr(a5,a3-a5+1);
418 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
419 }
420 #ifdef GDEBUG
421 GMessage(" After trimming:\n");
422 GMessage("%s\n",rseq.chars());
423 #endif
424 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
425 }
426 } //for each fastq record
427 delete fq2;
428 FRCLOSE(f_in);
429 FRCLOSE(f_in2);
430 if (doCollapse) {
431 outcounter=0;
432 int maxdup_count=1;
433 char* maxdup_seq=NULL;
434 dhash.startIterate();
435 FqDupRec* qd=NULL;
436 char* seq=NULL;
437 while ((qd=dhash.NextData(seq))!=NULL) {
438 GStr rseq(seq);
439 //do the dusting here
440 if (doDust) {
441 int dustbases=dust(rseq);
442 if (dustbases>(rseq.length()>>1)) {
443 if (trashReport && qd->firstname!=NULL) {
444 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
445 }
446 trash_D+=qd->count;
447 continue;
448 }
449 }
450 outcounter++;
451 if (qd->count>maxdup_count) {
452 maxdup_count=qd->count;
453 maxdup_seq=seq;
454 }
455 if (isfasta) {
456 if (prefix.is_empty()) {
457 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
458 rseq.chars());
459 }
460 else { //use custom read name
461 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
462 qd->count, rseq.chars());
463 }
464 }
465 else { //fastq format
466 if (convert_phred) convertPhred(qd->qv, qd->len);
467 if (prefix.is_empty()) {
468 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
469 rseq.chars(), qd->qv);
470 }
471 else { //use custom read name
472 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
473 qd->count, rseq.chars(), qd->qv);
474 }
475 }
476 }//for each element of dhash
477 if (maxdup_count>1) {
478 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
479 }
480 } //collapse entries
481 if (verbose) {
482 if (paired_reads) {
483 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
484 GMessage("Number of input pairs :%9d\n", incounter);
485 GMessage(" Output pairs :%9d\n", outcounter);
486 }
487 else {
488 GMessage(">Input file : %s\n", infname.chars());
489 GMessage("Number of input reads :%9d\n", incounter);
490 GMessage(" Output reads :%9d\n", outcounter);
491 }
492 GMessage("------------------------------------\n");
493 if (num_trimmed5)
494 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
495 if (num_trimmed3)
496 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
497 GMessage("------------------------------------\n");
498 if (trash_s>0)
499 GMessage(" Trashed by length:%9d\n", trash_s);
500 if (trash_N>0)
501 GMessage(" Trashed by N%%:%9d\n", trash_N);
502 if (trash_Q>0)
503 GMessage("Trashed by low quality:%9d\n", trash_Q);
504 if (trash_poly>0)
505 GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
506 if (trash_A5>0)
507 GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508 if (trash_A3>0)
509 GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510 }
511 if (trashReport) {
512 FWCLOSE(freport);
513 }
514 FWCLOSE(f_out);
515 FWCLOSE(f_out2);
516 } //while each input file
517 delete gxmem_l;
518 delete gxmem_r;
519 //getc(stdin);
520 }
521
522 class NData {
523 public:
524 int NPos[1024]; //there should be no reads longer than 1K ?
525 int NCount;
526 int end5;
527 int end3;
528 int seqlen;
529 double perc_N; //percentage of Ns in end5..end3 range only!
530 const char* seq;
531 bool valid;
532 NData() {
533 seqlen=0;
534 NCount=0;
535 end5=0;
536 end3=0;
537 seq=NULL;
538 perc_N=0;
539 valid=true;
540 }
541 void init(GStr& rseq) {
542 NCount=0;
543 perc_N=0;
544 seqlen=rseq.length();
545 seq=rseq.chars();
546 end5=1;
547 end3=seqlen;
548 for (int i=0;i<seqlen;i++)
549 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
550 NPos[NCount]=i;
551 NCount++;
552 }
553 perc_N=(NCount*100.0)/seqlen;
554 valid=true;
555 }
556 void N_calc() { //only in the region after trimming
557 int n=0;
558 for (int i=end3-1;i<end5;i++) {
559 if (seq[i]=='N') n++;
560 }
561 perc_N=(n*100.0)/(end5-end3+1);
562 }
563 };
564
565 static NData feat;
566 int perc_lenN=12; // incremental distance from ends, in percentage of
567 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
568
569 void N_analyze(int l5, int l3, int p5, int p3) {
570 /* assumes feat was filled properly */
571 int old_dif, t5,t3,v;
572 if (l3<l5+2 || p5>p3 ) {
573 feat.end5=l5+1;
574 feat.end3=l3+1;
575 return;
576 }
577
578 t5=feat.NPos[p5]-l5;
579 t3=l3-feat.NPos[p3];
580 old_dif=p3-p5;
581 v=(int)((((double)(l3-l5))*perc_lenN)/100);
582 if (v>20) v=20; /* enforce N-search limit for very long reads */
583 if (t5 < v ) {
584 l5=feat.NPos[p5]+1;
585 p5++;
586 }
587 if (t3 < v) {
588 l3=feat.NPos[p3]-1;
589 p3--;
590 }
591 /* restNs=p3-p5; number of Ns in the new CLR */
592 if (p3-p5==old_dif) { /* no change, return */
593 /* don't trim if this may shorten the read too much */
594 //if (l5-l3<min_read_len) return;
595 feat.end5=l5+1;
596 feat.end3=l3+1;
597 return;
598 }
599 else
600 N_analyze(l5,l3, p5,p3);
601 }
602
603
604 bool qtrim(GStr& qvs, int &l5, int &l3) {
605 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
606 if (qv_phredtype==0) {
607 //try to guess the Phred type
608 int vmin=256, vmax=0;
609 for (int i=0;i<qvs.length();i++) {
610 if (vmin>qvs[i]) vmin=qvs[i];
611 if (vmax<qvs[i]) vmax=qvs[i];
612 }
613 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
614 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
615 if (qv_phredtype==0) {
616 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
617 }
618 if (verbose)
619 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
620 } //guessing Phred type
621 for (l3=qvs.length()-1;l3>2;l3--) {
622 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
623 }
624 //just in case, check also the 5' the end (?)
625 for (l5=0;l5<qvs.length()-3;l5++) {
626 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
627 }
628 if (qvtrim_max>0) {
629 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
630 if (l5>qvtrim_max) l5=qvtrim_max;
631 }
632 return (l5>0 || l3<qvs.length()-1);
633 }
634
635 bool ntrim(GStr& rseq, int &l5, int &l3) {
636 //count Ns in the sequence, trim N-rich ends
637 feat.init(rseq);
638 l5=feat.end5-1;
639 l3=feat.end3-1;
640 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
641 if (l5==feat.end5-1 && l3==feat.end3-1) {
642 if (feat.perc_N>max_perc_N) {
643 feat.valid=false;
644 l3=l5+1;
645 return true;
646 }
647 else {
648 return false; //nothing changed
649 }
650 }
651 l5=feat.end5-1;
652 l3=feat.end3-1;
653 if (l3-l5+1<min_read_len) {
654 feat.valid=false;
655 return true;
656 }
657 feat.N_calc();
658
659 if (feat.perc_N>max_perc_N) {
660 feat.valid=false;
661 l3=l5+1;
662 return true;
663 }
664 return true;
665 }
666
667 //--------------- dust functions ----------------
668 class DNADuster {
669 public:
670 int dustword;
671 int dustwindow;
672 int dustwindow2;
673 int dustcutoff;
674 int mv, iv, jv;
675 int counts[32*32*32];
676 int iis[32*32*32];
677 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
678 dustword=wordsize;
679 dustwindow=winsize;
680 dustwindow2 = (winsize>>1);
681 dustcutoff=cutoff;
682 mv=0;
683 iv=0;
684 jv=0;
685 }
686 void setWindowSize(int value) {
687 dustwindow = value;
688 dustwindow2 = (dustwindow >> 1);
689 }
690 void setWordSize(int value) {
691 dustword=value;
692 }
693 void wo1(int len, const char* s, int ivv) {
694 int i, ii, j, v, t, n, n1, sum;
695 int js, nis;
696 n = 32 * 32 * 32;
697 n1 = n - 1;
698 nis = 0;
699 i = 0;
700 ii = 0;
701 sum = 0;
702 v = 0;
703 for (j=0; j < len; j++, s++) {
704 ii <<= 5;
705 if (*s<=32) {
706 i=0;
707 continue;
708 }
709 ii |= *s - 'A'; //assume uppercase!
710 ii &= n1;
711 i++;
712 if (i >= dustword) {
713 for (js=0; js < nis && iis[js] != ii; js++) ;
714 if (js == nis) {
715 iis[nis] = ii;
716 counts[ii] = 0;
717 nis++;
718 }
719 if ((t = counts[ii]) > 0) {
720 sum += t;
721 v = 10 * sum / j;
722 if (mv < v) {
723 mv = v;
724 iv = ivv;
725 jv = j;
726 }
727 }
728 counts[ii]++;
729 }
730 }
731 }
732
733 int wo(int len, const char* s, int* beg, int* end) {
734 int i, l1;
735 l1 = len - dustword + 1;
736 if (l1 < 0) {
737 *beg = 0;
738 *end = len - 1;
739 return 0;
740 }
741 mv = 0;
742 iv = 0;
743 jv = 0;
744 for (i=0; i < l1; i++) {
745 wo1(len-i, s+i, i);
746 }
747 *beg = iv;
748 *end = iv + jv;
749 return mv;
750 }
751
752 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
753 int i, j, l, a, b, v;
754 if (cutoff==0) cutoff=dustcutoff;
755 a=0;b=0;
756 //GMessage("Dust cutoff=%d\n", cutoff);
757 for (i=0; i < seqlen; i += dustwindow2) {
758 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
759 v = wo(l, seq+i, &a, &b);
760 if (v > cutoff) {
761 //for (j = a; j <= b && j < dustwindow2; j++) {
762 for (j = a; j <= b; j++) {
763 seqmsk[i+j]='N';//could be made lowercase instead
764 }
765 }
766 }
767 //return first;
768 }
769 };
770
771 static DNADuster duster;
772
773 int dust(GStr& rseq) {
774 char* seq=Gstrdup(rseq.chars());
775 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
776 //check the number of Ns:
777 int ncount=0;
778 for (int i=0;i<rseq.length();i++) {
779 if (seq[i]=='N') ncount++;
780 }
781 GFREE(seq);
782 return ncount;
783 }
784
785 struct SLocScore {
786 int pos;
787 int score;
788 SLocScore(int p=0,int s=0) {
789 pos=p;
790 score=s;
791 }
792 void set(int p, int s) {
793 pos=p;
794 score=s;
795 }
796 void add(int p, int add) {
797 pos=p;
798 score+=add;
799 }
800 };
801
802 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
803 if (!doPolyTrim) return false;
804 int rlen=seq.length();
805 l5=0;
806 l3=rlen-1;
807 int32 seedVal=*(int32*)poly_seed;
808 char polyChar=poly_seed[0];
809 //assumes N trimming was already done
810 //so a poly match should be very close to the end of the read
811 // -- find the initial match (seed)
812 int lmin=GMAX((rlen-16), 0);
813 int li;
814 for (li=rlen-4;li>lmin;li--) {
815 if (seedVal==*(int*)&(seq[li])) {
816 break;
817 }
818 }
819 if (li<=lmin) return false;
820 //seed found, try to extend it both ways
821 //extend right
822 int ri=li+3;
823 SLocScore loc(ri, poly_m_score<<2);
824 SLocScore maxloc(loc);
825 //extend right
826 while (ri<rlen-1) {
827 ri++;
828 if (seq[ri]==polyChar) {
829 loc.add(ri,poly_m_score);
830 }
831 else if (seq[ri]=='N') {
832 loc.add(ri,0);
833 }
834 else { //mismatch
835 loc.add(ri,poly_mis_score);
836 if (maxloc.score-loc.score>poly_dropoff_score) break;
837 }
838 if (maxloc.score<=loc.score) {
839 maxloc=loc;
840 }
841 }
842 ri=maxloc.pos;
843 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
844 //ri = right boundary for the poly match
845 //extend left
846 loc.set(li, maxloc.score);
847 maxloc.pos=li;
848 while (li>0) {
849 li--;
850 if (seq[li]==polyChar) {
851 loc.add(li,poly_m_score);
852 }
853 else if (seq[li]=='N') {
854 loc.add(li,0);
855 }
856 else { //mismatch
857 loc.add(li,poly_mis_score);
858 if (maxloc.score-loc.score>poly_dropoff_score) break;
859 }
860 if (maxloc.score<=loc.score) {
861 maxloc=loc;
862 }
863 }
864 li=maxloc.pos;
865 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
866 (maxloc.score>poly_minScore && ri>=rlen-3) ||
867 (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
868 //trimming this li-ri match at 3' end
869 l3=li-1;
870 if (l3<0) l3=0;
871 return true;
872 }
873 return false;
874 }
875
876 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
877 if (!doPolyTrim) return false;
878 int rlen=seq.length();
879 l5=0;
880 l3=rlen-1;
881 int32 seedVal=*(int32*)poly_seed;
882 char polyChar=poly_seed[0];
883 //assumes N trimming was already done
884 //so a poly match should be very close to the end of the read
885 // -- find the initial match (seed)
886 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
887 int li;
888 for (li=0;li<=lmax;li++) {
889 if (seedVal==*(int*)&(seq[li])) {
890 break;
891 }
892 }
893 if (li>lmax) return false;
894 //seed found, try to extend it both ways
895 //extend left
896 int ri=li+3; //save rightmost base of the seed
897 SLocScore loc(li, poly_m_score<<2);
898 SLocScore maxloc(loc);
899 while (li>0) {
900 li--;
901 if (seq[li]==polyChar) {
902 loc.add(li,poly_m_score);
903 }
904 else if (seq[li]=='N') {
905 loc.add(li,0);
906 }
907 else { //mismatch
908 loc.add(li,poly_mis_score);
909 if (maxloc.score-loc.score>poly_dropoff_score) break;
910 }
911 if (maxloc.score<=loc.score) {
912 maxloc=loc;
913 }
914 }
915 li=maxloc.pos;
916 if (li>5) return false; //no trimming wanted, too far from 5' end
917 //li = right boundary for the poly match
918
919 //extend right
920 loc.set(ri, maxloc.score);
921 maxloc.pos=ri;
922 while (ri<rlen-1) {
923 ri++;
924 if (seq[ri]==polyChar) {
925 loc.add(ri,poly_m_score);
926 }
927 else if (seq[ri]=='N') {
928 loc.add(ri,0);
929 }
930 else { //mismatch
931 loc.add(ri,poly_mis_score);
932 if (maxloc.score-loc.score>poly_dropoff_score) break;
933 }
934 if (maxloc.score<=loc.score) {
935 maxloc=loc;
936 }
937 }
938 ri=maxloc.pos;
939 if ((maxloc.score==poly_minScore && li==0) ||
940 (maxloc.score>poly_minScore && li<2)
941 || (maxloc.score>(poly_minScore*3) && li<8)) {
942 //adjust l5 to reflect this trimming of 5' end
943 l5=ri+1;
944 if (l5>rlen-1) l5=rlen-1;
945 return true;
946 }
947 return false;
948 }
949
950 bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951 if (adaptors3.Count()==0) return false;
952 int rlen=seq.length();
953 l5=0;
954 l3=rlen-1;
955 bool trimmed=false;
956 GStr wseq(seq);
957 int wlen=rlen;
958 GXSeqData seqdata;
959 int numruns=revCompl ? 2 : 1;
960 GList<GXAlnInfo> bestalns(true, true, false);
961 for (int ai=0;ai<adaptors3.Count();ai++) {
962 for (int r=0;r<numruns;r++) {
963 if (r) {
964 seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
965 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
966 }
967 else {
968 seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
969 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
970 }
971 GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
972 if (aln) {
973 if (aln->strong) {
974 trimmed=true;
975 bestalns.Add(aln);
976 break; //will check the rest next time
977 }
978 else bestalns.Add(aln);
979 }
980 }//forward and reverse adaptors
981 if (trimmed) break; //will check the rest in the next cycle
982 }//for each 3' adaptor
983 if (bestalns.Count()>0) {
984 GXAlnInfo* aln=bestalns[0];
985 if (aln->sl-1 > wlen-aln->sr) {
986 //keep left side
987 l3-=(wlen-aln->sl+1);
988 if (l3<0) l3=0;
989 }
990 else { //keep right side
991 l5+=aln->sr;
992 if (l5>=rlen) l5=rlen-1;
993 }
994 //delete aln;
995 //if (l3-l5+1<min_read_len) return true;
996 wseq=seq.substr(l5,l3-l5+1);
997 wlen=wseq.length();
998 return true; //break the loops here to report a good find
999 }
1000 return false;
1001 }
1002
1003 bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
1004 if (adaptors5.Count()==0) return false;
1005 int rlen=seq.length();
1006 l5=0;
1007 l3=rlen-1;
1008 bool trimmed=false;
1009 GStr wseq(seq);
1010 int wlen=rlen;
1011 GXSeqData seqdata;
1012 int numruns=revCompl ? 2 : 1;
1013 GList<GXAlnInfo> bestalns(true, true, false);
1014 for (int ai=0;ai<adaptors5.Count();ai++) {
1015 for (int r=0;r<numruns;r++) {
1016 if (r) {
1017 seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1018 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1019 }
1020 else {
1021 seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1022 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1023 }
1024 GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1025 if (aln) {
1026 if (aln->strong) {
1027 trimmed=true;
1028 bestalns.Add(aln);
1029 break; //will check the rest next time
1030 }
1031 else bestalns.Add(aln);
1032 }
1033 } //forward and reverse?
1034 if (trimmed) break; //will check the rest in the next cycle
1035 }//for each 5' adaptor
1036 if (bestalns.Count()>0) {
1037 GXAlnInfo* aln=bestalns[0];
1038 if (aln->sl-1 > wlen-aln->sr) {
1039 //keep left side
1040 l3-=(wlen-aln->sl+1);
1041 if (l3<0) l3=0;
1042 }
1043 else { //keep right side
1044 l5+=aln->sr;
1045 if (l5>=rlen) l5=rlen-1;
1046 }
1047 //delete aln;
1048 //if (l3-l5+1<min_read_len) return true;
1049 wseq=seq.substr(l5,l3-l5+1);
1050 wlen=wseq.length();
1051 return true; //break the loops here to report a good find
1052 }
1053 return false;
1054 }
1055
1056 //convert qvs to/from phred64 from/to phread33
1057 void convertPhred(GStr& q) {
1058 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1059 }
1060
1061 void convertPhred(char* q, int len) {
1062 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1063 }
1064
1065 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1066 GStr& rname, GStr& rinfo, GStr& infname) {
1067 rseq="";
1068 rqv="";
1069 rname="";
1070 rinfo="";
1071 if (fq.eof()) return false;
1072 char* l=fq.getLine();
1073 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1074 if (l==NULL) return false;
1075 /* if (rawFormat) {
1076 //TODO: implement raw qseq parsing here
1077 //if (raw type=N) then continue; //skip invalid/bad records
1078 } //raw qseq format
1079 else { // FASTQ or FASTA */
1080 isfasta=(l[0]=='>');
1081 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1082 infname.chars(), l);
1083 GStr s(l);
1084 rname=&(l[1]);
1085 for (int i=0;i<rname.length();i++)
1086 if (rname[i]<=' ') {
1087 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1088 rname.cut(i);
1089 break;
1090 }
1091 //now get the sequence
1092 if ((l=fq.getLine())==NULL)
1093 GError("Error: unexpected EOF after header for read %s (%s)\n",
1094 rname.chars(), infname.chars());
1095 rseq=l; //this must be the DNA line
1096 while ((l=fq.getLine())!=NULL) {
1097 //seq can span multiple lines
1098 if (l[0]=='>' || l[0]=='+') {
1099 fq.pushBack();
1100 break; //
1101 }
1102 rseq+=l;
1103 } //check for multi-line seq
1104 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1105 if ((l=fq.getLine())==NULL)
1106 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1107 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1108 if ((l=fq.getLine())==NULL)
1109 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1110 rqv=l;
1111 //if (rqv.length()!=rseq.length())
1112 // GError("Error: qv len != seq len for %s\n", rname.chars());
1113 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1114 rqv+=l; //append to qv string
1115 }
1116 }// fastq
1117 // } //<-- FASTA or FASTQ
1118 rseq.upper();
1119 return true;
1120 }
1121
1122 #ifdef GDEBUG
1123 void showTrim(GStr& s, int l5, int l3) {
1124 if (l5>0 || l3==0) {
1125 color_bg(c_red);
1126 }
1127 for (int i=0;i<s.length()-1;i++) {
1128 if (i && i==l5) color_resetbg();
1129 fprintf(stderr, "%c", s[i]);
1130 if (i && i==l3) color_bg(c_red);
1131 }
1132 fprintf(stderr, "%c", s[s.length()-1]);
1133 color_reset();
1134 fprintf(stderr, "\n");
1135 }
1136 #endif
1137
1138 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1139 //returns 0 if the read was untouched, 1 if it was just trimmed
1140 // and a trash code if it was trashed
1141 l5=0;
1142 l3=rseq.length()-1;
1143 #ifdef GDEBUG
1144 //rseq.reverse();
1145 GMessage(">%s\n", rname.chars());
1146 GMessage("%s\n",rseq.chars());
1147 #endif
1148 if (l3-l5+1<min_read_len) {
1149 return 's';
1150 }
1151 GStr wseq(rseq.chars());
1152 GStr wqv(rqv.chars());
1153 int w5=l5;
1154 int w3=l3;
1155 //first do the q-based trimming
1156 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1157 if (w3-w5+1<min_read_len) {
1158 return 'Q'; //invalid read
1159 }
1160 //-- keep only the w5..w3 range
1161 l5=w5;
1162 l3=w3;
1163 wseq=wseq.substr(w5, w3-w5+1);
1164 if (!wqv.is_empty())
1165 wqv=wqv.substr(w5, w3-w5+1);
1166 } //qv trimming
1167 // N-trimming on the remaining read seq
1168 if (ntrim(wseq, w5, w3)) {
1169 //GMessage("before: %s\n",wseq.chars());
1170 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1171 l5+=w5;
1172 l3-=(wseq.length()-1-w3);
1173 if (w3-w5+1<min_read_len) {
1174 return 'N'; //to be trashed
1175 }
1176 //-- keep only the w5..w3 range
1177 wseq=wseq.substr(w5, w3-w5+1);
1178 if (!wqv.is_empty())
1179 wqv=wqv.substr(w5, w3-w5+1);
1180 w5=0;
1181 w3=wseq.length()-1;
1182 }
1183 char trim_code;
1184 //clean the more dirty end first - 3'
1185 int prev_w5=0;
1186 int prev_w3=0;
1187 bool w3upd=true;
1188 bool w5upd=true;
1189 do {
1190 trim_code=0;
1191 if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1192 trim_code='A';
1193 }
1194 else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1195 trim_code='T';
1196 }
1197 else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1198 trim_code='A';
1199 }
1200 else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1201 trim_code='T';
1202 }
1203 else if (trim_adaptor5(wseq, w5, w3)) {
1204 trim_code='5';
1205 }
1206 else if (trim_adaptor3(wseq, w5, w3)) {
1207 trim_code='3';
1208 }
1209 if (trim_code) {
1210 w3upd=(w3!=prev_w3);
1211 w5upd=(w5!=prev_w5);
1212 if (w3upd) prev_w3=w3;
1213 if (w5upd) prev_w5=w5;
1214 #ifdef GDEBUG
1215 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1216 showTrim(wseq, w5, w3);
1217 #endif
1218 int trimlen=wseq.length()-(w3-w5+1);
1219 num_trimmed3++;
1220 if (trimlen<min_trimmed3)
1221 min_trimmed3=trimlen;
1222 l5+=w5;
1223 l3-=(wseq.length()-1-w3);
1224 if (w3-w5+1<min_read_len) {
1225 return trim_code;
1226 }
1227 //-- keep only the w5..w3 range
1228 wseq=wseq.substr(w5, w3-w5+1);
1229 if (!wqv.is_empty())
1230 wqv=wqv.substr(w5, w3-w5+1);
1231 }//trimming at 3' end
1232 } while (trim_code);
1233
1234 if (doCollapse) {
1235 //keep read for later
1236 FqDupRec* dr=dhash.Find(wseq.chars());
1237 if (dr==NULL) { //new entry
1238 //if (prefix.is_empty())
1239 dhash.Add(wseq.chars(),
1240 new FqDupRec(&wqv, rname.chars()));
1241 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1242 }
1243 else
1244 dr->add(wqv);
1245 } //collapsing duplicates
1246 else { //not collapsing duplicates
1247 //apply the dust filter now
1248 if (doDust) {
1249 int dustbases=dust(wseq);
1250 if (dustbases>(wseq.length()>>1)) {
1251 return 'D';
1252 }
1253 }
1254 } //not collapsing duplicates
1255 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1256 }
1257
1258 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1259 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1260 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1261 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1262 }
1263
1264 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1265 outcounter++;
1266 bool asFasta=(rqv.is_empty() || fastaOutput);
1267 if (asFasta) {
1268 if (prefix.is_empty()) {
1269 printHeader(f_out, '>',rname,rinfo);
1270 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1271 }
1272 else {
1273 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1274 rseq.chars());
1275 }
1276 }
1277 else { //fastq
1278 if (convert_phred) convertPhred(rqv);
1279 if (prefix.is_empty()) {
1280 printHeader(f_out, '@', rname, rinfo);
1281 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1282 }
1283 else
1284 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1285 rseq.chars(),rqv.chars() );
1286 }
1287 }
1288
1289 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1290 if (freport==NULL || trashcode<=' ') return;
1291 if (trashcode=='3' || trashcode=='5') {
1292 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1293 }
1294 else {
1295 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1296 }
1297 //tcounter++;
1298 }
1299
1300 GStr getFext(GStr& s, int* xpos=NULL) {
1301 //s must be a filename without a path
1302 GStr r("");
1303 if (xpos!=NULL) *xpos=0;
1304 if (s.is_empty() || s=="-") return r;
1305 int p=s.rindex('.');
1306 int d=s.rindex('/');
1307 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1308 r=s.substr(p+1);
1309 if (xpos!=NULL) *xpos=p+1;
1310 r.lower();
1311 return r;
1312 }
1313
1314 void baseFileName(GStr& fname) {
1315 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1316 int xpos=0;
1317 GStr fext=getFext(fname, &xpos);
1318 if (xpos<=1) return;
1319 bool extdel=false;
1320 GStr f2;
1321 if (fext=="z" || fext=="zip") {
1322 extdel=true;
1323 }
1324 else if (fext.length()>=2) {
1325 f2=fext.substr(0,2);
1326 extdel=(f2=="gz" || f2=="bz");
1327 }
1328 if (extdel) {
1329 fname.cut(xpos-1);
1330 fext=getFext(fname, &xpos);
1331 if (xpos<=1) return;
1332 }
1333 extdel=false;
1334 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1335 extdel=true;
1336 }
1337 else if (fext.length()>=2) {
1338 extdel=(fext.substr(0,2)=="fa");
1339 }
1340 if (extdel) fname.cut(xpos-1);
1341 GStr fncp(fname);
1342 fncp.lower();
1343 fncp.chomp("seq");
1344 fncp.chomp("sequence");
1345 fncp.trimR("_.");
1346 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1347 }
1348
1349 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1350 FILE* f_out=NULL;
1351 GStr fname(getFileName(infname.chars()));
1352 //eliminate known extensions
1353 baseFileName(fname);
1354 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1355 else if (pocmd.is_empty()) {
1356 GStr oname(fname);
1357 oname.append('.');
1358 oname.append(outsuffix);
1359 f_out=fopen(oname.chars(),"w");
1360 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1361 }
1362 else {
1363 GStr oname(">");
1364 oname.append(fname);
1365 oname.append('.');
1366 oname.append(outsuffix);
1367 pocmd.append(oname);
1368 f_out=popen(pocmd.chars(), "w");
1369 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1370 }
1371 return f_out;
1372 }
1373
1374 void guess_unzip(GStr& fname, GStr& picmd) {
1375 GStr fext=getFext(fname);
1376 if (fext=="gz" || fext=="gzip" || fext=="z") {
1377 picmd="gzip -cd ";
1378 }
1379 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1380 picmd="bzip2 -cd ";
1381 }
1382 }
1383
1384 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1385 //TODO: prepare CASeqData here, and collect hexamers as well
1386 if (seq.is_empty() || seq=="-" ||
1387 seq=="N/A" || seq==".") return;
1388
1389 CASeqData adata(revCompl);
1390 int idx=adaptors.Add(adata);
1391 if (idx<0) GError("Error: failed to add adaptor!\n");
1392 adaptors[idx].trim_type=trim_type;
1393 adaptors[idx].update(seq.chars());
1394 }
1395
1396
1397 int loadAdaptors(const char* fname) {
1398 GLineReader lr(fname);
1399 char* l;
1400 while ((l=lr.nextLine())!=NULL) {
1401 if (lr.length()<=3 || l[0]=='#') continue;
1402 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1403 l[0]==';'|| l[0]==':' ) {
1404 int i=1;
1405 while (l[i]!=0 && isspace(l[i])) {
1406 i++;
1407 }
1408 if (l[i]!=0) {
1409 GStr s(&(l[i]));
1410 #ifdef GDEBUG
1411 //s.reverse();
1412 #endif
1413 addAdaptor(adaptors3, s, galn_TrimRight);
1414 continue;
1415 }
1416 }
1417 else {
1418 GStr s(l);
1419 s.startTokenize("\t ;,:");
1420 GStr a5,a3;
1421 if (s.nextToken(a5))
1422 s.nextToken(a3);
1423 else continue; //no tokens on this line
1424 GAlnTrimType ttype5=galn_TrimLeft;
1425 a5.upper();
1426 a3.upper();
1427 if (a3.is_empty() || a3==a5 || a3=="=") {
1428 a3.clear();
1429 ttype5=galn_TrimEither;
1430 }
1431 #ifdef GDEBUG
1432 // a5.reverse();
1433 // a3.reverse();
1434 #endif
1435 addAdaptor(adaptors5, a5, ttype5);
1436 addAdaptor(adaptors3, a3, galn_TrimRight);
1437 }
1438 }
1439 return adaptors5.Count()+adaptors3.Count();
1440 }
1441
1442 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1443 GStr& s, GStr& infname, GStr& infname2) {
1444 // uses outsuffix to generate output file names and open file handles as needed
1445 infname="";
1446 infname2="";
1447 f_in=NULL;
1448 f_in2=NULL;
1449 f_out=NULL;
1450 f_out2=NULL;
1451 //analyze outsuffix intent
1452 GStr pocmd;
1453 if (outsuffix=="-") {
1454 f_out=stdout;
1455 }
1456 else {
1457 GStr ox=getFext(outsuffix);
1458 if (ox.length()>2) ox=ox.substr(0,2);
1459 if (ox=="gz") pocmd="gzip -9 -c ";
1460 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1461 }
1462 if (s=="-") {
1463 f_in=stdin;
1464 infname="stdin";
1465 f_out=prepOutFile(infname, pocmd);
1466 return;
1467 } // streaming from stdin
1468 s.startTokenize(",:");
1469 s.nextToken(infname);
1470 bool paired=s.nextToken(infname2);
1471 if (fileExists(infname.chars())==0)
1472 GError("Error: cannot find file %s!\n",infname.chars());
1473 GStr fname(getFileName(infname.chars()));
1474 GStr picmd;
1475 guess_unzip(fname, picmd);
1476 if (picmd.is_empty()) {
1477 f_in=fopen(infname.chars(), "r");
1478 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1479 }
1480 else {
1481 picmd.append(infname);
1482 f_in=popen(picmd.chars(), "r");
1483 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1484 }
1485 if (f_out==stdout) {
1486 if (paired) GError("Error: output suffix required for paired reads\n");
1487 return;
1488 }
1489 f_out=prepOutFile(infname, pocmd);
1490 if (!paired) return;
1491 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1492 // ---- paired reads:-------------
1493 if (fileExists(infname2.chars())==0)
1494 GError("Error: cannot find file %s!\n",infname2.chars());
1495 picmd="";
1496 GStr fname2(getFileName(infname2.chars()));
1497 guess_unzip(fname2, picmd);
1498 if (picmd.is_empty()) {
1499 f_in2=fopen(infname2.chars(), "r");
1500 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1501 }
1502 else {
1503 picmd.append(infname2);
1504 f_in2=popen(picmd.chars(), "r");
1505 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1506 }
1507 f_out2=prepOutFile(infname2, pocmd);
1508 }