ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 183
Committed: Thu Feb 16 21:59:33 2012 UTC (7 years, 6 months ago) by gpertea
File size: 44373 byte(s)
Log Message:
works

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7
8 #define USAGE "Usage:\n\
9 fqtrim [{-5 <5adaptor> -3 <3adaptor>|-f <adaptors_file>}] [-a <min_matchlen>]\\\n\
10 [-R] [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 <input.fq>[,<input_mates.fq>\n\
13 \n\
14 Trim low quality bases at the 3' end and can trim adaptor sequence(s), filter\n\
15 for low complexity and collapse duplicate reads.\n\
16 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 one read in a pair), the two file names should be given delimited by a comma\n\
18 or a colon character\n\
19 \n\
20 Options:\n\
21 -n rename all the reads using the <prefix> followed by a read counter;\n\
22 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23 the read duplication count\n\
24 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25 file(s) named <input>.<outsuffix> which will be created in the \n\
26 current (working) directory; (writes to stdout if -o- is given);\n\
27 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 -f file with adaptor sequences to trim, each line having this format:\n\
29 <5'-adaptor-sequence> <3'-adaptor-sequence>\n\
30 -5 trim the given adaptor or primer sequence at the 5' end of each read\n\
31 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 -3 trim the given adaptor sequence at the 3' end of each read\n\
33 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 -A disable polyA/T trimming (enabled by default)\n\
35 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
36 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
37 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
38 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
39 -l minimum \"clean\" length after trimming that a read must have\n\
40 in order to pass the filter (default: 16)\n\
41 -r write a simple \"trash report\" file listing the discarded reads with a\n\
42 one letter code for the reason why they were trashed\n\
43 -D apply a low-complexity (dust) filter and discard any read that has over\n\
44 50% of its length detected as low complexity\n\
45 -C collapse duplicate reads and add a prefix with their count to the read\n\
46 name (e.g. for microRNAs)\n\
47 -p input is phred64/phred33 (use -p64 or -p33)\n\
48 -Q convert quality values to the other Phred qv type\n\
49 -V verbose processing\n\
50 "
51
52 //-z for -o option, the output stream(s) will be first piped into the given\n
53 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
54
55
56 // example 3' adaptor for miRNAs: TCGTATGCCGTCTTCTGCTTG
57
58 //For paired reads sequencing:
59 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
60 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
61 //FILE* f_out=NULL; //stdout if not provided
62 //FILE* f_out2=NULL; //for paired reads
63 //FILE* f_in=NULL; //input fastq (stdin if not provided)
64 //FILE* f_in2=NULL; //for paired reads
65
66 FILE* freport=NULL;
67
68 bool debug=false;
69 bool verbose=false;
70 bool doCollapse=false;
71 bool doDust=false;
72 bool doPolyTrim=true;
73 bool fastaOutput=false;
74 bool trashReport=false;
75 bool revCompl=false; //also reverse complement adaptor sequences
76 int min_read_len=16;
77 double max_perc_N=7.0;
78 int dust_cutoff=16;
79 bool isfasta=false;
80 bool convert_phred=false;
81 GStr outsuffix; // -o
82 GStr prefix;
83 GStr zcmd;
84 int num_trimmed5=0;
85 int num_trimmed3=0;
86 int num_trimmedN=0;
87 int num_trimmedQ=0;
88 int min_trimmed5=INT_MAX;
89 int min_trimmed3=INT_MAX;
90
91 int qvtrim_qmin=0;
92 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
93 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
94 int qv_cvtadd=0; //could be -31 or +31
95
96 // adaptor matching metrics -- for X-drop ungapped extension
97 //const int match_reward=2;
98 //const int mismatch_penalty=3;
99 const int match_reward=2;
100 const int mismatch_penalty=4;
101 const int Xdrop=10;
102
103 const int poly_m_score=2; //match score
104 const int poly_mis_score=-3; //mismatch
105 const int poly_dropoff_score=7;
106 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
107
108 const char *polyA_seed="AAAA";
109 const char *polyT_seed="TTTT";
110
111 struct CASeqData {
112 //positional data for every possible hexamer in an adaptor
113 GVec<uint16>* pz[4096]; //0-based coordinates of all possible hexamers in the adaptor sequence
114 GVec<uint16>* pzr[4096]; //0-based coordinates of all possible hexamers for the reverse complement of the adaptor sequence
115 GStr seq; //actual adaptor sequence data
116 GStr seqr; //reverse complement sequence
117 int amlen; //fraction of adaptor length matching that's
118 //enough to consider the alignment
119 GAlnTrimType trim_type;
120 bool use_reverse;
121 CASeqData(bool rev=false):seq(),seqr(),
122 amlen(0), use_reverse(rev) {
123 trim_type=galn_None; //should be updated later!
124 for (int i=0;i<4096;i++) {
125 pz[i]=NULL;
126 pzr[i]=NULL;
127 }
128 }
129
130 void update(const char* s) {
131 seq=s;
132 table6mers(seq.chars(), seq.length(), pz);
133 amlen=iround(double(seq.length())*0.8);
134 if (amlen<12)
135 amlen=12;
136 if (!use_reverse) return;
137 //reverse complement
138 seqr=s;
139 int slen=seq.length();
140 for (int i=0;i<slen;i++)
141 seqr[i]=ntComplement(seq[slen-i-1]);
142 table6mers(seqr.chars(), seqr.length(), pzr);
143 }
144
145 ~CASeqData() {
146 for (int i=0;i<4096;i++) {
147 delete pz[i];
148 delete pzr[i];
149 }
150 }
151 };
152
153 GVec<CASeqData> adaptors5;
154 GVec<CASeqData> adaptors3;
155
156 CGreedyAlignData* gxmem_l=NULL;
157 CGreedyAlignData* gxmem_r=NULL;
158
159 // element in dhash:
160 class FqDupRec {
161 public:
162 int count; //how many of these reads are the same
163 int len; //length of qv
164 char* firstname; //optional, only if we want to keep the original read names
165 char* qv;
166 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
167 len=0;
168 qv=NULL;
169 firstname=NULL;
170 count=0;
171 if (qstr!=NULL) {
172 qv=Gstrdup(qstr->chars());
173 len=qstr->length();
174 count++;
175 }
176 if (rname!=NULL) firstname=Gstrdup(rname);
177 }
178 ~FqDupRec() {
179 GFREE(qv);
180 GFREE(firstname);
181 }
182 void add(GStr& d) { //collapse another record into this one
183 if (d.length()!=len)
184 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
185 count++;
186 for (int i=0;i<len;i++)
187 qv[i]=(qv[i]+d[i])/2;
188 }
189 };
190
191 void openfw(FILE* &f, GArgs& args, char opt) {
192 GStr s=args.getOpt(opt);
193 if (!s.is_empty()) {
194 if (s=='-') f=stdout;
195 else {
196 f=fopen(s.chars(),"w");
197 if (f==NULL) GError("Error creating file: %s\n", s.chars());
198 }
199 }
200 }
201
202 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
203 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
204
205 GHash<FqDupRec> dhash; //hash to keep track of duplicates
206
207 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type);
208 int loadAdaptors(const char* fname);
209
210 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
211 GStr& s, GStr& infname, GStr& infname2);
212 // uses outsuffix to generate output file names and open file handles as needed
213
214 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
215 void trash_report(char trashcode, GStr& rname, FILE* freport);
216
217 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
218 GStr& rname, GStr& rinfo, GStr& infname);
219
220 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
221 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
222
223 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
224 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
225 int dust(GStr& seq);
226 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
227 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
228 bool trim_adaptor5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
229 bool trim_adaptor3(GStr& seq, int &l5, int &l3);
230
231 void convertPhred(char* q, int len);
232 void convertPhred(GStr& q);
233
234 int main(int argc, char * const argv[]) {
235 GArgs args(argc, argv, "YQDCRVAl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
236 int e;
237 if ((e=args.isError())>0) {
238 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
239 exit(224);
240 }
241 debug=(args.getOpt('Y')!=NULL);
242 verbose=(args.getOpt('V')!=NULL);
243 convert_phred=(args.getOpt('Q')!=NULL);
244 doCollapse=(args.getOpt('C')!=NULL);
245 doDust=(args.getOpt('D')!=NULL);
246 revCompl=(args.getOpt('R')!=NULL);
247 if (args.getOpt('A')) doPolyTrim=false;
248 /*
249 rawFormat=(args.getOpt('R')!=NULL);
250 if (rawFormat) {
251 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
252 }
253 */
254 prefix=args.getOpt('n');
255 GStr s=args.getOpt('l');
256 if (!s.is_empty())
257 min_read_len=s.asInt();
258 s=args.getOpt('m');
259 if (!s.is_empty())
260 max_perc_N=s.asDouble();
261 s=args.getOpt('d');
262 if (!s.is_empty()) {
263 dust_cutoff=s.asInt();
264 doDust=true;
265 }
266 s=args.getOpt('q');
267 if (!s.is_empty()) {
268 qvtrim_qmin=s.asInt();
269 }
270 s=args.getOpt('t');
271 if (!s.is_empty()) {
272 qvtrim_max=s.asInt();
273 }
274 s=args.getOpt('p');
275 if (!s.is_empty()) {
276 int v=s.asInt();
277 if (v==33) {
278 qv_phredtype=33;
279 qv_cvtadd=31;
280 }
281 else if (v==64) {
282 qv_phredtype=64;
283 qv_cvtadd=-31;
284 }
285 else
286 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
287 }
288 s=args.getOpt('f');
289 if (!s.is_empty()) {
290 loadAdaptors(s.chars());
291 }
292 bool fileAdaptors=adaptors5.Count()+adaptors3.Count();
293 s=args.getOpt('5');
294 if (!s.is_empty()) {
295 if (fileAdaptors)
296 GError("Error: options -5 and -f cannot be used together!\n");
297 s.upper();
298 addAdaptor(adaptors5, s, galn_TrimLeft);
299 }
300 s=args.getOpt('3');
301 if (!s.is_empty()) {
302 if (fileAdaptors)
303 GError("Error: options -3 and -f cannot be used together!\n");
304 s.upper();
305 addAdaptor(adaptors3, s, galn_TrimRight);
306 }
307 s=args.getOpt('y');
308 if (!s.is_empty()) {
309 int minmatch=s.asInt();
310 poly_minScore=minmatch*poly_m_score;
311 }
312
313 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
314 else outsuffix="-";
315 trashReport= (args.getOpt('r')!=NULL);
316 int fcount=args.startNonOpt();
317 if (fcount==0) {
318 GMessage(USAGE);
319 exit(224);
320 }
321 if (fcount>1 && doCollapse) {
322 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
323 }
324 //openfw(f_out, args, 'o');
325 //if (f_out==NULL) f_out=stdout;
326 if (trashReport)
327 openfw(freport, args, 'r');
328 char* infile=NULL;
329
330 if (adaptors5.Count()>0)
331 //gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop-2);
332 gxmem_l=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
333 if (adaptors3.Count()>0)
334 gxmem_r=new CGreedyAlignData(match_reward, mismatch_penalty, Xdrop);
335
336 while ((infile=args.nextNonOpt())!=NULL) {
337 //for each input file
338 int incounter=0; //counter for input reads
339 int outcounter=0; //counter for output reads
340 int trash_s=0; //too short from the get go
341 int trash_Q=0;
342 int trash_N=0;
343 int trash_D=0;
344 int trash_poly=0;
345 int trash_A3=0;
346 int trash_A5=0;
347 s=infile;
348 GStr infname;
349 GStr infname2;
350 FILE* f_in=NULL;
351 FILE* f_in2=NULL;
352 FILE* f_out=NULL;
353 FILE* f_out2=NULL;
354 bool paired_reads=false;
355 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
356 GLineReader fq(f_in);
357 GLineReader* fq2=NULL;
358 if (f_in2!=NULL) {
359 fq2=new GLineReader(f_in2);
360 paired_reads=true;
361 }
362 GStr rseq, rqv, seqid, seqinfo;
363 GStr rseq2, rqv2, seqid2, seqinfo2;
364 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
365 incounter++;
366 int a5=0, a3=0, b5=0, b3=0;
367 char tcode=0, tcode2=0;
368 tcode=process_read(seqid, rseq, rqv, a5, a3);
369 if (fq2!=NULL) {
370 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
371 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
372 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
373 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
374 }
375 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
376 //decide what to do with this pair and print rseq2 if the pair makes it
377 if (tcode>1 && tcode2<=1) {
378 //"untrash" rseq
379 if (a3-a5+1<min_read_len) {
380 a5=1;
381 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
382 }
383 tcode=1;
384 }
385 else if (tcode<=1 && tcode2>1) {
386 //"untrash" rseq2
387 if (b3-b5+1<min_read_len) {
388 b5=1;
389 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
390 }
391 tcode2=1;
392 }
393 if (tcode<=1) { //trimmed or left intact -- write it!
394 if (tcode>0) {
395 rseq2=rseq2.substr(b5,b3-b5+1);
396 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
397 }
398 int nocounter=0;
399 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
400 }
401 } //pair read
402 if (tcode>1) { //trashed
403 #ifdef GDEBUG
404 GMessage(" !!!!TRASH code = %c\n",tcode);
405 #endif
406 if (tcode=='s') trash_s++;
407 else if (tcode=='A' || tcode=='T') trash_poly++;
408 else if (tcode=='Q') trash_Q++;
409 else if (tcode=='N') trash_N++;
410 else if (tcode=='D') trash_D++;
411 else if (tcode=='3') trash_A3++;
412 else if (tcode=='5') trash_A5++;
413 if (trashReport) trash_report(tcode, seqid, freport);
414 }
415 else if (!doCollapse) { //write it
416 if (tcode>0) {
417 rseq=rseq.substr(a5,a3-a5+1);
418 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
419 }
420 #ifdef GDEBUG
421 GMessage(" After trimming:\n");
422 GMessage("%s\n",rseq.chars());
423 #endif
424 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
425 }
426 } //for each fastq record
427 delete fq2;
428 FRCLOSE(f_in);
429 FRCLOSE(f_in2);
430 if (doCollapse) {
431 outcounter=0;
432 int maxdup_count=1;
433 char* maxdup_seq=NULL;
434 dhash.startIterate();
435 FqDupRec* qd=NULL;
436 char* seq=NULL;
437 while ((qd=dhash.NextData(seq))!=NULL) {
438 GStr rseq(seq);
439 //do the dusting here
440 if (doDust) {
441 int dustbases=dust(rseq);
442 if (dustbases>(rseq.length()>>1)) {
443 if (trashReport && qd->firstname!=NULL) {
444 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
445 }
446 trash_D+=qd->count;
447 continue;
448 }
449 }
450 outcounter++;
451 if (qd->count>maxdup_count) {
452 maxdup_count=qd->count;
453 maxdup_seq=seq;
454 }
455 if (isfasta) {
456 if (prefix.is_empty()) {
457 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
458 rseq.chars());
459 }
460 else { //use custom read name
461 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
462 qd->count, rseq.chars());
463 }
464 }
465 else { //fastq format
466 if (convert_phred) convertPhred(qd->qv, qd->len);
467 if (prefix.is_empty()) {
468 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
469 rseq.chars(), qd->qv);
470 }
471 else { //use custom read name
472 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
473 qd->count, rseq.chars(), qd->qv);
474 }
475 }
476 }//for each element of dhash
477 if (maxdup_count>1) {
478 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
479 }
480 } //collapse entries
481 if (verbose) {
482 if (paired_reads) {
483 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
484 GMessage("Number of input pairs :%9d\n", incounter);
485 GMessage(" Output pairs :%9d\n", outcounter);
486 }
487 else {
488 GMessage(">Input file : %s\n", infname.chars());
489 GMessage("Number of input reads :%9d\n", incounter);
490 GMessage(" Output reads :%9d\n", outcounter);
491 }
492 GMessage("------------------------------------\n");
493 if (num_trimmed5)
494 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
495 if (num_trimmed3)
496 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
497 GMessage("------------------------------------\n");
498 if (trash_s>0)
499 GMessage(" Trashed by length:%9d\n", trash_s);
500 if (trash_N>0)
501 GMessage(" Trashed by N%%:%9d\n", trash_N);
502 if (trash_Q>0)
503 GMessage("Trashed by low quality:%9d\n", trash_Q);
504 if (trash_poly>0)
505 GMessage(" Trashed by poly-A/T:%9d\n", trash_poly);
506 if (trash_A5>0)
507 GMessage(" Trashed by 5' adaptor:%9d\n", trash_A5);
508 if (trash_A3>0)
509 GMessage(" Trashed by 3' adaptor:%9d\n", trash_A3);
510 }
511 if (trashReport) {
512 FWCLOSE(freport);
513 }
514 FWCLOSE(f_out);
515 FWCLOSE(f_out2);
516 } //while each input file
517 delete gxmem_l;
518 delete gxmem_r;
519 //getc(stdin);
520 }
521
522 class NData {
523 public:
524 int NPos[1024]; //there should be no reads longer than 1K ?
525 int NCount;
526 int end5;
527 int end3;
528 int seqlen;
529 double perc_N; //percentage of Ns in end5..end3 range only!
530 const char* seq;
531 bool valid;
532 NData() {
533 seqlen=0;
534 NCount=0;
535 end5=0;
536 end3=0;
537 seq=NULL;
538 perc_N=0;
539 valid=true;
540 }
541 void init(GStr& rseq) {
542 NCount=0;
543 perc_N=0;
544 seqlen=rseq.length();
545 seq=rseq.chars();
546 end5=1;
547 end3=seqlen;
548 for (int i=0;i<seqlen;i++)
549 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
550 NPos[NCount]=i;
551 NCount++;
552 }
553 perc_N=(NCount*100.0)/seqlen;
554 valid=true;
555 }
556 void N_calc() { //only in the region after trimming
557 int n=0;
558 for (int i=end3-1;i<end5;i++) {
559 if (seq[i]=='N') n++;
560 }
561 perc_N=(n*100.0)/(end5-end3+1);
562 }
563 };
564
565 static NData feat;
566 int perc_lenN=12; // incremental distance from ends, in percentage of
567 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
568
569 void N_analyze(int l5, int l3, int p5, int p3) {
570 /* assumes feat was filled properly */
571 int old_dif, t5,t3,v;
572 if (l3<l5+2 || p5>p3 ) {
573 feat.end5=l5+1;
574 feat.end3=l3+1;
575 return;
576 }
577
578 t5=feat.NPos[p5]-l5;
579 t3=l3-feat.NPos[p3];
580 old_dif=p3-p5;
581 v=(int)((((double)(l3-l5))*perc_lenN)/100);
582 if (v>20) v=20; /* enforce N-search limit for very long reads */
583 if (t5 < v ) {
584 l5=feat.NPos[p5]+1;
585 p5++;
586 }
587 if (t3 < v) {
588 l3=feat.NPos[p3]-1;
589 p3--;
590 }
591 /* restNs=p3-p5; number of Ns in the new CLR */
592 if (p3-p5==old_dif) { /* no change, return */
593 /* don't trim if this may shorten the read too much */
594 //if (l5-l3<min_read_len) return;
595 feat.end5=l5+1;
596 feat.end3=l3+1;
597 return;
598 }
599 else
600 N_analyze(l5,l3, p5,p3);
601 }
602
603
604 bool qtrim(GStr& qvs, int &l5, int &l3) {
605 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
606 if (qv_phredtype==0) {
607 //try to guess the Phred type
608 int vmin=256, vmax=0;
609 for (int i=0;i<qvs.length();i++) {
610 if (vmin>qvs[i]) vmin=qvs[i];
611 if (vmax<qvs[i]) vmax=qvs[i];
612 }
613 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
614 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
615 if (qv_phredtype==0) {
616 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
617 }
618 if (verbose)
619 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
620 } //guessing Phred type
621 for (l3=qvs.length()-1;l3>2;l3--) {
622 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
623 }
624 //just in case, check also the 5' the end (?)
625 for (l5=0;l5<qvs.length()-3;l5++) {
626 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
627 }
628 if (qvtrim_max>0) {
629 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
630 if (l5>qvtrim_max) l5=qvtrim_max;
631 }
632 return (l5>0 || l3<qvs.length()-1);
633 }
634
635 bool ntrim(GStr& rseq, int &l5, int &l3) {
636 //count Ns in the sequence, trim N-rich ends
637 feat.init(rseq);
638 l5=feat.end5-1;
639 l3=feat.end3-1;
640 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
641 if (l5==feat.end5-1 && l3==feat.end3-1) {
642 if (feat.perc_N>max_perc_N) {
643 feat.valid=false;
644 l3=l5+1;
645 return true;
646 }
647 else {
648 return false; //nothing changed
649 }
650 }
651 l5=feat.end5-1;
652 l3=feat.end3-1;
653 if (l3-l5+1<min_read_len) {
654 feat.valid=false;
655 return true;
656 }
657 feat.N_calc();
658
659 if (feat.perc_N>max_perc_N) {
660 feat.valid=false;
661 l3=l5+1;
662 return true;
663 }
664 return true;
665 }
666
667 //--------------- dust functions ----------------
668 class DNADuster {
669 public:
670 int dustword;
671 int dustwindow;
672 int dustwindow2;
673 int dustcutoff;
674 int mv, iv, jv;
675 int counts[32*32*32];
676 int iis[32*32*32];
677 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
678 dustword=wordsize;
679 dustwindow=winsize;
680 dustwindow2 = (winsize>>1);
681 dustcutoff=cutoff;
682 mv=0;
683 iv=0;
684 jv=0;
685 }
686 void setWindowSize(int value) {
687 dustwindow = value;
688 dustwindow2 = (dustwindow >> 1);
689 }
690 void setWordSize(int value) {
691 dustword=value;
692 }
693 void wo1(int len, const char* s, int ivv) {
694 int i, ii, j, v, t, n, n1, sum;
695 int js, nis;
696 n = 32 * 32 * 32;
697 n1 = n - 1;
698 nis = 0;
699 i = 0;
700 ii = 0;
701 sum = 0;
702 v = 0;
703 for (j=0; j < len; j++, s++) {
704 ii <<= 5;
705 if (*s<=32) {
706 i=0;
707 continue;
708 }
709 ii |= *s - 'A'; //assume uppercase!
710 ii &= n1;
711 i++;
712 if (i >= dustword) {
713 for (js=0; js < nis && iis[js] != ii; js++) ;
714 if (js == nis) {
715 iis[nis] = ii;
716 counts[ii] = 0;
717 nis++;
718 }
719 if ((t = counts[ii]) > 0) {
720 sum += t;
721 v = 10 * sum / j;
722 if (mv < v) {
723 mv = v;
724 iv = ivv;
725 jv = j;
726 }
727 }
728 counts[ii]++;
729 }
730 }
731 }
732
733 int wo(int len, const char* s, int* beg, int* end) {
734 int i, l1;
735 l1 = len - dustword + 1;
736 if (l1 < 0) {
737 *beg = 0;
738 *end = len - 1;
739 return 0;
740 }
741 mv = 0;
742 iv = 0;
743 jv = 0;
744 for (i=0; i < l1; i++) {
745 wo1(len-i, s+i, i);
746 }
747 *beg = iv;
748 *end = iv + jv;
749 return mv;
750 }
751
752 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
753 int i, j, l, a, b, v;
754 if (cutoff==0) cutoff=dustcutoff;
755 a=0;b=0;
756 //GMessage("Dust cutoff=%d\n", cutoff);
757 for (i=0; i < seqlen; i += dustwindow2) {
758 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
759 v = wo(l, seq+i, &a, &b);
760 if (v > cutoff) {
761 //for (j = a; j <= b && j < dustwindow2; j++) {
762 for (j = a; j <= b; j++) {
763 seqmsk[i+j]='N';//could be made lowercase instead
764 }
765 }
766 }
767 //return first;
768 }
769 };
770
771 static DNADuster duster;
772
773 int dust(GStr& rseq) {
774 char* seq=Gstrdup(rseq.chars());
775 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
776 //check the number of Ns:
777 int ncount=0;
778 for (int i=0;i<rseq.length();i++) {
779 if (seq[i]=='N') ncount++;
780 }
781 GFREE(seq);
782 return ncount;
783 }
784
785 struct SLocScore {
786 int pos;
787 int score;
788 SLocScore(int p=0,int s=0) {
789 pos=p;
790 score=s;
791 }
792 void set(int p, int s) {
793 pos=p;
794 score=s;
795 }
796 void add(int p, int add) {
797 pos=p;
798 score+=add;
799 }
800 };
801
802 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
803 if (!doPolyTrim) return false;
804 int rlen=seq.length();
805 l5=0;
806 l3=rlen-1;
807 int32 seedVal=*(int32*)poly_seed;
808 char polyChar=poly_seed[0];
809 //assumes N trimming was already done
810 //so a poly match should be very close to the end of the read
811 // -- find the initial match (seed)
812 int lmin=GMAX((rlen-16), 0);
813 int li;
814 for (li=rlen-4;li>lmin;li--) {
815 if (seedVal==*(int*)&(seq[li])) {
816 break;
817 }
818 }
819 if (li<=lmin) return false;
820 //seed found, try to extend it both ways
821 //extend right
822 int ri=li+3;
823 SLocScore loc(ri, poly_m_score<<2);
824 SLocScore maxloc(loc);
825 //extend right
826 while (ri<rlen-1) {
827 ri++;
828 if (seq[ri]==polyChar) {
829 loc.add(ri,poly_m_score);
830 }
831 else if (seq[ri]=='N') {
832 loc.add(ri,0);
833 }
834 else { //mismatch
835 loc.add(ri,poly_mis_score);
836 if (maxloc.score-loc.score>poly_dropoff_score) break;
837 }
838 if (maxloc.score<=loc.score) {
839 maxloc=loc;
840 }
841 }
842 ri=maxloc.pos;
843 if (ri<rlen-6) return false; //no trimming wanted, too far from 3' end
844 //ri = right boundary for the poly match
845 //extend left
846 loc.set(li, maxloc.score);
847 maxloc.pos=li;
848 while (li>0) {
849 li--;
850 if (seq[li]==polyChar) {
851 loc.add(li,poly_m_score);
852 }
853 else if (seq[li]=='N') {
854 loc.add(li,0);
855 }
856 else { //mismatch
857 loc.add(li,poly_mis_score);
858 if (maxloc.score-loc.score>poly_dropoff_score) break;
859 }
860 if (maxloc.score<=loc.score) {
861 maxloc=loc;
862 }
863 }
864 li=maxloc.pos;
865 if ((maxloc.score==poly_minScore && ri==rlen-1) ||
866 (maxloc.score>poly_minScore && ri>=rlen-3) ||
867 (maxloc.score>(poly_minScore*3) && ri>=rlen-8)) {
868 //trimming this li-ri match at 3' end
869 l3=li-1;
870 if (l3<0) l3=0;
871 return true;
872 }
873 return false;
874 }
875
876 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
877 if (!doPolyTrim) return false;
878 int rlen=seq.length();
879 l5=0;
880 l3=rlen-1;
881 int32 seedVal=*(int32*)poly_seed;
882 char polyChar=poly_seed[0];
883 //assumes N trimming was already done
884 //so a poly match should be very close to the end of the read
885 // -- find the initial match (seed)
886 int lmax=GMIN(12, rlen-4);//how far from 5' end to look for 4-mer seeds
887 int li;
888 for (li=0;li<=lmax;li++) {
889 if (seedVal==*(int*)&(seq[li])) {
890 break;
891 }
892 }
893 if (li>lmax) return false;
894 //seed found, try to extend it both ways
895 //extend left
896 int ri=li+3; //save rightmost base of the seed
897 SLocScore loc(li, poly_m_score<<2);
898 SLocScore maxloc(loc);
899 while (li>0) {
900 li--;
901 if (seq[li]==polyChar) {
902 loc.add(li,poly_m_score);
903 }
904 else if (seq[li]=='N') {
905 loc.add(li,0);
906 }
907 else { //mismatch
908 loc.add(li,poly_mis_score);
909 if (maxloc.score-loc.score>poly_dropoff_score) break;
910 }
911 if (maxloc.score<=loc.score) {
912 maxloc=loc;
913 }
914 }
915 li=maxloc.pos;
916 if (li>5) return false; //no trimming wanted, too far from 5' end
917 //li = right boundary for the poly match
918
919 //extend right
920 loc.set(ri, maxloc.score);
921 maxloc.pos=ri;
922 while (ri<rlen-1) {
923 ri++;
924 if (seq[ri]==polyChar) {
925 loc.add(ri,poly_m_score);
926 }
927 else if (seq[ri]=='N') {
928 loc.add(ri,0);
929 }
930 else { //mismatch
931 loc.add(ri,poly_mis_score);
932 if (maxloc.score-loc.score>poly_dropoff_score) break;
933 }
934 if (maxloc.score<=loc.score) {
935 maxloc=loc;
936 }
937 }
938 ri=maxloc.pos;
939 if ((maxloc.score==poly_minScore && li==0) ||
940 (maxloc.score>poly_minScore && li<2)
941 || (maxloc.score>(poly_minScore*3) && li<8)) {
942 //adjust l5 to reflect this trimming of 5' end
943 l5=ri+1;
944 if (l5>rlen-1) l5=rlen-1;
945 return true;
946 }
947 return false;
948 }
949
950 bool trim_adaptor3(GStr& seq, int&l5, int &l3) {
951 if (adaptors3.Count()==0) return false;
952 int rlen=seq.length();
953 l5=0;
954 l3=rlen-1;
955 bool trimmed=false;
956 GStr wseq(seq);
957 int wlen=rlen;
958 GXSeqData seqdata;
959 int numruns=revCompl ? 2 : 1;
960
961 for (int ai=0;ai<adaptors3.Count();ai++) {
962 for (int r=0;r<numruns;r++) {
963 if (r) {
964 seqdata.update(adaptors3[ai].seqr.chars(), adaptors3[ai].seqr.length(),
965 adaptors3[ai].pzr, wseq.chars(), wlen, adaptors3[ai].amlen);
966 }
967 else {
968 seqdata.update(adaptors3[ai].seq.chars(), adaptors3[ai].seq.length(),
969 adaptors3[ai].pz, wseq.chars(), wlen, adaptors3[ai].amlen);
970 }
971
972 GXAlnInfo* aln=match_adaptor(seqdata, adaptors3[ai].trim_type, gxmem_r, 86);
973 if (aln) {
974 trimmed=true;
975 //keep unmatched region on the left OR right (the longer one)
976 if (aln->sl > wlen-aln->sr) {
977 //keep left side
978 l3-=(wlen-aln->sl+1);
979 if (l3<0) l3=0;
980 }
981 else { //keep right side
982 l5+=aln->sr;
983 if (l5>=rlen) l5=rlen-1;
984 }
985 delete aln;
986 if (l3-l5+1<min_read_len) return true;
987 wseq=seq.substr(l5,l3-l5+1);
988 wlen=wseq.length();
989 }
990 }//forward and reverse adaptors
991 }//for each 3' adaptor
992 return trimmed;
993 }
994
995 bool trim_adaptor5(GStr& seq, int&l5, int &l3) {
996 if (adaptors5.Count()==0) return false;
997 int rlen=seq.length();
998 l5=0;
999 l3=rlen-1;
1000 bool trimmed=false;
1001 GStr wseq(seq);
1002 int wlen=rlen;
1003 GXSeqData seqdata;
1004 int numruns=revCompl ? 2 : 1;
1005 GList<GXAlnInfo> bestalns(true, true, false);
1006 for (int ai=0;ai<adaptors5.Count();ai++) {
1007 for (int r=0;r<numruns;r++) {
1008 if (r) {
1009 seqdata.update(adaptors5[ai].seqr.chars(), adaptors5[ai].seqr.length(),
1010 adaptors5[ai].pzr, wseq.chars(), wlen, adaptors5[ai].amlen);
1011 }
1012 else {
1013 seqdata.update(adaptors5[ai].seq.chars(), adaptors5[ai].seq.length(),
1014 adaptors5[ai].pz, wseq.chars(), wlen, adaptors5[ai].amlen);
1015 }
1016 GXAlnInfo* aln=match_adaptor(seqdata, adaptors5[ai].trim_type, gxmem_l, 90);
1017 if (aln) {
1018 if (aln->strong) {
1019 trimmed=true;
1020 bestalns.Add(aln);
1021 break; //will check the rest next time
1022 }
1023 else bestalns.Add(aln);
1024 }
1025 } //forward and reverse?
1026 if (trimmed) break; //will check the rest in the next cycle
1027 }//for each 5' adaptor
1028 if (bestalns.Count()>0) {
1029 GXAlnInfo* aln=bestalns[0];
1030 if (aln->sl-1 > wlen-aln->sr) {
1031 //keep left side
1032 l3-=(wlen-aln->sl+1);
1033 if (l3<0) l3=0;
1034 }
1035 else { //keep right side
1036 l5+=aln->sr;
1037 if (l5>=rlen) l5=rlen-1;
1038 }
1039 //delete aln;
1040 //if (l3-l5+1<min_read_len) return true;
1041 wseq=seq.substr(l5,l3-l5+1);
1042 wlen=wseq.length();
1043 return true; //break the loops here to report a good find
1044 }
1045 return false;
1046 }
1047
1048 //convert qvs to/from phred64 from/to phread33
1049 void convertPhred(GStr& q) {
1050 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1051 }
1052
1053 void convertPhred(char* q, int len) {
1054 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1055 }
1056
1057 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1058 GStr& rname, GStr& rinfo, GStr& infname) {
1059 rseq="";
1060 rqv="";
1061 rname="";
1062 rinfo="";
1063 if (fq.eof()) return false;
1064 char* l=fq.getLine();
1065 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1066 if (l==NULL) return false;
1067 /* if (rawFormat) {
1068 //TODO: implement raw qseq parsing here
1069 //if (raw type=N) then continue; //skip invalid/bad records
1070 } //raw qseq format
1071 else { // FASTQ or FASTA */
1072 isfasta=(l[0]=='>');
1073 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1074 infname.chars(), l);
1075 GStr s(l);
1076 rname=&(l[1]);
1077 for (int i=0;i<rname.length();i++)
1078 if (rname[i]<=' ') {
1079 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1080 rname.cut(i);
1081 break;
1082 }
1083 //now get the sequence
1084 if ((l=fq.getLine())==NULL)
1085 GError("Error: unexpected EOF after header for read %s (%s)\n",
1086 rname.chars(), infname.chars());
1087 rseq=l; //this must be the DNA line
1088 while ((l=fq.getLine())!=NULL) {
1089 //seq can span multiple lines
1090 if (l[0]=='>' || l[0]=='+') {
1091 fq.pushBack();
1092 break; //
1093 }
1094 rseq+=l;
1095 } //check for multi-line seq
1096 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1097 if ((l=fq.getLine())==NULL)
1098 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1099 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1100 if ((l=fq.getLine())==NULL)
1101 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1102 rqv=l;
1103 //if (rqv.length()!=rseq.length())
1104 // GError("Error: qv len != seq len for %s\n", rname.chars());
1105 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1106 rqv+=l; //append to qv string
1107 }
1108 }// fastq
1109 // } //<-- FASTA or FASTQ
1110 rseq.upper();
1111 return true;
1112 }
1113
1114 #ifdef GDEBUG
1115 void showTrim(GStr& s, int l5, int l3) {
1116 if (l5>0 || l3==0) {
1117 color_bg(c_red);
1118 }
1119 for (int i=0;i<s.length()-1;i++) {
1120 if (i && i==l5) color_resetbg();
1121 fprintf(stderr, "%c", s[i]);
1122 if (i && i==l3) color_bg(c_red);
1123 }
1124 fprintf(stderr, "%c", s[s.length()-1]);
1125 color_reset();
1126 fprintf(stderr, "\n");
1127 }
1128 #endif
1129
1130 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1131 //returns 0 if the read was untouched, 1 if it was just trimmed
1132 // and a trash code if it was trashed
1133 l5=0;
1134 l3=rseq.length()-1;
1135 #ifdef GDEBUG
1136 //rseq.reverse();
1137 GMessage(">%s\n", rname.chars());
1138 GMessage("%s\n",rseq.chars());
1139 #endif
1140 if (l3-l5+1<min_read_len) {
1141 return 's';
1142 }
1143 GStr wseq(rseq.chars());
1144 GStr wqv(rqv.chars());
1145 int w5=l5;
1146 int w3=l3;
1147 //first do the q-based trimming
1148 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1149 if (w3-w5+1<min_read_len) {
1150 return 'Q'; //invalid read
1151 }
1152 //-- keep only the w5..w3 range
1153 l5=w5;
1154 l3=w3;
1155 wseq=wseq.substr(w5, w3-w5+1);
1156 if (!wqv.is_empty())
1157 wqv=wqv.substr(w5, w3-w5+1);
1158 } //qv trimming
1159 // N-trimming on the remaining read seq
1160 if (ntrim(wseq, w5, w3)) {
1161 //GMessage("before: %s\n",wseq.chars());
1162 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1163 l5+=w5;
1164 l3-=(wseq.length()-1-w3);
1165 if (w3-w5+1<min_read_len) {
1166 return 'N'; //to be trashed
1167 }
1168 //-- keep only the w5..w3 range
1169 wseq=wseq.substr(w5, w3-w5+1);
1170 if (!wqv.is_empty())
1171 wqv=wqv.substr(w5, w3-w5+1);
1172 w5=0;
1173 w3=wseq.length()-1;
1174 }
1175 char trim_code;
1176 //clean the more dirty end first - 3'
1177 int prev_w5=0;
1178 int prev_w3=0;
1179 bool w3upd=true;
1180 bool w5upd=true;
1181 do {
1182 trim_code=0;
1183 if (w3upd && trim_poly3(wseq, w5, w3, polyA_seed)) {
1184 trim_code='A';
1185 }
1186 else if (w3upd && trim_poly3(wseq, w5, w3, polyT_seed)) {
1187 trim_code='T';
1188 }
1189 else if (w5upd && trim_poly5(wseq, w5, w3, polyA_seed)) {
1190 trim_code='A';
1191 }
1192 else if (w5upd && trim_poly5(wseq, w5, w3, polyT_seed)) {
1193 trim_code='T';
1194 }
1195 else if (trim_adaptor5(wseq, w5, w3)) {
1196 trim_code='5';
1197 }
1198 else if (trim_adaptor3(wseq, w5, w3)) {
1199 trim_code='3';
1200 }
1201 if (trim_code) {
1202 w3upd=(w3!=prev_w3);
1203 w5upd=(w5!=prev_w5);
1204 if (w3upd) prev_w3=w3;
1205 if (w5upd) prev_w5=w5;
1206 #ifdef GDEBUG
1207 GMessage("#### TRIM by '%c' code ( w5-w3 = %d-%d ):\n",trim_code, w5,w3);
1208 showTrim(wseq, w5, w3);
1209 #endif
1210 int trimlen=wseq.length()-(w3-w5+1);
1211 num_trimmed3++;
1212 if (trimlen<min_trimmed3)
1213 min_trimmed3=trimlen;
1214 l5+=w5;
1215 l3-=(wseq.length()-1-w3);
1216 if (w3-w5+1<min_read_len) {
1217 return trim_code;
1218 }
1219 //-- keep only the w5..w3 range
1220 wseq=wseq.substr(w5, w3-w5+1);
1221 if (!wqv.is_empty())
1222 wqv=wqv.substr(w5, w3-w5+1);
1223 }//trimming at 3' end
1224 } while (trim_code);
1225
1226 if (doCollapse) {
1227 //keep read for later
1228 FqDupRec* dr=dhash.Find(wseq.chars());
1229 if (dr==NULL) { //new entry
1230 //if (prefix.is_empty())
1231 dhash.Add(wseq.chars(),
1232 new FqDupRec(&wqv, rname.chars()));
1233 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1234 }
1235 else
1236 dr->add(wqv);
1237 } //collapsing duplicates
1238 else { //not collapsing duplicates
1239 //apply the dust filter now
1240 if (doDust) {
1241 int dustbases=dust(wseq);
1242 if (dustbases>(wseq.length()>>1)) {
1243 return 'D';
1244 }
1245 }
1246 } //not collapsing duplicates
1247 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1248 }
1249
1250 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1251 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1252 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1253 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1254 }
1255
1256 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1257 outcounter++;
1258 bool asFasta=(rqv.is_empty() || fastaOutput);
1259 if (asFasta) {
1260 if (prefix.is_empty()) {
1261 printHeader(f_out, '>',rname,rinfo);
1262 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1263 }
1264 else {
1265 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1266 rseq.chars());
1267 }
1268 }
1269 else { //fastq
1270 if (convert_phred) convertPhred(rqv);
1271 if (prefix.is_empty()) {
1272 printHeader(f_out, '@', rname, rinfo);
1273 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1274 }
1275 else
1276 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1277 rseq.chars(),rqv.chars() );
1278 }
1279 }
1280
1281 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1282 if (freport==NULL || trashcode<=' ') return;
1283 if (trashcode=='3' || trashcode=='5') {
1284 fprintf(freport, "%s\ta%c\n",rname.chars(),trashcode);
1285 }
1286 else {
1287 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1288 }
1289 //tcounter++;
1290 }
1291
1292 GStr getFext(GStr& s, int* xpos=NULL) {
1293 //s must be a filename without a path
1294 GStr r("");
1295 if (xpos!=NULL) *xpos=0;
1296 if (s.is_empty() || s=="-") return r;
1297 int p=s.rindex('.');
1298 int d=s.rindex('/');
1299 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1300 r=s.substr(p+1);
1301 if (xpos!=NULL) *xpos=p+1;
1302 r.lower();
1303 return r;
1304 }
1305
1306 void baseFileName(GStr& fname) {
1307 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1308 int xpos=0;
1309 GStr fext=getFext(fname, &xpos);
1310 if (xpos<=1) return;
1311 bool extdel=false;
1312 GStr f2;
1313 if (fext=="z" || fext=="zip") {
1314 extdel=true;
1315 }
1316 else if (fext.length()>=2) {
1317 f2=fext.substr(0,2);
1318 extdel=(f2=="gz" || f2=="bz");
1319 }
1320 if (extdel) {
1321 fname.cut(xpos-1);
1322 fext=getFext(fname, &xpos);
1323 if (xpos<=1) return;
1324 }
1325 extdel=false;
1326 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1327 extdel=true;
1328 }
1329 else if (fext.length()>=2) {
1330 extdel=(fext.substr(0,2)=="fa");
1331 }
1332 if (extdel) fname.cut(xpos-1);
1333 GStr fncp(fname);
1334 fncp.lower();
1335 fncp.chomp("seq");
1336 fncp.chomp("sequence");
1337 fncp.trimR("_.");
1338 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1339 }
1340
1341 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1342 FILE* f_out=NULL;
1343 GStr fname(getFileName(infname.chars()));
1344 //eliminate known extensions
1345 baseFileName(fname);
1346 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1347 else if (pocmd.is_empty()) {
1348 GStr oname(fname);
1349 oname.append('.');
1350 oname.append(outsuffix);
1351 f_out=fopen(oname.chars(),"w");
1352 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1353 }
1354 else {
1355 GStr oname(">");
1356 oname.append(fname);
1357 oname.append('.');
1358 oname.append(outsuffix);
1359 pocmd.append(oname);
1360 f_out=popen(pocmd.chars(), "w");
1361 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1362 }
1363 return f_out;
1364 }
1365
1366 void guess_unzip(GStr& fname, GStr& picmd) {
1367 GStr fext=getFext(fname);
1368 if (fext=="gz" || fext=="gzip" || fext=="z") {
1369 picmd="gzip -cd ";
1370 }
1371 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1372 picmd="bzip2 -cd ";
1373 }
1374 }
1375
1376 void addAdaptor(GVec<CASeqData>& adaptors, GStr& seq, GAlnTrimType trim_type) {
1377 //TODO: prepare CASeqData here, and collect hexamers as well
1378 if (seq.is_empty() || seq=="-" ||
1379 seq=="N/A" || seq==".") return;
1380
1381 CASeqData adata(revCompl);
1382 int idx=adaptors.Add(adata);
1383 if (idx<0) GError("Error: failed to add adaptor!\n");
1384 adaptors[idx].trim_type=trim_type;
1385 adaptors[idx].update(seq.chars());
1386 }
1387
1388
1389 int loadAdaptors(const char* fname) {
1390 GLineReader lr(fname);
1391 char* l;
1392 while ((l=lr.nextLine())!=NULL) {
1393 if (lr.length()<=3 || l[0]=='#') continue;
1394 if ( l[0]==' ' || l[0]=='\t' || l[0]==',' ||
1395 l[0]==';'|| l[0]==':' ) {
1396 int i=1;
1397 while (l[i]!=0 && isspace(l[i])) {
1398 i++;
1399 }
1400 if (l[i]!=0) {
1401 GStr s(&(l[i]));
1402 #ifdef GDEBUG
1403 //s.reverse();
1404 #endif
1405 addAdaptor(adaptors3, s, galn_TrimRight);
1406 continue;
1407 }
1408 }
1409 else {
1410 GStr s(l);
1411 s.startTokenize("\t ;,:");
1412 GStr a5,a3;
1413 if (s.nextToken(a5))
1414 s.nextToken(a3);
1415 else continue; //no tokens on this line
1416 GAlnTrimType ttype5=galn_TrimLeft;
1417 a5.upper();
1418 a3.upper();
1419 if (a3.is_empty() || a3==a5 || a3=="=") {
1420 a3.clear();
1421 ttype5=galn_TrimEither;
1422 }
1423 #ifdef GDEBUG
1424 // a5.reverse();
1425 // a3.reverse();
1426 #endif
1427 addAdaptor(adaptors5, a5, ttype5);
1428 addAdaptor(adaptors3, a3, galn_TrimRight);
1429 }
1430 }
1431 return adaptors5.Count()+adaptors3.Count();
1432 }
1433
1434 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1435 GStr& s, GStr& infname, GStr& infname2) {
1436 // uses outsuffix to generate output file names and open file handles as needed
1437 infname="";
1438 infname2="";
1439 f_in=NULL;
1440 f_in2=NULL;
1441 f_out=NULL;
1442 f_out2=NULL;
1443 //analyze outsuffix intent
1444 GStr pocmd;
1445 if (outsuffix=="-") {
1446 f_out=stdout;
1447 }
1448 else {
1449 GStr ox=getFext(outsuffix);
1450 if (ox.length()>2) ox=ox.substr(0,2);
1451 if (ox=="gz") pocmd="gzip -9 -c ";
1452 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1453 }
1454 if (s=="-") {
1455 f_in=stdin;
1456 infname="stdin";
1457 f_out=prepOutFile(infname, pocmd);
1458 return;
1459 } // streaming from stdin
1460 s.startTokenize(",:");
1461 s.nextToken(infname);
1462 bool paired=s.nextToken(infname2);
1463 if (fileExists(infname.chars())==0)
1464 GError("Error: cannot find file %s!\n",infname.chars());
1465 GStr fname(getFileName(infname.chars()));
1466 GStr picmd;
1467 guess_unzip(fname, picmd);
1468 if (picmd.is_empty()) {
1469 f_in=fopen(infname.chars(), "r");
1470 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1471 }
1472 else {
1473 picmd.append(infname);
1474 f_in=popen(picmd.chars(), "r");
1475 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1476 }
1477 if (f_out==stdout) {
1478 if (paired) GError("Error: output suffix required for paired reads\n");
1479 return;
1480 }
1481 f_out=prepOutFile(infname, pocmd);
1482 if (!paired) return;
1483 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1484 // ---- paired reads:-------------
1485 if (fileExists(infname2.chars())==0)
1486 GError("Error: cannot find file %s!\n",infname2.chars());
1487 picmd="";
1488 GStr fname2(getFileName(infname2.chars()));
1489 guess_unzip(fname2, picmd);
1490 if (picmd.is_empty()) {
1491 f_in2=fopen(infname2.chars(), "r");
1492 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1493 }
1494 else {
1495 picmd.append(infname2);
1496 f_in2=popen(picmd.chars(), "r");
1497 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1498 }
1499 f_out2=prepOutFile(infname2, pocmd);
1500 }