ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/fqtrim/fqtrim.cpp
Revision: 94
Committed: Tue Oct 4 21:27:35 2011 UTC (7 years, 11 months ago) by gpertea
File size: 48809 byte(s)
Log Message:
wip

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include <ctype.h>
6 #include "GAlnExtend.h"
7
8 #define USAGE "Usage:\n\
9 fqtrim [{-5 <5adapter> -3 <3adapter>|-f <adapters_file>}] [-a <min_matchlen>]\\\n\
10 [-q <minq> [-t <trim_max_len>]] [-p {64|33}] [-o <outsuffix>]\\\n\
11 [-l <minlen>] [-C] [-D] [-Q] [-n <rename_prefix>] [-r <discarded.lst>]\\\n\
12 <input.fq>[,<input_mates.fq>\n\
13 \n\
14 Trim low quality bases at the 3' end and can trim adapter sequence(s), filter\n\
15 for low complexity and collapse duplicate reads.\n\
16 If read pairs should be trimmed and kept together (i.e. without discarding\n\
17 one read in a pair), the two file names should be given delimited by a comma\n\
18 or a colon character\n\
19 \n\
20 Options:\n\
21 -n rename all the reads using the <prefix> followed by a read counter;\n\
22 if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\
23 the read duplication count\n\
24 -o unless this parameter is '-', write the trimmed/filtered reads to \n\
25 file(s) named <input>.<outsuffix> which will be created in the \n\
26 current (working) directory; (writes to stdout if -o- is given);\n\
27 a suffix ending with .gz, .gzip or .bz2 will enforce compression\n\
28 -f file with adapter sequences to trim, each line having this format:\n\
29 <5'-adapter-sequence> <3'-adapter-sequence>\n\
30 -5 trim the given adapter or primer sequence at the 5' end of each read\n\
31 (e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\
32 -3 trim the given adapter sequence at the 3' end of each read\n\
33 (e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\
34 -A disable polyA trimming (enabled by default)\n\
35 -T enable polyT trimming (disabled by default)\n\
36 -y minimum length of exact match to adaptor sequence at the proper end (6)\n\
37 -q trim bases with quality value lower than <minq> (starting at the 3' end)\n\
38 -t for -q option, maximum trimming at the 3' end is limited to <trim_max_len>\n\
39 -m maximum percentage of Ns allowed in a read after trimming (default 7)\n\
40 -l minimum \"clean\" length after trimming that a read must have\n\
41 in order to pass the filter (default: 16)\n\
42 -r write a simple \"trash report\" file listing the discarded reads with a\n\
43 one letter code for the reason why they were trashed\n\
44 -D apply a low-complexity (dust) filter and discard any read that has over\n\
45 50% of its length detected as low complexity\n\
46 -C collapse duplicate reads and add a prefix with their count to the read\n\
47 name (e.g. for microRNAs)\n\
48 -p input is phred64/phred33 (use -p64 or -p33)\n\
49 -Q convert quality values to the other Phred qv type\n\
50 -V verbose processing\n\
51 "
52
53 //-z for -o option, the output stream(s) will be first piped into the given\n
54 // <zcmd> command, which must output to stdout (e.g. -z 'bzip2 -9 -c')\n
55
56
57 // example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG
58
59 //For paired reads sequencing:
60 //3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT
61 //5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
62 //FILE* f_out=NULL; //stdout if not provided
63 //FILE* f_out2=NULL; //for paired reads
64 //FILE* f_in=NULL; //input fastq (stdin if not provided)
65 //FILE* f_in2=NULL; //for paired reads
66 FILE* freport=NULL;
67
68 bool debug=false;
69 bool verbose=false;
70 bool doCollapse=false;
71 bool doDust=false;
72 bool fastaOutput=false;
73 bool trashReport=false;
74 //bool rawFormat=false;
75 int min_read_len=16;
76 double max_perc_N=7.0;
77 int dust_cutoff=16;
78 bool isfasta=false;
79 bool convert_phred=false;
80 GStr outsuffix; // -o
81 GStr prefix;
82 GStr zcmd;
83 int num_trimmed5=0;
84 int num_trimmed3=0;
85 int num_trimmedN=0;
86 int num_trimmedQ=0;
87 int min_trimmed5=INT_MAX;
88 int min_trimmed3=INT_MAX;
89
90 int qvtrim_qmin=0;
91 int qvtrim_max=0; //for -q, do not trim at 3'-end more than this number of bases
92 int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet)
93 int qv_cvtadd=0; //could be -31 or +31
94
95 // adaptor matching metrics -- for X-drop ungapped extension
96 const int poly_m_score=2; //match score
97 const int poly_mis_score=-3; //mismatch
98 const int poly_dropoff_score=7;
99 int poly_minScore=12; //i.e. an exact match of 6 bases at the proper ends WILL be trimmed
100
101 const char *polyA_seed="AAAA";
102 const char *polyT_seed="TTTT";
103
104 struct CAdapters {
105 GStr a5;
106 GStr a3;
107 CAdapters(const char* s5=NULL, const char* s3=NULL):a5(s5),a3(s3) {
108 }
109 };
110
111 GPVec<CAdapters> adapters;
112
113 // element in dhash:
114 class FqDupRec {
115 public:
116 int count; //how many of these reads are the same
117 int len; //length of qv
118 char* firstname; //optional, only if we want to keep the original read names
119 char* qv;
120 FqDupRec(GStr* qstr=NULL, const char* rname=NULL) {
121 len=0;
122 qv=NULL;
123 firstname=NULL;
124 count=0;
125 if (qstr!=NULL) {
126 qv=Gstrdup(qstr->chars());
127 len=qstr->length();
128 count++;
129 }
130 if (rname!=NULL) firstname=Gstrdup(rname);
131 }
132 ~FqDupRec() {
133 GFREE(qv);
134 GFREE(firstname);
135 }
136 void add(GStr& d) { //collapse another record into this one
137 if (d.length()!=len)
138 GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n");
139 count++;
140 for (int i=0;i<len;i++)
141 qv[i]=(qv[i]+d[i])/2;
142 }
143 };
144
145 void openfw(FILE* &f, GArgs& args, char opt) {
146 GStr s=args.getOpt(opt);
147 if (!s.is_empty()) {
148 if (s=='-') f=stdout;
149 else {
150 f=fopen(s.chars(),"w");
151 if (f==NULL) GError("Error creating file: %s\n", s.chars());
152 }
153 }
154 }
155
156 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
157 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
158
159 GHash<FqDupRec> dhash; //hash to keep track of duplicates
160
161 int loadAdapters(const char* fname);
162
163 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
164 GStr& s, GStr& infname, GStr& infname2);
165 // uses outsuffix to generate output file names and open file handles as needed
166
167 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter);
168 void trash_report(char trashcode, GStr& rname, FILE* freport);
169
170 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
171 GStr& rname, GStr& rinfo, GStr& infname);
172
173 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3);
174 //returns 0 if the read was untouched, 1 if it was trimmed and a trash code if it was trashed
175
176 bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured
177 bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured
178 int dust(GStr& seq);
179 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed); //returns true if any trimming occured
180 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed);
181 bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured
182 bool trim_adapter3(GStr& seq, int &l5, int &l3);
183
184 void convertPhred(char* q, int len);
185 void convertPhred(GStr& q);
186
187 int main(int argc, char * const argv[]) {
188 GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:f:t:o:z:a:");
189 int e;
190 if ((e=args.isError())>0) {
191 GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]);
192 exit(224);
193 }
194 debug=(args.getOpt('Y')!=NULL);
195 verbose=(args.getOpt('V')!=NULL);
196 convert_phred=(args.getOpt('Q')!=NULL);
197 doCollapse=(args.getOpt('C')!=NULL);
198 doDust=(args.getOpt('D')!=NULL);
199 /*
200 rawFormat=(args.getOpt('R')!=NULL);
201 if (rawFormat) {
202 GError("Sorry, raw qseq format parsing is not implemented yet!\n");
203 }
204 */
205 prefix=args.getOpt('n');
206 GStr s=args.getOpt('l');
207 if (!s.is_empty())
208 min_read_len=s.asInt();
209 s=args.getOpt('m');
210 if (!s.is_empty())
211 max_perc_N=s.asDouble();
212 s=args.getOpt('d');
213 if (!s.is_empty()) {
214 dust_cutoff=s.asInt();
215 doDust=true;
216 }
217 s=args.getOpt('q');
218 if (!s.is_empty()) {
219 qvtrim_qmin=s.asInt();
220 }
221 s=args.getOpt('t');
222 if (!s.is_empty()) {
223 qvtrim_max=s.asInt();
224 }
225 s=args.getOpt('p');
226 if (!s.is_empty()) {
227 int v=s.asInt();
228 if (v==33) {
229 qv_phredtype=33;
230 qv_cvtadd=31;
231 }
232 else if (v==64) {
233 qv_phredtype=64;
234 qv_cvtadd=-31;
235 }
236 else
237 GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE);
238 }
239 s=args.getOpt('f');
240 if (!s.is_empty()) {
241 loadAdapters(s.chars());
242 }
243 bool fileAdapters=adapters.Count();
244 s=args.getOpt('5');
245 if (!s.is_empty()) {
246 if (fileAdapters)
247 GError("Error: options -5 and -f cannot be used together!\n");
248 s.upper();
249 adapters.Add(new CAdapters(s.chars()));
250 }
251 s=args.getOpt('3');
252 if (!s.is_empty()) {
253 if (fileAdapters)
254 GError("Error: options -3 and -f cannot be used together!\n");
255 s.upper();
256 if (adapters.Count()>0)
257 adapters[0]->a3=s.chars();
258 else adapters.Add(NULL, new CAdapters(s.chars()));
259 }
260 s=args.getOpt('y');
261 if (!s.is_empty()) {
262 int minmatch=s.asInt();
263 poly_minScore=minmatch*poly_m_score;
264 }
265
266 if (args.getOpt('o')!=NULL) outsuffix=args.getOpt('o');
267 else outsuffix="-";
268 trashReport= (args.getOpt('r')!=NULL);
269 int fcount=args.startNonOpt();
270 if (fcount==0) {
271 GMessage(USAGE);
272 exit(224);
273 }
274 if (fcount>1 && doCollapse) {
275 GError("%s Sorry, the -C option only works with a single input.\n", USAGE);
276 }
277 //openfw(f_out, args, 'o');
278 //if (f_out==NULL) f_out=stdout;
279 if (trashReport)
280 openfw(freport, args, 'r');
281 char* infile=NULL;
282 while ((infile=args.nextNonOpt())!=NULL) {
283 int incounter=0; //counter for input reads
284 int outcounter=0; //counter for output reads
285 int trash_s=0; //too short from the get go
286 int trash_Q=0;
287 int trash_N=0;
288 int trash_D=0;
289 int trash_A3=0;
290 int trash_A5=0;
291 s=infile;
292 GStr infname;
293 GStr infname2;
294 FILE* f_in=NULL;
295 FILE* f_in2=NULL;
296 FILE* f_out=NULL;
297 FILE* f_out2=NULL;
298 bool paired_reads=false;
299 setupFiles(f_in, f_in2, f_out, f_out2, s, infname, infname2);
300 GLineReader fq(f_in);
301 GLineReader* fq2=NULL;
302 if (f_in2!=NULL) {
303 fq2=new GLineReader(f_in2);
304 paired_reads=true;
305 }
306 GStr rseq, rqv, seqid, seqinfo;
307 GStr rseq2, rqv2, seqid2, seqinfo2;
308 while (getFastxRec(fq, rseq, rqv, seqid, seqinfo, infname)) {
309 incounter++;
310 int a5=0, a3=0, b5=0, b3=0;
311 char tcode=0, tcode2=0;
312 tcode=process_read(seqid, rseq, rqv, a5, a3);
313 //if (!doCollapse) {
314 if (fq2!=NULL) {
315 getFastxRec(*fq2, rseq2, rqv2, seqid2, seqinfo2, infname2);
316 if (seqid.substr(0,seqid.length()-1)!=seqid2.substr(0,seqid2.length()-1)) {
317 GError("Error: no paired match for read %s vs %s (%s,%s)\n",
318 seqid.chars(), seqid2.chars(), infname.chars(), infname2.chars());
319 }
320 tcode2=process_read(seqid2, rseq2, rqv2, b5, b3);
321 //decide what to do with this pair and print rseq2 if the pair makes it
322 if (tcode>1 && tcode2<=1) {
323 //"untrash" rseq
324 if (a3-a5+1<min_read_len) {
325 a5=1;
326 if (a3<min_read_len) { a3= GMIN(rseq.length()-1, min_read_len+1); }
327 }
328 tcode=1;
329 }
330 else if (tcode<=1 && tcode2>1) {
331 //"untrash" rseq2
332 if (b3-b5+1<min_read_len) {
333 b5=1;
334 if (b3<min_read_len) { b3= GMIN((rseq2.length()-1),(min_read_len+1)); }
335 }
336 tcode2=1;
337 }
338 if (tcode<=1) { //trimmed or left intact -- write it!
339 if (tcode>0) {
340 rseq2=rseq2.substr(b5,b3-b5+1);
341 if (!rqv2.is_empty()) rqv2=rqv2.substr(b5,b3-b5+1);
342 }
343 int nocounter=0;
344 writeRead(f_out2, seqid2, seqinfo2, rseq2, rqv2, nocounter);
345 }
346 } //paired read
347 // }
348 if (tcode>1) { //trashed
349 if (tcode=='s') trash_s++;
350 else if (tcode=='Q') trash_Q++;
351 else if (tcode=='N') trash_N++;
352 else if (tcode=='D') trash_D++;
353 else if (tcode=='3') trash_A3++;
354 else if (tcode=='5') trash_A5++;
355 if (trashReport) trash_report(tcode, seqid, freport);
356 }
357 else if (!doCollapse) { //write it
358 if (tcode>0) {
359 rseq=rseq.substr(a5,a3-a5+1);
360 if (!rqv.is_empty()) rqv=rqv.substr(a5,a3-a5+1);
361 }
362 writeRead(f_out, seqid, seqinfo, rseq, rqv, outcounter);
363 }
364 } //for each fastq record
365 delete fq2;
366 FRCLOSE(f_in);
367 FRCLOSE(f_in2);
368 if (doCollapse) {
369 outcounter=0;
370 int maxdup_count=1;
371 char* maxdup_seq=NULL;
372 dhash.startIterate();
373 FqDupRec* qd=NULL;
374 char* seq=NULL;
375 while ((qd=dhash.NextData(seq))!=NULL) {
376 GStr rseq(seq);
377 //do the dusting here
378 if (doDust) {
379 int dustbases=dust(rseq);
380 if (dustbases>(rseq.length()>>1)) {
381 if (trashReport && qd->firstname!=NULL) {
382 fprintf(freport, "%s_x%d\tD\n",qd->firstname, qd->count);
383 }
384 trash_D+=qd->count;
385 continue;
386 }
387 }
388 outcounter++;
389 if (qd->count>maxdup_count) {
390 maxdup_count=qd->count;
391 maxdup_seq=seq;
392 }
393 if (isfasta) {
394 if (prefix.is_empty()) {
395 fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count,
396 rseq.chars());
397 }
398 else { //use custom read name
399 fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter,
400 qd->count, rseq.chars());
401 }
402 }
403 else { //fastq format
404 if (convert_phred) convertPhred(qd->qv, qd->len);
405 if (prefix.is_empty()) {
406 fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count,
407 rseq.chars(), qd->qv);
408 }
409 else { //use custom read name
410 fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter,
411 qd->count, rseq.chars(), qd->qv);
412 }
413 }
414 }//for each element of dhash
415 if (maxdup_count>1) {
416 GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq);
417 }
418 } //collapse entries
419 if (verbose) {
420 if (paired_reads) {
421 GMessage(">Input files : %s , %s\n", infname.chars(), infname2.chars());
422 GMessage("Number of input pairs :%9d\n", incounter);
423 GMessage(" Output pairs :%9d\n", outcounter);
424 }
425 else {
426 GMessage(">Input file : %s\n", infname.chars());
427 GMessage("Number of input reads :%9d\n", incounter);
428 GMessage(" Output reads :%9d\n", outcounter);
429 }
430 GMessage("------------------------------------\n");
431 if (num_trimmed5)
432 GMessage(" 5' trimmed :%9d (min. trim: %d)\n", num_trimmed5, min_trimmed5);
433 if (num_trimmed3)
434 GMessage(" 3' trimmed :%9d (min. trim: %d)\n", num_trimmed3, min_trimmed3);
435 GMessage("------------------------------------\n");
436 if (trash_s>0)
437 GMessage(" Trashed by length:%9d\n", trash_s);
438 if (trash_N>0)
439 GMessage(" Trashed by N%%:%9d\n", trash_N);
440 if (trash_Q>0)
441 GMessage("Trashed by low quality:%9d\n", trash_Q);
442 if (trash_A5>0)
443 GMessage(" Trashed by 5' adapter:%9d\n", trash_A5);
444 if (trash_A3>0)
445 GMessage(" Trashed by 3' adapter:%9d\n", trash_A3);
446 }
447 if (trashReport) {
448 FWCLOSE(freport);
449 }
450 FWCLOSE(f_out);
451 FWCLOSE(f_out2);
452 } //while each input file
453
454 //getc(stdin);
455 }
456
457 class NData {
458 public:
459 int NPos[1024]; //there should be no reads longer than 1K ?
460 int NCount;
461 int end5;
462 int end3;
463 int seqlen;
464 double perc_N; //percentage of Ns in end5..end3 range only!
465 const char* seq;
466 bool valid;
467 NData() {
468 NCount=0;
469 end5=0;
470 end3=0;
471 seq=NULL;
472 perc_N=0;
473 valid=true;
474 }
475 void init(GStr& rseq) {
476 NCount=0;
477 perc_N=0;
478 seqlen=rseq.length();
479 seq=rseq.chars();
480 end5=1;
481 end3=seqlen;
482 for (int i=0;i<seqlen;i++)
483 if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT")
484 NPos[NCount]=i;
485 NCount++;
486 }
487 perc_N=(NCount*100.0)/seqlen;
488 valid=true;
489 }
490 void N_calc() { //only in the region after trimming
491 int n=0;
492 for (int i=end3-1;i<end5;i++) {
493 if (seq[i]=='N') n++;
494 }
495 perc_N=(n*100.0)/(end5-end3+1);
496 }
497 };
498
499 static NData feat;
500 int perc_lenN=12; // incremental distance from ends, in percentage of
501 // sequence length, where N-trimming is done (default:12 %) (autolimited to 20)
502
503 void N_analyze(int l5, int l3, int p5, int p3) {
504 /* assumes feat was filled properly */
505 int old_dif, t5,t3,v;
506 if (l3<l5+2 || p5>p3 ) {
507 feat.end5=l5+1;
508 feat.end3=l3+1;
509 return;
510 }
511
512 t5=feat.NPos[p5]-l5;
513 t3=l3-feat.NPos[p3];
514 old_dif=p3-p5;
515 v=(int)((((double)(l3-l5))*perc_lenN)/100);
516 if (v>20) v=20; /* enforce N-search limit for very long reads */
517 if (t5 < v ) {
518 l5=feat.NPos[p5]+1;
519 p5++;
520 }
521 if (t3 < v) {
522 l3=feat.NPos[p3]-1;
523 p3--;
524 }
525 /* restNs=p3-p5; number of Ns in the new CLR */
526 if (p3-p5==old_dif) { /* no change, return */
527 /* don't trim if this may shorten the read too much */
528 //if (l5-l3<min_read_len) return;
529 feat.end5=l5+1;
530 feat.end3=l3+1;
531 return;
532 }
533 else
534 N_analyze(l5,l3, p5,p3);
535 }
536
537
538 bool qtrim(GStr& qvs, int &l5, int &l3) {
539 if (qvtrim_qmin==0 || qvs.is_empty()) return false;
540 if (qv_phredtype==0) {
541 //try to guess the Phred type
542 int vmin=256, vmax=0;
543 for (int i=0;i<qvs.length();i++) {
544 if (vmin>qvs[i]) vmin=qvs[i];
545 if (vmax<qvs[i]) vmax=qvs[i];
546 }
547 if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; }
548 if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; }
549 if (qv_phredtype==0) {
550 GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n");
551 }
552 if (verbose)
553 GMessage("Input reads have Phred-%d quality values.\n", (qv_phredtype==33 ? 33 : 64));
554 } //guessing Phred type
555 for (l3=qvs.length()-1;l3>2;l3--) {
556 if (qvs[l3]-qv_phredtype>=qvtrim_qmin && qvs[l3-1]-qv_phredtype>=qvtrim_qmin) break;
557 }
558 //just in case, check also the 5' the end (?)
559 for (l5=0;l5<qvs.length()-3;l5++) {
560 if (qvs[l5]-qv_phredtype>=qvtrim_qmin && qvs[l5+1]-qv_phredtype>=qvtrim_qmin) break;
561 }
562 if (qvtrim_max>0) {
563 if (qvs.length()-1-l3>qvtrim_max) l3=qvs.length()-1-qvtrim_max;
564 if (l5>qvtrim_max) l5=qvtrim_max;
565 }
566 return (l5>0 || l3<qvs.length()-1);
567 }
568
569 bool ntrim(GStr& rseq, int &l5, int &l3) {
570 //count Ns in the sequence, trim N-rich ends
571 feat.init(rseq);
572 l5=feat.end5-1;
573 l3=feat.end3-1;
574 N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1);
575 if (l5==feat.end5-1 && l3==feat.end3-1) {
576 if (feat.perc_N>max_perc_N) {
577 feat.valid=false;
578 l3=l5+1;
579 return true;
580 }
581 else {
582 return false; //nothing changed
583 }
584 }
585 l5=feat.end5-1;
586 l3=feat.end3-1;
587 if (l3-l5+1<min_read_len) {
588 feat.valid=false;
589 return true;
590 }
591 feat.N_calc();
592
593 if (feat.perc_N>max_perc_N) {
594 feat.valid=false;
595 l3=l5+1;
596 return true;
597 }
598 return true;
599 }
600
601 //--------------- dust functions ----------------
602 class DNADuster {
603 public:
604 int dustword;
605 int dustwindow;
606 int dustwindow2;
607 int dustcutoff;
608 int mv, iv, jv;
609 int counts[32*32*32];
610 int iis[32*32*32];
611 DNADuster(int cutoff=16, int winsize=32, int wordsize=3) {
612 dustword=wordsize;
613 dustwindow=winsize;
614 dustwindow2 = (winsize>>1);
615 dustcutoff=cutoff;
616 mv=0;
617 iv=0;
618 jv=0;
619 }
620 void setWindowSize(int value) {
621 dustwindow = value;
622 dustwindow2 = (dustwindow >> 1);
623 }
624 void setWordSize(int value) {
625 dustword=value;
626 }
627 void wo1(int len, const char* s, int ivv) {
628 int i, ii, j, v, t, n, n1, sum;
629 int js, nis;
630 n = 32 * 32 * 32;
631 n1 = n - 1;
632 nis = 0;
633 i = 0;
634 ii = 0;
635 sum = 0;
636 v = 0;
637 for (j=0; j < len; j++, s++) {
638 ii <<= 5;
639 if (*s<=32) {
640 i=0;
641 continue;
642 }
643 ii |= *s - 'A'; //assume uppercase!
644 ii &= n1;
645 i++;
646 if (i >= dustword) {
647 for (js=0; js < nis && iis[js] != ii; js++) ;
648 if (js == nis) {
649 iis[nis] = ii;
650 counts[ii] = 0;
651 nis++;
652 }
653 if ((t = counts[ii]) > 0) {
654 sum += t;
655 v = 10 * sum / j;
656 if (mv < v) {
657 mv = v;
658 iv = ivv;
659 jv = j;
660 }
661 }
662 counts[ii]++;
663 }
664 }
665 }
666
667 int wo(int len, const char* s, int* beg, int* end) {
668 int i, l1;
669 l1 = len - dustword + 1;
670 if (l1 < 0) {
671 *beg = 0;
672 *end = len - 1;
673 return 0;
674 }
675 mv = 0;
676 iv = 0;
677 jv = 0;
678 for (i=0; i < l1; i++) {
679 wo1(len-i, s+i, i);
680 }
681 *beg = iv;
682 *end = iv + jv;
683 return mv;
684 }
685
686 void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) {
687 int i, j, l, a, b, v;
688 if (cutoff==0) cutoff=dustcutoff;
689 a=0;b=0;
690 //GMessage("Dust cutoff=%d\n", cutoff);
691 for (i=0; i < seqlen; i += dustwindow2) {
692 l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i;
693 v = wo(l, seq+i, &a, &b);
694 if (v > cutoff) {
695 //for (j = a; j <= b && j < dustwindow2; j++) {
696 for (j = a; j <= b; j++) {
697 seqmsk[i+j]='N';//could be made lowercase instead
698 }
699 }
700 }
701 //return first;
702 }
703 };
704
705 static DNADuster duster;
706
707 int dust(GStr& rseq) {
708 char* seq=Gstrdup(rseq.chars());
709 duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff);
710 //check the number of Ns:
711 int ncount=0;
712 for (int i=0;i<rseq.length();i++) {
713 if (seq[i]=='N') ncount++;
714 }
715 GFREE(seq);
716 return ncount;
717 }
718
719 int get3mer_value(const char* s) {
720 return (s[0]<<16)+(s[1]<<8)+s[2];
721 }
722
723 int w3_match(int qv, const char* str, int slen, int start_index=0) {
724 if (start_index>=slen || start_index<0) return -1;
725 for (int i=start_index;i<slen-3;i++) {
726 int rv=get3mer_value(str+i);
727 if (rv==qv) return i;
728 }
729 return -1;
730 }
731
732 int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) {
733 if (end_index>=slen) return -1;
734 if (end_index<0) end_index=slen-1;
735 for (int i=end_index-2;i>=0;i--) {
736 int rv=get3mer_value(str+i);
737 if (rv==qv) return i;
738 }
739 return -1;
740 }
741
742 struct SLocScore {
743 int pos;
744 int score;
745 SLocScore(int p=0,int s=0) {
746 pos=p;
747 score=s;
748 }
749 void set(int p, int s) {
750 pos=p;
751 score=s;
752 }
753 void add(int p, int add) {
754 pos=p;
755 score+=add;
756 }
757 };
758
759 bool trim_poly3(GStr &seq, int &l5, int &l3, const char* poly_seed) {
760 int rlen=seq.length();
761 l5=0;
762 l3=rlen-1;
763 int32 seedVal=*(int32*)poly_seed;
764 char polyChar=poly_seed[0];
765 //assumes N trimming was already done
766 //so a poly match should be very close to the end of the read
767 // -- find the initial match (seed)
768 int lmin=GMAX((rlen-12), 0);
769 int li;
770 for (li=rlen-4;li>lmin;li--) {
771 if (seedVal==*(int*)&(seq[li])) {
772 break;
773 }
774 }
775 if (li<=lmin) return false;
776 //seed found, try to extend it both ways
777 //extend right
778 int ri=li+3;
779 SLocScore loc(ri, poly_m_score<<2);
780 SLocScore maxloc(loc);
781 //extend right
782 while (ri<rlen-2) {
783 ri++;
784 if (seq[ri]==polyChar) {
785 loc.add(ri,poly_m_score);
786 }
787 else if (seq[ri]=='N') {
788 loc.add(ri,0);
789 }
790 else { //mismatch
791 loc.add(ri,poly_mis_score);
792 if (maxloc.score-loc.score>poly_dropoff_score) break;
793 }
794 if (maxloc.score<=loc.score) {
795 maxloc=loc;
796 }
797 }
798 ri=maxloc.pos;
799 if (ri<rlen-3) return false; //no trimming wanted, too far from 3' end
800 //ri = right boundary for the poly match
801 //extend left
802 loc.set(li, maxloc.score);
803 maxloc.pos=li;
804 while (li>0) {
805 li--;
806 if (seq[li]==polyChar) {
807 loc.add(li,poly_m_score);
808 }
809 else if (seq[li]=='N') {
810 loc.add(li,0);
811 }
812 else { //mismatch
813 loc.add(li,poly_mis_score);
814 if (maxloc.score-loc.score>poly_dropoff_score) break;
815 }
816 if (maxloc.score<=loc.score) {
817 maxloc=loc;
818 }
819 }
820 if (maxloc.score>poly_minScore && ri>=rlen-3) {
821 l5=li;
822 l3=ri;
823 return true;
824 }
825 return false;
826 }
827
828
829 bool trim_poly5(GStr &seq, int &l5, int &l3, const char* poly_seed) {
830 int rlen=seq.length();
831 l5=0;
832 l3=rlen-1;
833 int32 seedVal=*(int32*)poly_seed;
834 char polyChar=poly_seed[0];
835 //assumes N trimming was already done
836 //so a poly match should be very close to the end of the read
837 // -- find the initial match (seed)
838 int lmax=GMIN(8, rlen-4);//how far from 5' end to look for 4-mer seeds
839 int li;
840 for (li=0;li<=lmax;li++) {
841 if (seedVal==*(int*)&(seq[li])) {
842 break;
843 }
844 }
845 if (li>lmax) return false;
846 //seed found, try to extend it both ways
847 //extend left
848 int ri=li+3; //save rightmost base of the seed
849 SLocScore loc(li, poly_m_score<<2);
850 SLocScore maxloc(loc);
851 while (li>0) {
852 li--;
853 if (seq[li]==polyChar) {
854 loc.add(li,poly_m_score);
855 }
856 else if (seq[li]=='N') {
857 loc.add(li,0);
858 }
859 else { //mismatch
860 loc.add(li,poly_mis_score);
861 if (maxloc.score-loc.score>poly_dropoff_score) break;
862 }
863 if (maxloc.score<=loc.score) {
864 maxloc=loc;
865 }
866 }
867 li=maxloc.pos;
868 if (li>3) return false; //no trimming wanted, too far from 5' end
869 //li = right boundary for the poly match
870
871 //extend right
872 loc.set(ri, maxloc.score);
873 maxloc.pos=ri;
874 while (ri<rlen-2) {
875 ri++;
876 if (seq[ri]==polyChar) {
877 loc.add(ri,poly_m_score);
878 }
879 else if (seq[ri]=='N') {
880 loc.add(ri,0);
881 }
882 else { //mismatch
883 loc.add(ri,poly_mis_score);
884 if (maxloc.score-loc.score>poly_dropoff_score) break;
885 }
886 if (maxloc.score<=loc.score) {
887 maxloc=loc;
888 }
889 }
890
891 if (maxloc.score>poly_minScore && li<=3) {
892 l5=li;
893 l3=ri;
894 return true;
895 }
896 return false;
897 }
898
899 bool trim_adapter3(GStr& seq, int&l5, int &l3) {
900 int rlen=seq.length();
901 l5=0;
902 l3=rlen-1;
903 //first try a full match, we might get lucky
904 int fi=-1;
905 if ((fi=seq.index(adapter3))>=0) {
906 if (fi<rlen-fi-a3len) {//match is closer to the right end
907 l5=fi+a3len;
908 l3=rlen-1;
909 }
910 else {
911 l5=0;
912 l3=fi-1;
913 }
914 return true;
915 }
916 #ifdef DEBUG
917 if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars());
918 #endif
919
920 //also, for fast detection of other adapter-only reads that start past
921 // the beginning of the adapter sequence, try to see if the first a3len-4
922 // bases of the read are a substring of the adapter
923 if (rlen>a3len-3) {
924 GStr rstart=seq.substr(1,a3len-4);
925 if ((fi=adapter3.index(rstart))>=0) {
926 l3=rlen-1;
927 l5=a3len-4;
928 while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++;
929 return true;
930 }
931 }
932 CSegChain a3segs; //no chains here, just an ordered collection of segment pairs
933 //check the easy cases - 11 bases exact match at the end
934 int fdlen=11;
935 if (a3len<16) {
936 fdlen=a3len>>1;
937 }
938 if (fdlen>4) {
939 //check if we're lucky enough to have the last 11 bases of the read a part of the adapter
940 GStr rstart=seq.substr(-fdlen-3,fdlen);
941 if ((fi=adapter3.index(rstart))>=0) {
942 #ifdef DEBUG
943 if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars());
944 #endif
945 if (extendMatch(seq.chars(), rlen, rlen-fdlen-3,
946 adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs))
947 return true;
948 }
949 //another easy case: first 11 characters of the adaptor found as a substring of the read
950 GStr bstr=adapter3.substr(0, fdlen);
951 if ((fi=seq.rindex(bstr))>=0) {
952 #ifdef DEBUG
953 if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
954 #endif
955 if (extendMatch(seq.chars(), rlen, fi,
956 adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs))
957 return true;
958 }
959 } //tried to match 11 bases first
960
961 //no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor
962 //-- only extend if the match is in the 3' (ending) region of the read
963 int wordSize=3;
964 int hlen=12;
965 if (hlen>a3len-wordSize) hlen=a3len-wordSize;
966 int imin=rlen>>1; //last half of the read, left boundary for the wmer match
967 if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); }
968 imin=rlen-imin;
969 for (int iw=0;iw<hlen;iw++) {
970 //int32* qv=(int32*)(adapter3.chars()+iw);
971 int qv=get3mer_value(adapter3.chars()+iw);
972 fi=-1;
973 //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
974 while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
975 //GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin);
976
977 #ifdef DEBUG
978 if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars());
979 #endif
980 if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
981 a3len, iw, wordSize, l5,l3, a3segs)) return true;
982 fi--;
983 if (fi<imin) break;
984 }
985 } //for each wmer in the first hlen bases of the adaptor
986 /*
987 //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
988 //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
989 if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false;
990 int hlen2=a3len-wordSize;
991 //if (hlen2>a3len-4) hlen2=a3len-4;
992 if (hlen2>hlen) {
993 #ifdef DEBUG
994 if (debug && a3segs.Count()>0) {
995 GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
996 }
997 #endif
998 for (int iw=hlen;iw<hlen2;iw++) {
999 //int* qv=(int32 *)(adapter3.chars()+iw);
1000 int qv=get3mer_value(adapter3.chars()+iw);
1001 fi=-1;
1002 //while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1003 while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) {
1004 extendMatch(seq.chars(), rlen, fi, adapter3.chars(),
1005 a3len, iw, wordSize, l5,l3, a3segs);
1006 fi--;
1007 if (fi<imin) break;
1008 }
1009 } //for each wmer between hlen2 and hlen bases of the adaptor
1010 }
1011 //lastly, analyze collected a3segs for a possible gapped alignment:
1012 GList<CSegChain> segchains(false,true,false);
1013 #ifdef DEBUG
1014 if (debug && a3segs.Count()>0) {
1015 GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1016 }
1017 #endif
1018 for (int i=0;i<a3segs.Count();i++) {
1019 if (a3segs[i]->chain==NULL) {
1020 if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain
1021 CSegChain* newchain=new CSegChain();
1022 newchain->setFreeItem(false);
1023 newchain->addSegPair(a3segs[i]);
1024 a3segs[i]->chain=newchain;
1025 segchains.Add(newchain); //just to free them when done
1026 }
1027 for (int j=i+1;j<a3segs.Count();j++) {
1028 CSegChain* chain=a3segs[i]->chain;
1029 if (chain->extendChain(a3segs[j])) {
1030 a3segs[j]->chain=chain;
1031 #ifdef DEBUG
1032 if (debug) dbgPrintChain(*chain, adapter3.chars());
1033 #endif
1034 //save time by checking here if the extended chain is already acceptable for trimming
1035 if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) {
1036 l5=0;
1037 l3=chain->astart-2;
1038 #ifdef DEBUG
1039 if (debug && a3segs.Count()>0) {
1040 GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars());
1041 }
1042 #endif
1043 return true;
1044 }
1045 } //chain can be extended
1046 }
1047 } //collect segment alignments into chains
1048 */
1049 return false; //no adapter parts found
1050 }
1051
1052 bool trim_adapter5(GStr& seq, int&l5, int &l3) {
1053 //if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars());
1054 int rlen=seq.length();
1055 l5=0;
1056 l3=rlen-1;
1057 //try to see if adapter is fully included in the read
1058 int fi=-1;
1059 if ((fi=seq.index(adapter5))>=0) {
1060 if (fi<rlen-fi-a5len) {//match is closer to the right end
1061 l5=fi+a5len;
1062 l3=rlen-1;
1063 }
1064 else {
1065 l5=0;
1066 l3=fi-1;
1067 }
1068 return true;
1069 }
1070 #ifdef DEBUG
1071 if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars());
1072 #endif
1073
1074 CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded
1075
1076 //try the easy way out first - look for an exact match of 11 bases
1077 int fdlen=11;
1078 if (a5len<16) {
1079 fdlen=a5len>>1;
1080 }
1081 if (fdlen>4) {
1082 GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus
1083 if ((fi=adapter5.index(rstart))>=0) {
1084 #ifdef DEBUG
1085 if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars());
1086 #endif
1087 if (extendMatch(seq.chars(), rlen, 1,
1088 adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true))
1089 return true;
1090 }
1091 //another easy case: last 11 characters of the adaptor found as a substring of the read
1092 GStr bstr=adapter5.substr(-fdlen);
1093 if ((fi=seq.index(bstr))>=0) {
1094 #ifdef DEBUG
1095 if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars());
1096 #endif
1097 if (extendMatch(seq.chars(), rlen, fi,
1098 adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true))
1099 return true;
1100 }
1101 } //tried to matching at most 11 bases first
1102
1103 //-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor
1104 //-- only extend a wmer if it matches in the 5' (beginning) region of the read
1105 int wordSize=3;
1106 int hlen=12;
1107 if (hlen>a5len-wordSize) hlen=a5len-wordSize;
1108 int imax=rlen>>1; //first half of the read, right boundary for the wmer match
1109 if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); }
1110 for (int iw=0;iw<=hlen;iw++) {
1111 int apstart=a5len-iw-wordSize;
1112 fi=0;
1113 //int* qv=(int32 *)(adapter5.chars()+apstart);
1114 int qv=get3mer_value(adapter5.chars()+apstart);
1115 //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1116 while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1117 #ifdef DEBUG
1118 if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars());
1119 #endif
1120 if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1121 a5len, apstart, wordSize, l5,l3, a5segs, true)) return true;
1122 fi++;
1123 if (fi>imax) break;
1124 }
1125 } //for each wmer in the last hlen bases of the adaptor
1126 /*
1127
1128 //couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there
1129 //but only do this if we already have segment pairs collected in the last 12 bases of the adapter
1130 if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1131 int hlen2=a5len-wordSize;
1132 //if (hlen2>a5len-wordSize) hlen2=a5len-wordSize;
1133 #ifdef DEBUG
1134 if (debug && a5segs.Count()>0) {
1135 GMessage(" >>>>>2nd. hash: %s\n",seq.chars());
1136 }
1137 #endif
1138 if (hlen2>hlen) {
1139 for (int iw=hlen+1;iw<=hlen2;iw++) {
1140 int apstart=a5len-iw-wordSize;
1141 fi=0;
1142 //int* qv=(int32 *)(adapter5.chars()+apstart);
1143 int qv=get3mer_value(adapter5.chars()+apstart);
1144 //while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1145 while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) {
1146 extendMatch(seq.chars(), rlen, fi, adapter5.chars(),
1147 a5len, apstart, wordSize, l5,l3, a5segs, true);
1148 fi++;
1149 if (fi>imax) break;
1150 }
1151 } //for each wmer between hlen2 and hlen bases of the adaptor
1152 }
1153 if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false;
1154 // lastly, analyze collected a5segs for a possible gapped alignment:
1155 GList<CSegChain> segchains(false,true,false);
1156 #ifdef DEBUG
1157 if (debug && a5segs.Count()>0) {
1158 GMessage(">>>>>>>>> Read: %s\n",seq.chars());
1159 }
1160 #endif
1161 for (int i=0;i<a5segs.Count();i++) {
1162 if (a5segs[i]->chain==NULL) {
1163 if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain
1164 CSegChain* newchain=new CSegChain(true);
1165 newchain->setFreeItem(false);
1166 newchain->addSegPair(a5segs[i]);
1167 a5segs[i]->chain=newchain;
1168 segchains.Add(newchain); //just to free them when done
1169 }
1170 for (int j=i+1;j<a5segs.Count();j++) {
1171 CSegChain* chain=a5segs[i]->chain;
1172 if (chain->extendChain(a5segs[j])) {
1173 a5segs[j]->chain=chain;
1174 #ifdef DEBUG
1175 if (debug) dbgPrintChain(*chain, adapter5.chars());
1176 #endif
1177 //save time by checking here if the extended chain is already acceptable for trimming
1178 if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) {
1179 l5=chain->aend;
1180 l3=rlen-1;
1181 return true;
1182 }
1183 } //chain can be extended
1184 }
1185 } //collect segment alignments into chains
1186 */
1187 return false; //no adapter parts found
1188 }
1189
1190 //convert qvs to/from phred64 from/to phread33
1191 void convertPhred(GStr& q) {
1192 for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd;
1193 }
1194
1195 void convertPhred(char* q, int len) {
1196 for (int i=0;i<len;i++) q[i]+=qv_cvtadd;
1197 }
1198
1199 bool getFastxRec(GLineReader& fq, GStr& rseq, GStr& rqv,
1200 GStr& rname, GStr& rinfo, GStr& infname) {
1201 rseq="";
1202 rqv="";
1203 rname="";
1204 rinfo="";
1205 if (fq.eof()) return false;
1206 char* l=fq.getLine();
1207 while (l!=NULL && (l[0]==0 || isspace(l[0]))) l=fq.getLine(); //ignore empty lines
1208 if (l==NULL) return false;
1209 /* if (rawFormat) {
1210 //TODO: implement raw qseq parsing here
1211 //if (raw type=N) then continue; //skip invalid/bad records
1212 } //raw qseq format
1213 else { // FASTQ or FASTA */
1214 isfasta=(l[0]=='>');
1215 if (!isfasta && l[0]!='@') GError("Error: fasta/fastq record marker not found(%s)\n%s\n",
1216 infname.chars(), l);
1217 GStr s(l);
1218 rname=&(l[1]);
1219 for (int i=0;i<rname.length();i++)
1220 if (rname[i]<=' ') {
1221 if (i<rname.length()-2) rinfo=rname.substr(i+1);
1222 rname.cut(i);
1223 break;
1224 }
1225 //now get the sequence
1226 if ((l=fq.getLine())==NULL)
1227 GError("Error: unexpected EOF after header for read %s (%s)\n",
1228 rname.chars(), infname.chars());
1229 rseq=l; //this must be the DNA line
1230 while ((l=fq.getLine())!=NULL) {
1231 //seq can span multiple lines
1232 if (l[0]=='>' || l[0]=='+') {
1233 fq.pushBack();
1234 break; //
1235 }
1236 rseq+=l;
1237 } //check for multi-line seq
1238 if (!isfasta) { //reading fastq quality values, which can also be multi-line
1239 if ((l=fq.getLine())==NULL)
1240 GError("Error: unexpected EOF after sequence for %s\n", rname.chars());
1241 if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n");
1242 if ((l=fq.getLine())==NULL)
1243 GError("Error: unexpected EOF after qv header for %s\n", rname.chars());
1244 rqv=l;
1245 //if (rqv.length()!=rseq.length())
1246 // GError("Error: qv len != seq len for %s\n", rname.chars());
1247 while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) {
1248 rqv+=l; //append to qv string
1249 }
1250 }// fastq
1251 // } //<-- FASTA or FASTQ
1252 rseq.upper();
1253 return true;
1254 }
1255
1256 char process_read(GStr& rname, GStr& rseq, GStr& rqv, int &l5, int &l3) {
1257 //returns 0 if the read was untouched, 1 if it was just trimmed
1258 // and a trash code if it was trashed
1259 l5=0;
1260 l3=rseq.length()-1;
1261 if (l3-l5+1<min_read_len) {
1262 return 's';
1263 }
1264 GStr wseq(rseq.chars());
1265 GStr wqv(rqv.chars());
1266 int w5=l5;
1267 int w3=l3;
1268 //first do the q-based trimming
1269 if (qvtrim_qmin!=0 && !wqv.is_empty() && qtrim(wqv, w5, w3)) { // qv-threshold trimming
1270 if (w3-w5+1<min_read_len) {
1271 return 'Q'; //invalid read
1272 }
1273 //-- keep only the w5..w3 range
1274 l5=w5;
1275 l3=w3;
1276 wseq=wseq.substr(w5, w3-w5+1);
1277 if (!wqv.is_empty())
1278 wqv=wqv.substr(w5, w3-w5+1);
1279 } //qv trimming
1280 // N-trimming on the remaining read seq
1281 if (ntrim(wseq, w5, w3)) {
1282 //GMessage("before: %s\n",wseq.chars());
1283 //GMessage("after : %s (%d)\n",wseq.substr(w5,w3-w5+1).chars(),w3-w5+1);
1284 l5+=w5;
1285 l3-=(wseq.length()-1-w3);
1286 if (w3-w5+1<min_read_len) {
1287 return 'N'; //to be trashed
1288 }
1289 //-- keep only the w5..w3 range
1290 wseq=wseq.substr(w5, w3-w5+1);
1291 if (!wqv.is_empty())
1292 wqv=wqv.substr(w5, w3-w5+1);
1293 w5=0;
1294 w3=wseq.length()-1;
1295 }
1296 if (a3len>0) {
1297 if (trim_adapter3(wseq, w5, w3)) {
1298 int trimlen=wseq.length()-(w3-w5+1);
1299 num_trimmed3++;
1300 if (trimlen<min_trimmed3)
1301 min_trimmed3=trimlen;
1302 l5+=w5;
1303 l3-=(wseq.length()-1-w3);
1304 if (w3-w5+1<min_read_len) {
1305 return '3';
1306 }
1307 //-- keep only the w5..w3 range
1308 wseq=wseq.substr(w5, w3-w5+1);
1309 if (!wqv.is_empty())
1310 wqv=wqv.substr(w5, w3-w5+1);
1311 }//some adapter was trimmed
1312 } //adapter trimming
1313 if (a5len>0) {
1314 if (trim_adapter5(wseq, w5, w3)) {
1315 int trimlen=wseq.length()-(w3-w5+1);
1316 num_trimmed5++;
1317 if (trimlen<min_trimmed5)
1318 min_trimmed5=trimlen;
1319 l5+=w5;
1320 l3-=(wseq.length()-1-w3);
1321 if (w3-w5+1<min_read_len) {
1322 return '5';
1323 }
1324 //-- keep only the w5..w3 range
1325 wseq=wseq.substr(w5, w3-w5+1);
1326 if (!wqv.is_empty())
1327 wqv=wqv.substr(w5, w3-w5+1);
1328 }//some adapter was trimmed
1329 } //adapter trimming
1330 if (doCollapse) {
1331 //keep read for later
1332 FqDupRec* dr=dhash.Find(wseq.chars());
1333 if (dr==NULL) { //new entry
1334 //if (prefix.is_empty())
1335 dhash.Add(wseq.chars(),
1336 new FqDupRec(&wqv, rname.chars()));
1337 //else dhash.Add(wseq.chars(), new FqDupRec(wqv.chars(),wqv.length()));
1338 }
1339 else
1340 dr->add(wqv);
1341 } //collapsing duplicates
1342 else { //not collapsing duplicates
1343 //apply the dust filter now
1344 if (doDust) {
1345 int dustbases=dust(wseq);
1346 if (dustbases>(wseq.length()>>1)) {
1347 return 'D';
1348 }
1349 }
1350 } //not collapsing duplicates
1351 return (l5>0 || l3<rseq.length()-1) ? 1 : 0;
1352 }
1353
1354 void printHeader(FILE* f_out, char recmarker, GStr& rname, GStr& rinfo) {
1355 //GMessage("printing Header..%c%s\n",recmarker, rname.chars());
1356 if (rinfo.is_empty()) fprintf(f_out, "%c%s\n",recmarker,rname.chars());
1357 else fprintf(f_out, "%c%s %s\n",recmarker, rname.chars(), rinfo.chars());
1358 }
1359
1360 void writeRead(FILE* f_out, GStr& rname, GStr& rinfo, GStr& rseq, GStr& rqv, int& outcounter) {
1361 outcounter++;
1362 bool asFasta=(rqv.is_empty() || fastaOutput);
1363 if (asFasta) {
1364 if (prefix.is_empty()) {
1365 printHeader(f_out, '>',rname,rinfo);
1366 fprintf(f_out, "%s\n", rseq.chars()); //plain one-line fasta for now
1367 }
1368 else {
1369 fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter,
1370 rseq.chars());
1371 }
1372 }
1373 else { //fastq
1374 if (convert_phred) convertPhred(rqv);
1375 if (prefix.is_empty()) {
1376 printHeader(f_out, '@', rname, rinfo);
1377 fprintf(f_out, "%s\n+\n%s\n", rseq.chars(), rqv.chars());
1378 }
1379 else
1380 fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter,
1381 rseq.chars(),rqv.chars() );
1382 }
1383 }
1384
1385 void trash_report(char trashcode, GStr& rname, FILE* freport) {
1386 if (freport==NULL || trashcode<=' ') return;
1387 if (trashcode=='3' || trashcode=='5') {
1388 fprintf(freport, "%s\tA%c\n",rname.chars(),trashcode);
1389 }
1390 else {
1391 fprintf(freport, "%s\t%c\n",rname.chars(),trashcode);
1392 }
1393 //tcounter++;
1394 }
1395
1396 GStr getFext(GStr& s, int* xpos=NULL) {
1397 //s must be a filename without a path
1398 GStr r("");
1399 if (xpos!=NULL) *xpos=0;
1400 if (s.is_empty() || s=="-") return r;
1401 int p=s.rindex('.');
1402 int d=s.rindex('/');
1403 if (p<=0 || p>s.length()-2 || p<s.length()-7 || p<d) return r;
1404 r=s.substr(p+1);
1405 if (xpos!=NULL) *xpos=p+1;
1406 r.lower();
1407 return r;
1408 }
1409
1410 void baseFileName(GStr& fname) {
1411 //remove all known extensions, like .txt,fq,fastq,fasta,fa)(.gz .gzip .bz2 .bzip2) .
1412 int xpos=0;
1413 GStr fext=getFext(fname, &xpos);
1414 if (xpos<=1) return;
1415 bool extdel=false;
1416 GStr f2;
1417 if (fext=="z" || fext=="zip") {
1418 extdel=true;
1419 }
1420 else if (fext.length()>=2) {
1421 f2=fext.substr(0,2);
1422 extdel=(f2=="gz" || f2=="bz");
1423 }
1424 if (extdel) {
1425 fname.cut(xpos-1);
1426 fext=getFext(fname, &xpos);
1427 if (xpos<=1) return;
1428 }
1429 extdel=false;
1430 if (fext=="f" || fext=="fq" || fext=="txt" || fext=="seq" || fext=="sequence") {
1431 extdel=true;
1432 }
1433 else if (fext.length()>=2) {
1434 extdel=(fext.substr(0,2)=="fa");
1435 }
1436 if (extdel) fname.cut(xpos-1);
1437 GStr fncp(fname);
1438 fncp.lower();
1439 fncp.chomp("seq");
1440 fncp.chomp("sequence");
1441 fncp.trimR("_.");
1442 if (fncp.length()<fname.length()) fname.cut(fncp.length());
1443 }
1444
1445 FILE* prepOutFile(GStr& infname, GStr& pocmd) {
1446 FILE* f_out=NULL;
1447 GStr fname(getFileName(infname.chars()));
1448 //eliminate known extensions
1449 baseFileName(fname);
1450 if (outsuffix.is_empty() || outsuffix=="-") { return stdout; }
1451 else if (pocmd.is_empty()) {
1452 GStr oname(fname);
1453 oname.append('.');
1454 oname.append(outsuffix);
1455 f_out=fopen(oname.chars(),"w");
1456 if (f_out==NULL) GError("Error: cannot create '%s'\n",oname.chars());
1457 }
1458 else {
1459 GStr oname(">");
1460 oname.append(fname);
1461 oname.append('.');
1462 oname.append(outsuffix);
1463 pocmd.append(oname);
1464 f_out=popen(pocmd.chars(), "w");
1465 if (f_out==NULL) GError("Error: cannot popen '%s'\n",pocmd.chars());
1466 }
1467 return f_out;
1468 }
1469
1470 void guess_unzip(GStr& fname, GStr& picmd) {
1471 GStr fext=getFext(fname);
1472 if (fext=="gz" || fext=="gzip" || fext=="z") {
1473 picmd="gzip -cd ";
1474 }
1475 else if (fext=="bz2" || fext=="bzip2" || fext=="bz" || fext=="bzip") {
1476 picmd="bzip2 -cd ";
1477 }
1478 }
1479
1480
1481 int loadAdapters(const char* fname) {
1482 GLineReader lr(fname);
1483 char* l;
1484 while ((l=lr.nextLine())!=NULL) {
1485 if (lr.length()<=3 || l[0]=='#') continue;
1486 char* s5;
1487 char* s3;
1488 if (isspace(l[0]) || l[0]==',' || l[0]==';'|| l[0]==':') {
1489 int i=1;
1490 while (l[i]!=0 && isspace(l[i])) {
1491 i++;
1492 }
1493 if (l[i]!=0) {
1494 adapters.Add(new CAdapters(NULL, &(l[i])));
1495 continue;
1496 }
1497 }
1498 else {
1499 p=l;
1500 while (*delpos!='\0' && strchr(fTokenDelimiter, *delpos)!=NULL)
1501 delpos++;
1502 s5.startTokenize("\t ;,:",);
1503 s5.nextToken(s3);
1504 }
1505 }
1506
1507 }
1508
1509 void setupFiles(FILE*& f_in, FILE*& f_in2, FILE*& f_out, FILE*& f_out2,
1510 GStr& s, GStr& infname, GStr& infname2) {
1511 // uses outsuffix to generate output file names and open file handles as needed
1512 infname="";
1513 infname2="";
1514 f_in=NULL;
1515 f_in2=NULL;
1516 f_out=NULL;
1517 f_out2=NULL;
1518 //analyze outsuffix intent
1519 GStr pocmd;
1520 if (outsuffix=="-") {
1521 f_out=stdout;
1522 }
1523 else {
1524 GStr ox=getFext(outsuffix);
1525 if (ox.length()>2) ox=ox.substr(0,2);
1526 if (ox=="gz") pocmd="gzip -9 -c ";
1527 else if (ox=="bz") pocmd="bzip2 -9 -c ";
1528 }
1529 if (s=="-") {
1530 f_in=stdin;
1531 infname="stdin";
1532 f_out=prepOutFile(infname, pocmd);
1533 return;
1534 } // streaming from stdin
1535 s.startTokenize(",:");
1536 s.nextToken(infname);
1537 bool paired=s.nextToken(infname2);
1538 if (fileExists(infname.chars())==0)
1539 GError("Error: cannot find file %s!\n",infname.chars());
1540 GStr fname(getFileName(infname.chars()));
1541 GStr picmd;
1542 guess_unzip(fname, picmd);
1543 if (picmd.is_empty()) {
1544 f_in=fopen(infname.chars(), "r");
1545 if (f_in==NULL) GError("Error opening file '%s'!\n",infname.chars());
1546 }
1547 else {
1548 picmd.append(infname);
1549 f_in=popen(picmd.chars(), "r");
1550 if (f_in==NULL) GError("Error at popen %s!\n", picmd.chars());
1551 }
1552 if (f_out==stdout) {
1553 if (paired) GError("Error: output suffix required for paired reads\n");
1554 return;
1555 }
1556 f_out=prepOutFile(infname, pocmd);
1557 if (!paired) return;
1558 if (doCollapse) GError("Error: sorry, -C option cannot be used with paired reads!\n");
1559 // ---- paired reads:-------------
1560 if (fileExists(infname2.chars())==0)
1561 GError("Error: cannot find file %s!\n",infname2.chars());
1562 picmd="";
1563 GStr fname2(getFileName(infname2.chars()));
1564 guess_unzip(fname2, picmd);
1565 if (picmd.is_empty()) {
1566 f_in2=fopen(infname2.chars(), "r");
1567 if (f_in2==NULL) GError("Error opening file '%s'!\n",infname2.chars());
1568 }
1569 else {
1570 picmd.append(infname2);
1571 f_in2=popen(picmd.chars(), "r");
1572 if (f_in2==NULL) GError("Error at popen %s!\n", picmd.chars());
1573 }
1574 f_out2=prepOutFile(infname2, pocmd);
1575 }