1 |
#include "GArgs.h" |
2 |
#include "GStr.h" |
3 |
#include "GHash.hh" |
4 |
#include "GList.hh" |
5 |
|
6 |
#define USAGE "Usage:\n\ |
7 |
fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\ |
8 |
[-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\ |
9 |
\n\ |
10 |
Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\ |
11 |
optionally collapse duplicates in the input fastq data.\n\ |
12 |
\n\ |
13 |
Options:\n\ |
14 |
-n rename all the reads using the <prefix> followed by a read counter;\n\ |
15 |
if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\ |
16 |
the read duplication count\n\ |
17 |
-o write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\ |
18 |
-5 trim the given adapter or primer sequence at the 5' end of each read\n\ |
19 |
(e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\ |
20 |
-3 trim the given adapter sequence at the 3' end of each read\n\ |
21 |
(e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\ |
22 |
-l minimum \"clean\" length at the high quality 5'end that a read must\n\ |
23 |
have in order to pass the filter (default: 16)\n\ |
24 |
-r write a simple \"trash report\" file listing the discarded reads with a\n\ |
25 |
one letter code for the reason why they were trashed\n\ |
26 |
-C collapse duplicate reads and add a prefix with their count to the read\n\ |
27 |
name\n\ |
28 |
-D apply a low-complexity (dust) filter and discard any read that has at\n\ |
29 |
least half of it masked by this\n\ |
30 |
-Q quality values in the input data are interpreted as phred64, convert\n\ |
31 |
them to phred33\n\ |
32 |
" |
33 |
// example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG |
34 |
FILE* f_out=NULL; //stdout if not provided |
35 |
FILE* f_in=NULL; //input fastq (stdin if not provided) |
36 |
FILE* freport=NULL; |
37 |
bool debug=false; |
38 |
bool doCollapse=false; |
39 |
bool doDust=false; |
40 |
bool trashReport=false; |
41 |
bool rawFormat=false; |
42 |
int min_read_len=16; |
43 |
int dust_cutoff=16; |
44 |
bool isfasta=false; |
45 |
bool phred64=false; |
46 |
GStr prefix; |
47 |
GStr adapter3; |
48 |
GStr adapter5; |
49 |
int a3len=0; |
50 |
int a5len=0; |
51 |
// adaptor matching metrics -- for extendMatch() function |
52 |
static const int a_m_score=2; //match score |
53 |
static const int a_mis_score=-3; //mismatch |
54 |
static const int a_dropoff_score=7; |
55 |
static const int a_min_score=7; |
56 |
|
57 |
// element in dhash: |
58 |
class FqDupRec { |
59 |
public: |
60 |
int count; //how many of these reads are the same |
61 |
int len; //length of qv |
62 |
char* firstname; //optional, only if we want to keep the original read names |
63 |
char* qv; |
64 |
FqDupRec(GStr* qstr=NULL, const char* rname=NULL) { |
65 |
len=0; |
66 |
qv=NULL; |
67 |
firstname=NULL; |
68 |
count=0; |
69 |
if (qstr!=NULL) { |
70 |
qv=Gstrdup(qstr->chars()); |
71 |
len=qstr->length(); |
72 |
count++; |
73 |
} |
74 |
if (rname!=NULL) firstname=Gstrdup(rname); |
75 |
} |
76 |
~FqDupRec() { |
77 |
GFREE(qv); |
78 |
GFREE(firstname); |
79 |
} |
80 |
void add(GStr& d) { //collapse another record into this one |
81 |
if (d.length()!=len) |
82 |
GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n"); |
83 |
count++; |
84 |
for (int i=0;i<len;i++) |
85 |
qv[i]=(qv[i]+d[i])/2; |
86 |
} |
87 |
}; |
88 |
|
89 |
void openfw(FILE* &f, GArgs& args, char opt) { |
90 |
GStr s=args.getOpt(opt); |
91 |
if (!s.is_empty()) { |
92 |
if (s=='-') f=stdout; |
93 |
else { |
94 |
f=fopen(s,"w"); |
95 |
if (f==NULL) GError("Error creating file: %s\n", s.chars()); |
96 |
} |
97 |
} |
98 |
} |
99 |
|
100 |
#define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh) |
101 |
#define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh) |
102 |
|
103 |
GHash<FqDupRec> dhash; //hash to keep track of duplicates |
104 |
|
105 |
bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured |
106 |
|
107 |
int dust(GStr& seq); |
108 |
bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
109 |
bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
110 |
|
111 |
void convertQ64(char* q, int len); |
112 |
void convertQ64(GStr& q); |
113 |
|
114 |
int main(int argc, char * const argv[]) { |
115 |
GArgs args(argc, argv, "YQRDCl:d:3:5:n:r:o:"); |
116 |
int e; |
117 |
int icounter=0; //counter for input reads |
118 |
int outcounter=0; //counter for output reads |
119 |
if ((e=args.isError())>0) { |
120 |
GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]); |
121 |
exit(224); |
122 |
} |
123 |
debug=(args.getOpt('Y')!=NULL); |
124 |
phred64=(args.getOpt('Q')!=NULL); |
125 |
doCollapse=(args.getOpt('C')!=NULL); |
126 |
doDust=(args.getOpt('D')!=NULL); |
127 |
rawFormat=(args.getOpt('R')!=NULL); |
128 |
if (rawFormat) { |
129 |
GError("Sorry, raw qseq format parsing is not implemented yet!\n"); |
130 |
} |
131 |
prefix=args.getOpt('n'); |
132 |
GStr s=args.getOpt('l'); |
133 |
if (!s.is_empty()) |
134 |
min_read_len=s.asInt(); |
135 |
s=args.getOpt('d'); |
136 |
if (!s.is_empty()) { |
137 |
dust_cutoff=s.asInt(); |
138 |
doDust=true; |
139 |
} |
140 |
|
141 |
if (args.getOpt('3')!=NULL) { |
142 |
adapter3=args.getOpt('3'); |
143 |
adapter3.upper(); |
144 |
a3len=adapter3.length(); |
145 |
} |
146 |
if (args.getOpt('5')!=NULL) { |
147 |
adapter5=args.getOpt('5'); |
148 |
adapter5.upper(); |
149 |
a5len=adapter5.length(); |
150 |
} |
151 |
trashReport= (args.getOpt('r')!=NULL); |
152 |
if (args.startNonOpt()==0) { |
153 |
GMessage(USAGE); |
154 |
exit(224); |
155 |
} |
156 |
|
157 |
openfw(f_out, args, 'o'); |
158 |
if (f_out==NULL) f_out=stdout; |
159 |
if (trashReport) |
160 |
openfw(freport, args, 'r'); |
161 |
char* infile=NULL; |
162 |
while ((infile=args.nextNonOpt())!=NULL) { |
163 |
GStr infname(infile); |
164 |
if (strcmp(infile,"-")==0) { |
165 |
f_in=stdin; infname="stdin"; } |
166 |
else { |
167 |
f_in=fopen(infile,"r"); |
168 |
if (f_in==NULL) |
169 |
GError("Cannot open input file %s!\n",infile); |
170 |
} |
171 |
GLineReader fq(f_in); |
172 |
char* l=NULL; |
173 |
while ((l=fq.getLine())!=NULL) { |
174 |
GStr rname; //current read name |
175 |
GStr rseq; //current read sequence |
176 |
GStr rqv; //current read quality values |
177 |
GStr s; |
178 |
if (rawFormat) { |
179 |
//TODO: implement qseq parsing here |
180 |
//if (raw type=N) then continue; //skip invalid/bad records |
181 |
|
182 |
} //raw qseq format |
183 |
else { // FASTQ or FASTA |
184 |
isfasta=(l[0]=='>'); |
185 |
if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n"); |
186 |
s=l; |
187 |
rname=&(l[1]); |
188 |
icounter++; |
189 |
//GMessage("readname=%s\n",rname.chars()); |
190 |
for (int i=0;i<rname.length();i++) |
191 |
if (rname[i]<=' ') { rname.cut(i); break; } |
192 |
//now get the sequence |
193 |
if ((l=fq.getLine())==NULL) |
194 |
GError("Error: unexpected EOF after header for %s\n",rname.chars()); |
195 |
rseq=l; //this must be the DNA line |
196 |
if (isfasta) { |
197 |
while ((l=fq.getLine())!=NULL) { |
198 |
//fasta can have multiple sequence lines |
199 |
if (l[0]=='>') { |
200 |
fq.pushBack(); |
201 |
break; // |
202 |
} |
203 |
rseq+=l; |
204 |
} |
205 |
} //multi-line fasta file |
206 |
if (!isfasta) { |
207 |
if ((l=fq.getLine())==NULL) |
208 |
GError("Error: unexpected EOF after sequence for %s\n", rname.chars()); |
209 |
if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n"); |
210 |
if ((l=fq.getLine())==NULL) |
211 |
GError("Error: unexpected EOF after qv header for %s\n", rname.chars()); |
212 |
rqv=l; |
213 |
if (rqv.length()!=rseq.length()) |
214 |
GError("Error: qv len != seq len for %s\n", rname.chars()); |
215 |
} |
216 |
} //<-- FASTA or FASTQ |
217 |
rseq.upper(); |
218 |
int l5=0; |
219 |
int l3=rseq.length()-1; |
220 |
if (l3-l5+1<min_read_len) { |
221 |
if (trashReport) { |
222 |
fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars()); |
223 |
} |
224 |
continue; |
225 |
} |
226 |
if (ntrim(rseq, rqv, l5, l3)) { |
227 |
if (l3-l5+1<min_read_len) { |
228 |
if (trashReport) { |
229 |
fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars()); |
230 |
} |
231 |
continue; //invalid read |
232 |
} |
233 |
//-- keep only the l5..l3 range |
234 |
rseq=rseq.substr(l5, l3-l5+1); |
235 |
if (!rqv.is_empty()) |
236 |
rqv=rqv.substr(l5, l3-l5+1); |
237 |
} |
238 |
|
239 |
if (a3len>0) { |
240 |
if (trim_adapter3(rseq, l5, l3)) { |
241 |
if (l3-l5+1<min_read_len) { |
242 |
if (trashReport) { |
243 |
fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars()); |
244 |
} |
245 |
continue; |
246 |
} |
247 |
//-- keep only the l5..l3 range |
248 |
rseq=rseq.substr(l5, l3-l5+1); |
249 |
if (!rqv.is_empty()) |
250 |
rqv=rqv.substr(l5, l3-l5+1); |
251 |
}//some adapter was trimmed |
252 |
} //adapter trimming |
253 |
if (a5len>0) { |
254 |
if (trim_adapter5(rseq, l5, l3)) { |
255 |
if (l3-l5+1<min_read_len) { |
256 |
if (trashReport) { |
257 |
fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars()); |
258 |
} |
259 |
continue; |
260 |
} |
261 |
//-- keep only the l5..l3 range |
262 |
rseq=rseq.substr(l5, l3-l5+1); |
263 |
if (!rqv.is_empty()) |
264 |
rqv=rqv.substr(l5, l3-l5+1); |
265 |
}//some adapter was trimmed |
266 |
} //adapter trimming |
267 |
if (doCollapse) { |
268 |
//keep read for later |
269 |
FqDupRec* dr=dhash.Find(rseq.chars()); |
270 |
if (dr==NULL) { //new entry |
271 |
//if (prefix.is_empty()) |
272 |
dhash.Add(rseq.chars(), |
273 |
new FqDupRec(&rqv, rname.chars())); |
274 |
//else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length())); |
275 |
} |
276 |
else |
277 |
dr->add(rqv); |
278 |
} //collapsing duplicates |
279 |
else { //not collapsing duplicates |
280 |
//do the dust filter now |
281 |
if (doDust) { |
282 |
int dustbases=dust(rseq); |
283 |
if (dustbases>(rseq.length()>>1)) { |
284 |
if (trashReport) { |
285 |
fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars()); |
286 |
} |
287 |
continue; |
288 |
} |
289 |
} |
290 |
//print this record here |
291 |
outcounter++; |
292 |
if (isfasta) { |
293 |
if (prefix.is_empty()) |
294 |
fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars()); |
295 |
else |
296 |
fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter, |
297 |
rseq.chars()); |
298 |
} |
299 |
else { //fastq |
300 |
if (phred64) convertQ64(rqv); |
301 |
if (prefix.is_empty()) |
302 |
fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars()); |
303 |
else |
304 |
fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
305 |
rseq.chars(),rqv.chars() ); |
306 |
} |
307 |
} //not collapsing duplicates |
308 |
} //for each fastq record |
309 |
} //while each input file |
310 |
FRCLOSE(f_in); |
311 |
if (doCollapse) { |
312 |
outcounter=0; |
313 |
int maxdup_count=1; |
314 |
char* maxdup_seq=NULL; |
315 |
dhash.startIterate(); |
316 |
FqDupRec* qd=NULL; |
317 |
char* seq=NULL; |
318 |
while ((qd=dhash.NextData(seq))!=NULL) { |
319 |
GStr rseq(seq); |
320 |
//do the dusting here |
321 |
if (doDust) { |
322 |
int dustbases=dust(rseq); |
323 |
if (dustbases>(rseq.length()>>1)) { |
324 |
if (trashReport && qd->firstname!=NULL) { |
325 |
fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq); |
326 |
} |
327 |
continue; |
328 |
} |
329 |
} |
330 |
outcounter++; |
331 |
if (qd->count>maxdup_count) { |
332 |
maxdup_count=qd->count; |
333 |
maxdup_seq=seq; |
334 |
} |
335 |
if (isfasta) { |
336 |
if (prefix.is_empty()) { |
337 |
fprintf(f_out, ">%s_x%d\n%s\n", qd->firstname, qd->count, |
338 |
rseq.chars()); |
339 |
} |
340 |
else { //use custom read name |
341 |
fprintf(f_out, ">%s%08d_x%d\n%s\n", prefix.chars(), outcounter, |
342 |
qd->count, rseq.chars()); |
343 |
} |
344 |
} |
345 |
else { //fastq format |
346 |
if (phred64) convertQ64(qd->qv, qd->len); |
347 |
if (prefix.is_empty()) { |
348 |
fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count, |
349 |
rseq.chars(), qd->qv); |
350 |
} |
351 |
else { //use custom read name |
352 |
fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
353 |
qd->count, rseq.chars(), qd->qv); |
354 |
} |
355 |
} |
356 |
}//for each element of dhash |
357 |
if (maxdup_count>1) { |
358 |
GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq); |
359 |
} |
360 |
} //report collapsed dhash entries |
361 |
GMessage("Number of input reads: %9d\n", icounter); |
362 |
GMessage(" Output records: %9d\n", outcounter); |
363 |
if (trashReport) { |
364 |
FWCLOSE(freport); |
365 |
} |
366 |
|
367 |
FWCLOSE(f_out); |
368 |
} |
369 |
|
370 |
class NData { |
371 |
public: |
372 |
int NPos[1024]; //there should be no reads longer than 1K ? |
373 |
int NCount; |
374 |
int end5; |
375 |
int end3; |
376 |
int seqlen; |
377 |
double perc_N; //percentage of Ns in end5..end3 range only! |
378 |
const char* seq; |
379 |
bool valid; |
380 |
NData() { |
381 |
NCount=0; |
382 |
end5=0; |
383 |
end3=0; |
384 |
seq=NULL; |
385 |
perc_N=0; |
386 |
valid=true; |
387 |
} |
388 |
void init(GStr& rseq) { |
389 |
NCount=0; |
390 |
perc_N=0; |
391 |
seqlen=rseq.length(); |
392 |
seq=rseq.chars(); |
393 |
end5=1; |
394 |
end3=seqlen; |
395 |
for (int i=0;i<seqlen;i++) |
396 |
if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT") |
397 |
NPos[NCount]=i; |
398 |
NCount++; |
399 |
} |
400 |
perc_N=(NCount*100.0)/seqlen; |
401 |
valid=true; |
402 |
} |
403 |
void N_calc() { //only in the region after trimming |
404 |
int n=0; |
405 |
for (int i=end3-1;i<end5;i++) { |
406 |
if (seq[i]=='N') n++; |
407 |
} |
408 |
perc_N=(n*100.0)/(end5-end3+1); |
409 |
} |
410 |
}; |
411 |
|
412 |
static NData feat; |
413 |
int perc_lenN=12; // incremental distance from ends, in percentage of |
414 |
// sequence length, where N-trimming is done (default:12 %) (autolimited to 20) |
415 |
|
416 |
void N_analyze(int l5, int l3, int p5, int p3) { |
417 |
/* assumes feat was filled properly */ |
418 |
int old_dif, t5,t3,v; |
419 |
if (l3-l5<min_read_len || p5>p3 ) { |
420 |
feat.end5=l5+1; |
421 |
feat.end3=l3+1; |
422 |
return; |
423 |
} |
424 |
|
425 |
t5=feat.NPos[p5]-l5; |
426 |
t3=l3-feat.NPos[p3]; |
427 |
old_dif=p3-p5; |
428 |
v=(int)((((double)(l3-l5))*perc_lenN)/100); |
429 |
if (v>20) v=20; /* enforce N-search limit for very long reads */ |
430 |
if (t5 < v ) { |
431 |
l5=feat.NPos[p5]+1; |
432 |
p5++; |
433 |
} |
434 |
if (t3 < v) { |
435 |
l3=feat.NPos[p3]-1; |
436 |
p3--; |
437 |
} |
438 |
/* restNs=p3-p5; number of Ns in the new CLR */ |
439 |
if (p3-p5==old_dif) { /* no change, return */ |
440 |
/* don't trim if this may shorten the read too much */ |
441 |
//if (l5-l3<min_read_len) return; |
442 |
feat.end5=l5+1; |
443 |
feat.end3=l3+1; |
444 |
return; |
445 |
} |
446 |
else |
447 |
N_analyze(l5,l3, p5,p3); |
448 |
} |
449 |
|
450 |
|
451 |
bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) { |
452 |
//count Ns in the sequence |
453 |
feat.init(rseq); |
454 |
l5=feat.end5-1; |
455 |
l3=feat.end3-1; |
456 |
N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1); |
457 |
if (l5==feat.end5-1 && l3==feat.end3-1) |
458 |
return false; //nothing changed |
459 |
l5=feat.end5-1; |
460 |
l3=feat.end3-1; |
461 |
if (l3-l5+1<min_read_len) { |
462 |
feat.valid=false; |
463 |
return true; |
464 |
} |
465 |
feat.N_calc(); |
466 |
if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases |
467 |
feat.valid=false; |
468 |
l3=l5+1; |
469 |
return true; |
470 |
} |
471 |
return true; |
472 |
} |
473 |
|
474 |
//--------------- dust functions ---------------- |
475 |
class DNADuster { |
476 |
public: |
477 |
int dustword; |
478 |
int dustwindow; |
479 |
int dustwindow2; |
480 |
int dustcutoff; |
481 |
int mv, iv, jv; |
482 |
int counts[32*32*32]; |
483 |
int iis[32*32*32]; |
484 |
DNADuster(int cutoff=16, int winsize=32, int wordsize=3) { |
485 |
dustword=wordsize; |
486 |
dustwindow=winsize; |
487 |
dustwindow2 = (winsize>>1); |
488 |
dustcutoff=cutoff; |
489 |
mv=0; |
490 |
iv=0; |
491 |
jv=0; |
492 |
} |
493 |
void setWindowSize(int value) { |
494 |
dustwindow = value; |
495 |
dustwindow2 = (dustwindow >> 1); |
496 |
} |
497 |
void setWordSize(int value) { |
498 |
dustword=value; |
499 |
} |
500 |
void wo1(int len, const char* s, int ivv) { |
501 |
int i, ii, j, v, t, n, n1, sum; |
502 |
int js, nis; |
503 |
n = 32 * 32 * 32; |
504 |
n1 = n - 1; |
505 |
nis = 0; |
506 |
i = 0; |
507 |
ii = 0; |
508 |
sum = 0; |
509 |
v = 0; |
510 |
for (j=0; j < len; j++, s++) { |
511 |
ii <<= 5; |
512 |
if (*s<=32) { |
513 |
i=0; |
514 |
continue; |
515 |
} |
516 |
ii |= *s - 'A'; //assume uppercase! |
517 |
ii &= n1; |
518 |
i++; |
519 |
if (i >= dustword) { |
520 |
for (js=0; js < nis && iis[js] != ii; js++) ; |
521 |
if (js == nis) { |
522 |
iis[nis] = ii; |
523 |
counts[ii] = 0; |
524 |
nis++; |
525 |
} |
526 |
if ((t = counts[ii]) > 0) { |
527 |
sum += t; |
528 |
v = 10 * sum / j; |
529 |
if (mv < v) { |
530 |
mv = v; |
531 |
iv = ivv; |
532 |
jv = j; |
533 |
} |
534 |
} |
535 |
counts[ii]++; |
536 |
} |
537 |
} |
538 |
} |
539 |
|
540 |
int wo(int len, const char* s, int* beg, int* end) { |
541 |
int i, l1; |
542 |
l1 = len - dustword + 1; |
543 |
if (l1 < 0) { |
544 |
*beg = 0; |
545 |
*end = len - 1; |
546 |
return 0; |
547 |
} |
548 |
mv = 0; |
549 |
iv = 0; |
550 |
jv = 0; |
551 |
for (i=0; i < l1; i++) { |
552 |
wo1(len-i, s+i, i); |
553 |
} |
554 |
*beg = iv; |
555 |
*end = iv + jv; |
556 |
return mv; |
557 |
} |
558 |
|
559 |
void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) { |
560 |
int i, j, l, a, b, v; |
561 |
if (cutoff==0) cutoff=dustcutoff; |
562 |
a=0;b=0; |
563 |
//GMessage("Dust cutoff=%d\n", cutoff); |
564 |
for (i=0; i < seqlen; i += dustwindow2) { |
565 |
l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i; |
566 |
v = wo(l, seq+i, &a, &b); |
567 |
if (v > cutoff) { |
568 |
//for (j = a; j <= b && j < dustwindow2; j++) { |
569 |
for (j = a; j <= b; j++) { |
570 |
seqmsk[i+j]='N';//could be made lowercase instead |
571 |
} |
572 |
} |
573 |
} |
574 |
//return first; |
575 |
} |
576 |
}; |
577 |
|
578 |
static DNADuster duster; |
579 |
|
580 |
int dust(GStr& rseq) { |
581 |
char* seq=Gstrdup(rseq.chars()); |
582 |
duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff); |
583 |
//check the number of Ns: |
584 |
int ncount=0; |
585 |
for (int i=0;i<rseq.length();i++) { |
586 |
if (seq[i]=='N') ncount++; |
587 |
} |
588 |
GFREE(seq); |
589 |
return ncount; |
590 |
} |
591 |
|
592 |
// ------------------ adapter matching - triplet matching |
593 |
//look for matching triplets amongst the first 9 nucleotides of the 3' adaptor |
594 |
// or the last 9 nucleotides for the 5' adapter |
595 |
//when a triplet match is found, simply try to extend the alignment using a drop-off scheme |
596 |
// check minimum score and |
597 |
// for 3' adapter trimming: |
598 |
// require that the right end of the alignment for either the adaptor OR the read must be |
599 |
// < 3 distance from its right end |
600 |
// for 5' adapter trimming: |
601 |
// require that the left end of the alignment for either the adaptor OR the read must |
602 |
// be at coordinate 0 |
603 |
|
604 |
bool extendMatch(const char* a, int alen, int ai, |
605 |
const char* b, int blen, int bi, int mlen, int& l5, int& l3, bool end5=false) { |
606 |
//so the alignment starts at ai in a, bi in b, with a perfect match of length mlen |
607 |
//if (debug) |
608 |
// GStr dbg(b); |
609 |
//GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai); |
610 |
int a_l=ai; //alignment coordinates on a |
611 |
int a_r=ai+mlen-1; |
612 |
int b_l=bi; //alignment coordinates on b |
613 |
int b_r=bi+mlen-1; |
614 |
int ai_maxscore=ai; |
615 |
int bi_maxscore=bi; |
616 |
int score=mlen*a_m_score; |
617 |
int maxscore=score; |
618 |
int mism5score=a_mis_score; |
619 |
if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end |
620 |
//try to extend to the left first, if possible |
621 |
while (ai>0 && bi>0) { |
622 |
ai--; |
623 |
bi--; |
624 |
score+= (a[ai]==b[bi])? a_m_score : mism5score; |
625 |
if (score>maxscore) { |
626 |
ai_maxscore=ai; |
627 |
bi_maxscore=bi; |
628 |
maxscore=score; |
629 |
} |
630 |
else if (maxscore-score>a_dropoff_score) break; |
631 |
} |
632 |
a_l=ai_maxscore; |
633 |
b_l=bi_maxscore; |
634 |
//if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore); |
635 |
//now extend to the right |
636 |
ai_maxscore=a_r; |
637 |
bi_maxscore=b_r; |
638 |
ai=a_r; |
639 |
bi=b_r; |
640 |
score=maxscore; |
641 |
//sometimes there are extra AAAAs at the end of the read, ignore those |
642 |
if (strcmp(&a[alen-4],"AAAA")==0) { |
643 |
alen-=3; |
644 |
while (a[alen-1]=='A' && alen>ai) alen--; |
645 |
} |
646 |
while (ai<alen-1 && bi<blen-1) { |
647 |
ai++; |
648 |
bi++; |
649 |
//score+= (a[ai]==b[bi])? a_m_score : a_mis_score; |
650 |
if (a[ai]==b[bi]) { //match |
651 |
score+=a_m_score; |
652 |
if (ai>=alen-2) { |
653 |
score+=a_m_score-(alen-ai-1); |
654 |
} |
655 |
} |
656 |
else { //mismatch |
657 |
score+=a_mis_score; |
658 |
} |
659 |
if (score>maxscore) { |
660 |
ai_maxscore=ai; |
661 |
bi_maxscore=bi; |
662 |
maxscore=score; |
663 |
} |
664 |
else if (maxscore-score>a_dropoff_score) break; |
665 |
} |
666 |
a_r=ai_maxscore; |
667 |
b_r=bi_maxscore; |
668 |
int a_ovh3=alen-a_r-1; |
669 |
int b_ovh3=blen-b_r-1; |
670 |
int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3; |
671 |
int mmovh5=(a_l<b_l)? a_l : b_l; |
672 |
//if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore); |
673 |
if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) { |
674 |
if (a_l<a_ovh3) { |
675 |
//adapter closer to the left end (typical for 5' adapter) |
676 |
l5=a_r+1; |
677 |
l3=alen-1; |
678 |
} |
679 |
else { |
680 |
//adapter matching at the right end (typical for 3' adapter) |
681 |
l5=0; |
682 |
l3=a_l-1; |
683 |
} |
684 |
return true; |
685 |
} |
686 |
//do not trim: |
687 |
l5=0; |
688 |
l3=alen-1; |
689 |
return false; |
690 |
} |
691 |
|
692 |
bool trim_adapter3(GStr& seq, int&l5, int &l3) { |
693 |
int rlen=seq.length(); |
694 |
l5=0; |
695 |
l3=rlen-1; |
696 |
//first try a full match, we might get lucky |
697 |
int fi=-1; |
698 |
if ((fi=seq.index(adapter3))>=0) { |
699 |
if (fi<rlen-fi-a3len) {//match is closer to the right end |
700 |
l5=fi+a3len; |
701 |
l3=rlen-1; |
702 |
} |
703 |
else { |
704 |
l5=0; |
705 |
l3=fi-1; |
706 |
} |
707 |
return true; |
708 |
} |
709 |
//also, for fast detection of other adapter-only reads that start past |
710 |
// the beginning of the adapter sequence, try to see if the first a3len-4 |
711 |
// characters of the read are a substring of the adapter |
712 |
GStr rstart=seq.substr(0,a3len-4); |
713 |
if ((fi=adapter3.index(rstart))>=0) { |
714 |
l3=rlen-1; |
715 |
l5=a3len-4; |
716 |
while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++; |
717 |
return true; |
718 |
} |
719 |
|
720 |
//another easy case: first half of the adaptor matches |
721 |
int a3hlen=a3len>>1; |
722 |
GStr ahalf=adapter3.substr(0, a3hlen); |
723 |
if ((fi=seq.index(ahalf))>=0) { |
724 |
extendMatch(seq.chars(), rlen, fi, |
725 |
adapter3.chars(), a3len, 0, a3hlen, l5,l3); |
726 |
return true; |
727 |
} |
728 |
//no easy cases, so let's do the word hashing |
729 |
for (int iw=0;iw<6;iw++) { |
730 |
GStr aword=adapter3.substr(iw,3); |
731 |
if ((fi=seq.index(aword))>=0 && rlen-fi>3) { |
732 |
if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(), |
733 |
a3len, iw, 3, l5,l3)) return true; |
734 |
} |
735 |
} |
736 |
return false; //no adapter parts found |
737 |
} |
738 |
|
739 |
bool trim_adapter5(GStr& seq, int&l5, int &l3) { |
740 |
//if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars()); |
741 |
int rlen=seq.length(); |
742 |
l5=0; |
743 |
l3=rlen-1; |
744 |
//try to see if adapter is fully included in the read |
745 |
int fi=-1; |
746 |
if ((fi=seq.index(adapter5))>=0) { |
747 |
if (fi<rlen-fi-a5len) {//match is closer to the right end |
748 |
l5=fi+a5len; |
749 |
l3=rlen-1; |
750 |
} |
751 |
else { |
752 |
l5=0; |
753 |
l3=fi-1; |
754 |
} |
755 |
return true; |
756 |
} |
757 |
//for fast detection of adapter-rich reads, check if the first 12 |
758 |
//characters of the read are a substring of the adapter |
759 |
GStr rstart=seq.substr(1,12); |
760 |
if ((fi=adapter5.index(rstart))>=0) { |
761 |
//l3=rlen-1; |
762 |
//l5=a5len-4; |
763 |
//while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++; |
764 |
//return true; |
765 |
//if (debug) GMessage(" first 12char of the read match adaptor!\n"); |
766 |
extendMatch(seq.chars(), rlen, 1, |
767 |
adapter5.chars(), a5len, fi, 12, l5,l3, true); |
768 |
return true; |
769 |
} |
770 |
|
771 |
//another easy case: last 12 characters of the adaptor found as a substring of the read |
772 |
int aplen=12; |
773 |
int apstart=a5len-aplen-2; |
774 |
if (apstart<0) { apstart=0; aplen=a5len-1; } |
775 |
GStr bstr=adapter5.substr(apstart, aplen); |
776 |
if ((fi=seq.index(bstr))>=0) { |
777 |
//if (debug) GMessage(" last 12char of adaptor match the read!\n"); |
778 |
extendMatch(seq.chars(), rlen, fi, |
779 |
adapter5.chars(), a5len, apstart, aplen, l5,l3,true); |
780 |
return true; |
781 |
} |
782 |
//no easy cases, find a triplet match as a seed for alignment extension |
783 |
//find triplets at the right end of the adapter |
784 |
for (int iw=0;iw<6;iw++) { |
785 |
apstart=a5len-iw-4; |
786 |
GStr aword=adapter5.substr(apstart,3); |
787 |
if ((fi=seq.index(aword))>=0) { |
788 |
//if (debug) GMessage("extending wmatch :%*s\n", fi+3, aword.chars()); |
789 |
if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
790 |
a5len, apstart, 3, l5,l3,true)) { |
791 |
return true; |
792 |
} |
793 |
} |
794 |
} |
795 |
return false; //no adapter parts found |
796 |
} |
797 |
|
798 |
//conversion of phred64 to phread33 |
799 |
void convertQ64(GStr& q) { |
800 |
for (int i=0;i<q.length();i++) q[i]-=31; |
801 |
} |
802 |
|
803 |
void convertQ64(char* q, int len) { |
804 |
for (int i=0;i<len;i++) q[i]-=31; |
805 |
} |