1 |
gpertea |
4 |
#include "GArgs.h" |
2 |
|
|
#include "GStr.h" |
3 |
|
|
#include "GHash.hh" |
4 |
|
|
#include "GList.hh" |
5 |
|
|
|
6 |
|
|
#define USAGE "Usage:\n\ |
7 |
|
|
fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\ |
8 |
|
|
[-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\ |
9 |
|
|
\n\ |
10 |
|
|
Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\ |
11 |
|
|
optionally collapse duplicates in the input fastq data.\n\ |
12 |
|
|
\n\ |
13 |
|
|
Options:\n\ |
14 |
|
|
-n rename all the reads using the <prefix> followed by a read counter;\n\ |
15 |
|
|
if -C option was given, the suffix \"_x<N>\" is appended, with <N> being\n\ |
16 |
|
|
the read duplication count\n\ |
17 |
|
|
-o write the trimmed/filtered fastq into <trimmed.fq>(instead of stdout)\n\ |
18 |
|
|
-5 trim the given adapter or primer sequence at the 5' end of each read\n\ |
19 |
|
|
(e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\ |
20 |
|
|
-3 trim the given adapter sequence at the 3' end of each read\n\ |
21 |
|
|
(e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\ |
22 |
|
|
-l minimum \"clean\" length at the high quality 5'end that a read must\n\ |
23 |
|
|
have in order to pass the filter (default: 16)\n\ |
24 |
|
|
-r write a simple \"trash report\" file listing the discarded reads with a\n\ |
25 |
|
|
one letter code for the reason why they were trashed\n\ |
26 |
|
|
-C collapse duplicate reads and add a prefix with their count to the read\n\ |
27 |
|
|
name\n\ |
28 |
|
|
-D apply a low-complexity (dust) filter and discard any read that has at\n\ |
29 |
|
|
least half of it masked by this\n\ |
30 |
|
|
-Q quality values in the input data are interpreted as phred64, convert\n\ |
31 |
|
|
them to phred33\n\ |
32 |
|
|
" |
33 |
|
|
// example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG |
34 |
|
|
FILE* f_out=NULL; //stdout if not provided |
35 |
|
|
FILE* f_in=NULL; //input fastq (stdin if not provided) |
36 |
|
|
FILE* freport=NULL; |
37 |
|
|
bool debug=false; |
38 |
|
|
bool doCollapse=false; |
39 |
|
|
bool doDust=false; |
40 |
|
|
bool trashReport=false; |
41 |
|
|
bool rawFormat=false; |
42 |
|
|
int min_read_len=16; |
43 |
|
|
int dust_cutoff=16; |
44 |
|
|
bool isfasta=false; |
45 |
|
|
bool phred64=false; |
46 |
|
|
GStr prefix; |
47 |
|
|
GStr adapter3; |
48 |
|
|
GStr adapter5; |
49 |
|
|
int a3len=0; |
50 |
|
|
int a5len=0; |
51 |
|
|
// adaptor matching metrics -- for extendMatch() function |
52 |
|
|
static const int a_m_score=2; //match score |
53 |
|
|
static const int a_mis_score=-3; //mismatch |
54 |
|
|
static const int a_dropoff_score=7; |
55 |
|
|
static const int a_min_score=7; |
56 |
|
|
|
57 |
|
|
// element in dhash: |
58 |
|
|
class FqDupRec { |
59 |
|
|
public: |
60 |
|
|
int count; //how many of these reads are the same |
61 |
|
|
int len; //length of qv |
62 |
|
|
char* firstname; //optional, only if we want to keep the original read names |
63 |
|
|
char* qv; |
64 |
|
|
FqDupRec(GStr* qstr=NULL, const char* rname=NULL) { |
65 |
|
|
len=0; |
66 |
|
|
qv=NULL; |
67 |
|
|
firstname=NULL; |
68 |
|
|
count=0; |
69 |
|
|
if (qstr!=NULL) { |
70 |
|
|
qv=Gstrdup(qstr->chars()); |
71 |
|
|
len=qstr->length(); |
72 |
|
|
count++; |
73 |
|
|
} |
74 |
|
|
if (rname!=NULL) firstname=Gstrdup(rname); |
75 |
|
|
} |
76 |
|
|
~FqDupRec() { |
77 |
|
|
GFREE(qv); |
78 |
|
|
GFREE(firstname); |
79 |
|
|
} |
80 |
|
|
void add(GStr& d) { //collapse another record into this one |
81 |
|
|
if (d.length()!=len) |
82 |
|
|
GError("Error at FqDupRec::add(): cannot collapse reads with different length!\n"); |
83 |
|
|
count++; |
84 |
|
|
for (int i=0;i<len;i++) |
85 |
|
|
qv[i]=(qv[i]+d[i])/2; |
86 |
|
|
} |
87 |
|
|
}; |
88 |
|
|
|
89 |
|
|
void openfw(FILE* &f, GArgs& args, char opt) { |
90 |
|
|
GStr s=args.getOpt(opt); |
91 |
|
|
if (!s.is_empty()) { |
92 |
|
|
if (s=='-') f=stdout; |
93 |
|
|
else { |
94 |
|
|
f=fopen(s,"w"); |
95 |
|
|
if (f==NULL) GError("Error creating file: %s\n", s.chars()); |
96 |
|
|
} |
97 |
|
|
} |
98 |
|
|
} |
99 |
|
|
|
100 |
|
|
#define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh) |
101 |
|
|
#define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh) |
102 |
|
|
|
103 |
|
|
GHash<FqDupRec> dhash; //hash to keep track of duplicates |
104 |
|
|
|
105 |
|
|
bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured |
106 |
|
|
|
107 |
|
|
int dust(GStr& seq); |
108 |
|
|
bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
109 |
|
|
bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
110 |
|
|
|
111 |
|
|
void convertQ64(char* q, int len); |
112 |
|
|
void convertQ64(GStr& q); |
113 |
|
|
|
114 |
|
|
int main(int argc, char * const argv[]) { |
115 |
gpertea |
9 |
GArgs args(argc, argv, "YQRDCl:d:3:5:n:r:o:"); |
116 |
gpertea |
4 |
int e; |
117 |
|
|
int icounter=0; //counter for input reads |
118 |
|
|
int outcounter=0; //counter for output reads |
119 |
|
|
if ((e=args.isError())>0) { |
120 |
|
|
GMessage("%s\nInvalid argument: %s\n", USAGE, argv[e]); |
121 |
|
|
exit(224); |
122 |
|
|
} |
123 |
gpertea |
9 |
debug=(args.getOpt('Y')!=NULL); |
124 |
gpertea |
4 |
phred64=(args.getOpt('Q')!=NULL); |
125 |
|
|
doCollapse=(args.getOpt('C')!=NULL); |
126 |
|
|
doDust=(args.getOpt('D')!=NULL); |
127 |
|
|
rawFormat=(args.getOpt('R')!=NULL); |
128 |
|
|
if (rawFormat) { |
129 |
|
|
GError("Sorry, raw qseq format parsing is not implemented yet!\n"); |
130 |
|
|
} |
131 |
|
|
prefix=args.getOpt('n'); |
132 |
|
|
GStr s=args.getOpt('l'); |
133 |
|
|
if (!s.is_empty()) |
134 |
|
|
min_read_len=s.asInt(); |
135 |
|
|
s=args.getOpt('d'); |
136 |
|
|
if (!s.is_empty()) { |
137 |
|
|
dust_cutoff=s.asInt(); |
138 |
|
|
doDust=true; |
139 |
|
|
} |
140 |
|
|
|
141 |
|
|
if (args.getOpt('3')!=NULL) { |
142 |
|
|
adapter3=args.getOpt('3'); |
143 |
|
|
adapter3.upper(); |
144 |
|
|
a3len=adapter3.length(); |
145 |
|
|
} |
146 |
|
|
if (args.getOpt('5')!=NULL) { |
147 |
|
|
adapter5=args.getOpt('5'); |
148 |
|
|
adapter5.upper(); |
149 |
|
|
a5len=adapter5.length(); |
150 |
|
|
} |
151 |
|
|
trashReport= (args.getOpt('r')!=NULL); |
152 |
|
|
if (args.startNonOpt()==0) { |
153 |
|
|
GMessage(USAGE); |
154 |
|
|
exit(224); |
155 |
|
|
} |
156 |
|
|
|
157 |
|
|
openfw(f_out, args, 'o'); |
158 |
|
|
if (trashReport) |
159 |
|
|
openfw(freport, args, 'r'); |
160 |
|
|
char* infile=NULL; |
161 |
|
|
while ((infile=args.nextNonOpt())!=NULL) { |
162 |
|
|
GStr infname(infile); |
163 |
|
|
if (strcmp(infile,"-")==0) { |
164 |
|
|
f_in=stdin; infname="stdin"; } |
165 |
|
|
else { |
166 |
|
|
f_in=fopen(infile,"r"); |
167 |
|
|
if (f_in==NULL) |
168 |
|
|
GError("Cannot open input file %s!\n",infile); |
169 |
|
|
} |
170 |
|
|
GLineReader fq(f_in); |
171 |
|
|
char* l=NULL; |
172 |
|
|
while ((l=fq.getLine())!=NULL) { |
173 |
|
|
GStr rname; //current read name |
174 |
|
|
GStr rseq; //current read sequence |
175 |
|
|
GStr rqv; //current read quality values |
176 |
|
|
GStr s; |
177 |
|
|
if (rawFormat) { |
178 |
|
|
//TODO: implement qseq parsing here |
179 |
|
|
//if (raw type=N) then continue; //skip invalid/bad records |
180 |
|
|
|
181 |
|
|
} //raw qseq format |
182 |
|
|
else { // FASTQ or FASTA |
183 |
|
|
isfasta=(l[0]=='>'); |
184 |
|
|
if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n"); |
185 |
|
|
s=l; |
186 |
|
|
rname=&(l[1]); |
187 |
|
|
icounter++; |
188 |
|
|
//GMessage("readname=%s\n",rname.chars()); |
189 |
|
|
for (int i=0;i<rname.length();i++) |
190 |
|
|
if (rname[i]<=' ') { rname.cut(i); break; } |
191 |
|
|
//now get the sequence |
192 |
|
|
if ((l=fq.getLine())==NULL) |
193 |
|
|
GError("Error: unexpected EOF after header for %s\n",rname.chars()); |
194 |
|
|
rseq=l; //this must be the DNA line |
195 |
|
|
if (isfasta) { |
196 |
|
|
while ((l=fq.getLine())!=NULL) { |
197 |
|
|
//fasta can have multiple sequence lines |
198 |
|
|
if (l[0]=='>') { |
199 |
|
|
fq.pushBack(); |
200 |
|
|
break; // |
201 |
|
|
} |
202 |
|
|
rseq+=l; |
203 |
|
|
} |
204 |
|
|
} //multi-line fasta file |
205 |
|
|
if (!isfasta) { |
206 |
|
|
if ((l=fq.getLine())==NULL) |
207 |
|
|
GError("Error: unexpected EOF after sequence for %s\n", rname.chars()); |
208 |
|
|
if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n"); |
209 |
|
|
if ((l=fq.getLine())==NULL) |
210 |
|
|
GError("Error: unexpected EOF after qv header for %s\n", rname.chars()); |
211 |
|
|
rqv=l; |
212 |
|
|
if (rqv.length()!=rseq.length()) |
213 |
|
|
GError("Error: qv len != seq len for %s\n", rname.chars()); |
214 |
|
|
} |
215 |
|
|
} //<-- FASTA or FASTQ |
216 |
|
|
rseq.upper(); |
217 |
|
|
int l5=0; |
218 |
|
|
int l3=rseq.length()-1; |
219 |
|
|
if (l3-l5+1<min_read_len) { |
220 |
|
|
if (trashReport) { |
221 |
|
|
fprintf(freport, "%s\ts\t%s\n",rname.chars(), rseq.chars()); |
222 |
|
|
} |
223 |
|
|
continue; |
224 |
|
|
} |
225 |
|
|
if (ntrim(rseq, rqv, l5, l3)) { |
226 |
|
|
if (l3-l5+1<min_read_len) { |
227 |
|
|
if (trashReport) { |
228 |
|
|
fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars()); |
229 |
|
|
} |
230 |
|
|
continue; //invalid read |
231 |
|
|
} |
232 |
|
|
//-- keep only the l5..l3 range |
233 |
|
|
rseq=rseq.substr(l5, l3-l5+1); |
234 |
|
|
if (!rqv.is_empty()) |
235 |
|
|
rqv=rqv.substr(l5, l3-l5+1); |
236 |
|
|
} |
237 |
|
|
|
238 |
|
|
if (a3len>0) { |
239 |
|
|
if (trim_adapter3(rseq, l5, l3)) { |
240 |
|
|
if (l3-l5+1<min_read_len) { |
241 |
|
|
if (trashReport) { |
242 |
|
|
fprintf(freport, "%s\tA3\t%s\n",rname.chars(), rseq.chars()); |
243 |
|
|
} |
244 |
|
|
continue; |
245 |
|
|
} |
246 |
|
|
//-- keep only the l5..l3 range |
247 |
|
|
rseq=rseq.substr(l5, l3-l5+1); |
248 |
|
|
if (!rqv.is_empty()) |
249 |
|
|
rqv=rqv.substr(l5, l3-l5+1); |
250 |
|
|
}//some adapter was trimmed |
251 |
|
|
} //adapter trimming |
252 |
|
|
if (a5len>0) { |
253 |
|
|
if (trim_adapter5(rseq, l5, l3)) { |
254 |
|
|
if (l3-l5+1<min_read_len) { |
255 |
|
|
if (trashReport) { |
256 |
|
|
fprintf(freport, "%s\tA5\t%s\n",rname.chars(), rseq.chars()); |
257 |
|
|
} |
258 |
|
|
continue; |
259 |
|
|
} |
260 |
|
|
//-- keep only the l5..l3 range |
261 |
|
|
rseq=rseq.substr(l5, l3-l5+1); |
262 |
|
|
if (!rqv.is_empty()) |
263 |
|
|
rqv=rqv.substr(l5, l3-l5+1); |
264 |
|
|
}//some adapter was trimmed |
265 |
|
|
} //adapter trimming |
266 |
|
|
if (doCollapse) { |
267 |
|
|
//keep read for later |
268 |
|
|
FqDupRec* dr=dhash.Find(rseq.chars()); |
269 |
|
|
if (dr==NULL) { //new entry |
270 |
|
|
//if (prefix.is_empty()) |
271 |
|
|
dhash.Add(rseq.chars(), |
272 |
|
|
new FqDupRec(&rqv, rname.chars())); |
273 |
|
|
//else dhash.Add(rseq.chars(), new FqDupRec(rqv.chars(),rqv.length())); |
274 |
|
|
} |
275 |
|
|
else |
276 |
|
|
dr->add(rqv); |
277 |
|
|
} //collapsing duplicates |
278 |
|
|
else { //not collapsing duplicates |
279 |
|
|
//do the dust filter now |
280 |
|
|
if (doDust) { |
281 |
|
|
int dustbases=dust(rseq); |
282 |
|
|
if (dustbases>(rseq.length()>>1)) { |
283 |
|
|
if (trashReport) { |
284 |
|
|
fprintf(freport, "%s\tD\t%s\n",rname.chars(),rseq.chars()); |
285 |
|
|
} |
286 |
|
|
continue; |
287 |
|
|
} |
288 |
|
|
} |
289 |
|
|
//print this record here |
290 |
|
|
outcounter++; |
291 |
|
|
if (isfasta) { |
292 |
|
|
if (prefix.is_empty()) |
293 |
|
|
fprintf(f_out, ">%s\n%s\n", rname.chars(), rseq.chars()); |
294 |
|
|
else |
295 |
|
|
fprintf(f_out, ">%s%08d\n%s\n", prefix.chars(), outcounter, |
296 |
|
|
rseq.chars()); |
297 |
|
|
} |
298 |
|
|
else { //fastq |
299 |
|
|
if (phred64) convertQ64(rqv); |
300 |
|
|
if (prefix.is_empty()) |
301 |
|
|
fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars()); |
302 |
|
|
else |
303 |
|
|
fprintf(f_out, "@%s_%08d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
304 |
|
|
rseq.chars(),rqv.chars() ); |
305 |
|
|
} |
306 |
|
|
} //not collapsing duplicates |
307 |
|
|
} //for each fastq record |
308 |
|
|
} //while each input file |
309 |
|
|
FRCLOSE(f_in); |
310 |
|
|
if (doCollapse) { |
311 |
|
|
outcounter=0; |
312 |
|
|
int maxdup_count=1; |
313 |
|
|
char* maxdup_seq=NULL; |
314 |
|
|
dhash.startIterate(); |
315 |
|
|
FqDupRec* qd=NULL; |
316 |
|
|
char* seq=NULL; |
317 |
|
|
while ((qd=dhash.NextData(seq))!=NULL) { |
318 |
|
|
GStr rseq(seq); |
319 |
|
|
//do the dusting here |
320 |
|
|
if (doDust) { |
321 |
|
|
int dustbases=dust(rseq); |
322 |
|
|
if (dustbases>(rseq.length()>>1)) { |
323 |
|
|
if (trashReport && qd->firstname!=NULL) { |
324 |
|
|
fprintf(freport, "%s_x%d\tD\t%s\n",qd->firstname, qd->count,seq); |
325 |
|
|
} |
326 |
|
|
continue; |
327 |
|
|
} |
328 |
|
|
} |
329 |
|
|
outcounter++; |
330 |
|
|
if (qd->count>maxdup_count) { |
331 |
|
|
maxdup_count=qd->count; |
332 |
|
|
maxdup_seq=seq; |
333 |
|
|
} |
334 |
|
|
if (isfasta) { |
335 |
|
|
if (prefix.is_empty()) { |
336 |
|
|
fprintf(f_out, "@%s_x%d\n%s\n", qd->firstname, qd->count, |
337 |
|
|
rseq.chars()); |
338 |
|
|
} |
339 |
|
|
else { //use custom read name |
340 |
|
|
fprintf(f_out, "@%s%08d_x%d\n%s\n", prefix.chars(), outcounter, |
341 |
|
|
qd->count, rseq.chars()); |
342 |
|
|
} |
343 |
|
|
} |
344 |
|
|
else { //fastq format |
345 |
|
|
if (phred64) convertQ64(qd->qv, qd->len); |
346 |
|
|
if (prefix.is_empty()) { |
347 |
|
|
fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count, |
348 |
|
|
rseq.chars(), qd->qv); |
349 |
|
|
} |
350 |
|
|
else { //use custom read name |
351 |
|
|
fprintf(f_out, "@%s%08d_x%d\n%s\n+\n%s\n", prefix.chars(), outcounter, |
352 |
|
|
qd->count, rseq.chars(), qd->qv); |
353 |
|
|
} |
354 |
|
|
} |
355 |
|
|
}//for each element of dhash |
356 |
|
|
if (maxdup_count>1) { |
357 |
|
|
GMessage("Maximum read multiplicity: x %d (read: %s)\n",maxdup_count, maxdup_seq); |
358 |
|
|
} |
359 |
|
|
} //report collapsed dhash entries |
360 |
|
|
GMessage("Number of input reads: %9d\n", icounter); |
361 |
|
|
GMessage(" Output records: %9d\n", outcounter); |
362 |
|
|
if (trashReport) { |
363 |
|
|
FWCLOSE(freport); |
364 |
|
|
} |
365 |
|
|
|
366 |
|
|
FWCLOSE(f_out); |
367 |
|
|
} |
368 |
|
|
|
369 |
|
|
class NData { |
370 |
|
|
public: |
371 |
|
|
int NPos[1024]; //there should be no reads longer than 1K ? |
372 |
|
|
int NCount; |
373 |
|
|
int end5; |
374 |
|
|
int end3; |
375 |
|
|
int seqlen; |
376 |
|
|
double perc_N; //percentage of Ns in end5..end3 range only! |
377 |
|
|
const char* seq; |
378 |
|
|
bool valid; |
379 |
|
|
NData() { |
380 |
|
|
NCount=0; |
381 |
|
|
end5=0; |
382 |
|
|
end3=0; |
383 |
|
|
seq=NULL; |
384 |
|
|
perc_N=0; |
385 |
|
|
valid=true; |
386 |
|
|
} |
387 |
|
|
void init(GStr& rseq) { |
388 |
|
|
NCount=0; |
389 |
|
|
perc_N=0; |
390 |
|
|
seqlen=rseq.length(); |
391 |
|
|
seq=rseq.chars(); |
392 |
|
|
end5=1; |
393 |
|
|
end3=seqlen; |
394 |
|
|
for (int i=0;i<seqlen;i++) |
395 |
|
|
if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT") |
396 |
|
|
NPos[NCount]=i; |
397 |
|
|
NCount++; |
398 |
|
|
} |
399 |
|
|
perc_N=(NCount*100.0)/seqlen; |
400 |
|
|
valid=true; |
401 |
|
|
} |
402 |
|
|
void N_calc() { //only in the region after trimming |
403 |
|
|
int n=0; |
404 |
|
|
for (int i=end3-1;i<end5;i++) { |
405 |
|
|
if (seq[i]=='N') n++; |
406 |
|
|
} |
407 |
|
|
perc_N=(n*100.0)/(end5-end3+1); |
408 |
|
|
} |
409 |
|
|
}; |
410 |
|
|
|
411 |
|
|
static NData feat; |
412 |
|
|
int perc_lenN=12; // incremental distance from ends, in percentage of |
413 |
|
|
// sequence length, where N-trimming is done (default:12 %) (autolimited to 20) |
414 |
|
|
|
415 |
|
|
void N_analyze(int l5, int l3, int p5, int p3) { |
416 |
|
|
/* assumes feat was filled properly */ |
417 |
|
|
int old_dif, t5,t3,v; |
418 |
|
|
if (l3-l5<min_read_len || p5>p3 ) { |
419 |
|
|
feat.end5=l5+1; |
420 |
|
|
feat.end3=l3+1; |
421 |
|
|
return; |
422 |
|
|
} |
423 |
|
|
|
424 |
|
|
t5=feat.NPos[p5]-l5; |
425 |
|
|
t3=l3-feat.NPos[p3]; |
426 |
|
|
old_dif=p3-p5; |
427 |
|
|
v=(int)((((double)(l3-l5))*perc_lenN)/100); |
428 |
|
|
if (v>20) v=20; /* enforce N-search limit for very long reads */ |
429 |
|
|
if (t5 < v ) { |
430 |
|
|
l5=feat.NPos[p5]+1; |
431 |
|
|
p5++; |
432 |
|
|
} |
433 |
|
|
if (t3 < v) { |
434 |
|
|
l3=feat.NPos[p3]-1; |
435 |
|
|
p3--; |
436 |
|
|
} |
437 |
|
|
/* restNs=p3-p5; number of Ns in the new CLR */ |
438 |
|
|
if (p3-p5==old_dif) { /* no change, return */ |
439 |
|
|
/* don't trim if this may shorten the read too much */ |
440 |
|
|
//if (l5-l3<min_read_len) return; |
441 |
|
|
feat.end5=l5+1; |
442 |
|
|
feat.end3=l3+1; |
443 |
|
|
return; |
444 |
|
|
} |
445 |
|
|
else |
446 |
|
|
N_analyze(l5,l3, p5,p3); |
447 |
|
|
} |
448 |
|
|
|
449 |
|
|
|
450 |
|
|
bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) { |
451 |
|
|
//count Ns in the sequence |
452 |
|
|
feat.init(rseq); |
453 |
|
|
l5=feat.end5-1; |
454 |
|
|
l3=feat.end3-1; |
455 |
|
|
N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1); |
456 |
|
|
if (l5==feat.end5-1 && l3==feat.end3-1) |
457 |
|
|
return false; //nothing changed |
458 |
|
|
l5=feat.end5-1; |
459 |
|
|
l3=feat.end3-1; |
460 |
|
|
if (l3-l5+1<min_read_len) { |
461 |
|
|
feat.valid=false; |
462 |
|
|
return true; |
463 |
|
|
} |
464 |
|
|
feat.N_calc(); |
465 |
|
|
if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases |
466 |
|
|
feat.valid=false; |
467 |
|
|
l3=l5+1; |
468 |
|
|
return true; |
469 |
|
|
} |
470 |
|
|
return true; |
471 |
|
|
} |
472 |
|
|
|
473 |
|
|
//--------------- dust functions ---------------- |
474 |
|
|
class DNADuster { |
475 |
|
|
public: |
476 |
|
|
int dustword; |
477 |
|
|
int dustwindow; |
478 |
|
|
int dustwindow2; |
479 |
|
|
int dustcutoff; |
480 |
|
|
int mv, iv, jv; |
481 |
|
|
int counts[32*32*32]; |
482 |
|
|
int iis[32*32*32]; |
483 |
|
|
DNADuster(int cutoff=16, int winsize=32, int wordsize=3) { |
484 |
|
|
dustword=wordsize; |
485 |
|
|
dustwindow=winsize; |
486 |
|
|
dustwindow2 = (winsize>>1); |
487 |
|
|
dustcutoff=cutoff; |
488 |
|
|
mv=0; |
489 |
|
|
iv=0; |
490 |
|
|
jv=0; |
491 |
|
|
} |
492 |
|
|
void setWindowSize(int value) { |
493 |
|
|
dustwindow = value; |
494 |
|
|
dustwindow2 = (dustwindow >> 1); |
495 |
|
|
} |
496 |
|
|
void setWordSize(int value) { |
497 |
|
|
dustword=value; |
498 |
|
|
} |
499 |
|
|
void wo1(int len, const char* s, int ivv) { |
500 |
|
|
int i, ii, j, v, t, n, n1, sum; |
501 |
|
|
int js, nis; |
502 |
|
|
n = 32 * 32 * 32; |
503 |
|
|
n1 = n - 1; |
504 |
|
|
nis = 0; |
505 |
|
|
i = 0; |
506 |
|
|
ii = 0; |
507 |
|
|
sum = 0; |
508 |
|
|
v = 0; |
509 |
|
|
for (j=0; j < len; j++, s++) { |
510 |
|
|
ii <<= 5; |
511 |
|
|
if (*s<=32) { |
512 |
|
|
i=0; |
513 |
|
|
continue; |
514 |
|
|
} |
515 |
|
|
ii |= *s - 'A'; //assume uppercase! |
516 |
|
|
ii &= n1; |
517 |
|
|
i++; |
518 |
|
|
if (i >= dustword) { |
519 |
|
|
for (js=0; js < nis && iis[js] != ii; js++) ; |
520 |
|
|
if (js == nis) { |
521 |
|
|
iis[nis] = ii; |
522 |
|
|
counts[ii] = 0; |
523 |
|
|
nis++; |
524 |
|
|
} |
525 |
|
|
if ((t = counts[ii]) > 0) { |
526 |
|
|
sum += t; |
527 |
|
|
v = 10 * sum / j; |
528 |
|
|
if (mv < v) { |
529 |
|
|
mv = v; |
530 |
|
|
iv = ivv; |
531 |
|
|
jv = j; |
532 |
|
|
} |
533 |
|
|
} |
534 |
|
|
counts[ii]++; |
535 |
|
|
} |
536 |
|
|
} |
537 |
|
|
} |
538 |
|
|
|
539 |
|
|
int wo(int len, const char* s, int* beg, int* end) { |
540 |
|
|
int i, l1; |
541 |
|
|
l1 = len - dustword + 1; |
542 |
|
|
if (l1 < 0) { |
543 |
|
|
*beg = 0; |
544 |
|
|
*end = len - 1; |
545 |
|
|
return 0; |
546 |
|
|
} |
547 |
|
|
mv = 0; |
548 |
|
|
iv = 0; |
549 |
|
|
jv = 0; |
550 |
|
|
for (i=0; i < l1; i++) { |
551 |
|
|
wo1(len-i, s+i, i); |
552 |
|
|
} |
553 |
|
|
*beg = iv; |
554 |
|
|
*end = iv + jv; |
555 |
|
|
return mv; |
556 |
|
|
} |
557 |
|
|
|
558 |
|
|
void dust(const char* seq, char* seqmsk, int seqlen, int cutoff=0) { //, maskFunc maskfn) { |
559 |
|
|
int i, j, l, a, b, v; |
560 |
|
|
if (cutoff==0) cutoff=dustcutoff; |
561 |
|
|
a=0;b=0; |
562 |
|
|
//GMessage("Dust cutoff=%d\n", cutoff); |
563 |
|
|
for (i=0; i < seqlen; i += dustwindow2) { |
564 |
|
|
l = (seqlen > i+dustwindow) ? dustwindow : seqlen-i; |
565 |
|
|
v = wo(l, seq+i, &a, &b); |
566 |
|
|
if (v > cutoff) { |
567 |
|
|
//for (j = a; j <= b && j < dustwindow2; j++) { |
568 |
|
|
for (j = a; j <= b; j++) { |
569 |
|
|
seqmsk[i+j]='N';//could be made lowercase instead |
570 |
|
|
} |
571 |
|
|
} |
572 |
|
|
} |
573 |
|
|
//return first; |
574 |
|
|
} |
575 |
|
|
}; |
576 |
|
|
|
577 |
|
|
static DNADuster duster; |
578 |
|
|
|
579 |
|
|
int dust(GStr& rseq) { |
580 |
|
|
char* seq=Gstrdup(rseq.chars()); |
581 |
|
|
duster.dust(rseq.chars(), seq, rseq.length(), dust_cutoff); |
582 |
|
|
//check the number of Ns: |
583 |
|
|
int ncount=0; |
584 |
|
|
for (int i=0;i<rseq.length();i++) { |
585 |
|
|
if (seq[i]=='N') ncount++; |
586 |
|
|
} |
587 |
|
|
GFREE(seq); |
588 |
|
|
return ncount; |
589 |
|
|
} |
590 |
|
|
|
591 |
|
|
// ------------------ adapter matching - triplet matching |
592 |
|
|
//look for matching triplets amongst the first 9 nucleotides of the 3' adaptor |
593 |
|
|
// or the last 9 nucleotides for the 5' adapter |
594 |
|
|
//when a triplet match is found, simply try to extend the alignment using a drop-off scheme |
595 |
|
|
// check minimum score and |
596 |
|
|
// for 3' adapter trimming: |
597 |
|
|
// require that the right end of the alignment for either the adaptor OR the read must be |
598 |
|
|
// < 3 distance from its right end |
599 |
|
|
// for 5' adapter trimming: |
600 |
|
|
// require that the left end of the alignment for either the adaptor OR the read must |
601 |
|
|
// be at coordinate 0 |
602 |
|
|
|
603 |
|
|
bool extendMatch(const char* a, int alen, int ai, |
604 |
gpertea |
9 |
const char* b, int blen, int bi, int mlen, int& l5, int& l3, bool end5=false) { |
605 |
gpertea |
4 |
//so the alignment starts at ai in a, bi in b, with a perfect match of length mlen |
606 |
gpertea |
9 |
//if (debug) |
607 |
|
|
// GStr dbg(b); |
608 |
gpertea |
4 |
//GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai); |
609 |
|
|
int a_l=ai; //alignment coordinates on a |
610 |
|
|
int a_r=ai+mlen-1; |
611 |
|
|
int b_l=bi; //alignment coordinates on b |
612 |
|
|
int b_r=bi+mlen-1; |
613 |
|
|
int ai_maxscore=ai; |
614 |
|
|
int bi_maxscore=bi; |
615 |
|
|
int score=mlen*a_m_score; |
616 |
|
|
int maxscore=score; |
617 |
gpertea |
9 |
int mism5score=a_mis_score; |
618 |
|
|
if (end5 && ai<(alen>>1)) mism5score-=2; // increase penalty for mismatches at 5' end |
619 |
gpertea |
4 |
//try to extend to the left first, if possible |
620 |
|
|
while (ai>0 && bi>0) { |
621 |
|
|
ai--; |
622 |
|
|
bi--; |
623 |
|
|
score+= (a[ai]==b[bi])? a_m_score : mism5score; |
624 |
|
|
if (score>maxscore) { |
625 |
|
|
ai_maxscore=ai; |
626 |
|
|
bi_maxscore=bi; |
627 |
|
|
maxscore=score; |
628 |
|
|
} |
629 |
|
|
else if (maxscore-score>a_dropoff_score) break; |
630 |
|
|
} |
631 |
|
|
a_l=ai_maxscore; |
632 |
|
|
b_l=bi_maxscore; |
633 |
gpertea |
9 |
//if (debug) GMessage(" after l-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore); |
634 |
gpertea |
4 |
//now extend to the right |
635 |
|
|
ai_maxscore=a_r; |
636 |
|
|
bi_maxscore=b_r; |
637 |
|
|
ai=a_r; |
638 |
|
|
bi=b_r; |
639 |
|
|
score=maxscore; |
640 |
|
|
//sometimes there are extra AAAAs at the end of the read, ignore those |
641 |
|
|
if (strcmp(&a[alen-4],"AAAA")==0) { |
642 |
|
|
alen-=3; |
643 |
|
|
while (a[alen-1]=='A' && alen>ai) alen--; |
644 |
|
|
} |
645 |
|
|
while (ai<alen-1 && bi<blen-1) { |
646 |
|
|
ai++; |
647 |
|
|
bi++; |
648 |
|
|
//score+= (a[ai]==b[bi])? a_m_score : a_mis_score; |
649 |
|
|
if (a[ai]==b[bi]) { //match |
650 |
|
|
score+=a_m_score; |
651 |
|
|
if (ai>=alen-2) { |
652 |
|
|
score+=a_m_score-(alen-ai-1); |
653 |
|
|
} |
654 |
|
|
} |
655 |
|
|
else { //mismatch |
656 |
|
|
score+=a_mis_score; |
657 |
|
|
} |
658 |
|
|
if (score>maxscore) { |
659 |
|
|
ai_maxscore=ai; |
660 |
|
|
bi_maxscore=bi; |
661 |
|
|
maxscore=score; |
662 |
|
|
} |
663 |
|
|
else if (maxscore-score>a_dropoff_score) break; |
664 |
|
|
} |
665 |
|
|
a_r=ai_maxscore; |
666 |
|
|
b_r=bi_maxscore; |
667 |
gpertea |
9 |
int a_ovh3=alen-a_r-1; |
668 |
|
|
int b_ovh3=blen-b_r-1; |
669 |
|
|
int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3; |
670 |
|
|
int mmovh5=(a_l<b_l)? a_l : b_l; |
671 |
|
|
//if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore); |
672 |
|
|
if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) { |
673 |
|
|
if (a_l<a_ovh3) { |
674 |
gpertea |
4 |
//adapter closer to the left end (typical for 5' adapter) |
675 |
|
|
l5=a_r+1; |
676 |
|
|
l3=alen-1; |
677 |
|
|
} |
678 |
|
|
else { |
679 |
|
|
//adapter matching at the right end (typical for 3' adapter) |
680 |
|
|
l5=0; |
681 |
|
|
l3=a_l-1; |
682 |
|
|
} |
683 |
|
|
return true; |
684 |
|
|
} |
685 |
|
|
return false; |
686 |
|
|
} |
687 |
|
|
|
688 |
|
|
bool trim_adapter3(GStr& seq, int&l5, int &l3) { |
689 |
|
|
int rlen=seq.length(); |
690 |
|
|
l5=0; |
691 |
|
|
l3=rlen-1; |
692 |
|
|
//first try a full match, we might get lucky |
693 |
|
|
int fi=-1; |
694 |
|
|
if ((fi=seq.index(adapter3))>=0) { |
695 |
|
|
if (fi<rlen-fi-a3len) {//match is closer to the right end |
696 |
|
|
l5=fi+a3len; |
697 |
|
|
l3=rlen-1; |
698 |
|
|
} |
699 |
|
|
else { |
700 |
|
|
l5=0; |
701 |
|
|
l3=fi-1; |
702 |
|
|
} |
703 |
|
|
return true; |
704 |
|
|
} |
705 |
|
|
//also, for fast detection of other adapter-only reads that start past |
706 |
|
|
// the beginning of the adapter sequence, try to see if the first a3len-4 |
707 |
|
|
// characters of the read are a substring of the adapter |
708 |
|
|
GStr rstart=seq.substr(0,a3len-4); |
709 |
|
|
if ((fi=adapter3.index(rstart))>=0) { |
710 |
|
|
l3=rlen-1; |
711 |
|
|
l5=a3len-4; |
712 |
|
|
while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++; |
713 |
|
|
return true; |
714 |
|
|
} |
715 |
|
|
|
716 |
|
|
//another easy case: first half of the adaptor matches |
717 |
|
|
int a3hlen=a3len>>1; |
718 |
|
|
GStr ahalf=adapter3.substr(0, a3hlen); |
719 |
|
|
if ((fi=seq.index(ahalf))>=0) { |
720 |
|
|
extendMatch(seq.chars(), rlen, fi, |
721 |
|
|
adapter3.chars(), a3len, 0, a3hlen, l5,l3); |
722 |
|
|
return true; |
723 |
|
|
} |
724 |
|
|
//no easy cases, so let's do the word hashing |
725 |
|
|
for (int iw=0;iw<6;iw++) { |
726 |
|
|
GStr aword=adapter3.substr(iw,3); |
727 |
|
|
if ((fi=seq.index(aword))>=0 && rlen-fi>3) { |
728 |
|
|
if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(), |
729 |
|
|
a3len, iw, 3, l5,l3)) return true; |
730 |
|
|
} |
731 |
|
|
} |
732 |
|
|
return false; //no adapter parts found |
733 |
|
|
} |
734 |
|
|
|
735 |
|
|
bool trim_adapter5(GStr& seq, int&l5, int &l3) { |
736 |
gpertea |
9 |
//if (debug) GMessage("trim_adapter5 on: %s\n", seq.chars()); |
737 |
gpertea |
4 |
int rlen=seq.length(); |
738 |
|
|
l5=0; |
739 |
|
|
l3=rlen-1; |
740 |
|
|
//try to see if adapter is fully included in the read |
741 |
|
|
int fi=-1; |
742 |
|
|
if ((fi=seq.index(adapter5))>=0) { |
743 |
|
|
if (fi<rlen-fi-a5len) {//match is closer to the right end |
744 |
|
|
l5=fi+a5len; |
745 |
|
|
l3=rlen-1; |
746 |
|
|
} |
747 |
|
|
else { |
748 |
|
|
l5=0; |
749 |
|
|
l3=fi-1; |
750 |
|
|
} |
751 |
|
|
return true; |
752 |
|
|
} |
753 |
|
|
//for fast detection of adapter-rich reads, check if the first 12 |
754 |
|
|
//characters of the read are a substring of the adapter |
755 |
|
|
GStr rstart=seq.substr(1,12); |
756 |
|
|
if ((fi=adapter5.index(rstart))>=0) { |
757 |
|
|
//l3=rlen-1; |
758 |
|
|
//l5=a5len-4; |
759 |
|
|
//while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++; |
760 |
|
|
//return true; |
761 |
gpertea |
9 |
//if (debug) GMessage(" first 12char of the read match adaptor!\n"); |
762 |
gpertea |
4 |
extendMatch(seq.chars(), rlen, 1, |
763 |
gpertea |
9 |
adapter5.chars(), a5len, fi, 12, l5,l3, true); |
764 |
gpertea |
4 |
return true; |
765 |
|
|
} |
766 |
|
|
|
767 |
|
|
//another easy case: last 12 characters of the adaptor found as a substring of the read |
768 |
|
|
int aplen=12; |
769 |
|
|
int apstart=a5len-aplen-2; |
770 |
|
|
if (apstart<0) { apstart=0; aplen=a5len-1; } |
771 |
|
|
GStr bstr=adapter5.substr(apstart, aplen); |
772 |
|
|
if ((fi=seq.index(bstr))>=0) { |
773 |
gpertea |
9 |
//if (debug) GMessage(" last 12char of adaptor match the read!\n"); |
774 |
gpertea |
4 |
extendMatch(seq.chars(), rlen, fi, |
775 |
gpertea |
9 |
adapter5.chars(), a5len, apstart, aplen, l5,l3,true); |
776 |
gpertea |
4 |
return true; |
777 |
|
|
} |
778 |
|
|
//no easy cases, find a triplet match as a seed for alignment extension |
779 |
|
|
//find triplets at the right end of the adapter |
780 |
|
|
for (int iw=0;iw<6;iw++) { |
781 |
|
|
apstart=a5len-iw-4; |
782 |
|
|
GStr aword=adapter5.substr(apstart,3); |
783 |
|
|
if ((fi=seq.index(aword))>=0) { |
784 |
gpertea |
9 |
//if (debug) GMessage("extending wmatch :%*s\n", fi+3, aword.chars()); |
785 |
gpertea |
4 |
if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
786 |
gpertea |
9 |
a5len, apstart, 3, l5,l3,true)) { |
787 |
gpertea |
4 |
return true; |
788 |
|
|
} |
789 |
|
|
} |
790 |
|
|
} |
791 |
|
|
return false; //no adapter parts found |
792 |
|
|
} |
793 |
|
|
|
794 |
|
|
//conversion of phred64 to phread33 |
795 |
|
|
void convertQ64(GStr& q) { |
796 |
|
|
for (int i=0;i<q.length();i++) q[i]-=31; |
797 |
|
|
} |
798 |
|
|
|
799 |
|
|
void convertQ64(char* q, int len) { |
800 |
|
|
for (int i=0;i<len;i++) q[i]-=31; |
801 |
|
|
} |