4 |
|
#include "GList.hh" |
5 |
|
|
6 |
|
#define USAGE "Usage:\n\ |
7 |
< |
fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-C] [-D] [-Q] \\\n\ |
8 |
< |
[-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>] <input.fq>\n\ |
7 |
> |
fqtrim [-5 <5'adapter>] [-3 <3'adapter>] [-l <minlen>] [-q <minqv>] [-C] [-D]\\\n\ |
8 |
> |
[-p {64|33}] [-n <rename_prefix>] [-o <trimmed.fq>] [-r <discarded.lst>]\\\n\ |
9 |
> |
[-Q] <input.fq>\n\ |
10 |
|
\n\ |
11 |
< |
Trim low quality bases at 3' end, optional adapter, filter for low complexity and\n\ |
12 |
< |
optionally collapse duplicates in the input fastq data.\n\ |
11 |
> |
Trim low quality bases at 3' end, optionally trim adapter sequence, filter\n\ |
12 |
> |
for low complexity and collapse duplicate reads\n\ |
13 |
|
\n\ |
14 |
|
Options:\n\ |
15 |
|
-n rename all the reads using the <prefix> followed by a read counter;\n\ |
20 |
|
(e.g. -5 CGACAGGTTCAGAGTTCTACAGTCCGACGATC)\n\ |
21 |
|
-3 trim the given adapter sequence at the 3' end of each read\n\ |
22 |
|
(e.g. -3 TCGTATGCCGTCTTCTGCTTG)\n\ |
23 |
< |
-l minimum \"clean\" length at the high quality 5'end that a read must\n\ |
24 |
< |
have in order to pass the filter (default: 16)\n\ |
23 |
> |
-q trim bases with quality value lower than <minq> (starting the 3' end)\n\ |
24 |
> |
-m maximum percentage of Ns allowed in a read after trimming (default 7)\n\ |
25 |
> |
-l minimum \"clean\" length after trimming that a read must have\n\ |
26 |
> |
in order to pass the filter (default: 16)\n\ |
27 |
|
-r write a simple \"trash report\" file listing the discarded reads with a\n\ |
28 |
|
one letter code for the reason why they were trashed\n\ |
29 |
+ |
-D apply a low-complexity (dust) filter and discard any read that has over\n\ |
30 |
+ |
50% of its length detected as low complexity\n\ |
31 |
|
-C collapse duplicate reads and add a prefix with their count to the read\n\ |
32 |
< |
name\n\ |
33 |
< |
-D apply a low-complexity (dust) filter and discard any read that has at\n\ |
34 |
< |
least half of it masked by this\n\ |
35 |
< |
-Q quality values in the input data are interpreted as phred64, convert\n\ |
31 |
< |
them to phred33\n\ |
32 |
> |
name (e.g. for microRNAs)\n\ |
33 |
> |
-p input is phred64/phred33 (use -p64 or -p33)\n\ |
34 |
> |
-Q convert quality values to the other Phred qv type\n\ |
35 |
> |
-V verbose processing\n\ |
36 |
|
" |
37 |
|
// example 3' adapter for miRNAs: TCGTATGCCGTCTTCTGCTTG |
38 |
+ |
|
39 |
+ |
//For pair ends sequencing: |
40 |
+ |
//3' : ACACTCTTTCCCTACACGACGCTCTTCCGATCT |
41 |
+ |
//5' : GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG |
42 |
|
FILE* f_out=NULL; //stdout if not provided |
43 |
|
FILE* f_in=NULL; //input fastq (stdin if not provided) |
44 |
|
FILE* freport=NULL; |
45 |
|
bool debug=false; |
46 |
+ |
bool verbose=false; |
47 |
|
bool doCollapse=false; |
48 |
|
bool doDust=false; |
49 |
|
bool trashReport=false; |
50 |
< |
bool rawFormat=false; |
50 |
> |
//bool rawFormat=false; |
51 |
|
int min_read_len=16; |
52 |
+ |
double max_perc_N=7.0; |
53 |
|
int dust_cutoff=16; |
54 |
|
bool isfasta=false; |
55 |
< |
bool phred64=false; |
55 |
> |
bool convert_phred=false; |
56 |
|
GStr prefix; |
57 |
|
GStr adapter3; |
58 |
|
GStr adapter5; |
59 |
+ |
|
60 |
+ |
int qvtrim_min=0; |
61 |
+ |
|
62 |
+ |
int qv_phredtype=0; // could be 64 or 33 (0 means undetermined yet) |
63 |
+ |
int qv_cvtadd=0; //could be -31 or +31 |
64 |
+ |
|
65 |
|
int a3len=0; |
66 |
|
int a5len=0; |
67 |
|
// adaptor matching metrics -- for extendMatch() function |
68 |
< |
static const int a_m_score=2; //match score |
69 |
< |
static const int a_mis_score=-3; //mismatch |
70 |
< |
static const int a_dropoff_score=7; |
71 |
< |
static const int a_min_score=7; |
68 |
> |
const int a_m_score=2; //match score |
69 |
> |
const int a_mis_score=-3; //mismatch |
70 |
> |
const int a_dropoff_score=7; |
71 |
> |
const int a_min_score=8; //an exact match of 4 bases at the proper ends WILL be trimmed |
72 |
> |
const int a_min_chain_score=15; //for gapped alignments |
73 |
> |
|
74 |
> |
class CSegChain; |
75 |
> |
|
76 |
> |
class CSegPair { |
77 |
> |
public: |
78 |
> |
GSeg a; |
79 |
> |
GSeg b; //the adapter segment |
80 |
> |
int score; |
81 |
> |
int flags; |
82 |
> |
CSegChain* chain; |
83 |
> |
CSegPair(int astart=0, int aend=0, int bstart=0, int bend=0, int mscore=0):a(astart,aend),b(bstart, bend) { |
84 |
> |
score=mscore; |
85 |
> |
if (score==0) score=a.len()*a_m_score; |
86 |
> |
flags=0; |
87 |
> |
chain=NULL; |
88 |
> |
} |
89 |
> |
int len() { return a.len(); } |
90 |
> |
bool operator==(CSegPair& d){ |
91 |
> |
//return (a.start==d.a.start && a.end==d.a.end && b.start==d.b.start && b.end==d.b.end); |
92 |
> |
//make equal even segments that are included into one another: |
93 |
> |
return (d.a.start>=a.start && d.a.end<=a.end && d.b.start>=b.start && d.b.end<=b.end); |
94 |
> |
} |
95 |
> |
bool operator>(CSegPair& d){ //ordering based on b (adaptor) start coord and score |
96 |
> |
if (b.start==d.b.start) { |
97 |
> |
if (score==d.score) { |
98 |
> |
//just try to be consistent: |
99 |
> |
if (b.end==d.b.end) { |
100 |
> |
return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start); |
101 |
> |
} |
102 |
> |
return (b.end>d.b.end); |
103 |
> |
} |
104 |
> |
else return (score<d.score); |
105 |
> |
} |
106 |
> |
return (b.start>d.b.start); |
107 |
> |
} |
108 |
> |
bool operator<(CSegPair& d){ //ordering based on b (adaptor) coord |
109 |
> |
/*if (b.start==d.b.start && b.end==d.b.end) { |
110 |
> |
return (a.start==d.a.start)?(a.end<d.a.end):(a.start<d.a.start); |
111 |
> |
} |
112 |
> |
return (b.start==d.b.start)?(b.end<d.b.end):(b.start<d.b.start);*/ |
113 |
> |
if (b.start==d.b.start) { |
114 |
> |
if (score==d.score) { |
115 |
> |
//just try to be consistent: |
116 |
> |
if (b.end==d.b.end) { |
117 |
> |
return (a.start==d.a.start)?(a.end>d.a.end):(a.start>d.a.start); |
118 |
> |
} |
119 |
> |
return (b.end<d.b.end); |
120 |
> |
} |
121 |
> |
else return (score>d.score); |
122 |
> |
} |
123 |
> |
return (b.start<d.b.start); |
124 |
> |
} |
125 |
> |
}; |
126 |
> |
|
127 |
> |
int cmpSegEnds(pointer sa, pointer sb) { //sort by adaptor seg ends AND score |
128 |
> |
CSegPair& x = *(CSegPair *)sa; |
129 |
> |
CSegPair& y = *(CSegPair *)sb; |
130 |
> |
/* |
131 |
> |
if (x.b.end==y.b.end) { |
132 |
> |
if (x.b.start==y.b.start) { |
133 |
> |
if (x.a.end==y.a.end) { |
134 |
> |
if (x.a.start==y.a.start) return 0; |
135 |
> |
return ((x.a.start>y.a.start) ? -1 : 1); |
136 |
> |
} |
137 |
> |
else { |
138 |
> |
return ((x.a.end>y.a.end) ? -1 : 1); |
139 |
> |
} |
140 |
> |
} |
141 |
> |
else { |
142 |
> |
return ((x.b.start>y.b.start) ? -1 : 1); |
143 |
> |
} |
144 |
> |
} |
145 |
> |
else { |
146 |
> |
return ((x.b.end>y.b.end) ? -1 : 1); |
147 |
> |
} |
148 |
> |
*/ |
149 |
> |
if (x.b.end==y.b.end) { |
150 |
> |
if (x.score==y.score) { |
151 |
> |
if (x.b.start==y.b.start) { |
152 |
> |
if (x.a.end==y.a.end) { |
153 |
> |
if (x.a.start==y.a.start) return 0; |
154 |
> |
return ((x.a.start<y.a.start) ? -1 : 1); |
155 |
> |
} |
156 |
> |
else { |
157 |
> |
return ((x.a.end<y.a.end) ? -1 : 1); |
158 |
> |
} |
159 |
> |
} |
160 |
> |
else { |
161 |
> |
return ((x.b.start<y.b.start) ? -1 : 1); |
162 |
> |
} |
163 |
> |
} else return ((x.score>y.score) ? -1 : 1); |
164 |
> |
} |
165 |
> |
else { |
166 |
> |
return ((x.b.end>y.b.end) ? -1 : 1); |
167 |
> |
} |
168 |
> |
|
169 |
> |
} |
170 |
> |
|
171 |
> |
class CSegChain:public GList<CSegPair> { |
172 |
> |
public: |
173 |
> |
uint astart; |
174 |
> |
uint aend; |
175 |
> |
uint bstart; |
176 |
> |
uint bend; |
177 |
> |
int score; |
178 |
> |
bool endSort; |
179 |
> |
CSegChain(bool aln5=false):GList<CSegPair>(true,true,true) {//sorted, free elements, unique |
180 |
> |
//as SegPairs are inserted, they will be sorted by a.start coordinate |
181 |
> |
score=0; |
182 |
> |
astart=MAX_UINT; |
183 |
> |
aend=0; |
184 |
> |
bstart=MAX_UINT; |
185 |
> |
bend=0; |
186 |
> |
endSort=aln5; |
187 |
> |
if (aln5) { setSorted(cmpSegEnds); } |
188 |
> |
} |
189 |
> |
bool operator==(CSegChain& d) { |
190 |
> |
//return (score==d.score); |
191 |
> |
return (astart==d.astart && aend==d.aend && bstart==d.bstart && bend==d.bend); |
192 |
> |
} |
193 |
> |
bool operator>(CSegChain& d) { // order based on b (adaptor) coordinate |
194 |
> |
//return (score<d.score); |
195 |
> |
if (bstart==d.bstart && bend==d.bend) { |
196 |
> |
return (astart==d.astart)?(aend>d.aend):(astart>d.astart); |
197 |
> |
} |
198 |
> |
return (bstart==d.bstart)?(bend>d.bend):(bstart>d.bstart); |
199 |
> |
} |
200 |
> |
bool operator<(CSegChain& d) { |
201 |
> |
//return (score>d.score); |
202 |
> |
if (bstart==d.bstart && bend==d.bend) { |
203 |
> |
return (astart==d.astart)?(aend<d.aend):(astart<d.astart); |
204 |
> |
} |
205 |
> |
return (bstart==d.bstart)?(bend<d.bend):(bstart<d.bstart); |
206 |
> |
} |
207 |
> |
void addSegPair(CSegPair* segp) { |
208 |
> |
if (AddIfNew(segp)!=segp) return; |
209 |
> |
score+=segp->score; |
210 |
> |
if (astart>segp->a.start) astart=segp->a.start; |
211 |
> |
if (aend<segp->a.end) aend=segp->a.end; |
212 |
> |
if (bstart>segp->b.start) bstart=segp->b.start; |
213 |
> |
if (bend<segp->b.end) bend=segp->b.end; |
214 |
> |
} |
215 |
> |
//for building actual chains: |
216 |
> |
bool extendChain(CSegPair* segp) { //segp expected to be "Greater Than" current chain |
217 |
> |
int bgap=0; |
218 |
> |
int agap=0; |
219 |
> |
//if (endSort) { |
220 |
> |
if (bstart>segp->b.start) { |
221 |
> |
bgap = (int)(bstart-segp->b.end); |
222 |
> |
if (abs(bgap)>2) return false; |
223 |
> |
agap = (int)(astart-segp->a.end); |
224 |
> |
if (abs(agap)>2) return false; |
225 |
> |
} |
226 |
> |
else { |
227 |
> |
bgap = (int) (segp->b.start-bend); |
228 |
> |
if (abs(bgap)>2) return false; |
229 |
> |
agap = (int)(segp->a.start-aend); |
230 |
> |
if (abs(agap)>2) return false; |
231 |
> |
} |
232 |
> |
if (agap*bgap<0) return false; |
233 |
> |
addSegPair(segp); |
234 |
> |
score-=abs(agap)+abs(bgap); |
235 |
> |
return true; |
236 |
> |
} |
237 |
> |
}; |
238 |
|
|
239 |
|
// element in dhash: |
240 |
|
class FqDupRec { |
284 |
|
|
285 |
|
GHash<FqDupRec> dhash; //hash to keep track of duplicates |
286 |
|
|
287 |
< |
bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3); //returns true if any trimming occured |
288 |
< |
|
287 |
> |
bool ntrim(GStr& rseq, int &l5, int &l3); //returns true if any trimming occured |
288 |
> |
bool qtrim(GStr& qvs, int &l5, int &l3); //return true if any trimming occured |
289 |
|
int dust(GStr& seq); |
290 |
|
bool trim_adapter3(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
291 |
|
bool trim_adapter5(GStr& seq, int &l5, int &l3); //returns true if any trimming occured |
292 |
|
|
293 |
< |
void convertQ64(char* q, int len); |
294 |
< |
void convertQ64(GStr& q); |
293 |
> |
void convertPhred(char* q, int len); |
294 |
> |
void convertPhred(GStr& q); |
295 |
|
|
296 |
|
int main(int argc, char * const argv[]) { |
297 |
< |
GArgs args(argc, argv, "YQRDCl:d:3:5:n:r:o:"); |
297 |
> |
GArgs args(argc, argv, "YQDCVl:d:3:5:m:n:r:p:q:o:"); |
298 |
|
int e; |
299 |
|
int icounter=0; //counter for input reads |
300 |
|
int outcounter=0; //counter for output reads |
303 |
|
exit(224); |
304 |
|
} |
305 |
|
debug=(args.getOpt('Y')!=NULL); |
306 |
< |
phred64=(args.getOpt('Q')!=NULL); |
306 |
> |
debug=(args.getOpt('V')!=NULL); |
307 |
> |
convert_phred=(args.getOpt('Q')!=NULL); |
308 |
|
doCollapse=(args.getOpt('C')!=NULL); |
309 |
|
doDust=(args.getOpt('D')!=NULL); |
310 |
+ |
/* |
311 |
|
rawFormat=(args.getOpt('R')!=NULL); |
312 |
|
if (rawFormat) { |
313 |
|
GError("Sorry, raw qseq format parsing is not implemented yet!\n"); |
314 |
|
} |
315 |
+ |
*/ |
316 |
|
prefix=args.getOpt('n'); |
317 |
|
GStr s=args.getOpt('l'); |
318 |
|
if (!s.is_empty()) |
319 |
|
min_read_len=s.asInt(); |
320 |
+ |
s=args.getOpt('m'); |
321 |
+ |
if (!s.is_empty()) |
322 |
+ |
max_perc_N=s.asDouble(); |
323 |
|
s=args.getOpt('d'); |
324 |
|
if (!s.is_empty()) { |
325 |
|
dust_cutoff=s.asInt(); |
326 |
|
doDust=true; |
327 |
|
} |
328 |
< |
|
328 |
> |
s=args.getOpt('q'); |
329 |
> |
if (!s.is_empty()) { |
330 |
> |
qvtrim_min=s.asInt(); |
331 |
> |
} |
332 |
> |
s=args.getOpt('p'); |
333 |
> |
if (!s.is_empty()) { |
334 |
> |
int v=s.asInt(); |
335 |
> |
if (v==33) { |
336 |
> |
qv_phredtype=33; |
337 |
> |
qv_cvtadd=31; |
338 |
> |
} |
339 |
> |
else if (v==64) { |
340 |
> |
qv_phredtype=64; |
341 |
> |
qv_cvtadd=-31; |
342 |
> |
} |
343 |
> |
else |
344 |
> |
GMessage("%s\nInvalid value for -p option (can only be 64 or 33)!\n",USAGE); |
345 |
> |
} |
346 |
|
if (args.getOpt('3')!=NULL) { |
347 |
|
adapter3=args.getOpt('3'); |
348 |
|
adapter3.upper(); |
380 |
|
GStr rseq; //current read sequence |
381 |
|
GStr rqv; //current read quality values |
382 |
|
GStr s; |
383 |
< |
if (rawFormat) { |
383 |
> |
/* if (rawFormat) { |
384 |
|
//TODO: implement qseq parsing here |
385 |
|
//if (raw type=N) then continue; //skip invalid/bad records |
386 |
|
|
387 |
|
} //raw qseq format |
388 |
< |
else { // FASTQ or FASTA |
388 |
> |
else { // FASTQ or FASTA */ |
389 |
|
isfasta=(l[0]=='>'); |
390 |
|
if (!isfasta && l[0]!='@') GError("Error: fastq record marker not detected!\n"); |
391 |
|
s=l; |
392 |
|
rname=&(l[1]); |
393 |
|
icounter++; |
189 |
– |
//GMessage("readname=%s\n",rname.chars()); |
394 |
|
for (int i=0;i<rname.length();i++) |
395 |
|
if (rname[i]<=' ') { rname.cut(i); break; } |
396 |
|
//now get the sequence |
397 |
|
if ((l=fq.getLine())==NULL) |
398 |
|
GError("Error: unexpected EOF after header for %s\n",rname.chars()); |
399 |
|
rseq=l; //this must be the DNA line |
400 |
< |
if (isfasta) { |
401 |
< |
while ((l=fq.getLine())!=NULL) { |
402 |
< |
//fasta can have multiple sequence lines |
199 |
< |
if (l[0]=='>') { |
400 |
> |
while ((l=fq.getLine())!=NULL) { |
401 |
> |
//seq can span multiple lines |
402 |
> |
if (l[0]=='>' || l[0]=='+') { |
403 |
|
fq.pushBack(); |
404 |
|
break; // |
405 |
|
} |
406 |
|
rseq+=l; |
407 |
< |
} |
408 |
< |
} //multi-line fasta file |
409 |
< |
if (!isfasta) { |
207 |
< |
if ((l=fq.getLine())==NULL) |
407 |
> |
} //check for multi-line seq |
408 |
> |
if (!isfasta) { //reading fastq quality values, which can also be multi-line |
409 |
> |
if ((l=fq.getLine())==NULL) |
410 |
|
GError("Error: unexpected EOF after sequence for %s\n", rname.chars()); |
411 |
|
if (l[0]!='+') GError("Error: fastq qv header marker not detected!\n"); |
412 |
|
if ((l=fq.getLine())==NULL) |
413 |
|
GError("Error: unexpected EOF after qv header for %s\n", rname.chars()); |
414 |
|
rqv=l; |
415 |
< |
if (rqv.length()!=rseq.length()) |
416 |
< |
GError("Error: qv len != seq len for %s\n", rname.chars()); |
417 |
< |
} |
418 |
< |
} //<-- FASTA or FASTQ |
415 |
> |
//if (rqv.length()!=rseq.length()) |
416 |
> |
// GError("Error: qv len != seq len for %s\n", rname.chars()); |
417 |
> |
while (rqv.length()<rseq.length() && ((l=fq.getLine())!=NULL)) { |
418 |
> |
rqv+=l; //append to qv string |
419 |
> |
} |
420 |
> |
}// fastq |
421 |
> |
// } //<-- FASTA or FASTQ |
422 |
|
rseq.upper(); |
423 |
|
int l5=0; |
424 |
|
int l3=rseq.length()-1; |
428 |
|
} |
429 |
|
continue; |
430 |
|
} |
431 |
< |
if (ntrim(rseq, rqv, l5, l3)) { |
431 |
> |
if (ntrim(rseq, l5, l3)) { // N-trimming |
432 |
> |
//GMessage("before: %s\n",rseq.chars()); |
433 |
> |
//GMessage("after : %s (%d)\n",rseq.substr(l5,l3-l5+1).chars(),l3-l5+1); |
434 |
|
if (l3-l5+1<min_read_len) { |
435 |
|
if (trashReport) { |
436 |
|
fprintf(freport, "%s\tN\t%s\n", rname.chars(), rseq.chars()); |
441 |
|
rseq=rseq.substr(l5, l3-l5+1); |
442 |
|
if (!rqv.is_empty()) |
443 |
|
rqv=rqv.substr(l5, l3-l5+1); |
444 |
+ |
l5=0; |
445 |
+ |
l3=rseq.length()-1; |
446 |
|
} |
447 |
< |
|
447 |
> |
if (qvtrim_min!=0 && !rqv.is_empty() && qtrim(rqv, l5, l3)) { // qv-threshold trimming |
448 |
> |
if (l3-l5+1<min_read_len) { |
449 |
> |
if (trashReport) { |
450 |
> |
fprintf(freport, "%s\tQ\t%s\n", rname.chars(), rseq.chars()); |
451 |
> |
} |
452 |
> |
continue; //invalid read |
453 |
> |
} |
454 |
> |
//-- keep only the l5..l3 range |
455 |
> |
rseq=rseq.substr(l5, l3-l5+1); |
456 |
> |
if (!rqv.is_empty()) |
457 |
> |
rqv=rqv.substr(l5, l3-l5+1); |
458 |
> |
} //qv trimming |
459 |
|
if (a3len>0) { |
460 |
|
if (trim_adapter3(rseq, l5, l3)) { |
461 |
|
if (l3-l5+1<min_read_len) { |
517 |
|
rseq.chars()); |
518 |
|
} |
519 |
|
else { //fastq |
520 |
< |
if (phred64) convertQ64(rqv); |
520 |
> |
if (convert_phred) convertPhred(rqv); |
521 |
|
if (prefix.is_empty()) |
522 |
|
fprintf(f_out, "@%s\n%s\n+\n%s\n", rname.chars(), rseq.chars(),rqv.chars()); |
523 |
|
else |
563 |
|
} |
564 |
|
} |
565 |
|
else { //fastq format |
566 |
< |
if (phred64) convertQ64(qd->qv, qd->len); |
566 |
> |
if (convert_phred) convertPhred(qd->qv, qd->len); |
567 |
|
if (prefix.is_empty()) { |
568 |
|
fprintf(f_out, "@%s_x%d\n%s\n+\n%s\n", qd->firstname, qd->count, |
569 |
|
rseq.chars(), qd->qv); |
585 |
|
} |
586 |
|
|
587 |
|
FWCLOSE(f_out); |
588 |
+ |
//getc(stdin); |
589 |
|
} |
590 |
|
|
591 |
|
class NData { |
614 |
|
end5=1; |
615 |
|
end3=seqlen; |
616 |
|
for (int i=0;i<seqlen;i++) |
617 |
< |
if (rseq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT") |
617 |
> |
if (seq[i]=='N') {// if (!ichrInStr(rseq[i], "ACGT") |
618 |
|
NPos[NCount]=i; |
619 |
|
NCount++; |
620 |
|
} |
637 |
|
void N_analyze(int l5, int l3, int p5, int p3) { |
638 |
|
/* assumes feat was filled properly */ |
639 |
|
int old_dif, t5,t3,v; |
640 |
< |
if (l3-l5<min_read_len || p5>p3 ) { |
640 |
> |
if (l3<l5+2 || p5>p3 ) { |
641 |
|
feat.end5=l5+1; |
642 |
|
feat.end3=l3+1; |
643 |
|
return; |
669 |
|
} |
670 |
|
|
671 |
|
|
672 |
< |
bool ntrim(GStr& rseq, GStr &rqv, int &l5, int &l3) { |
673 |
< |
//count Ns in the sequence |
672 |
> |
bool qtrim(GStr& qvs, int &l5, int &l3) { |
673 |
> |
if (qvtrim_min==0 || qvs.is_empty()) return false; |
674 |
> |
if (qv_phredtype==0) { |
675 |
> |
//try to guess the Phred type |
676 |
> |
int vmin=256, vmax=0; |
677 |
> |
for (int i=0;i<qvs.length();i++) { |
678 |
> |
if (vmin>qvs[i]) vmin=qvs[i]; |
679 |
> |
if (vmax<qvs[i]) vmax=qvs[i]; |
680 |
> |
} |
681 |
> |
if (vmin<64) { qv_phredtype=33; qv_cvtadd=31; } |
682 |
> |
if (vmax>95) { qv_phredtype=64; qv_cvtadd=-31; } |
683 |
> |
if (qv_phredtype==0) { |
684 |
> |
GError("Error: couldn't determine Phred type, please use the -p33 or -p64 !\n"); |
685 |
> |
} |
686 |
> |
} //guessing the Phred type |
687 |
> |
for (l3=qvs.length()-1;l3>2;l3--) { |
688 |
> |
if (qvs[l3]-qv_phredtype>=qvtrim_min && qvs[l3-1]-qv_phredtype>=qvtrim_min) break; |
689 |
> |
} |
690 |
> |
//just in case, check also the 5' the end (?) |
691 |
> |
for (l5=0;l5<qvs.length()-3;l5++) { |
692 |
> |
if (qvs[l5]-qv_phredtype>=qvtrim_min && qvs[l5+1]-qv_phredtype>=qvtrim_min) break; |
693 |
> |
} |
694 |
> |
return (l5>0 || l3<qvs.length()-1); |
695 |
> |
} |
696 |
> |
|
697 |
> |
bool ntrim(GStr& rseq, int &l5, int &l3) { |
698 |
> |
//count Ns in the sequence, trim N-rich ends |
699 |
|
feat.init(rseq); |
700 |
|
l5=feat.end5-1; |
701 |
|
l3=feat.end3-1; |
702 |
|
N_analyze(feat.end5-1, feat.end3-1, 0, feat.NCount-1); |
703 |
< |
if (l5==feat.end5-1 && l3==feat.end3-1) |
704 |
< |
return false; //nothing changed |
703 |
> |
if (l5==feat.end5-1 && l3==feat.end3-1) { |
704 |
> |
if (feat.perc_N>max_perc_N) { |
705 |
> |
feat.valid=false; |
706 |
> |
l3=l5+1; |
707 |
> |
return true; |
708 |
> |
} |
709 |
> |
else { |
710 |
> |
return false; //nothing changed |
711 |
> |
} |
712 |
> |
} |
713 |
|
l5=feat.end5-1; |
714 |
|
l3=feat.end3-1; |
715 |
|
if (l3-l5+1<min_read_len) { |
717 |
|
return true; |
718 |
|
} |
719 |
|
feat.N_calc(); |
720 |
< |
if (feat.perc_N>6.2) { //not valid if more than 1 N per 16 bases |
720 |
> |
|
721 |
> |
if (feat.perc_N>max_perc_N) { |
722 |
|
feat.valid=false; |
723 |
|
l3=l5+1; |
724 |
|
return true; |
844 |
|
return ncount; |
845 |
|
} |
846 |
|
|
847 |
< |
// ------------------ adapter matching - triplet matching |
848 |
< |
//look for matching triplets amongst the first 9 nucleotides of the 3' adaptor |
849 |
< |
// or the last 9 nucleotides for the 5' adapter |
850 |
< |
//when a triplet match is found, simply try to extend the alignment using a drop-off scheme |
851 |
< |
// check minimum score and |
597 |
< |
// for 3' adapter trimming: |
847 |
> |
|
848 |
> |
// ------------------ adapter matching - simple k-mer seed & extend, no indels for now |
849 |
> |
//when a k-mer match is found, simply try to extend the alignment using a drop-off scheme |
850 |
> |
//check minimum score and |
851 |
> |
//for 3' adapter trimming: |
852 |
|
// require that the right end of the alignment for either the adaptor OR the read must be |
853 |
|
// < 3 distance from its right end |
854 |
|
// for 5' adapter trimming: |
855 |
|
// require that the left end of the alignment for either the adaptor OR the read must |
856 |
< |
// be at coordinate 0 |
856 |
> |
// be at coordinate < 3 from start |
857 |
|
|
858 |
|
bool extendMatch(const char* a, int alen, int ai, |
859 |
< |
const char* b, int blen, int bi, int mlen, int& l5, int& l3, bool end5=false) { |
859 |
> |
const char* b, int blen, int bi, int mlen, int& l5, int& l3, CSegChain& segs, bool end5=false) { |
860 |
|
//so the alignment starts at ai in a, bi in b, with a perfect match of length mlen |
861 |
< |
//if (debug) |
862 |
< |
// GStr dbg(b); |
863 |
< |
//GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(ai, mlen)).chars(), ai); |
861 |
> |
#ifdef DEBUG |
862 |
> |
GStr dbg(b); |
863 |
> |
#endif |
864 |
> |
//if (debug) { |
865 |
> |
// GMessage(">> in %s\n\textending hit: %s at position %d\n", a, (dbg.substr(bi, mlen)).chars(), ai); |
866 |
> |
// } |
867 |
|
int a_l=ai; //alignment coordinates on a |
868 |
|
int a_r=ai+mlen-1; |
869 |
|
int b_l=bi; //alignment coordinates on b |
927 |
|
int mmovh3=(a_ovh3<b_ovh3)? a_ovh3 : b_ovh3; |
928 |
|
int mmovh5=(a_l<b_l)? a_l : b_l; |
929 |
|
//if (debug) GMessage(" after r-extend: %*s%s\t\t(score=%d)\n",a_l," ",dbg.substr(b_l,b_r-b_l+1).chars(),maxscore); |
930 |
+ |
#ifdef DEBUG |
931 |
+ |
if (debug) GMessage(" extended to: %*s\n",a_r+1,dbg.substr(b_l,b_r-b_l+1).chars()); |
932 |
+ |
#endif |
933 |
|
if (maxscore>=a_min_score && mmovh3<2 && mmovh5<2) { |
934 |
|
if (a_l<a_ovh3) { |
935 |
|
//adapter closer to the left end (typical for 5' adapter) |
943 |
|
} |
944 |
|
return true; |
945 |
|
} |
946 |
+ |
else { //keep this segment pair for later (gapped alignment) |
947 |
+ |
segs.addSegPair(new CSegPair(a_l+1, a_r+1, b_l+1, b_r+1, maxscore)); |
948 |
+ |
//this will also update min & max coordinates in segs (segs.astart, .aend, .bstart, .bend) |
949 |
+ |
} |
950 |
|
//do not trim: |
951 |
|
l5=0; |
952 |
|
l3=alen-1; |
953 |
|
return false; |
954 |
|
} |
955 |
|
|
956 |
+ |
/* |
957 |
+ |
int getWordValue(const char* s, int wlen) { |
958 |
+ |
int r=0; |
959 |
+ |
while (wlen--) { r+=(((int)s[wlen])<<wlen) } |
960 |
+ |
return r; |
961 |
+ |
} |
962 |
+ |
*/ |
963 |
+ |
int get3mer_value(const char* s) { |
964 |
+ |
return (s[0]<<16)+(s[1]<<8)+s[2]; |
965 |
+ |
} |
966 |
+ |
|
967 |
+ |
int w3_match(int qv, const char* str, int slen, int start_index=0) { |
968 |
+ |
if (start_index>=slen || start_index<0) return -1; |
969 |
+ |
for (int i=start_index;i<slen-3;i++) { |
970 |
+ |
int rv=get3mer_value(str+i); |
971 |
+ |
if (rv==qv) return i; |
972 |
+ |
} |
973 |
+ |
return -1; |
974 |
+ |
} |
975 |
+ |
|
976 |
+ |
int w3_rmatch(int qv, const char* str, int slen, int end_index=-1) { |
977 |
+ |
if (end_index>=slen) return -1; |
978 |
+ |
if (end_index<0) end_index=slen-1; |
979 |
+ |
for (int i=end_index-2;i>=0;i--) { |
980 |
+ |
int rv=get3mer_value(str+i); |
981 |
+ |
if (rv==qv) return i; |
982 |
+ |
} |
983 |
+ |
return -1; |
984 |
+ |
} |
985 |
+ |
|
986 |
+ |
int fast4match(int32 qv, const char* str, int slen, int start_index=0) { |
987 |
+ |
if (start_index>=slen || start_index<0) return -1; |
988 |
+ |
for (int i=start_index;i<slen-4;i++) { |
989 |
+ |
int32* rv=(int32*)(str+i); |
990 |
+ |
if (*rv==qv) return i; |
991 |
+ |
} |
992 |
+ |
return -1; |
993 |
+ |
} |
994 |
+ |
|
995 |
+ |
int fast4rmatch(int32 qv, const char* str, int slen, int end_index=-1) { |
996 |
+ |
if (end_index>=slen) return -1; |
997 |
+ |
if (end_index<0) end_index=slen-1; |
998 |
+ |
for (int i=end_index-3;i>=0;i--) { |
999 |
+ |
int32* rv=(int32*)(str+i); |
1000 |
+ |
if (*rv==qv) return i; |
1001 |
+ |
} |
1002 |
+ |
return -1; |
1003 |
+ |
} |
1004 |
+ |
|
1005 |
+ |
#ifdef DEBUG |
1006 |
+ |
void dbgPrintChain(CSegChain& chain, const char* aseq) { |
1007 |
+ |
GStr s(aseq); |
1008 |
+ |
for (int i=0;i<chain.Count();i++) { |
1009 |
+ |
CSegPair& seg=*chain[i]; |
1010 |
+ |
GMessage(" dbg chain seg%d: %*s [%d-%d:%d-%d]\n",i,seg.a.start-1+seg.len(), |
1011 |
+ |
s.substr(seg.b.start-1, seg.len()).chars(), seg.b.start,seg.b.end,seg.a.start,seg.a.end); |
1012 |
+ |
} |
1013 |
+ |
} |
1014 |
+ |
#endif |
1015 |
+ |
|
1016 |
|
bool trim_adapter3(GStr& seq, int&l5, int &l3) { |
1017 |
|
int rlen=seq.length(); |
1018 |
|
l5=0; |
1026 |
|
} |
1027 |
|
else { |
1028 |
|
l5=0; |
1029 |
< |
l3=fi-1; |
1029 |
> |
l3=fi-1; |
1030 |
|
} |
1031 |
|
return true; |
1032 |
|
} |
1033 |
+ |
#ifdef DEBUG |
1034 |
+ |
if (debug) GMessage(">TRIM3 >> Read: %s\n",seq.chars()); |
1035 |
+ |
#endif |
1036 |
+ |
|
1037 |
|
//also, for fast detection of other adapter-only reads that start past |
1038 |
|
// the beginning of the adapter sequence, try to see if the first a3len-4 |
1039 |
< |
// characters of the read are a substring of the adapter |
1040 |
< |
GStr rstart=seq.substr(0,a3len-4); |
1041 |
< |
if ((fi=adapter3.index(rstart))>=0) { |
1042 |
< |
l3=rlen-1; |
1043 |
< |
l5=a3len-4; |
1044 |
< |
while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++; |
1045 |
< |
return true; |
1039 |
> |
// bases of the read are a substring of the adapter |
1040 |
> |
if (rlen>a3len-3) { |
1041 |
> |
GStr rstart=seq.substr(1,a3len-4); |
1042 |
> |
if ((fi=adapter3.index(rstart))>=0) { |
1043 |
> |
l3=rlen-1; |
1044 |
> |
l5=a3len-4; |
1045 |
> |
while (fi+l5<a3len && l5<l3 && adapter3[fi+l5]==seq[l5]) l5++; |
1046 |
> |
return true; |
1047 |
> |
} |
1048 |
> |
} |
1049 |
> |
CSegChain a3segs; //no chains here, just an ordered collection of segment pairs |
1050 |
> |
//check the easy cases - 11 bases exact match at the end |
1051 |
> |
int fdlen=11; |
1052 |
> |
if (a3len<16) { |
1053 |
> |
fdlen=a3len>>1; |
1054 |
|
} |
1055 |
< |
|
1056 |
< |
//another easy case: first half of the adaptor matches |
1057 |
< |
int a3hlen=a3len>>1; |
1058 |
< |
GStr ahalf=adapter3.substr(0, a3hlen); |
1059 |
< |
if ((fi=seq.index(ahalf))>=0) { |
1060 |
< |
extendMatch(seq.chars(), rlen, fi, |
1061 |
< |
adapter3.chars(), a3len, 0, a3hlen, l5,l3); |
1062 |
< |
return true; |
1055 |
> |
if (fdlen>4) { |
1056 |
> |
//check if we're lucky enough to have the last 11 bases of the read a part of the adapter |
1057 |
> |
GStr rstart=seq.substr(-fdlen-3,fdlen); |
1058 |
> |
if ((fi=adapter3.index(rstart))>=0) { |
1059 |
> |
#ifdef DEBUG |
1060 |
> |
if (debug) GMessage(" W11match found: %*s\n", rlen-3, (adapter3.substr(fi,fdlen)).chars()); |
1061 |
> |
#endif |
1062 |
> |
if (extendMatch(seq.chars(), rlen, rlen-fdlen-3, |
1063 |
> |
adapter3.chars(), a3len, fi, fdlen, l5,l3, a3segs)) |
1064 |
> |
return true; |
1065 |
> |
} |
1066 |
> |
//another easy case: first 11 characters of the adaptor found as a substring of the read |
1067 |
> |
GStr bstr=adapter3.substr(0, fdlen); |
1068 |
> |
if ((fi=seq.rindex(bstr))>=0) { |
1069 |
> |
#ifdef DEBUG |
1070 |
> |
if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars()); |
1071 |
> |
#endif |
1072 |
> |
if (extendMatch(seq.chars(), rlen, fi, |
1073 |
> |
adapter3.chars(), a3len, 0, fdlen, l5,l3, a3segs)) |
1074 |
> |
return true; |
1075 |
> |
} |
1076 |
> |
} //tried to match 11 bases first |
1077 |
> |
|
1078 |
> |
//no easy cases, so let's do the wmer hashing for the first 12 bases of the adaptor |
1079 |
> |
//-- only extend if the match is in the 3' (ending) region of the read |
1080 |
> |
int wordSize=3; |
1081 |
> |
int hlen=12; |
1082 |
> |
if (hlen>a3len-wordSize) hlen=a3len-wordSize; |
1083 |
> |
int imin=rlen>>1; //last half of the read, left boundary for the wmer match |
1084 |
> |
if (imin<a3len) { imin=GMIN(a3len, rlen-wordSize); } |
1085 |
> |
imin=rlen-imin; |
1086 |
> |
for (int iw=0;iw<hlen;iw++) { |
1087 |
> |
//int32* qv=(int32*)(adapter3.chars()+iw); |
1088 |
> |
int qv=get3mer_value(adapter3.chars()+iw); |
1089 |
> |
fi=-1; |
1090 |
> |
//while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1091 |
> |
while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1092 |
> |
//GMessage(" ... fi=%d after w3_rmatch() (imin=%d)\n", fi, imin); |
1093 |
> |
|
1094 |
> |
#ifdef DEBUG |
1095 |
> |
if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter3.substr(iw,wordSize)).chars()); |
1096 |
> |
#endif |
1097 |
> |
if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(), |
1098 |
> |
a3len, iw, wordSize, l5,l3, a3segs)) return true; |
1099 |
> |
fi--; |
1100 |
> |
if (fi<imin) break; |
1101 |
> |
} |
1102 |
> |
} //for each wmer in the first hlen bases of the adaptor |
1103 |
> |
/* |
1104 |
> |
//couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there |
1105 |
> |
//but only do this if we already have segment pairs collected in the last 12 bases of the adapter |
1106 |
> |
if (a3segs.bstart>3 || a3segs.bend<(uint)(hlen-wordSize)) return false; |
1107 |
> |
int hlen2=a3len-wordSize; |
1108 |
> |
//if (hlen2>a3len-4) hlen2=a3len-4; |
1109 |
> |
if (hlen2>hlen) { |
1110 |
> |
#ifdef DEBUG |
1111 |
> |
if (debug && a3segs.Count()>0) { |
1112 |
> |
GMessage(" >>>>>2nd. hash: %s\n",seq.chars()); |
1113 |
> |
} |
1114 |
> |
#endif |
1115 |
> |
for (int iw=hlen;iw<hlen2;iw++) { |
1116 |
> |
//int* qv=(int32 *)(adapter3.chars()+iw); |
1117 |
> |
int qv=get3mer_value(adapter3.chars()+iw); |
1118 |
> |
fi=-1; |
1119 |
> |
//while ((fi=fast4rmatch(*qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1120 |
> |
while ((fi=w3_rmatch(qv, seq.chars(), rlen, fi))>=0 && fi>=imin) { |
1121 |
> |
extendMatch(seq.chars(), rlen, fi, adapter3.chars(), |
1122 |
> |
a3len, iw, wordSize, l5,l3, a3segs); |
1123 |
> |
fi--; |
1124 |
> |
if (fi<imin) break; |
1125 |
> |
} |
1126 |
> |
} //for each wmer between hlen2 and hlen bases of the adaptor |
1127 |
> |
} |
1128 |
> |
//lastly, analyze collected a3segs for a possible gapped alignment: |
1129 |
> |
GList<CSegChain> segchains(false,true,false); |
1130 |
> |
#ifdef DEBUG |
1131 |
> |
if (debug && a3segs.Count()>0) { |
1132 |
> |
GMessage(">>>>>>>>> Read: %s\n",seq.chars()); |
1133 |
|
} |
1134 |
< |
//no easy cases, so let's do the word hashing |
1135 |
< |
for (int iw=0;iw<6;iw++) { |
1136 |
< |
GStr aword=adapter3.substr(iw,3); |
1137 |
< |
if ((fi=seq.index(aword))>=0 && rlen-fi>3) { |
1138 |
< |
if (extendMatch(seq.chars(), rlen, fi, adapter3.chars(), |
1139 |
< |
a3len, iw, 3, l5,l3)) return true; |
1134 |
> |
#endif |
1135 |
> |
for (int i=0;i<a3segs.Count();i++) { |
1136 |
> |
if (a3segs[i]->chain==NULL) { |
1137 |
> |
if (a3segs[i]->b.start>3) continue; //don't start a hopeless chain |
1138 |
> |
CSegChain* newchain=new CSegChain(); |
1139 |
> |
newchain->setFreeItem(false); |
1140 |
> |
newchain->addSegPair(a3segs[i]); |
1141 |
> |
a3segs[i]->chain=newchain; |
1142 |
> |
segchains.Add(newchain); //just to free them when done |
1143 |
> |
} |
1144 |
> |
for (int j=i+1;j<a3segs.Count();j++) { |
1145 |
> |
CSegChain* chain=a3segs[i]->chain; |
1146 |
> |
if (chain->extendChain(a3segs[j])) { |
1147 |
> |
a3segs[j]->chain=chain; |
1148 |
> |
#ifdef DEBUG |
1149 |
> |
if (debug) dbgPrintChain(*chain, adapter3.chars()); |
1150 |
> |
#endif |
1151 |
> |
//save time by checking here if the extended chain is already acceptable for trimming |
1152 |
> |
if (chain->aend>(uint)(rlen-4) && chain->bstart<4 && chain->score>a_min_chain_score) { |
1153 |
> |
l5=0; |
1154 |
> |
l3=chain->astart-2; |
1155 |
> |
#ifdef DEBUG |
1156 |
> |
if (debug && a3segs.Count()>0) { |
1157 |
> |
GMessage(">>> >> trimmed-3: %*s\n",l3-l5+1,seq.substr(l5,l3-l5+1).chars()); |
1158 |
> |
} |
1159 |
> |
#endif |
1160 |
> |
return true; |
1161 |
> |
} |
1162 |
> |
} //chain can be extended |
1163 |
|
} |
1164 |
< |
} |
1164 |
> |
} //collect segment alignments into chains |
1165 |
> |
*/ |
1166 |
|
return false; //no adapter parts found |
1167 |
|
} |
1168 |
|
|
1184 |
|
} |
1185 |
|
return true; |
1186 |
|
} |
1187 |
< |
//for fast detection of adapter-rich reads, check if the first 12 |
1188 |
< |
//characters of the read are a substring of the adapter |
1189 |
< |
GStr rstart=seq.substr(1,12); |
1190 |
< |
if ((fi=adapter5.index(rstart))>=0) { |
1191 |
< |
//l3=rlen-1; |
1192 |
< |
//l5=a5len-4; |
1193 |
< |
//while (fi+l5<a5len && l5<l3 && adapter5[fi+l5]==seq[l5]) l5++; |
1194 |
< |
//return true; |
1195 |
< |
//if (debug) GMessage(" first 12char of the read match adaptor!\n"); |
1196 |
< |
extendMatch(seq.chars(), rlen, 1, |
767 |
< |
adapter5.chars(), a5len, fi, 12, l5,l3, true); |
768 |
< |
return true; |
1187 |
> |
#ifdef DEBUG |
1188 |
> |
if (debug) GMessage(">TRIM5 >> Read: %s\n",seq.chars()); |
1189 |
> |
#endif |
1190 |
> |
|
1191 |
> |
CSegChain a5segs(true); //list of segment pairs to analyze later if no extendMatch succeeded |
1192 |
> |
|
1193 |
> |
//try the easy way out first - look for an exact match of 11 bases |
1194 |
> |
int fdlen=11; |
1195 |
> |
if (a5len<16) { |
1196 |
> |
fdlen=a5len>>1; |
1197 |
|
} |
1198 |
< |
|
1199 |
< |
//another easy case: last 12 characters of the adaptor found as a substring of the read |
1200 |
< |
int aplen=12; |
1201 |
< |
int apstart=a5len-aplen-2; |
1202 |
< |
if (apstart<0) { apstart=0; aplen=a5len-1; } |
1203 |
< |
GStr bstr=adapter5.substr(apstart, aplen); |
1204 |
< |
if ((fi=seq.index(bstr))>=0) { |
1205 |
< |
//if (debug) GMessage(" last 12char of adaptor match the read!\n"); |
1206 |
< |
extendMatch(seq.chars(), rlen, fi, |
1207 |
< |
adapter5.chars(), a5len, apstart, aplen, l5,l3,true); |
1208 |
< |
return true; |
1198 |
> |
if (fdlen>4) { |
1199 |
> |
GStr rstart=seq.substr(1,fdlen); //skip the first base as it's sometimes bogus |
1200 |
> |
if ((fi=adapter5.index(rstart))>=0) { |
1201 |
> |
#ifdef DEBUG |
1202 |
> |
if (debug) GMessage(" W11match found: %*s\n", 1+fdlen, (adapter3.substr(fi,fdlen)).chars()); |
1203 |
> |
#endif |
1204 |
> |
if (extendMatch(seq.chars(), rlen, 1, |
1205 |
> |
adapter5.chars(), a5len, fi, fdlen, l5,l3, a5segs, true)) |
1206 |
> |
return true; |
1207 |
> |
} |
1208 |
> |
//another easy case: last 11 characters of the adaptor found as a substring of the read |
1209 |
> |
GStr bstr=adapter5.substr(-fdlen); |
1210 |
> |
if ((fi=seq.index(bstr))>=0) { |
1211 |
> |
#ifdef DEBUG |
1212 |
> |
if (debug) GMessage(" A11match found: %*s\n", fi+fdlen, bstr.chars()); |
1213 |
> |
#endif |
1214 |
> |
if (extendMatch(seq.chars(), rlen, fi, |
1215 |
> |
adapter5.chars(), a5len, a5len-fdlen, fdlen, l5,l3,a5segs,true)) |
1216 |
> |
return true; |
1217 |
> |
} |
1218 |
> |
} //tried to matching at most 11 bases first |
1219 |
> |
|
1220 |
> |
//-- no easy cases, do the wmer hashing for the last 12 bases of the adaptor |
1221 |
> |
//-- only extend a wmer if it matches in the 5' (beginning) region of the read |
1222 |
> |
int wordSize=3; |
1223 |
> |
int hlen=12; |
1224 |
> |
if (hlen>a5len-wordSize) hlen=a5len-wordSize; |
1225 |
> |
int imax=rlen>>1; //first half of the read, right boundary for the wmer match |
1226 |
> |
if (imax<a5len) { imax=GMIN(a5len, rlen-wordSize); } |
1227 |
> |
for (int iw=0;iw<=hlen;iw++) { |
1228 |
> |
int apstart=a5len-iw-wordSize; |
1229 |
> |
fi=0; |
1230 |
> |
//int* qv=(int32 *)(adapter5.chars()+apstart); |
1231 |
> |
int qv=get3mer_value(adapter5.chars()+apstart); |
1232 |
> |
//while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1233 |
> |
while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1234 |
> |
#ifdef DEBUG |
1235 |
> |
if (debug) GMessage(" Wmatch found: %*s\n", fi+wordSize, (adapter5.substr(apstart,wordSize)).chars()); |
1236 |
> |
#endif |
1237 |
> |
if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
1238 |
> |
a5len, apstart, wordSize, l5,l3, a5segs, true)) return true; |
1239 |
> |
fi++; |
1240 |
> |
if (fi>imax) break; |
1241 |
> |
} |
1242 |
> |
} //for each wmer in the last hlen bases of the adaptor |
1243 |
> |
/* |
1244 |
> |
|
1245 |
> |
//couldn't find a good trimming extension, hash 12 more bases of the adapter to collect more segment pairs there |
1246 |
> |
//but only do this if we already have segment pairs collected in the last 12 bases of the adapter |
1247 |
> |
if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false; |
1248 |
> |
int hlen2=a5len-wordSize; |
1249 |
> |
//if (hlen2>a5len-wordSize) hlen2=a5len-wordSize; |
1250 |
> |
#ifdef DEBUG |
1251 |
> |
if (debug && a5segs.Count()>0) { |
1252 |
> |
GMessage(" >>>>>2nd. hash: %s\n",seq.chars()); |
1253 |
> |
} |
1254 |
> |
#endif |
1255 |
> |
if (hlen2>hlen) { |
1256 |
> |
for (int iw=hlen+1;iw<=hlen2;iw++) { |
1257 |
> |
int apstart=a5len-iw-wordSize; |
1258 |
> |
fi=0; |
1259 |
> |
//int* qv=(int32 *)(adapter5.chars()+apstart); |
1260 |
> |
int qv=get3mer_value(adapter5.chars()+apstart); |
1261 |
> |
//while ((fi=fast4match(*qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1262 |
> |
while ((fi=w3_match(qv, seq.chars(), rlen, fi))>=0 && fi<=imax) { |
1263 |
> |
extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
1264 |
> |
a5len, apstart, wordSize, l5,l3, a5segs, true); |
1265 |
> |
fi++; |
1266 |
> |
if (fi>imax) break; |
1267 |
> |
} |
1268 |
> |
} //for each wmer between hlen2 and hlen bases of the adaptor |
1269 |
> |
} |
1270 |
> |
if (a5segs.bend<(uint)(a5len-3) || a5segs.bstart>(uint)(a5len-hlen+4)) return false; |
1271 |
> |
// lastly, analyze collected a5segs for a possible gapped alignment: |
1272 |
> |
GList<CSegChain> segchains(false,true,false); |
1273 |
> |
#ifdef DEBUG |
1274 |
> |
if (debug && a5segs.Count()>0) { |
1275 |
> |
GMessage(">>>>>>>>> Read: %s\n",seq.chars()); |
1276 |
|
} |
1277 |
< |
//no easy cases, find a triplet match as a seed for alignment extension |
1278 |
< |
//find triplets at the right end of the adapter |
1279 |
< |
for (int iw=0;iw<6;iw++) { |
1280 |
< |
apstart=a5len-iw-4; |
1281 |
< |
GStr aword=adapter5.substr(apstart,3); |
1282 |
< |
if ((fi=seq.index(aword))>=0) { |
1283 |
< |
//if (debug) GMessage("extending wmatch :%*s\n", fi+3, aword.chars()); |
1284 |
< |
if (extendMatch(seq.chars(), rlen, fi, adapter5.chars(), |
1285 |
< |
a5len, apstart, 3, l5,l3,true)) { |
1286 |
< |
return true; |
1287 |
< |
} |
1277 |
> |
#endif |
1278 |
> |
for (int i=0;i<a5segs.Count();i++) { |
1279 |
> |
if (a5segs[i]->chain==NULL) { |
1280 |
> |
if (a5segs[i]->b.end<(int)(a5len-4)) continue; //don't start a hopeless chain |
1281 |
> |
CSegChain* newchain=new CSegChain(true); |
1282 |
> |
newchain->setFreeItem(false); |
1283 |
> |
newchain->addSegPair(a5segs[i]); |
1284 |
> |
a5segs[i]->chain=newchain; |
1285 |
> |
segchains.Add(newchain); //just to free them when done |
1286 |
> |
} |
1287 |
> |
for (int j=i+1;j<a5segs.Count();j++) { |
1288 |
> |
CSegChain* chain=a5segs[i]->chain; |
1289 |
> |
if (chain->extendChain(a5segs[j])) { |
1290 |
> |
a5segs[j]->chain=chain; |
1291 |
> |
#ifdef DEBUG |
1292 |
> |
if (debug) dbgPrintChain(*chain, adapter5.chars()); |
1293 |
> |
#endif |
1294 |
> |
//save time by checking here if the extended chain is already acceptable for trimming |
1295 |
> |
if (chain->bend>(uint)(a5len-3) && chain->astart<4 && chain->score>a_min_chain_score) { |
1296 |
> |
l5=chain->aend; |
1297 |
> |
l3=rlen-1; |
1298 |
> |
return true; |
1299 |
> |
} |
1300 |
> |
} //chain can be extended |
1301 |
|
} |
1302 |
< |
} |
1302 |
> |
} //collect segment alignments into chains |
1303 |
> |
*/ |
1304 |
|
return false; //no adapter parts found |
1305 |
|
} |
1306 |
|
|
1307 |
< |
//conversion of phred64 to phread33 |
1308 |
< |
void convertQ64(GStr& q) { |
1309 |
< |
for (int i=0;i<q.length();i++) q[i]-=31; |
1307 |
> |
//convert qvs to/from phred64 from/to phread33 |
1308 |
> |
void convertPhred(GStr& q) { |
1309 |
> |
for (int i=0;i<q.length();i++) q[i]+=qv_cvtadd; |
1310 |
|
} |
1311 |
|
|
1312 |
< |
void convertQ64(char* q, int len) { |
1313 |
< |
for (int i=0;i<len;i++) q[i]-=31; |
1312 |
> |
void convertPhred(char* q, int len) { |
1313 |
> |
for (int i=0;i<len;i++) q[i]+=qv_cvtadd; |
1314 |
|
} |
1315 |
|
|