1 |
gpertea |
29 |
/* |
2 |
|
|
* bwt_map.cpp |
3 |
|
|
* TopHat |
4 |
|
|
* |
5 |
|
|
* Created by Cole Trapnell on 11/17/08. |
6 |
|
|
* Copyright 2008 Cole Trapnell. All rights reserved. |
7 |
|
|
* |
8 |
|
|
*/ |
9 |
|
|
|
10 |
|
|
#ifdef HAVE_CONFIG_H |
11 |
|
|
#include <config.h> |
12 |
|
|
#endif |
13 |
|
|
|
14 |
|
|
#include <cassert> |
15 |
|
|
#include <cstdlib> |
16 |
|
|
#include <iostream> |
17 |
|
|
#include <set> |
18 |
|
|
#include <vector> |
19 |
|
|
#include <cmath> |
20 |
|
|
|
21 |
|
|
#include "common.h" |
22 |
|
|
#include "bwt_map.h" |
23 |
|
|
#include "tokenize.h" |
24 |
|
|
#include "reads.h" |
25 |
|
|
|
26 |
|
|
using namespace std; |
27 |
|
|
|
28 |
|
|
void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness) |
29 |
|
|
{ |
30 |
|
|
uint32_t reference_id = bh.ref_id(); |
31 |
|
|
|
32 |
|
|
pair<RefHits::iterator, bool> ret = |
33 |
|
|
_hits_for_ref.insert(make_pair(reference_id, HitList())); |
34 |
|
|
HitList& hl = ret.first->second; |
35 |
|
|
|
36 |
|
|
if (check_uniqueness) |
37 |
|
|
{ |
38 |
|
|
// Check uniqueness, in case we are adding spliced hits from |
39 |
|
|
// several spliced alignment sources (e.g. de novo hashing + Bowtie |
40 |
|
|
// against a user-supplied index). We don't want to count the same |
41 |
|
|
// alignment twice if it happened to be found by more than one method |
42 |
|
|
HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt); |
43 |
|
|
HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt); |
44 |
|
|
for (; lb != ub && lb != hl.end(); ++lb) |
45 |
|
|
{ |
46 |
|
|
if (*lb == bh) |
47 |
|
|
{ |
48 |
|
|
//fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id()); |
49 |
|
|
return; |
50 |
|
|
} |
51 |
|
|
|
52 |
|
|
if (lb->insert_id() == bh.insert_id() && |
53 |
|
|
lb->ref_id() == bh.ref_id() && |
54 |
|
|
lb->antisense_align() == bh.antisense_align()) |
55 |
|
|
{ |
56 |
|
|
// If we get here, we may be looking at the same alignment |
57 |
|
|
// However, spanning_reads may report a shorter, trimmed alignment |
58 |
|
|
// so not all fields will be equal. If they just disagree on the |
59 |
|
|
// ends, and don't indicate a different junction coord, the |
60 |
|
|
// alignments are the same. |
61 |
|
|
|
62 |
|
|
if ((lb->left() <= bh.left() && lb->right() >= bh.right()) || |
63 |
|
|
(bh.left() <= lb->left() && bh.right() >= lb->right())) |
64 |
|
|
{ |
65 |
|
|
vector<pair<int,int> > lb_gaps, bh_gaps; |
66 |
|
|
lb->gaps(lb_gaps); |
67 |
|
|
bh.gaps(bh_gaps); |
68 |
|
|
if (lb_gaps == bh_gaps) |
69 |
|
|
{ |
70 |
|
|
// One alignment is contained in the other, they agree on |
71 |
|
|
// where the gaps, if any, are, and they share an id |
72 |
|
|
// => this is a redundant aligment, so toss it |
73 |
|
|
//fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id()); |
74 |
|
|
return; |
75 |
|
|
} |
76 |
|
|
} |
77 |
|
|
} |
78 |
|
|
} |
79 |
|
|
} |
80 |
|
|
_total_hits++; |
81 |
|
|
hl.push_back(bh); |
82 |
|
|
} |
83 |
|
|
|
84 |
|
|
bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2) |
85 |
|
|
{ |
86 |
|
|
return h1.insert_id() < h2.insert_id(); |
87 |
|
|
} |
88 |
|
|
|
89 |
|
|
void LineHitFactory::openStream(HitStream& hs) |
90 |
|
|
{ |
91 |
|
|
if (hs._hit_file==NULL && !hs._hit_file_name.empty()) { |
92 |
|
|
//open the file for HitStream here |
93 |
|
|
hs._hit_file=fopen(hs._hit_file_name.c_str(),"r"); |
94 |
|
|
if (hs._hit_file==NULL) |
95 |
|
|
err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str()); |
96 |
|
|
return; |
97 |
|
|
} |
98 |
|
|
if (hs._fzpipe!=NULL) { |
99 |
|
|
hs._hit_file=hs._fzpipe->file; |
100 |
|
|
} |
101 |
|
|
} |
102 |
|
|
|
103 |
|
|
void LineHitFactory::rewind(HitStream& hs) |
104 |
|
|
{ |
105 |
|
|
if (hs._fzpipe!=NULL) { |
106 |
|
|
hs._fzpipe->rewind(); |
107 |
|
|
hs._hit_file=hs._fzpipe->file; |
108 |
|
|
} |
109 |
|
|
else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file)); |
110 |
|
|
} |
111 |
|
|
|
112 |
|
|
bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) { |
113 |
|
|
FILE* f=(FILE *)(hs._hit_file); |
114 |
|
|
bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL); |
115 |
|
|
if (!new_rec || feof(f)) { |
116 |
|
|
hs._eof=true; |
117 |
|
|
return false; |
118 |
|
|
} |
119 |
|
|
++_line_num; |
120 |
|
|
char* nl = strrchr(_hit_buf, '\n'); |
121 |
|
|
if (nl) *nl = 0; |
122 |
|
|
buf = _hit_buf; |
123 |
|
|
buf_size = _hit_buf_max_sz - 1; |
124 |
|
|
return true; |
125 |
|
|
} |
126 |
|
|
|
127 |
|
|
void LineHitFactory::closeStream(HitStream& hs) { |
128 |
|
|
if (hs._fzpipe!=NULL) { |
129 |
|
|
hs._fzpipe->close(); |
130 |
|
|
return; |
131 |
|
|
} |
132 |
|
|
if (hs._hit_file!=NULL) { |
133 |
|
|
fclose((FILE*)(hs._hit_file)); |
134 |
|
|
hs._hit_file=NULL; |
135 |
|
|
} |
136 |
|
|
} |
137 |
|
|
void BAMHitFactory::openStream(HitStream& hs) { |
138 |
|
|
if (hs._hit_file==NULL) { |
139 |
|
|
if (hs._hit_file_name.empty()) |
140 |
|
|
//err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n"); |
141 |
|
|
return; //invalid stream, could be just a place holder |
142 |
|
|
//open the file here if not already open |
143 |
|
|
string fext=getFext(hs._hit_file_name); |
144 |
|
|
if (fext=="sam") |
145 |
|
|
hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0); |
146 |
|
|
else |
147 |
|
|
hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0); |
148 |
|
|
|
149 |
|
|
samfile_t* sam_file=(samfile_t*)(hs._hit_file); |
150 |
|
|
|
151 |
|
|
if (sam_file == NULL) |
152 |
|
|
err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str()); |
153 |
|
|
if (sam_file->header == NULL) |
154 |
|
|
err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str()); |
155 |
|
|
memset(&_next_hit, 0, sizeof(_next_hit)); |
156 |
|
|
//_beginning = bgzf_tell(sam_file->x.bam); |
157 |
|
|
_sam_header=sam_file->header; |
158 |
|
|
if (inspect_header(hs) == false) |
159 |
|
|
err_die("Error: invalid SAM header for file %s\n", |
160 |
|
|
hs._hit_file_name.c_str()); |
161 |
|
|
} |
162 |
|
|
} |
163 |
|
|
|
164 |
|
|
void BAMHitFactory::closeStream(HitStream& hs) { |
165 |
|
|
if (hs._hit_file) { |
166 |
|
|
samclose((samfile_t*)(hs._hit_file)); |
167 |
|
|
} |
168 |
|
|
hs._hit_file=NULL; |
169 |
|
|
_sam_header=NULL; |
170 |
|
|
} |
171 |
|
|
|
172 |
|
|
void BAMHitFactory::rewind(HitStream& hs) |
173 |
|
|
{ |
174 |
|
|
/* |
175 |
|
|
if (_hit_file && ((samfile_t*)_hit_file)->x.bam) |
176 |
|
|
{ |
177 |
|
|
bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET); |
178 |
|
|
_eof = false; |
179 |
|
|
} |
180 |
|
|
*/ |
181 |
|
|
this->closeStream(hs); |
182 |
|
|
this->openStream(hs); |
183 |
|
|
} |
184 |
|
|
|
185 |
|
|
string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) { |
186 |
|
|
const bam1_t* bamrec=(const bam1_t*)hit_buf; |
187 |
|
|
char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec); |
188 |
|
|
string sam_line(tamline); |
189 |
|
|
free(tamline); |
190 |
|
|
return sam_line; |
191 |
|
|
} |
192 |
|
|
|
193 |
|
|
bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) { |
194 |
|
|
if (_next_hit.data) { |
195 |
|
|
free(_next_hit.data); |
196 |
|
|
_next_hit.data = NULL; |
197 |
|
|
} |
198 |
|
|
_sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on |
199 |
|
|
if (hs.eof() || !hs.ready()) return false; |
200 |
|
|
|
201 |
|
|
//mark_curr_pos(); |
202 |
|
|
|
203 |
|
|
memset(&_next_hit, 0, sizeof(_next_hit)); |
204 |
|
|
|
205 |
|
|
int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit); |
206 |
|
|
if (bytes_read <= 0) { |
207 |
|
|
hs._eof = true; |
208 |
|
|
return false; |
209 |
|
|
} |
210 |
|
|
buf = (const char*)&_next_hit; |
211 |
|
|
buf_size = bytes_read; |
212 |
|
|
return true; |
213 |
|
|
} |
214 |
|
|
|
215 |
|
|
|
216 |
|
|
BowtieHit HitFactory::create_hit(const string& insert_name, |
217 |
|
|
const string& ref_name, |
218 |
|
|
int left, |
219 |
|
|
const vector<CigarOp>& cigar, |
220 |
|
|
bool antisense_aln, |
221 |
|
|
bool antisense_splice, |
222 |
|
|
unsigned char edit_dist, |
223 |
|
|
unsigned char splice_mms, |
224 |
|
|
bool end) |
225 |
|
|
{ |
226 |
|
|
uint64_t insert_id = _insert_table.get_id(insert_name); |
227 |
|
|
uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0); |
228 |
|
|
|
229 |
|
|
return BowtieHit(reference_id, |
230 |
|
|
insert_id, |
231 |
|
|
left, |
232 |
|
|
cigar, |
233 |
|
|
antisense_aln, |
234 |
|
|
antisense_splice, |
235 |
|
|
edit_dist, |
236 |
|
|
splice_mms, |
237 |
|
|
end); |
238 |
|
|
} |
239 |
|
|
|
240 |
|
|
BowtieHit HitFactory::create_hit(const string& insert_name, |
241 |
|
|
const string& ref_name, |
242 |
|
|
uint32_t left, |
243 |
|
|
uint32_t read_len, |
244 |
|
|
bool antisense_aln, |
245 |
|
|
unsigned char edit_dist, |
246 |
|
|
bool end) |
247 |
|
|
{ |
248 |
|
|
uint64_t insert_id = _insert_table.get_id(insert_name); |
249 |
|
|
uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0); |
250 |
|
|
|
251 |
|
|
return BowtieHit(reference_id, |
252 |
|
|
insert_id, |
253 |
|
|
left, |
254 |
|
|
read_len, |
255 |
|
|
antisense_aln, |
256 |
|
|
edit_dist, |
257 |
|
|
end); |
258 |
|
|
} |
259 |
|
|
|
260 |
|
|
bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
261 |
|
|
BowtieHit& bh, |
262 |
|
|
bool strip_slash, |
263 |
|
|
char* name_out, |
264 |
|
|
char* name_tags, |
265 |
|
|
char* seq, |
266 |
|
|
char* qual) |
267 |
|
|
{ |
268 |
|
|
if (!orig_bwt_buf || !*orig_bwt_buf) |
269 |
|
|
return false; |
270 |
|
|
|
271 |
|
|
static const int buf_size = 2048; |
272 |
|
|
|
273 |
|
|
char bwt_buf[buf_size]; |
274 |
|
|
strcpy(bwt_buf, orig_bwt_buf); |
275 |
|
|
//const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s"; |
276 |
|
|
|
277 |
|
|
char orientation; |
278 |
|
|
|
279 |
|
|
//int bwtf_ret = 0; |
280 |
|
|
//uint32_t seqid = 0; |
281 |
|
|
|
282 |
|
|
char text_name[buf_size]; |
283 |
|
|
unsigned int text_offset; |
284 |
|
|
|
285 |
|
|
|
286 |
|
|
//unsigned int other_occs; |
287 |
|
|
char mismatches[buf_size]; |
288 |
|
|
//memset(mismatches, 0, sizeof(mismatches)); |
289 |
|
|
|
290 |
|
|
const char* buf = bwt_buf; |
291 |
|
|
char* name = get_token((char**)&buf,"\t"); |
292 |
|
|
char* orientation_str = get_token((char**)&buf,"\t"); |
293 |
|
|
char* text_name_str = get_token((char**)&buf,"\t"); |
294 |
|
|
|
295 |
|
|
strcpy(text_name, text_name_str); |
296 |
|
|
|
297 |
|
|
char* text_offset_str = get_token((char**)&buf,"\t"); |
298 |
|
|
char* seq_str = get_token((char**)&buf,"\t"); |
299 |
|
|
if (seq) |
300 |
|
|
strcpy(seq, seq_str); |
301 |
|
|
|
302 |
|
|
const char* qual_str = get_token((char**)&buf,"\t"); |
303 |
|
|
if (qual) |
304 |
|
|
strcpy(qual, qual_str); |
305 |
|
|
|
306 |
|
|
/*const char* other_occs_str =*/ get_token((char**)&buf, "\t"); |
307 |
|
|
mismatches[0] = 0; |
308 |
|
|
char* mismatches_str = get_token((char**)&buf, "\t"); |
309 |
|
|
if (mismatches_str) |
310 |
|
|
strcpy(mismatches, mismatches_str); |
311 |
|
|
|
312 |
|
|
orientation = orientation_str[0]; |
313 |
|
|
text_offset = atoi(text_offset_str); |
314 |
|
|
|
315 |
|
|
bool end = true; |
316 |
|
|
unsigned int seg_offset = 0; |
317 |
|
|
unsigned int seg_num = 0; |
318 |
|
|
unsigned int num_segs = 0; |
319 |
|
|
|
320 |
|
|
// Copy the tag out of the name field before we might wipe it out |
321 |
|
|
char* pipe = strrchr(name, '|'); |
322 |
|
|
if (pipe) |
323 |
|
|
{ |
324 |
|
|
if (name_tags) |
325 |
|
|
strcpy(name_tags, pipe); |
326 |
|
|
|
327 |
|
|
char* tag_buf = pipe + 1; |
328 |
|
|
if (strchr(tag_buf, ':')) |
329 |
|
|
{ |
330 |
|
|
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
331 |
|
|
if (seg_num + 1 == num_segs) |
332 |
|
|
end = true; |
333 |
|
|
else |
334 |
|
|
end = false; |
335 |
|
|
} |
336 |
|
|
|
337 |
|
|
*pipe = 0; |
338 |
|
|
} |
339 |
|
|
// Stripping the slash and number following it gives the insert name |
340 |
|
|
char* slash = strrchr(name, '/'); |
341 |
|
|
if (strip_slash && slash) |
342 |
|
|
*slash = 0; |
343 |
|
|
|
344 |
|
|
size_t read_len = strlen(seq_str); |
345 |
|
|
|
346 |
|
|
// Add this alignment to the table of hits for this half of the |
347 |
|
|
// Bowtie map |
348 |
|
|
|
349 |
|
|
//vector<string> mismatch_toks; |
350 |
|
|
char* pch = strtok (mismatches,","); |
351 |
|
|
unsigned char num_mismatches = 0; |
352 |
|
|
while (pch != NULL) |
353 |
|
|
{ |
354 |
|
|
char* colon = strchr(pch, ':'); |
355 |
|
|
if (colon) |
356 |
|
|
{ |
357 |
|
|
num_mismatches++; |
358 |
|
|
} |
359 |
|
|
//mismatch_toks.push_back(pch); |
360 |
|
|
pch = strtok (NULL, ","); |
361 |
|
|
} |
362 |
|
|
|
363 |
|
|
bh = create_hit(name, |
364 |
|
|
text_name, |
365 |
|
|
text_offset, |
366 |
|
|
(int)read_len, |
367 |
|
|
orientation == '-', |
368 |
|
|
num_mismatches, |
369 |
|
|
end); |
370 |
|
|
|
371 |
|
|
return true; |
372 |
|
|
} |
373 |
|
|
|
374 |
|
|
int anchor_mismatch = 0; |
375 |
|
|
|
376 |
gpertea |
70 |
|
377 |
|
|
void parseSegReadName(char* name, char*& name_tags, bool strip_slash, |
378 |
|
|
bool &end, unsigned int &seg_offset, unsigned int& seg_num, |
379 |
|
|
unsigned int & num_segs) { |
380 |
|
|
char* pipe = strrchr(name, '|'); |
381 |
|
|
if (pipe) |
382 |
|
|
{ |
383 |
|
|
if (name_tags) |
384 |
|
|
strcpy(name_tags, pipe); |
385 |
|
|
|
386 |
|
|
char* tag_buf = pipe + 1; |
387 |
|
|
if (strchr(tag_buf, ':')) |
388 |
|
|
{ |
389 |
|
|
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
390 |
|
|
if (seg_num + 1 == num_segs) |
391 |
|
|
end = true; |
392 |
|
|
else |
393 |
|
|
end = false; |
394 |
|
|
} |
395 |
|
|
|
396 |
|
|
*pipe = 0; |
397 |
|
|
} |
398 |
|
|
|
399 |
|
|
// Stripping the slash and number following it gives the insert name |
400 |
|
|
char* slash = strrchr(name, '/'); |
401 |
|
|
if (strip_slash && slash) |
402 |
|
|
*slash = 0; |
403 |
|
|
} |
404 |
|
|
|
405 |
|
|
|
406 |
gpertea |
29 |
bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
407 |
|
|
BowtieHit& bh, |
408 |
|
|
bool strip_slash, |
409 |
|
|
char* name_out, |
410 |
|
|
char* name_tags, |
411 |
|
|
char* seq, |
412 |
|
|
char* qual) |
413 |
|
|
{ |
414 |
|
|
if (!orig_bwt_buf || !*orig_bwt_buf) |
415 |
|
|
return false; |
416 |
|
|
|
417 |
|
|
static const int buf_size = 2048; |
418 |
|
|
|
419 |
|
|
char bwt_buf[buf_size]; |
420 |
|
|
strcpy(bwt_buf, orig_bwt_buf); |
421 |
|
|
//const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s"; |
422 |
|
|
|
423 |
|
|
char orientation; |
424 |
|
|
char text_name[buf_size]; |
425 |
|
|
unsigned int text_offset; |
426 |
|
|
char mismatches[buf_size]; |
427 |
|
|
//memset(mismatches, 0, sizeof(mismatches)); |
428 |
|
|
|
429 |
|
|
char* buf = bwt_buf; |
430 |
|
|
char* name = get_token((char**)&buf,"\t"); |
431 |
|
|
char* orientation_str = get_token((char**)&buf,"\t"); |
432 |
|
|
char* text_name_str = get_token((char**)&buf,"\t"); |
433 |
|
|
strcpy(text_name, text_name_str); |
434 |
|
|
|
435 |
|
|
char* text_offset_str = get_token((char**)&buf,"\t"); |
436 |
|
|
char* seq_str = get_token((char**)&buf,"\t"); |
437 |
|
|
if (seq) |
438 |
|
|
strcpy(seq, seq_str); |
439 |
|
|
|
440 |
|
|
const char* qual_str = get_token((char**)&buf,"\t"); |
441 |
|
|
if (qual) |
442 |
|
|
strcpy(qual, qual_str); |
443 |
|
|
|
444 |
|
|
/*const char* other_occs_str =*/ get_token((char**)&buf, "\t"); |
445 |
|
|
mismatches[0] = 0; |
446 |
|
|
char* mismatches_str = get_token((char**)&buf, "\t"); |
447 |
|
|
if (mismatches_str) |
448 |
|
|
strcpy(mismatches, mismatches_str); |
449 |
|
|
|
450 |
|
|
orientation = orientation_str[0]; |
451 |
|
|
text_offset = atoi(text_offset_str); |
452 |
|
|
|
453 |
|
|
bool end = true; |
454 |
|
|
unsigned int seg_offset = 0; |
455 |
|
|
unsigned int seg_num = 0; |
456 |
|
|
unsigned int num_segs = 0; |
457 |
|
|
|
458 |
|
|
// Copy the tag out of the name field before we might wipe it out |
459 |
gpertea |
70 |
parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs); |
460 |
gpertea |
29 |
|
461 |
|
|
// Add this alignment to the table of hits for this half of the |
462 |
|
|
// Bowtie map |
463 |
|
|
|
464 |
|
|
// Parse the text_name field to recover the splice coords |
465 |
|
|
vector<string> toks; |
466 |
|
|
|
467 |
|
|
tokenize_strict(text_name, "|", toks); |
468 |
|
|
|
469 |
|
|
int num_extra_toks = (int)toks.size() - 6; |
470 |
|
|
|
471 |
|
|
if (num_extra_toks >= 0) |
472 |
|
|
{ |
473 |
|
|
static const uint8_t left_window_edge_field = 1; |
474 |
|
|
static const uint8_t splice_field = 2; |
475 |
|
|
//static const uint8_t right_window_edge_field = 3; |
476 |
|
|
static const uint8_t junction_type_field = 4; |
477 |
|
|
static const uint8_t strand_field = 5; |
478 |
|
|
|
479 |
|
|
string contig = toks[0]; |
480 |
|
|
for (int t = 1; t <= num_extra_toks; ++t) |
481 |
|
|
{ |
482 |
|
|
contig += "|"; |
483 |
|
|
contig += toks[t]; |
484 |
|
|
} |
485 |
|
|
|
486 |
|
|
vector<string> splice_toks; |
487 |
|
|
tokenize(toks[num_extra_toks + splice_field], "-", splice_toks); |
488 |
|
|
if (splice_toks.size() != 2) |
489 |
|
|
{ |
490 |
|
|
fprintf(stderr, "Warning: found malformed splice record, skipping:\n"); |
491 |
|
|
//fprintf(stderr, "%s (token: %s)\n", text_name, |
492 |
|
|
// toks[num_extra_toks + splice_field].c_str()); |
493 |
|
|
return false; |
494 |
|
|
} |
495 |
|
|
|
496 |
|
|
// |
497 |
|
|
// check for an insertion hit |
498 |
|
|
// |
499 |
|
|
if(toks[num_extra_toks + junction_type_field] == "ins") |
500 |
|
|
{ |
501 |
|
|
int8_t spliced_read_len = strlen(seq_str); |
502 |
|
|
/* |
503 |
|
|
* The 0-based position of the left edge of the alignment. Note that this |
504 |
|
|
* value may need to be futher corrected to account for the presence of |
505 |
|
|
* of the insertion. |
506 |
|
|
*/ |
507 |
|
|
uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
508 |
|
|
uint32_t right = left + spliced_read_len - 1; |
509 |
|
|
|
510 |
|
|
/* |
511 |
|
|
* The 0-based position of the last genomic sequence before the insertion |
512 |
|
|
*/ |
513 |
|
|
uint32_t left_splice_pos = atoi(splice_toks[0].c_str()); |
514 |
|
|
|
515 |
|
|
string insertedSequence = splice_toks[1]; |
516 |
|
|
/* |
517 |
|
|
* The 0-based position of the first genomic sequence after teh insertion |
518 |
|
|
*/ |
519 |
|
|
uint32_t right_splice_pos = left_splice_pos + 1; |
520 |
|
|
if(left > left_splice_pos){ |
521 |
|
|
/* |
522 |
|
|
* The genomic position of the left edge of the alignment needs to be corrected |
523 |
|
|
* If the alignment does not extend into the insertion, simply subtract the length |
524 |
|
|
* of the inserted sequence, otherwise, just set it equal to the right edge |
525 |
|
|
*/ |
526 |
|
|
left = left - insertedSequence.length(); |
527 |
|
|
if(left < right_splice_pos){ |
528 |
|
|
left = right_splice_pos; |
529 |
|
|
} |
530 |
|
|
} |
531 |
|
|
if(right > left_splice_pos){ |
532 |
|
|
right = right - insertedSequence.length(); |
533 |
|
|
if(right < left_splice_pos){ |
534 |
|
|
right = left_splice_pos; |
535 |
|
|
} |
536 |
|
|
} |
537 |
|
|
/* |
538 |
|
|
* Now, right and left should be properly transformed into genomic coordinates |
539 |
|
|
* We should be able to deduce how much the alignment matches the insertion |
540 |
|
|
* simply based on the length of the read |
541 |
|
|
*/ |
542 |
|
|
int left_match_length = 0; |
543 |
|
|
if(left <= left_splice_pos){ |
544 |
|
|
left_match_length = left_splice_pos - left + 1; |
545 |
|
|
} |
546 |
|
|
int right_match_length = 0; |
547 |
|
|
if(right >= right_splice_pos){ |
548 |
|
|
right_match_length = right - right_splice_pos + 1; |
549 |
|
|
} |
550 |
|
|
int insertion_match_length = spliced_read_len - left_match_length - right_match_length; |
551 |
|
|
|
552 |
|
|
if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0) |
553 |
|
|
return false; |
554 |
|
|
|
555 |
|
|
string junction_strand = toks[num_extra_toks + strand_field]; |
556 |
|
|
if(junction_strand != "rev" && junction_strand != "fwd"){ |
557 |
|
|
fprintf(stderr, "Malformed insertion record\n"); |
558 |
|
|
return false; |
559 |
|
|
} |
560 |
|
|
|
561 |
|
|
char* pch = strtok( mismatches, ","); |
562 |
|
|
unsigned char num_mismatches = 0; |
563 |
|
|
|
564 |
|
|
/* |
565 |
|
|
* remember that text_offset holds the left end of the |
566 |
|
|
* alignment, relative to the start of the contig |
567 |
|
|
*/ |
568 |
|
|
|
569 |
|
|
/* |
570 |
|
|
* The 0-based relative position of the left-most character |
571 |
|
|
* before the insertion in the contig |
572 |
|
|
*/ |
573 |
|
|
int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str()); |
574 |
|
|
while (pch != NULL) |
575 |
|
|
{ |
576 |
|
|
char* colon = strchr(pch, ':'); |
577 |
|
|
if (colon) |
578 |
|
|
{ |
579 |
|
|
*colon = 0; |
580 |
|
|
int mismatch_pos = atoi(pch); |
581 |
|
|
|
582 |
|
|
/* |
583 |
|
|
* for reversely mapped reads, |
584 |
|
|
* find the correct mismatched position. |
585 |
|
|
*/ |
586 |
|
|
if(orientation == '-'){ |
587 |
|
|
mismatch_pos = spliced_read_len - mismatch_pos - 1; |
588 |
|
|
} |
589 |
|
|
|
590 |
|
|
/* |
591 |
|
|
* Only count mismatches outside of the insertion region |
592 |
|
|
* If there is a mismatch within the insertion, |
593 |
|
|
* disallow this hit |
594 |
|
|
*/ |
595 |
|
|
if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){ |
596 |
|
|
num_mismatches++; |
597 |
|
|
}else{ |
598 |
|
|
return false; |
599 |
|
|
} |
600 |
|
|
} |
601 |
|
|
pch = strtok (NULL, ","); |
602 |
|
|
} |
603 |
|
|
|
604 |
|
|
|
605 |
|
|
vector<CigarOp> cigar; |
606 |
|
|
cigar.push_back(CigarOp(MATCH, left_match_length)); |
607 |
|
|
cigar.push_back(CigarOp(INS, insertion_match_length)); |
608 |
|
|
cigar.push_back(CigarOp(MATCH, right_match_length)); |
609 |
|
|
|
610 |
|
|
/* |
611 |
|
|
* For now, disallow hits that don't span |
612 |
|
|
* the insertion. If we allow these types of hits, |
613 |
|
|
* then long_spanning.cpp needs to be updated |
614 |
|
|
* in order to intelligently merge these kinds |
615 |
|
|
* of reads back together |
616 |
|
|
* |
617 |
|
|
* Following code has been changed to allow segment that end |
618 |
|
|
* in an insertion |
619 |
|
|
*/ |
620 |
|
|
bh = create_hit(name, |
621 |
|
|
contig, |
622 |
|
|
left, |
623 |
|
|
cigar, |
624 |
|
|
orientation == '-', |
625 |
|
|
junction_strand == "rev", |
626 |
|
|
num_mismatches, |
627 |
|
|
0, |
628 |
|
|
end); |
629 |
|
|
return true; |
630 |
|
|
} |
631 |
|
|
|
632 |
|
|
else |
633 |
|
|
{ |
634 |
|
|
uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
635 |
|
|
int spliced_read_len = strlen(seq_str); |
636 |
|
|
int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1; |
637 |
|
|
if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len; |
638 |
|
|
int8_t right_splice_pos = spliced_read_len - left_splice_pos; |
639 |
|
|
|
640 |
|
|
if (right_splice_pos <= 0 || left_splice_pos <= 0) |
641 |
|
|
return false; |
642 |
|
|
|
643 |
|
|
if (orientation == '+') |
644 |
|
|
{ |
645 |
|
|
if (left_splice_pos + seg_offset < _anchor_length){ |
646 |
|
|
return false; |
647 |
|
|
} |
648 |
|
|
} |
649 |
|
|
else |
650 |
|
|
{ |
651 |
|
|
if (right_splice_pos + seg_offset < _anchor_length) |
652 |
|
|
return false; |
653 |
|
|
} |
654 |
|
|
//uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos; |
655 |
|
|
//atoi(toks[right_window_edge_field].c_str()); |
656 |
|
|
int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1; |
657 |
|
|
|
658 |
|
|
string junction_strand = toks[num_extra_toks + strand_field]; |
659 |
|
|
if (!(junction_strand == "rev" || junction_strand == "fwd")|| |
660 |
|
|
!(orientation == '-' || orientation == '+')) |
661 |
|
|
{ |
662 |
|
|
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
663 |
|
|
//fprintf(stderr, "junction_strand=%s, orientation='%c'\n", |
664 |
|
|
// junction_strand.c_str(), orientation); |
665 |
|
|
return false; |
666 |
|
|
} |
667 |
|
|
|
668 |
|
|
//vector<string> mismatch_toks; |
669 |
|
|
char* pch = strtok (mismatches,","); |
670 |
|
|
int mismatches_in_anchor = 0; |
671 |
|
|
unsigned char num_mismatches = 0; |
672 |
|
|
while (pch != NULL) |
673 |
|
|
{ |
674 |
|
|
char* colon = strchr(pch, ':'); |
675 |
|
|
if (colon) |
676 |
|
|
{ |
677 |
|
|
*colon = 0; |
678 |
|
|
num_mismatches++; |
679 |
|
|
int mismatch_pos = atoi(pch); |
680 |
|
|
if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) || |
681 |
|
|
(orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len) |
682 |
|
|
mismatches_in_anchor++; |
683 |
|
|
} |
684 |
|
|
//mismatch_toks.push_back(pch); |
685 |
|
|
pch = strtok (NULL, ","); |
686 |
|
|
} |
687 |
|
|
|
688 |
|
|
// FIXME: we probably should exclude these hits somewhere, but this |
689 |
|
|
// isn't the right place |
690 |
|
|
vector<CigarOp> cigar; |
691 |
|
|
cigar.push_back(CigarOp(MATCH, left_splice_pos)); |
692 |
|
|
if(toks[num_extra_toks + junction_type_field] == "del"){ |
693 |
|
|
cigar.push_back(CigarOp(DEL, gap_len)); |
694 |
|
|
}else{ |
695 |
|
|
cigar.push_back(CigarOp(REF_SKIP, gap_len)); |
696 |
|
|
} |
697 |
|
|
cigar.push_back(CigarOp(MATCH, right_splice_pos)); |
698 |
|
|
|
699 |
|
|
bh = create_hit(name, |
700 |
|
|
contig, |
701 |
|
|
left, |
702 |
|
|
cigar, |
703 |
|
|
orientation == '-', |
704 |
|
|
junction_strand == "rev", |
705 |
|
|
num_mismatches, |
706 |
|
|
mismatches_in_anchor, |
707 |
|
|
end); |
708 |
|
|
return true; |
709 |
|
|
} |
710 |
|
|
} |
711 |
|
|
else |
712 |
|
|
{ |
713 |
|
|
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
714 |
|
|
//fprintf(stderr, "%s\n", orig_bwt_buf); |
715 |
|
|
// continue; |
716 |
|
|
return false; |
717 |
|
|
} |
718 |
|
|
|
719 |
|
|
return false; |
720 |
|
|
} |
721 |
|
|
|
722 |
gpertea |
70 |
|
723 |
|
|
char parseCigar(vector<CigarOp>& cigar, const char* cigar_str, |
724 |
|
|
bool &spliced_alignment) { |
725 |
|
|
// Mostly pilfered direct from the SAM tools: |
726 |
|
|
const char* p_cig = cigar_str; |
727 |
|
|
while (*p_cig) |
728 |
|
|
{ |
729 |
|
|
char* t; |
730 |
|
|
int length = (int)strtol(p_cig, &t, 10); |
731 |
|
|
if (length <= 0) |
732 |
|
|
{ |
733 |
|
|
//fprintf (stderr, "CIGAR op has zero length\n"); |
734 |
|
|
return false; |
735 |
|
|
} |
736 |
|
|
char op_char = toupper(*t); |
737 |
|
|
CigarOpCode opcode; |
738 |
|
|
if (op_char == 'M') opcode = MATCH; |
739 |
|
|
else if (op_char == 'I') opcode = INS; |
740 |
|
|
else if (op_char == 'D') opcode = DEL; |
741 |
|
|
else if (op_char == 'N') |
742 |
|
|
{ |
743 |
|
|
if (length > max_report_intron_length) |
744 |
|
|
return false; |
745 |
|
|
opcode = REF_SKIP; |
746 |
|
|
spliced_alignment = true; |
747 |
|
|
} |
748 |
|
|
else if (op_char == 'S') opcode = SOFT_CLIP; |
749 |
|
|
else if (op_char == 'H') opcode = HARD_CLIP; |
750 |
|
|
else if (op_char == 'P') opcode = PAD; |
751 |
|
|
else |
752 |
|
|
{ |
753 |
|
|
fprintf (stderr, "invalid CIGAR operation\n"); |
754 |
|
|
return false; |
755 |
|
|
} |
756 |
|
|
p_cig = t + 1; |
757 |
|
|
//i += length; |
758 |
|
|
cigar.push_back(CigarOp(opcode, length)); |
759 |
|
|
} |
760 |
|
|
if (*p_cig) { |
761 |
|
|
fprintf (stderr, "unmatched CIGAR operation\n"); |
762 |
|
|
return *p_cig; |
763 |
|
|
} |
764 |
|
|
return 0; |
765 |
|
|
} |
766 |
|
|
|
767 |
|
|
void getSAMmismatches(char* &buf, vector<CigarOp>& cigar, |
768 |
|
|
int* mismatches, int& num_mismatches, int& sam_nm, bool& antisense_splice) { |
769 |
|
|
const char* tag_buf = buf; |
770 |
|
|
sam_nm=0; |
771 |
|
|
num_mismatches=0; |
772 |
|
|
while((tag_buf = get_token((char**)&buf,"\t"))) |
773 |
|
|
{ |
774 |
|
|
vector<string> tuple_fields; |
775 |
|
|
tokenize(tag_buf,":", tuple_fields); |
776 |
|
|
if (tuple_fields.size() == 3) |
777 |
|
|
{ |
778 |
|
|
if (tuple_fields[0] == "XS") |
779 |
|
|
{ |
780 |
|
|
if (tuple_fields[2] == "-") |
781 |
|
|
antisense_splice = true; |
782 |
|
|
} |
783 |
|
|
else if (tuple_fields[0] == "NM") |
784 |
|
|
{ |
785 |
|
|
sam_nm = atoi(tuple_fields[2].c_str()); |
786 |
|
|
} |
787 |
|
|
else if (tuple_fields[0] == "NS") |
788 |
|
|
{ |
789 |
|
|
//ignored for now |
790 |
|
|
} |
791 |
|
|
else if (tuple_fields[0] == "MD") |
792 |
|
|
{ |
793 |
|
|
const char* p=tuple_fields[2].c_str(); |
794 |
|
|
int bi=0; //base offset position in the read |
795 |
|
|
while (*p != 0) { |
796 |
|
|
if (isdigit(*p)) { |
797 |
|
|
int v=atoi(p); |
798 |
|
|
do { p++; } while (isdigit(*p)); |
799 |
|
|
bi+=v; |
800 |
|
|
} |
801 |
|
|
while (isupper(*p)) { |
802 |
|
|
p++; |
803 |
|
|
mismatches[num_mismatches]=bi; |
804 |
|
|
num_mismatches++; |
805 |
|
|
bi++; |
806 |
|
|
} |
807 |
|
|
if (*p=='^') { //reference deletion |
808 |
|
|
p++; |
809 |
|
|
while (isupper(*p)) { //insert read bases |
810 |
|
|
p++; bi++; |
811 |
|
|
} |
812 |
|
|
} |
813 |
|
|
} |
814 |
|
|
} |
815 |
|
|
else |
816 |
|
|
{ |
817 |
|
|
//fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str()); |
818 |
|
|
//return false; |
819 |
|
|
} |
820 |
|
|
} |
821 |
|
|
} |
822 |
|
|
/* By convention,the NM field of the SAM record |
823 |
|
|
* counts an insertion or deletion. I dont' think |
824 |
|
|
* we want the mismatch count in the BowtieHit |
825 |
|
|
* record to reflect this. Therefore, subtract out |
826 |
|
|
* the mismatches due to in/dels |
827 |
|
|
*/ |
828 |
|
|
for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){ |
829 |
|
|
if(itr->opcode == INS){ |
830 |
|
|
sam_nm -= itr->length; |
831 |
|
|
} |
832 |
|
|
if(itr->opcode == DEL){ |
833 |
|
|
sam_nm -= itr->length; |
834 |
|
|
} |
835 |
|
|
} |
836 |
|
|
} |
837 |
|
|
|
838 |
gpertea |
29 |
bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
839 |
|
|
BowtieHit& bh, |
840 |
|
|
bool strip_slash, |
841 |
|
|
char* name_out, |
842 |
|
|
char* name_tags, |
843 |
|
|
char* seq, |
844 |
|
|
char* qual) |
845 |
|
|
{ |
846 |
|
|
if (!orig_bwt_buf || !*orig_bwt_buf) |
847 |
|
|
return false; |
848 |
|
|
char bwt_buf[2048]; |
849 |
|
|
strcpy(bwt_buf, orig_bwt_buf); |
850 |
|
|
// Are we still in the header region? |
851 |
|
|
if (bwt_buf[0] == '@') |
852 |
|
|
return false; |
853 |
|
|
|
854 |
|
|
char* buf = bwt_buf; |
855 |
|
|
char* name = get_token((char**)&buf,"\t"); |
856 |
|
|
char* sam_flag_str = get_token((char**)&buf,"\t"); |
857 |
|
|
char* text_name = get_token((char**)&buf,"\t"); |
858 |
|
|
char* text_offset_str = get_token((char**)&buf,"\t"); |
859 |
|
|
const char* map_qual_str = get_token((char**)&buf,"\t"); |
860 |
|
|
char* cigar_str = get_token((char**)&buf,"\t"); |
861 |
|
|
const char* mate_ref_str = get_token((char**)&buf,"\t"); |
862 |
|
|
const char* mate_pos_str = get_token((char**)&buf,"\t"); |
863 |
|
|
const char* inferred_insert_sz_str = get_token((char**)&buf,"\t"); |
864 |
|
|
|
865 |
|
|
const char* seq_str = get_token((char**)&buf,"\t"); |
866 |
|
|
if (seq) |
867 |
|
|
strcpy(seq, seq_str); |
868 |
|
|
|
869 |
|
|
const char* qual_str = get_token((char**)&buf,"\t"); |
870 |
|
|
if (qual) |
871 |
|
|
strcpy(qual, qual_str); |
872 |
|
|
|
873 |
|
|
if (!name || |
874 |
|
|
!sam_flag_str || |
875 |
|
|
!text_name || |
876 |
|
|
!text_offset_str || |
877 |
|
|
!map_qual_str || |
878 |
|
|
!cigar_str || |
879 |
|
|
!mate_ref_str || |
880 |
|
|
!mate_pos_str || |
881 |
|
|
!inferred_insert_sz_str || |
882 |
|
|
!seq_str || |
883 |
|
|
!qual_str) |
884 |
|
|
{ |
885 |
|
|
// truncated or malformed SAM record |
886 |
|
|
return false; |
887 |
|
|
} |
888 |
|
|
|
889 |
|
|
|
890 |
|
|
int sam_flag = atoi(sam_flag_str); |
891 |
|
|
int text_offset = atoi(text_offset_str); |
892 |
|
|
|
893 |
|
|
bool end = true; |
894 |
|
|
unsigned int seg_offset = 0; |
895 |
|
|
unsigned int seg_num = 0; |
896 |
|
|
unsigned int num_segs = 0; |
897 |
|
|
|
898 |
|
|
// Copy the tag out of the name field before we might wipe it out |
899 |
gpertea |
70 |
parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs); |
900 |
gpertea |
29 |
|
901 |
|
|
vector<CigarOp> cigar; |
902 |
|
|
bool spliced_alignment = false; |
903 |
gpertea |
70 |
if (parseCigar(cigar, cigar_str, spliced_alignment)) { |
904 |
|
|
return false; |
905 |
|
|
} |
906 |
gpertea |
29 |
|
907 |
|
|
//vector<string> attributes; |
908 |
|
|
//tokenize(tag_buf, " \t",attributes); |
909 |
gpertea |
70 |
|
910 |
gpertea |
29 |
bool antisense_splice = false; |
911 |
gpertea |
70 |
int num_mismatches = 0; |
912 |
|
|
int sam_nm = 0; //the value of the NM tag (edit distance) |
913 |
|
|
int mismatches[1024]; |
914 |
|
|
getSAMmismatches(buf, cigar, mismatches, num_mismatches, sam_nm, antisense_splice); |
915 |
gpertea |
29 |
|
916 |
|
|
if (spliced_alignment) |
917 |
|
|
{ |
918 |
|
|
bh = create_hit(name, |
919 |
|
|
text_name, |
920 |
|
|
text_offset - 1, |
921 |
|
|
cigar, |
922 |
|
|
sam_flag & 0x0010, |
923 |
|
|
antisense_splice, |
924 |
|
|
num_mismatches, |
925 |
gpertea |
70 |
0, |
926 |
gpertea |
29 |
end); |
927 |
|
|
} |
928 |
|
|
else |
929 |
|
|
{ |
930 |
|
|
//assert(cigar.size() == 1 && cigar[0].opcode == MATCH); |
931 |
|
|
bh = create_hit(name, |
932 |
|
|
text_name, |
933 |
|
|
text_offset - 1, // SAM files are 1-indexed |
934 |
|
|
cigar, |
935 |
|
|
sam_flag & 0x0010, |
936 |
|
|
false, |
937 |
|
|
num_mismatches, |
938 |
|
|
0, |
939 |
|
|
end); |
940 |
|
|
} |
941 |
gpertea |
70 |
return true; |
942 |
gpertea |
29 |
} |
943 |
|
|
|
944 |
gpertea |
70 |
|
945 |
|
|
void spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, |
946 |
|
|
int8_t l_len, int mlen, int8_t r_len, CigarOpCode opcode) { |
947 |
|
|
//merge the original 'cigar' with the new insert/gap operation |
948 |
|
|
// at position l_len into splcigar; |
949 |
|
|
//find the original opcode that croses l_len location |
950 |
|
|
int ofs=0; |
951 |
|
|
bool found=false; |
952 |
|
|
for (size_t c = 0 ; c < cigar.size(); ++c) |
953 |
|
|
{ |
954 |
|
|
ofs+=cigar[c].length; |
955 |
|
|
if (!found) { |
956 |
|
|
if (ofs>=l_len) { |
957 |
|
|
found=true; |
958 |
gpertea |
97 |
//TODO:inject new code here, splitting existing code if necessary |
959 |
|
|
|
960 |
|
|
continue; |
961 |
gpertea |
70 |
} |
962 |
|
|
else { |
963 |
|
|
//not found yet, just copy these codes |
964 |
|
|
splcigar.push_back(cigar[c]); |
965 |
|
|
} |
966 |
|
|
} |
967 |
gpertea |
97 |
else { //found already |
968 |
|
|
//TODO: check this |
969 |
|
|
splcigar.push_back(cigar[c]); |
970 |
gpertea |
70 |
} |
971 |
|
|
} |
972 |
|
|
} |
973 |
|
|
|
974 |
gpertea |
69 |
bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
975 |
|
|
BowtieHit& bh, |
976 |
|
|
bool strip_slash, |
977 |
|
|
char* name_out, |
978 |
|
|
char* name_tags, |
979 |
|
|
char* seq, |
980 |
|
|
char* qual) |
981 |
|
|
{ |
982 |
|
|
if (!orig_bwt_buf || !*orig_bwt_buf) |
983 |
|
|
return false; |
984 |
gpertea |
29 |
|
985 |
gpertea |
69 |
char bwt_buf[2048]; |
986 |
|
|
strcpy(bwt_buf, orig_bwt_buf); |
987 |
|
|
// Are we still in the header region? |
988 |
|
|
if (bwt_buf[0] == '@') |
989 |
|
|
return false; |
990 |
gpertea |
29 |
|
991 |
gpertea |
69 |
char* buf = bwt_buf; |
992 |
|
|
char* name = get_token((char**)&buf,"\t"); |
993 |
|
|
char* sam_flag_str = get_token((char**)&buf,"\t"); |
994 |
|
|
char* text_name = get_token((char**)&buf,"\t"); |
995 |
|
|
char* text_offset_str = get_token((char**)&buf,"\t"); |
996 |
|
|
const char* map_qual_str = get_token((char**)&buf,"\t"); |
997 |
|
|
char* cigar_str = get_token((char**)&buf,"\t"); |
998 |
|
|
const char* mate_ref_str = get_token((char**)&buf,"\t"); |
999 |
|
|
const char* mate_pos_str = get_token((char**)&buf,"\t"); |
1000 |
|
|
const char* inferred_insert_sz_str = get_token((char**)&buf,"\t"); |
1001 |
gpertea |
70 |
int num_mismatches=0; |
1002 |
|
|
int mismatches[1024]; //list of 0-based mismatch positions in this read |
1003 |
|
|
//parsed from SAM's MD:Z: tag |
1004 |
gpertea |
69 |
const char* seq_str = get_token((char**)&buf,"\t"); |
1005 |
|
|
if (seq) |
1006 |
|
|
strcpy(seq, seq_str); |
1007 |
|
|
|
1008 |
|
|
const char* qual_str = get_token((char**)&buf,"\t"); |
1009 |
|
|
if (qual) |
1010 |
|
|
strcpy(qual, qual_str); |
1011 |
|
|
|
1012 |
|
|
if (!name || |
1013 |
|
|
!sam_flag_str || |
1014 |
|
|
!text_name || |
1015 |
|
|
!text_offset_str || |
1016 |
|
|
!map_qual_str || |
1017 |
|
|
!cigar_str || |
1018 |
|
|
!mate_ref_str || |
1019 |
|
|
!mate_pos_str || |
1020 |
|
|
!inferred_insert_sz_str || |
1021 |
|
|
!seq_str || |
1022 |
|
|
!qual_str) |
1023 |
|
|
{ |
1024 |
|
|
// truncated or malformed SAM record |
1025 |
|
|
return false; |
1026 |
|
|
} |
1027 |
|
|
|
1028 |
|
|
|
1029 |
|
|
int sam_flag = atoi(sam_flag_str); |
1030 |
|
|
int text_offset = atoi(text_offset_str); |
1031 |
gpertea |
70 |
text_offset--; //SAM is 1-based, Bowtie is 0-based |
1032 |
gpertea |
69 |
bool end = true; |
1033 |
|
|
unsigned int seg_offset = 0; |
1034 |
|
|
unsigned int seg_num = 0; |
1035 |
|
|
unsigned int num_segs = 0; |
1036 |
|
|
|
1037 |
|
|
// Copy the tag out of the name field before we might wipe it out |
1038 |
gpertea |
70 |
parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs); |
1039 |
gpertea |
69 |
|
1040 |
gpertea |
70 |
vector<CigarOp> samcigar; |
1041 |
|
|
bool spliced_alignment = false; |
1042 |
|
|
if (parseCigar(samcigar, cigar_str, spliced_alignment)) { |
1043 |
|
|
return false; |
1044 |
|
|
} |
1045 |
gpertea |
69 |
|
1046 |
gpertea |
70 |
bool antisense_splice = false; |
1047 |
|
|
int sam_nm = 0; |
1048 |
gpertea |
135 |
getSAMmismatches(buf, samcigar, mismatches, num_mismatches, sam_nm, antisense_splice); |
1049 |
gpertea |
69 |
|
1050 |
gpertea |
70 |
//############################################## |
1051 |
gpertea |
69 |
|
1052 |
|
|
// Add this alignment to the table of hits for this half of the |
1053 |
|
|
// Bowtie map |
1054 |
|
|
|
1055 |
|
|
// Parse the text_name field to recover the splice coords |
1056 |
|
|
vector<string> toks; |
1057 |
|
|
|
1058 |
|
|
tokenize_strict(text_name, "|", toks); |
1059 |
|
|
|
1060 |
|
|
int num_extra_toks = (int)toks.size() - 6; |
1061 |
|
|
|
1062 |
|
|
if (num_extra_toks >= 0) |
1063 |
|
|
{ |
1064 |
|
|
static const uint8_t left_window_edge_field = 1; |
1065 |
|
|
static const uint8_t splice_field = 2; |
1066 |
|
|
//static const uint8_t right_window_edge_field = 3; |
1067 |
|
|
static const uint8_t junction_type_field = 4; |
1068 |
|
|
static const uint8_t strand_field = 5; |
1069 |
|
|
|
1070 |
|
|
string contig = toks[0]; |
1071 |
|
|
for (int t = 1; t <= num_extra_toks; ++t) |
1072 |
|
|
{ |
1073 |
|
|
contig += "|"; |
1074 |
|
|
contig += toks[t]; |
1075 |
|
|
} |
1076 |
|
|
|
1077 |
|
|
vector<string> splice_toks; |
1078 |
|
|
tokenize(toks[num_extra_toks + splice_field], "-", splice_toks); |
1079 |
|
|
if (splice_toks.size() != 2) |
1080 |
|
|
{ |
1081 |
|
|
fprintf(stderr, "Warning: found malformed splice record, skipping:\n"); |
1082 |
gpertea |
70 |
//fprintf(stderr, "\t%s (token: %s)\n", text_name, |
1083 |
gpertea |
69 |
// toks[num_extra_toks + splice_field].c_str()); |
1084 |
|
|
return false; |
1085 |
|
|
} |
1086 |
gpertea |
70 |
int spl_num_mismatches=0; |
1087 |
gpertea |
69 |
// |
1088 |
|
|
// check for an insertion hit |
1089 |
|
|
// |
1090 |
|
|
if(toks[num_extra_toks + junction_type_field] == "ins") |
1091 |
|
|
{ |
1092 |
|
|
int8_t spliced_read_len = strlen(seq_str); |
1093 |
gpertea |
70 |
// The 0-based position of the left edge of the alignment. Note that this |
1094 |
|
|
// value may need to be futher corrected to account for the presence of |
1095 |
|
|
// of the insertion. |
1096 |
gpertea |
69 |
uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
1097 |
|
|
uint32_t right = left + spliced_read_len - 1; |
1098 |
|
|
|
1099 |
gpertea |
70 |
// The 0-based position of the last genomic sequence before the insertion |
1100 |
gpertea |
69 |
uint32_t left_splice_pos = atoi(splice_toks[0].c_str()); |
1101 |
|
|
|
1102 |
|
|
string insertedSequence = splice_toks[1]; |
1103 |
gpertea |
70 |
// The 0-based position of the first genomic sequence after the insertion |
1104 |
|
|
|
1105 |
gpertea |
69 |
uint32_t right_splice_pos = left_splice_pos + 1; |
1106 |
|
|
if(left > left_splice_pos){ |
1107 |
|
|
/* |
1108 |
|
|
* The genomic position of the left edge of the alignment needs to be corrected |
1109 |
|
|
* If the alignment does not extend into the insertion, simply subtract the length |
1110 |
|
|
* of the inserted sequence, otherwise, just set it equal to the right edge |
1111 |
|
|
*/ |
1112 |
|
|
left = left - insertedSequence.length(); |
1113 |
|
|
if(left < right_splice_pos){ |
1114 |
|
|
left = right_splice_pos; |
1115 |
|
|
} |
1116 |
|
|
} |
1117 |
|
|
if(right > left_splice_pos){ |
1118 |
|
|
right = right - insertedSequence.length(); |
1119 |
|
|
if(right < left_splice_pos){ |
1120 |
|
|
right = left_splice_pos; |
1121 |
|
|
} |
1122 |
|
|
} |
1123 |
|
|
/* |
1124 |
|
|
* Now, right and left should be properly transformed into genomic coordinates |
1125 |
|
|
* We should be able to deduce how much the alignment matches the insertion |
1126 |
|
|
* simply based on the length of the read |
1127 |
|
|
*/ |
1128 |
|
|
int left_match_length = 0; |
1129 |
|
|
if(left <= left_splice_pos){ |
1130 |
|
|
left_match_length = left_splice_pos - left + 1; |
1131 |
|
|
} |
1132 |
|
|
int right_match_length = 0; |
1133 |
|
|
if(right >= right_splice_pos){ |
1134 |
|
|
right_match_length = right - right_splice_pos + 1; |
1135 |
|
|
} |
1136 |
|
|
int insertion_match_length = spliced_read_len - left_match_length - right_match_length; |
1137 |
|
|
|
1138 |
|
|
if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0) |
1139 |
|
|
return false; |
1140 |
|
|
|
1141 |
|
|
string junction_strand = toks[num_extra_toks + strand_field]; |
1142 |
|
|
if(junction_strand != "rev" && junction_strand != "fwd"){ |
1143 |
|
|
fprintf(stderr, "Malformed insertion record\n"); |
1144 |
|
|
return false; |
1145 |
|
|
} |
1146 |
|
|
|
1147 |
gpertea |
70 |
//char* pch = strtok( mismatches, ","); |
1148 |
|
|
//unsigned char num_mismatches = 0; |
1149 |
|
|
//text_offset holds the left end of the alignment, |
1150 |
|
|
//RELATIVE TO the start of the contig |
1151 |
gpertea |
69 |
|
1152 |
gpertea |
70 |
//The 0-based relative position of the left-most character |
1153 |
|
|
//before the insertion in the contig |
1154 |
gpertea |
69 |
int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str()); |
1155 |
gpertea |
70 |
for (int i=0;i<num_mismatches;i++) { |
1156 |
|
|
int mismatch_pos = mismatches[i]; |
1157 |
|
|
// for reversely mapped reads, |
1158 |
|
|
//find the correct mismatched position. |
1159 |
|
|
if (sam_flag & 0x0010){ |
1160 |
|
|
mismatch_pos = spliced_read_len - mismatch_pos - 1; |
1161 |
|
|
} |
1162 |
gpertea |
69 |
|
1163 |
gpertea |
70 |
//Only count mismatches outside of the insertion region |
1164 |
|
|
// If there is a mismatch within the insertion, |
1165 |
|
|
// disallow this hit |
1166 |
|
|
if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){ |
1167 |
|
|
spl_num_mismatches++; |
1168 |
|
|
}else{ |
1169 |
|
|
return false; |
1170 |
gpertea |
69 |
} |
1171 |
|
|
} |
1172 |
|
|
|
1173 |
gpertea |
135 |
//TODO: merge with the original cigar string |
1174 |
gpertea |
70 |
vector<CigarOp> splcigar; |
1175 |
|
|
spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS); |
1176 |
gpertea |
69 |
/* |
1177 |
gpertea |
70 |
splcigar.push_back(CigarOp(MATCH, left_match_length)); |
1178 |
|
|
splcigar.push_back(CigarOp(INS, insertion_match_length)); |
1179 |
|
|
splcigar.push_back(CigarOp(MATCH, right_match_length)); |
1180 |
|
|
*/ |
1181 |
|
|
/* |
1182 |
gpertea |
69 |
* For now, disallow hits that don't span |
1183 |
|
|
* the insertion. If we allow these types of hits, |
1184 |
|
|
* then long_spanning.cpp needs to be updated |
1185 |
|
|
* in order to intelligently merge these kinds |
1186 |
|
|
* of reads back together |
1187 |
|
|
* |
1188 |
|
|
* Following code has been changed to allow segment that end |
1189 |
|
|
* in an insertion |
1190 |
|
|
*/ |
1191 |
|
|
bh = create_hit(name, |
1192 |
|
|
contig, |
1193 |
|
|
left, |
1194 |
gpertea |
70 |
//splcigar, |
1195 |
|
|
splcigar, |
1196 |
|
|
sam_flag & 0x0010, |
1197 |
gpertea |
69 |
junction_strand == "rev", |
1198 |
gpertea |
70 |
spl_num_mismatches, |
1199 |
gpertea |
69 |
0, |
1200 |
|
|
end); |
1201 |
|
|
return true; |
1202 |
gpertea |
70 |
} //"ins" |
1203 |
gpertea |
69 |
else |
1204 |
|
|
{ |
1205 |
|
|
uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
1206 |
|
|
int spliced_read_len = strlen(seq_str); |
1207 |
|
|
int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1; |
1208 |
|
|
if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len; |
1209 |
|
|
int8_t right_splice_pos = spliced_read_len - left_splice_pos; |
1210 |
|
|
|
1211 |
|
|
if (right_splice_pos <= 0 || left_splice_pos <= 0) |
1212 |
|
|
return false; |
1213 |
|
|
|
1214 |
|
|
if ((sam_flag & 0x0010) == 0) //###### |
1215 |
|
|
{ |
1216 |
gpertea |
135 |
if (left_splice_pos + seg_offset < _anchor_length) |
1217 |
|
|
return false; |
1218 |
|
|
} |
1219 |
|
|
else |
1220 |
|
|
{ |
1221 |
|
|
if (right_splice_pos + seg_offset < _anchor_length) |
1222 |
|
|
return false; |
1223 |
|
|
} |
1224 |
gpertea |
69 |
//uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos; |
1225 |
|
|
//atoi(toks[right_window_edge_field].c_str()); |
1226 |
|
|
int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1; |
1227 |
|
|
|
1228 |
|
|
string junction_strand = toks[num_extra_toks + strand_field]; |
1229 |
gpertea |
70 |
if (!(junction_strand == "rev" || junction_strand == "fwd")) |
1230 |
|
|
// || !(orientation == '-' || orientation == '+')) |
1231 |
gpertea |
69 |
{ |
1232 |
gpertea |
135 |
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
1233 |
|
|
//fprintf(stderr, "junction_strand=%s, orientation='%c'\n", |
1234 |
|
|
// junction_strand.c_str(), orientation); |
1235 |
|
|
return false; |
1236 |
gpertea |
69 |
} |
1237 |
|
|
|
1238 |
|
|
int mismatches_in_anchor = 0; |
1239 |
gpertea |
70 |
for (int i=0;i<num_mismatches;i++) { |
1240 |
|
|
spl_num_mismatches++; |
1241 |
|
|
int mismatch_pos = mismatches[i]; |
1242 |
|
|
if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) || |
1243 |
|
|
((sam_flag & 0x0010) != 0 && |
1244 |
|
|
abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len) |
1245 |
|
|
mismatches_in_anchor++; |
1246 |
|
|
} |
1247 |
gpertea |
69 |
|
1248 |
|
|
// FIXME: we probably should exclude these hits somewhere, but this |
1249 |
|
|
// isn't the right place |
1250 |
gpertea |
70 |
vector<CigarOp> splcigar; |
1251 |
|
|
CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP ; |
1252 |
gpertea |
135 |
spliceCigar(splcigar, samcigar, left_splice_pos, gap_len, right_splice_pos, opcode); |
1253 |
gpertea |
70 |
/* |
1254 |
|
|
splcigar.push_back(CigarOp(MATCH, left_splice_pos)); |
1255 |
gpertea |
69 |
if(toks[num_extra_toks + junction_type_field] == "del"){ |
1256 |
gpertea |
70 |
splcigar.push_back(CigarOp(DEL, gap_len)); |
1257 |
gpertea |
69 |
}else{ |
1258 |
gpertea |
70 |
splcigar.push_back(CigarOp(REF_SKIP, gap_len)); |
1259 |
gpertea |
69 |
} |
1260 |
gpertea |
70 |
splcigar.push_back(CigarOp(MATCH, right_splice_pos)); |
1261 |
|
|
*/ |
1262 |
gpertea |
69 |
bh = create_hit(name, |
1263 |
|
|
contig, |
1264 |
|
|
left, |
1265 |
gpertea |
70 |
splcigar, |
1266 |
|
|
(sam_flag & 0x0010), |
1267 |
gpertea |
69 |
junction_strand == "rev", |
1268 |
gpertea |
70 |
spl_num_mismatches, |
1269 |
gpertea |
69 |
mismatches_in_anchor, |
1270 |
|
|
end); |
1271 |
|
|
return true; |
1272 |
|
|
} |
1273 |
gpertea |
70 |
} //parse splice data |
1274 |
gpertea |
69 |
else |
1275 |
|
|
{ |
1276 |
|
|
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
1277 |
|
|
//fprintf(stderr, "%s\n", orig_bwt_buf); |
1278 |
|
|
// continue; |
1279 |
|
|
return false; |
1280 |
|
|
} |
1281 |
|
|
|
1282 |
|
|
return false; |
1283 |
|
|
} |
1284 |
|
|
|
1285 |
|
|
|
1286 |
gpertea |
135 |
|
1287 |
gpertea |
29 |
bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
1288 |
|
|
BowtieHit& bh, bool strip_slash, |
1289 |
|
|
char* name_out, char* name_tags, |
1290 |
|
|
char* seq, char* qual) { |
1291 |
|
|
if (_sam_header==NULL) |
1292 |
|
|
err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!"); |
1293 |
|
|
const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf; |
1294 |
|
|
|
1295 |
|
|
|
1296 |
|
|
uint32_t sam_flag = hit_buf->core.flag; |
1297 |
|
|
|
1298 |
|
|
int text_offset = hit_buf->core.pos; |
1299 |
|
|
int text_mate_pos = hit_buf->core.mpos; |
1300 |
|
|
int target_id = hit_buf->core.tid; |
1301 |
|
|
int mate_target_id = hit_buf->core.mtid; |
1302 |
|
|
|
1303 |
|
|
vector<CigarOp> cigar; |
1304 |
|
|
bool spliced_alignment = false; |
1305 |
|
|
int num_hits = 1; |
1306 |
|
|
|
1307 |
|
|
double mapQ = hit_buf->core.qual; |
1308 |
|
|
long double error_prob; |
1309 |
|
|
if (mapQ > 0) |
1310 |
|
|
{ |
1311 |
|
|
long double p = (-1.0 * mapQ) / 10.0; |
1312 |
|
|
error_prob = pow(10.0L, p); |
1313 |
|
|
} |
1314 |
|
|
else |
1315 |
|
|
{ |
1316 |
|
|
error_prob = 1.0; |
1317 |
|
|
} |
1318 |
|
|
|
1319 |
|
|
//header->target_name[c->tid] |
1320 |
|
|
|
1321 |
|
|
bool end = true; |
1322 |
|
|
unsigned int seg_offset = 0; |
1323 |
|
|
unsigned int seg_num = 0; |
1324 |
|
|
unsigned int num_segs = 0; |
1325 |
|
|
// Copy the tag out of the name field before we might wipe it out |
1326 |
|
|
char* qname = bam1_qname(hit_buf); |
1327 |
|
|
char* pipe = strrchr(qname, '|'); |
1328 |
|
|
if (pipe) |
1329 |
|
|
{ |
1330 |
|
|
if (name_tags) |
1331 |
|
|
strcpy(name_tags, pipe); |
1332 |
|
|
|
1333 |
|
|
char* tag_buf = pipe + 1; |
1334 |
|
|
if (strchr(tag_buf, ':')) |
1335 |
|
|
{ |
1336 |
|
|
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
1337 |
|
|
if (seg_num + 1 == num_segs) |
1338 |
|
|
end = true; |
1339 |
|
|
else |
1340 |
|
|
end = false; |
1341 |
|
|
} |
1342 |
|
|
|
1343 |
|
|
*pipe = 0; |
1344 |
|
|
} |
1345 |
|
|
|
1346 |
|
|
|
1347 |
|
|
if (target_id < 0) { |
1348 |
|
|
//assert(cigar.size() == 1 && cigar[0].opcode == MATCH); |
1349 |
|
|
bh = create_hit(qname, |
1350 |
|
|
"*", //ref_name |
1351 |
|
|
0, //left coord |
1352 |
|
|
0, //read_len |
1353 |
|
|
false, //antisense_aln |
1354 |
|
|
0, //edit_dist |
1355 |
|
|
end); |
1356 |
|
|
return true; |
1357 |
|
|
} |
1358 |
|
|
|
1359 |
|
|
//string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id]; |
1360 |
|
|
string text_name = _sam_header->target_name[target_id]; |
1361 |
|
|
for (int i = 0; i < hit_buf->core.n_cigar; ++i) |
1362 |
|
|
{ |
1363 |
|
|
//char* t; |
1364 |
|
|
|
1365 |
|
|
int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT; |
1366 |
|
|
if (length <= 0) |
1367 |
|
|
{ |
1368 |
|
|
fprintf (stderr, "BAM error: CIGAR op has zero length\n"); |
1369 |
|
|
return false; |
1370 |
|
|
} |
1371 |
|
|
|
1372 |
|
|
CigarOpCode opcode; |
1373 |
|
|
switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK) |
1374 |
|
|
{ |
1375 |
|
|
case BAM_CMATCH: opcode = MATCH; break; |
1376 |
|
|
case BAM_CINS: opcode = INS; break; |
1377 |
|
|
case BAM_CDEL: opcode = DEL; break; |
1378 |
|
|
case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break; |
1379 |
|
|
case BAM_CHARD_CLIP: opcode = HARD_CLIP; break; |
1380 |
|
|
case BAM_CPAD: opcode = PAD; break; |
1381 |
|
|
case BAM_CREF_SKIP: |
1382 |
|
|
opcode = REF_SKIP; |
1383 |
|
|
spliced_alignment = true; |
1384 |
|
|
if (length > (int)max_report_intron_length) |
1385 |
|
|
{ |
1386 |
|
|
//fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n"); |
1387 |
|
|
return false; |
1388 |
|
|
} |
1389 |
|
|
break; |
1390 |
|
|
default: |
1391 |
|
|
fprintf (stderr, "BAM read: invalid CIGAR operation\n"); |
1392 |
|
|
return false; |
1393 |
|
|
} |
1394 |
|
|
if (opcode != HARD_CLIP) |
1395 |
|
|
cigar.push_back(CigarOp(opcode, length)); |
1396 |
|
|
} |
1397 |
|
|
|
1398 |
|
|
string mrnm; |
1399 |
|
|
if (mate_target_id >= 0) { |
1400 |
|
|
if (mate_target_id == target_id) { |
1401 |
|
|
//mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id]; |
1402 |
|
|
mrnm = _sam_header->target_name[mate_target_id]; |
1403 |
|
|
} |
1404 |
|
|
else { |
1405 |
|
|
//fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n"); |
1406 |
|
|
return false; |
1407 |
|
|
} |
1408 |
|
|
} |
1409 |
|
|
else { |
1410 |
|
|
text_mate_pos = 0; |
1411 |
|
|
} |
1412 |
|
|
//CuffStrand source_strand = CUFF_STRAND_UNKNOWN; |
1413 |
|
|
bool antisense_splice=false; |
1414 |
|
|
unsigned char num_mismatches = 0; |
1415 |
|
|
unsigned char num_splice_anchor_mismatches = 0; |
1416 |
|
|
|
1417 |
|
|
uint8_t* ptr = bam_aux_get(hit_buf, "XS"); |
1418 |
|
|
if (ptr) { |
1419 |
|
|
char src_strand_char = bam_aux2A(ptr); |
1420 |
|
|
if (src_strand_char == '-') |
1421 |
|
|
antisense_splice = true; |
1422 |
|
|
// else if (src_strand_char == '+') |
1423 |
|
|
// source_strand = CUFF_FWD; |
1424 |
|
|
} |
1425 |
|
|
|
1426 |
|
|
ptr = bam_aux_get(hit_buf, "NM"); |
1427 |
|
|
if (ptr) { |
1428 |
|
|
num_mismatches = bam_aux2i(ptr); |
1429 |
|
|
} |
1430 |
|
|
|
1431 |
|
|
ptr = bam_aux_get(hit_buf, "NH"); |
1432 |
|
|
if (ptr) { |
1433 |
|
|
num_hits = bam_aux2i(ptr); |
1434 |
|
|
} |
1435 |
|
|
|
1436 |
|
|
//bool antisense_aln = bam1_strand(hit_buf); |
1437 |
|
|
|
1438 |
|
|
//if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN) |
1439 |
|
|
// source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping()); |
1440 |
|
|
if (spliced_alignment) { |
1441 |
|
|
//if (source_strand == CUFF_STRAND_UNKNOWN) { |
1442 |
|
|
// fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n"); |
1443 |
|
|
// } |
1444 |
|
|
bh = create_hit(qname, |
1445 |
|
|
text_name, |
1446 |
|
|
text_offset, // BAM files are 0-indexed |
1447 |
|
|
cigar, |
1448 |
|
|
sam_flag & 0x0010, |
1449 |
|
|
antisense_splice, |
1450 |
|
|
num_mismatches, |
1451 |
|
|
num_splice_anchor_mismatches, |
1452 |
|
|
end); |
1453 |
|
|
|
1454 |
|
|
} |
1455 |
|
|
else { |
1456 |
|
|
//assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN); |
1457 |
|
|
//assert(cigar.size() == 1 && cigar[0].opcode == MATCH); |
1458 |
|
|
bh = create_hit(qname, |
1459 |
|
|
text_name, |
1460 |
|
|
text_offset, // BAM files are 0-indexed |
1461 |
|
|
cigar, |
1462 |
|
|
sam_flag & 0x0010, |
1463 |
|
|
false, |
1464 |
|
|
num_mismatches, |
1465 |
|
|
0, |
1466 |
|
|
end); |
1467 |
|
|
} |
1468 |
|
|
if (seq!=NULL) { |
1469 |
|
|
char *bseq = (char*)bam1_seq(hit_buf); |
1470 |
|
|
for(int i=0;i<(hit_buf->core.l_qseq);i++) { |
1471 |
|
|
char v = bam1_seqi(bseq,i); |
1472 |
|
|
seq[i]=bam_nt16_rev_table[v]; |
1473 |
|
|
} |
1474 |
|
|
seq[hit_buf->core.l_qseq]=0; |
1475 |
|
|
} |
1476 |
|
|
if (qual!=NULL) { |
1477 |
|
|
char *bq = (char*)bam1_qual(hit_buf); |
1478 |
|
|
for(int i=0;i<(hit_buf->core.l_qseq);i++) { |
1479 |
|
|
qual[i]=bq[i]+33; |
1480 |
|
|
} |
1481 |
|
|
qual[hit_buf->core.l_qseq]=0; |
1482 |
|
|
} |
1483 |
|
|
return true; |
1484 |
|
|
} |
1485 |
|
|
|
1486 |
|
|
bool BAMHitFactory::inspect_header(HitStream& hs) |
1487 |
|
|
{ |
1488 |
|
|
bam_header_t* header = ((samfile_t*)(hs._hit_file))->header; |
1489 |
|
|
|
1490 |
|
|
if (header == NULL) { |
1491 |
|
|
fprintf(stderr, "Warning: No BAM header\n"); |
1492 |
|
|
return false; |
1493 |
|
|
} |
1494 |
|
|
if (header->l_text == 0) { |
1495 |
|
|
fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n"); |
1496 |
|
|
return false; |
1497 |
|
|
} |
1498 |
|
|
return true; |
1499 |
|
|
} |
1500 |
|
|
|
1501 |
|
|
|
1502 |
|
|
void get_mapped_reads(FILE* bwtf, |
1503 |
|
|
HitTable& hits, |
1504 |
|
|
HitFactory& hit_factory, |
1505 |
|
|
bool strip_slash, |
1506 |
|
|
bool verbose) |
1507 |
|
|
{ |
1508 |
|
|
|
1509 |
|
|
|
1510 |
|
|
char bwt_buf[2048]; |
1511 |
|
|
uint32_t reads_extracted = 0; |
1512 |
|
|
|
1513 |
|
|
while (fgets(bwt_buf, 2048, bwtf)) |
1514 |
|
|
{ |
1515 |
|
|
// Chomp the newline |
1516 |
|
|
char* nl = strrchr(bwt_buf, '\n'); |
1517 |
|
|
if (nl) *nl = 0; |
1518 |
|
|
if (*bwt_buf == 0) |
1519 |
|
|
continue; |
1520 |
|
|
// Get a new record from the tab-delimited Bowtie map |
1521 |
|
|
BowtieHit bh; |
1522 |
|
|
if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash)) |
1523 |
|
|
{ |
1524 |
|
|
// Only check uniqueness if these hits are spliced |
1525 |
|
|
hits.add_hit(bh, true); |
1526 |
|
|
} |
1527 |
|
|
reads_extracted++; |
1528 |
|
|
} |
1529 |
|
|
|
1530 |
|
|
// This will sort the map by insert id. |
1531 |
|
|
hits.finalize(); |
1532 |
|
|
fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted); |
1533 |
|
|
} |
1534 |
|
|
|
1535 |
|
|
|
1536 |
|
|
/* |
1537 |
|
|
AlignStatus status(const BowtieHit* align) |
1538 |
|
|
{ |
1539 |
|
|
if (!align) |
1540 |
|
|
return UNALIGNED; |
1541 |
|
|
if (align->contiguous()) |
1542 |
|
|
return CONTIGUOUS; |
1543 |
|
|
return SPLICED; |
1544 |
|
|
} |
1545 |
|
|
*/ |
1546 |
|
|
|
1547 |
|
|
void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC) |
1548 |
|
|
{ |
1549 |
|
|
int max_hit_pos = -1; |
1550 |
|
|
for (size_t i = 0; i < hits.size(); ++i) |
1551 |
|
|
{ |
1552 |
|
|
max_hit_pos = max((int)hits[i].right(),max_hit_pos); |
1553 |
|
|
} |
1554 |
|
|
|
1555 |
|
|
if ((int)DoC.size() < max_hit_pos) |
1556 |
|
|
DoC.resize(max_hit_pos); |
1557 |
|
|
|
1558 |
|
|
for (size_t i = 0; i < hits.size(); ++i) |
1559 |
|
|
{ |
1560 |
|
|
const BowtieHit& bh = hits[i]; |
1561 |
|
|
|
1562 |
|
|
// split up the coverage contibution for this reads |
1563 |
|
|
size_t j = bh.left(); |
1564 |
|
|
const vector<CigarOp>& cigar = bh.cigar(); |
1565 |
|
|
|
1566 |
|
|
for (size_t c = 0 ; c < cigar.size(); ++c) |
1567 |
|
|
{ |
1568 |
|
|
switch(cigar[c].opcode) |
1569 |
|
|
{ |
1570 |
|
|
case MATCH: |
1571 |
|
|
for (size_t m = 0; m < cigar[c].length; ++m) |
1572 |
|
|
{ |
1573 |
|
|
if (DoC[j + m] < 0xFFFF) |
1574 |
|
|
DoC[j + m]++; |
1575 |
|
|
} |
1576 |
|
|
//fall through this case to REF_SKIP is intentional |
1577 |
|
|
case REF_SKIP: |
1578 |
|
|
j += cigar[c].length; |
1579 |
|
|
break; |
1580 |
|
|
default: |
1581 |
|
|
break; |
1582 |
|
|
} |
1583 |
|
|
|
1584 |
|
|
} |
1585 |
|
|
} |
1586 |
|
|
} |
1587 |
|
|
|
1588 |
|
|
void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC) |
1589 |
|
|
{ |
1590 |
|
|
if ((int)DoC.size() < bh.right()) |
1591 |
|
|
DoC.resize(bh.right()); |
1592 |
|
|
|
1593 |
|
|
// split up the coverage contibution for this reads |
1594 |
|
|
size_t j = bh.left(); |
1595 |
|
|
const vector<CigarOp>& cigar = bh.cigar(); |
1596 |
|
|
|
1597 |
|
|
for (size_t c = 0 ; c < cigar.size(); ++c) |
1598 |
|
|
{ |
1599 |
|
|
switch(cigar[c].opcode) |
1600 |
|
|
{ |
1601 |
|
|
case MATCH: |
1602 |
|
|
for (size_t m = 0; m < cigar[c].length; ++m) |
1603 |
|
|
{ |
1604 |
gpertea |
135 |
if (DoC[j + m] < VMAXINT32) |
1605 |
gpertea |
29 |
DoC[j + m]++; |
1606 |
|
|
} |
1607 |
|
|
//fall through this case to REF_SKIP is intentional |
1608 |
|
|
case REF_SKIP: |
1609 |
|
|
j += cigar[c].length; |
1610 |
|
|
break; |
1611 |
|
|
default: |
1612 |
|
|
break; |
1613 |
|
|
} |
1614 |
|
|
|
1615 |
|
|
} |
1616 |
|
|
} |
1617 |
|
|
|
1618 |
|
|
void print_hit(FILE* fout, |
1619 |
|
|
const char* read_name, |
1620 |
|
|
const BowtieHit& bh, |
1621 |
|
|
const char* ref_name, |
1622 |
|
|
const char* sequence, |
1623 |
|
|
const char* qualities, |
1624 |
|
|
bool from_bowtie) |
1625 |
|
|
{ |
1626 |
|
|
string seq; |
1627 |
|
|
string quals; |
1628 |
|
|
if (sequence) |
1629 |
|
|
{ |
1630 |
|
|
seq = sequence; |
1631 |
|
|
quals = qualities; |
1632 |
|
|
seq.resize(bh.read_len()); |
1633 |
|
|
quals.resize(bh.read_len()); |
1634 |
|
|
} |
1635 |
|
|
else |
1636 |
|
|
{ |
1637 |
|
|
seq = "*"; |
1638 |
|
|
} |
1639 |
|
|
|
1640 |
|
|
if (qualities) |
1641 |
|
|
{ |
1642 |
|
|
quals = qualities; |
1643 |
|
|
quals.resize(bh.read_len()); |
1644 |
|
|
} |
1645 |
|
|
else |
1646 |
|
|
{ |
1647 |
|
|
quals = "*"; |
1648 |
|
|
} |
1649 |
|
|
|
1650 |
|
|
uint32_t sam_flag = 0; |
1651 |
|
|
if (bh.antisense_align()) |
1652 |
|
|
{ |
1653 |
|
|
sam_flag |= 0x0010; // BAM_FREVERSE |
1654 |
|
|
if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed. |
1655 |
|
|
{ |
1656 |
|
|
reverse_complement(seq); |
1657 |
|
|
reverse(quals.begin(), quals.end()); |
1658 |
|
|
} |
1659 |
|
|
} |
1660 |
|
|
|
1661 |
|
|
uint32_t sam_pos = bh.left() + 1; |
1662 |
|
|
uint32_t map_quality = 255; |
1663 |
|
|
char cigar[256]; |
1664 |
|
|
cigar[0] = 0; |
1665 |
|
|
string mate_ref_name = "*"; |
1666 |
|
|
uint32_t mate_pos = 0; |
1667 |
|
|
uint32_t insert_size = 0; |
1668 |
|
|
//string qualities = "*"; |
1669 |
|
|
|
1670 |
|
|
const vector<CigarOp>& bh_cigar = bh.cigar(); |
1671 |
|
|
|
1672 |
|
|
/* |
1673 |
|
|
* In addition to calculating the cigar string, |
1674 |
|
|
* we need to figure out how many in/dels are in the |
1675 |
|
|
* sequence, so that we can give the correct |
1676 |
|
|
* value for the NM tag |
1677 |
|
|
*/ |
1678 |
|
|
int indel_distance = 0; |
1679 |
|
|
for (size_t c = 0; c < bh_cigar.size(); ++c) |
1680 |
|
|
{ |
1681 |
|
|
char ibuf[64]; |
1682 |
|
|
sprintf(ibuf, "%d", bh_cigar[c].length); |
1683 |
|
|
switch(bh_cigar[c].opcode) |
1684 |
|
|
{ |
1685 |
|
|
case MATCH: |
1686 |
|
|
strcat(cigar, ibuf); |
1687 |
|
|
strcat(cigar, "M"); |
1688 |
|
|
break; |
1689 |
|
|
case INS: |
1690 |
|
|
strcat(cigar, ibuf); |
1691 |
|
|
strcat(cigar, "I"); |
1692 |
|
|
indel_distance += bh_cigar[c].length; |
1693 |
|
|
break; |
1694 |
|
|
case DEL: |
1695 |
|
|
strcat(cigar, ibuf); |
1696 |
|
|
strcat(cigar, "D"); |
1697 |
|
|
indel_distance += bh_cigar[c].length; |
1698 |
|
|
break; |
1699 |
|
|
case REF_SKIP: |
1700 |
|
|
strcat(cigar, ibuf); |
1701 |
|
|
strcat(cigar, "N"); |
1702 |
|
|
break; |
1703 |
|
|
default: |
1704 |
|
|
break; |
1705 |
|
|
} |
1706 |
|
|
} |
1707 |
|
|
|
1708 |
|
|
//string q = string(bh.read_len(), '!'); |
1709 |
|
|
//string s = string(bh.read_len(), 'N'); |
1710 |
|
|
|
1711 |
|
|
fprintf(fout, |
1712 |
|
|
"%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", |
1713 |
|
|
read_name, |
1714 |
|
|
sam_flag, |
1715 |
|
|
ref_name, |
1716 |
|
|
sam_pos, |
1717 |
|
|
map_quality, |
1718 |
|
|
cigar, |
1719 |
|
|
mate_ref_name.c_str(), |
1720 |
|
|
mate_pos, |
1721 |
|
|
insert_size, |
1722 |
|
|
seq.c_str(), |
1723 |
|
|
quals.c_str()); |
1724 |
|
|
|
1725 |
|
|
if (!sam_readgroup_id.empty()) |
1726 |
|
|
fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str()); |
1727 |
|
|
|
1728 |
|
|
fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance); |
1729 |
|
|
|
1730 |
|
|
bool containsSplice = false; |
1731 |
|
|
for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){ |
1732 |
|
|
if(itr->opcode == REF_SKIP){ |
1733 |
|
|
containsSplice = true; |
1734 |
|
|
break; |
1735 |
|
|
} |
1736 |
|
|
} |
1737 |
|
|
|
1738 |
|
|
if (containsSplice) |
1739 |
|
|
fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+'); |
1740 |
|
|
|
1741 |
|
|
fprintf(fout, "\n"); |
1742 |
|
|
} |
1743 |
|
|
|
1744 |
|
|
void print_bamhit(GBamWriter& wbam, |
1745 |
|
|
const char* read_name, |
1746 |
|
|
const BowtieHit& bh, |
1747 |
|
|
const char* ref_name, |
1748 |
|
|
const char* sequence, |
1749 |
|
|
const char* qualities, |
1750 |
|
|
bool from_bowtie) |
1751 |
|
|
{ |
1752 |
|
|
string seq; |
1753 |
|
|
string quals; |
1754 |
|
|
if (sequence) { |
1755 |
|
|
seq = sequence; |
1756 |
|
|
quals = qualities; |
1757 |
|
|
seq.resize(bh.read_len()); |
1758 |
|
|
quals.resize(bh.read_len()); |
1759 |
|
|
} |
1760 |
|
|
else { |
1761 |
|
|
seq = "*"; |
1762 |
|
|
} |
1763 |
|
|
if (qualities) { |
1764 |
|
|
quals = qualities; |
1765 |
|
|
quals.resize(bh.read_len()); |
1766 |
|
|
} |
1767 |
|
|
else |
1768 |
|
|
{ |
1769 |
|
|
quals = "*"; |
1770 |
|
|
} |
1771 |
|
|
|
1772 |
|
|
uint32_t sam_flag = 0; |
1773 |
|
|
if (bh.antisense_align()) |
1774 |
|
|
{ |
1775 |
|
|
sam_flag |= 0x0010; // BAM_FREVERSE |
1776 |
|
|
if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed. |
1777 |
|
|
{ |
1778 |
|
|
reverse_complement(seq); |
1779 |
|
|
reverse(quals.begin(), quals.end()); |
1780 |
|
|
} |
1781 |
|
|
} |
1782 |
|
|
|
1783 |
|
|
uint32_t sam_pos = bh.left() + 1; |
1784 |
|
|
uint32_t map_quality = 255; |
1785 |
|
|
char cigar[256]; |
1786 |
|
|
cigar[0] = 0; |
1787 |
|
|
string mate_ref_name = "*"; |
1788 |
|
|
uint32_t mate_pos = 0; |
1789 |
|
|
uint32_t insert_size = 0; |
1790 |
|
|
//string qualities = "*"; |
1791 |
|
|
|
1792 |
|
|
const vector<CigarOp>& bh_cigar = bh.cigar(); |
1793 |
|
|
/* |
1794 |
|
|
* In addition to calculating the cigar string, |
1795 |
|
|
* we need to figure out how many in/dels are in the |
1796 |
|
|
* sequence, so that we can give the correct |
1797 |
|
|
* value for the NM tag |
1798 |
|
|
*/ |
1799 |
|
|
int indel_distance = 0; |
1800 |
|
|
for (size_t c = 0; c < bh_cigar.size(); ++c) |
1801 |
|
|
{ |
1802 |
|
|
char ibuf[64]; |
1803 |
|
|
sprintf(ibuf, "%d", bh_cigar[c].length); |
1804 |
|
|
switch(bh_cigar[c].opcode) |
1805 |
|
|
{ |
1806 |
|
|
case MATCH: |
1807 |
|
|
strcat(cigar, ibuf); |
1808 |
|
|
strcat(cigar, "M"); |
1809 |
|
|
break; |
1810 |
|
|
case INS: |
1811 |
|
|
strcat(cigar, ibuf); |
1812 |
|
|
strcat(cigar, "I"); |
1813 |
|
|
indel_distance += bh_cigar[c].length; |
1814 |
|
|
break; |
1815 |
|
|
case DEL: |
1816 |
|
|
strcat(cigar, ibuf); |
1817 |
|
|
strcat(cigar, "D"); |
1818 |
|
|
indel_distance += bh_cigar[c].length; |
1819 |
|
|
break; |
1820 |
|
|
case REF_SKIP: |
1821 |
|
|
strcat(cigar, ibuf); |
1822 |
|
|
strcat(cigar, "N"); |
1823 |
|
|
break; |
1824 |
|
|
default: |
1825 |
|
|
break; |
1826 |
|
|
} |
1827 |
|
|
} |
1828 |
|
|
|
1829 |
|
|
//string q = string(bh.read_len(), '!'); |
1830 |
|
|
//string s = string(bh.read_len(), 'N'); |
1831 |
|
|
/* |
1832 |
|
|
fprintf(fout, |
1833 |
|
|
"%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", |
1834 |
|
|
read_name, |
1835 |
|
|
sam_flag, |
1836 |
|
|
ref_name, |
1837 |
|
|
sam_pos, |
1838 |
|
|
map_quality, |
1839 |
|
|
cigar, |
1840 |
|
|
mate_ref_name.c_str(), |
1841 |
|
|
mate_pos, |
1842 |
|
|
insert_size, |
1843 |
|
|
seq.c_str(), |
1844 |
|
|
quals.c_str()); |
1845 |
|
|
|
1846 |
|
|
fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance); |
1847 |
|
|
*/ |
1848 |
|
|
vector<string> auxdata; |
1849 |
|
|
|
1850 |
|
|
if (!sam_readgroup_id.empty()) |
1851 |
|
|
{ |
1852 |
|
|
string nm("RG:Z:"); |
1853 |
|
|
nm += sam_readgroup_id; |
1854 |
|
|
auxdata.push_back(nm); |
1855 |
|
|
} |
1856 |
|
|
|
1857 |
|
|
string nm("NM:i:"); |
1858 |
|
|
str_appendInt(nm, bh.edit_dist() + indel_distance); |
1859 |
|
|
auxdata.push_back(nm); |
1860 |
|
|
bool containsSplice = false; |
1861 |
|
|
for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr) |
1862 |
|
|
if(itr->opcode == REF_SKIP){ |
1863 |
|
|
containsSplice = true; |
1864 |
|
|
break; |
1865 |
|
|
} |
1866 |
|
|
|
1867 |
|
|
if (containsSplice) { |
1868 |
|
|
//fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+'); |
1869 |
|
|
nm="XS:A:"; |
1870 |
|
|
nm+=(char)(bh.antisense_splice() ? '-' : '+'); |
1871 |
|
|
auxdata.push_back(nm); |
1872 |
|
|
} |
1873 |
|
|
|
1874 |
|
|
GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality, |
1875 |
|
|
cigar, mate_ref_name.c_str(), mate_pos, |
1876 |
|
|
insert_size, seq.c_str(), quals.c_str(), &auxdata); |
1877 |
|
|
|
1878 |
|
|
|
1879 |
|
|
|
1880 |
|
|
wbam.write(brec); |
1881 |
|
|
delete brec; |
1882 |
|
|
} |
1883 |
|
|
|
1884 |
|
|
/** |
1885 |
|
|
* Print a vector of cigar operations to a file. |
1886 |
|
|
* @param bh_cigar A vector of CigarOps |
1887 |
|
|
* @return a string representation of the cigar string |
1888 |
|
|
*/ |
1889 |
|
|
std::string print_cigar(vector<CigarOp>& bh_cigar){ |
1890 |
|
|
char cigar[256]; |
1891 |
|
|
cigar[0] = 0; |
1892 |
|
|
for (size_t c = 0; c < bh_cigar.size(); ++c) |
1893 |
|
|
{ |
1894 |
|
|
char ibuf[64]; |
1895 |
|
|
sprintf(ibuf, "%d", bh_cigar[c].length); |
1896 |
|
|
switch(bh_cigar[c].opcode) |
1897 |
|
|
{ |
1898 |
|
|
case MATCH: |
1899 |
|
|
strcat(cigar, ibuf); |
1900 |
|
|
strcat(cigar, "M"); |
1901 |
|
|
break; |
1902 |
|
|
case INS: |
1903 |
|
|
strcat(cigar, ibuf); |
1904 |
|
|
strcat(cigar, "I"); |
1905 |
|
|
break; |
1906 |
|
|
case DEL: |
1907 |
|
|
strcat(cigar, ibuf); |
1908 |
|
|
strcat(cigar, "D"); |
1909 |
|
|
break; |
1910 |
|
|
case REF_SKIP: |
1911 |
|
|
strcat(cigar, ibuf); |
1912 |
|
|
strcat(cigar, "N"); |
1913 |
|
|
break; |
1914 |
|
|
default: |
1915 |
|
|
break; |
1916 |
|
|
} |
1917 |
|
|
} |
1918 |
|
|
string result(cigar); |
1919 |
|
|
return result; |
1920 |
|
|
} |
1921 |
|
|
|
1922 |
|
|
bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt) |
1923 |
|
|
{ |
1924 |
|
|
RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id); |
1925 |
|
|
if (!ref_str) |
1926 |
|
|
return false; |
1927 |
|
|
|
1928 |
|
|
const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right()); |
1929 |
|
|
|
1930 |
|
|
size_t pos_seq = 0; |
1931 |
|
|
size_t pos_ref = 0; |
1932 |
|
|
size_t mismatch = 0; |
1933 |
|
|
for (size_t i = 0; i < _cigar.size(); ++i) |
1934 |
|
|
{ |
1935 |
|
|
CigarOp cigar = _cigar[i]; |
1936 |
|
|
switch(cigar.opcode) |
1937 |
|
|
{ |
1938 |
|
|
case MATCH: |
1939 |
|
|
{ |
1940 |
|
|
for (size_t j = 0; j < cigar.length; ++j) |
1941 |
|
|
{ |
1942 |
|
|
if (_seq[pos_seq++] != ref_seq[pos_ref++]) |
1943 |
|
|
++mismatch; |
1944 |
|
|
} |
1945 |
|
|
} |
1946 |
|
|
break; |
1947 |
|
|
case INS: |
1948 |
|
|
{ |
1949 |
|
|
pos_seq += cigar.length; |
1950 |
|
|
} |
1951 |
|
|
break; |
1952 |
|
|
|
1953 |
|
|
case DEL: |
1954 |
|
|
case REF_SKIP: |
1955 |
|
|
{ |
1956 |
|
|
pos_ref += cigar.length; |
1957 |
|
|
} |
1958 |
|
|
break; |
1959 |
|
|
|
1960 |
|
|
default: |
1961 |
|
|
break; |
1962 |
|
|
} |
1963 |
|
|
} |
1964 |
|
|
|
1965 |
|
|
return mismatch == _edit_dist; |
1966 |
|
|
} |