1 |
/* |
2 |
* bwt_map.cpp |
3 |
* TopHat |
4 |
* |
5 |
* Created by Cole Trapnell on 11/17/08. |
6 |
* Copyright 2008 Cole Trapnell. All rights reserved. |
7 |
* |
8 |
*/ |
9 |
|
10 |
#ifdef HAVE_CONFIG_H |
11 |
#include <config.h> |
12 |
#endif |
13 |
|
14 |
#include <cassert> |
15 |
#include <cstdlib> |
16 |
#include <iostream> |
17 |
#include <set> |
18 |
#include <vector> |
19 |
#include <cmath> |
20 |
|
21 |
#include "common.h" |
22 |
#include "bwt_map.h" |
23 |
#include "tokenize.h" |
24 |
#include "reads.h" |
25 |
|
26 |
using namespace std; |
27 |
|
28 |
void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness) |
29 |
{ |
30 |
uint32_t reference_id = bh.ref_id(); |
31 |
|
32 |
pair<RefHits::iterator, bool> ret = |
33 |
_hits_for_ref.insert(make_pair(reference_id, HitList())); |
34 |
HitList& hl = ret.first->second; |
35 |
|
36 |
if (check_uniqueness) |
37 |
{ |
38 |
// Check uniqueness, in case we are adding spliced hits from |
39 |
// several spliced alignment sources (e.g. de novo hashing + Bowtie |
40 |
// against a user-supplied index). We don't want to count the same |
41 |
// alignment twice if it happened to be found by more than one method |
42 |
HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt); |
43 |
HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt); |
44 |
for (; lb != ub && lb != hl.end(); ++lb) |
45 |
{ |
46 |
if (*lb == bh) |
47 |
{ |
48 |
//fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id()); |
49 |
return; |
50 |
} |
51 |
|
52 |
if (lb->insert_id() == bh.insert_id() && |
53 |
lb->ref_id() == bh.ref_id() && |
54 |
lb->antisense_align() == bh.antisense_align()) |
55 |
{ |
56 |
// If we get here, we may be looking at the same alignment |
57 |
// However, spanning_reads may report a shorter, trimmed alignment |
58 |
// so not all fields will be equal. If they just disagree on the |
59 |
// ends, and don't indicate a different junction coord, the |
60 |
// alignments are the same. |
61 |
|
62 |
if ((lb->left() <= bh.left() && lb->right() >= bh.right()) || |
63 |
(bh.left() <= lb->left() && bh.right() >= lb->right())) |
64 |
{ |
65 |
vector<pair<int,int> > lb_gaps, bh_gaps; |
66 |
lb->gaps(lb_gaps); |
67 |
bh.gaps(bh_gaps); |
68 |
if (lb_gaps == bh_gaps) |
69 |
{ |
70 |
// One alignment is contained in the other, they agree on |
71 |
// where the gaps, if any, are, and they share an id |
72 |
// => this is a redundant aligment, so toss it |
73 |
//fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id()); |
74 |
return; |
75 |
} |
76 |
} |
77 |
} |
78 |
} |
79 |
} |
80 |
_total_hits++; |
81 |
hl.push_back(bh); |
82 |
} |
83 |
|
84 |
bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2) |
85 |
{ |
86 |
return h1.insert_id() < h2.insert_id(); |
87 |
} |
88 |
|
89 |
void LineHitFactory::openStream(HitStream& hs) |
90 |
{ |
91 |
if (hs._hit_file==NULL && !hs._hit_file_name.empty()) { |
92 |
//open the file for HitStream here |
93 |
hs._hit_file=fopen(hs._hit_file_name.c_str(),"r"); |
94 |
if (hs._hit_file==NULL) |
95 |
err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str()); |
96 |
return; |
97 |
} |
98 |
if (hs._fzpipe!=NULL) { |
99 |
hs._hit_file=hs._fzpipe->file; |
100 |
} |
101 |
} |
102 |
|
103 |
void LineHitFactory::rewind(HitStream& hs) |
104 |
{ |
105 |
if (hs._fzpipe!=NULL) { |
106 |
hs._fzpipe->rewind(); |
107 |
hs._hit_file=hs._fzpipe->file; |
108 |
} |
109 |
else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file)); |
110 |
} |
111 |
|
112 |
bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) { |
113 |
FILE* f=(FILE *)(hs._hit_file); |
114 |
bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL); |
115 |
if (!new_rec || feof(f)) { |
116 |
hs._eof=true; |
117 |
return false; |
118 |
} |
119 |
++_line_num; |
120 |
char* nl = strrchr(_hit_buf, '\n'); |
121 |
if (nl) *nl = 0; |
122 |
buf = _hit_buf; |
123 |
buf_size = _hit_buf_max_sz - 1; |
124 |
return true; |
125 |
} |
126 |
|
127 |
void LineHitFactory::closeStream(HitStream& hs) { |
128 |
if (hs._fzpipe!=NULL) { |
129 |
hs._fzpipe->close(); |
130 |
return; |
131 |
} |
132 |
if (hs._hit_file!=NULL) { |
133 |
fclose((FILE*)(hs._hit_file)); |
134 |
hs._hit_file=NULL; |
135 |
} |
136 |
} |
137 |
void BAMHitFactory::openStream(HitStream& hs) { |
138 |
if (hs._hit_file==NULL) { |
139 |
if (hs._hit_file_name.empty()) |
140 |
//err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n"); |
141 |
return; //invalid stream, could be just a place holder |
142 |
//open the file here if not already open |
143 |
string fext=getFext(hs._hit_file_name); |
144 |
if (fext=="sam") |
145 |
hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0); |
146 |
else |
147 |
hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0); |
148 |
|
149 |
samfile_t* sam_file=(samfile_t*)(hs._hit_file); |
150 |
|
151 |
if (sam_file == NULL) |
152 |
err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str()); |
153 |
if (sam_file->header == NULL) |
154 |
err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str()); |
155 |
memset(&_next_hit, 0, sizeof(_next_hit)); |
156 |
//_beginning = bgzf_tell(sam_file->x.bam); |
157 |
_sam_header=sam_file->header; |
158 |
if (inspect_header(hs) == false) |
159 |
err_die("Error: invalid SAM header for file %s\n", |
160 |
hs._hit_file_name.c_str()); |
161 |
} |
162 |
} |
163 |
|
164 |
void BAMHitFactory::closeStream(HitStream& hs) { |
165 |
if (hs._hit_file) { |
166 |
samclose((samfile_t*)(hs._hit_file)); |
167 |
} |
168 |
hs._hit_file=NULL; |
169 |
_sam_header=NULL; |
170 |
} |
171 |
|
172 |
void BAMHitFactory::rewind(HitStream& hs) |
173 |
{ |
174 |
/* |
175 |
if (_hit_file && ((samfile_t*)_hit_file)->x.bam) |
176 |
{ |
177 |
bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET); |
178 |
_eof = false; |
179 |
} |
180 |
*/ |
181 |
this->closeStream(hs); |
182 |
this->openStream(hs); |
183 |
} |
184 |
|
185 |
string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) { |
186 |
const bam1_t* bamrec=(const bam1_t*)hit_buf; |
187 |
char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec); |
188 |
string sam_line(tamline); |
189 |
free(tamline); |
190 |
return sam_line; |
191 |
} |
192 |
|
193 |
bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) { |
194 |
if (_next_hit.data) { |
195 |
free(_next_hit.data); |
196 |
_next_hit.data = NULL; |
197 |
} |
198 |
_sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on |
199 |
if (hs.eof() || !hs.ready()) return false; |
200 |
|
201 |
//mark_curr_pos(); |
202 |
|
203 |
memset(&_next_hit, 0, sizeof(_next_hit)); |
204 |
|
205 |
int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit); |
206 |
if (bytes_read <= 0) { |
207 |
hs._eof = true; |
208 |
return false; |
209 |
} |
210 |
buf = (const char*)&_next_hit; |
211 |
buf_size = bytes_read; |
212 |
return true; |
213 |
} |
214 |
|
215 |
|
216 |
BowtieHit HitFactory::create_hit(const string& insert_name, |
217 |
const string& ref_name, |
218 |
int left, |
219 |
const vector<CigarOp>& cigar, |
220 |
bool antisense_aln, |
221 |
bool antisense_splice, |
222 |
unsigned char edit_dist, |
223 |
unsigned char splice_mms, |
224 |
bool end) |
225 |
{ |
226 |
uint64_t insert_id = _insert_table.get_id(insert_name); |
227 |
uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0); |
228 |
|
229 |
return BowtieHit(reference_id, |
230 |
insert_id, |
231 |
left, |
232 |
cigar, |
233 |
antisense_aln, |
234 |
antisense_splice, |
235 |
edit_dist, |
236 |
splice_mms, |
237 |
end); |
238 |
} |
239 |
|
240 |
BowtieHit HitFactory::create_hit(const string& insert_name, |
241 |
const string& ref_name, |
242 |
uint32_t left, |
243 |
uint32_t read_len, |
244 |
bool antisense_aln, |
245 |
unsigned char edit_dist, |
246 |
bool end) |
247 |
{ |
248 |
uint64_t insert_id = _insert_table.get_id(insert_name); |
249 |
uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0); |
250 |
|
251 |
return BowtieHit(reference_id, |
252 |
insert_id, |
253 |
left, |
254 |
read_len, |
255 |
antisense_aln, |
256 |
edit_dist, |
257 |
end); |
258 |
} |
259 |
|
260 |
bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
261 |
BowtieHit& bh, |
262 |
bool strip_slash, |
263 |
char* name_out, |
264 |
char* name_tags, |
265 |
char* seq, |
266 |
char* qual) |
267 |
{ |
268 |
if (!orig_bwt_buf || !*orig_bwt_buf) |
269 |
return false; |
270 |
|
271 |
static const int buf_size = 2048; |
272 |
|
273 |
char bwt_buf[buf_size]; |
274 |
strcpy(bwt_buf, orig_bwt_buf); |
275 |
//const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s"; |
276 |
|
277 |
char orientation; |
278 |
|
279 |
//int bwtf_ret = 0; |
280 |
//uint32_t seqid = 0; |
281 |
|
282 |
char text_name[buf_size]; |
283 |
unsigned int text_offset; |
284 |
|
285 |
|
286 |
//unsigned int other_occs; |
287 |
char mismatches[buf_size]; |
288 |
//memset(mismatches, 0, sizeof(mismatches)); |
289 |
|
290 |
const char* buf = bwt_buf; |
291 |
char* name = get_token((char**)&buf,"\t"); |
292 |
char* orientation_str = get_token((char**)&buf,"\t"); |
293 |
char* text_name_str = get_token((char**)&buf,"\t"); |
294 |
|
295 |
strcpy(text_name, text_name_str); |
296 |
|
297 |
char* text_offset_str = get_token((char**)&buf,"\t"); |
298 |
char* seq_str = get_token((char**)&buf,"\t"); |
299 |
if (seq) |
300 |
strcpy(seq, seq_str); |
301 |
|
302 |
const char* qual_str = get_token((char**)&buf,"\t"); |
303 |
if (qual) |
304 |
strcpy(qual, qual_str); |
305 |
|
306 |
/*const char* other_occs_str =*/ get_token((char**)&buf, "\t"); |
307 |
mismatches[0] = 0; |
308 |
char* mismatches_str = get_token((char**)&buf, "\t"); |
309 |
if (mismatches_str) |
310 |
strcpy(mismatches, mismatches_str); |
311 |
|
312 |
orientation = orientation_str[0]; |
313 |
text_offset = atoi(text_offset_str); |
314 |
|
315 |
bool end = true; |
316 |
unsigned int seg_offset = 0; |
317 |
unsigned int seg_num = 0; |
318 |
unsigned int num_segs = 0; |
319 |
|
320 |
// Copy the tag out of the name field before we might wipe it out |
321 |
char* pipe = strrchr(name, '|'); |
322 |
if (pipe) |
323 |
{ |
324 |
if (name_tags) |
325 |
strcpy(name_tags, pipe); |
326 |
|
327 |
char* tag_buf = pipe + 1; |
328 |
if (strchr(tag_buf, ':')) |
329 |
{ |
330 |
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
331 |
if (seg_num + 1 == num_segs) |
332 |
end = true; |
333 |
else |
334 |
end = false; |
335 |
} |
336 |
|
337 |
*pipe = 0; |
338 |
} |
339 |
// Stripping the slash and number following it gives the insert name |
340 |
char* slash = strrchr(name, '/'); |
341 |
if (strip_slash && slash) |
342 |
*slash = 0; |
343 |
|
344 |
size_t read_len = strlen(seq_str); |
345 |
|
346 |
// Add this alignment to the table of hits for this half of the |
347 |
// Bowtie map |
348 |
|
349 |
//vector<string> mismatch_toks; |
350 |
char* pch = strtok (mismatches,","); |
351 |
unsigned char num_mismatches = 0; |
352 |
while (pch != NULL) |
353 |
{ |
354 |
char* colon = strchr(pch, ':'); |
355 |
if (colon) |
356 |
{ |
357 |
num_mismatches++; |
358 |
} |
359 |
//mismatch_toks.push_back(pch); |
360 |
pch = strtok (NULL, ","); |
361 |
} |
362 |
|
363 |
bh = create_hit(name, |
364 |
text_name, |
365 |
text_offset, |
366 |
(int)read_len, |
367 |
orientation == '-', |
368 |
num_mismatches, |
369 |
end); |
370 |
|
371 |
return true; |
372 |
} |
373 |
|
374 |
int anchor_mismatch = 0; |
375 |
|
376 |
|
377 |
void parseSegReadName(char* name, char*& name_tags, bool strip_slash, |
378 |
bool &end, unsigned int &seg_offset, unsigned int& seg_num, |
379 |
unsigned int & num_segs) { |
380 |
char* pipe = strrchr(name, '|'); |
381 |
if (pipe) |
382 |
{ |
383 |
if (name_tags) |
384 |
strcpy(name_tags, pipe); |
385 |
|
386 |
char* tag_buf = pipe + 1; |
387 |
if (strchr(tag_buf, ':')) |
388 |
{ |
389 |
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
390 |
if (seg_num + 1 == num_segs) |
391 |
end = true; |
392 |
else |
393 |
end = false; |
394 |
} |
395 |
|
396 |
*pipe = 0; |
397 |
} |
398 |
|
399 |
// Stripping the slash and number following it gives the insert name |
400 |
char* slash = strrchr(name, '/'); |
401 |
if (strip_slash && slash) |
402 |
*slash = 0; |
403 |
} |
404 |
|
405 |
|
406 |
bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
407 |
BowtieHit& bh, |
408 |
bool strip_slash, |
409 |
char* name_out, |
410 |
char* name_tags, |
411 |
char* seq, |
412 |
char* qual) |
413 |
{ |
414 |
if (!orig_bwt_buf || !*orig_bwt_buf) |
415 |
return false; |
416 |
|
417 |
static const int buf_size = 2048; |
418 |
|
419 |
char bwt_buf[buf_size]; |
420 |
strcpy(bwt_buf, orig_bwt_buf); |
421 |
//const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s"; |
422 |
|
423 |
char orientation; |
424 |
char text_name[buf_size]; |
425 |
unsigned int text_offset; |
426 |
char mismatches[buf_size]; |
427 |
//memset(mismatches, 0, sizeof(mismatches)); |
428 |
|
429 |
char* buf = bwt_buf; |
430 |
char* name = get_token((char**)&buf,"\t"); |
431 |
char* orientation_str = get_token((char**)&buf,"\t"); |
432 |
char* text_name_str = get_token((char**)&buf,"\t"); |
433 |
strcpy(text_name, text_name_str); |
434 |
|
435 |
char* text_offset_str = get_token((char**)&buf,"\t"); |
436 |
char* seq_str = get_token((char**)&buf,"\t"); |
437 |
if (seq) |
438 |
strcpy(seq, seq_str); |
439 |
|
440 |
const char* qual_str = get_token((char**)&buf,"\t"); |
441 |
if (qual) |
442 |
strcpy(qual, qual_str); |
443 |
|
444 |
/*const char* other_occs_str =*/ get_token((char**)&buf, "\t"); |
445 |
mismatches[0] = 0; |
446 |
char* mismatches_str = get_token((char**)&buf, "\t"); |
447 |
if (mismatches_str) |
448 |
strcpy(mismatches, mismatches_str); |
449 |
|
450 |
orientation = orientation_str[0]; |
451 |
text_offset = atoi(text_offset_str); |
452 |
|
453 |
bool end = true; |
454 |
unsigned int seg_offset = 0; |
455 |
unsigned int seg_num = 0; |
456 |
unsigned int num_segs = 0; |
457 |
|
458 |
// Copy the tag out of the name field before we might wipe it out |
459 |
parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs); |
460 |
|
461 |
// Add this alignment to the table of hits for this half of the |
462 |
// Bowtie map |
463 |
|
464 |
// Parse the text_name field to recover the splice coords |
465 |
vector<string> toks; |
466 |
|
467 |
tokenize_strict(text_name, "|", toks); |
468 |
|
469 |
int num_extra_toks = (int)toks.size() - 6; |
470 |
|
471 |
if (num_extra_toks >= 0) |
472 |
{ |
473 |
static const uint8_t left_window_edge_field = 1; |
474 |
static const uint8_t splice_field = 2; |
475 |
//static const uint8_t right_window_edge_field = 3; |
476 |
static const uint8_t junction_type_field = 4; |
477 |
static const uint8_t strand_field = 5; |
478 |
|
479 |
string contig = toks[0]; |
480 |
for (int t = 1; t <= num_extra_toks; ++t) |
481 |
{ |
482 |
contig += "|"; |
483 |
contig += toks[t]; |
484 |
} |
485 |
|
486 |
vector<string> splice_toks; |
487 |
tokenize(toks[num_extra_toks + splice_field], "-", splice_toks); |
488 |
if (splice_toks.size() != 2) |
489 |
{ |
490 |
fprintf(stderr, "Warning: found malformed splice record, skipping:\n"); |
491 |
//fprintf(stderr, "%s (token: %s)\n", text_name, |
492 |
// toks[num_extra_toks + splice_field].c_str()); |
493 |
return false; |
494 |
} |
495 |
|
496 |
// |
497 |
// check for an insertion hit |
498 |
// |
499 |
if(toks[num_extra_toks + junction_type_field] == "ins") |
500 |
{ |
501 |
int8_t spliced_read_len = strlen(seq_str); |
502 |
/* |
503 |
* The 0-based position of the left edge of the alignment. Note that this |
504 |
* value may need to be futher corrected to account for the presence of |
505 |
* of the insertion. |
506 |
*/ |
507 |
uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
508 |
uint32_t right = left + spliced_read_len - 1; |
509 |
|
510 |
|
511 |
/* |
512 |
* The 0-based position of the last genomic sequence before the insertion |
513 |
*/ |
514 |
uint32_t left_splice_pos = atoi(splice_toks[0].c_str()); |
515 |
|
516 |
string insertedSequence = splice_toks[1]; |
517 |
/* |
518 |
* The 0-based position of the first genomic sequence after teh insertion |
519 |
*/ |
520 |
uint32_t right_splice_pos = left_splice_pos + 1; |
521 |
if(left > left_splice_pos){ |
522 |
/* |
523 |
* The genomic position of the left edge of the alignment needs to be corrected |
524 |
* If the alignment does not extend into the insertion, simply subtract the length |
525 |
* of the inserted sequence, otherwise, just set it equal to the right edge |
526 |
*/ |
527 |
left = left - insertedSequence.length(); |
528 |
if(left < right_splice_pos){ |
529 |
left = right_splice_pos; |
530 |
} |
531 |
} |
532 |
if(right > left_splice_pos){ |
533 |
right = right - insertedSequence.length(); |
534 |
if(right < left_splice_pos){ |
535 |
right = left_splice_pos; |
536 |
} |
537 |
} |
538 |
/* |
539 |
* Now, right and left should be properly transformed into genomic coordinates |
540 |
* We should be able to deduce how much the alignment matches the insertion |
541 |
* simply based on the length of the read |
542 |
*/ |
543 |
int left_match_length = 0; |
544 |
if(left <= left_splice_pos){ |
545 |
left_match_length = left_splice_pos - left + 1; |
546 |
} |
547 |
int right_match_length = 0; |
548 |
if(right >= right_splice_pos){ |
549 |
right_match_length = right - right_splice_pos + 1; |
550 |
} |
551 |
int insertion_match_length = spliced_read_len - left_match_length - right_match_length; |
552 |
|
553 |
if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0) |
554 |
return false; |
555 |
|
556 |
string junction_strand = toks[num_extra_toks + strand_field]; |
557 |
if(junction_strand != "rev" && junction_strand != "fwd"){ |
558 |
fprintf(stderr, "Malformed insertion record\n"); |
559 |
return false; |
560 |
} |
561 |
|
562 |
char* pch = strtok( mismatches, ","); |
563 |
unsigned char num_mismatches = 0; |
564 |
|
565 |
/* |
566 |
* remember that text_offset holds the left end of the |
567 |
* alignment, relative to the start of the contig |
568 |
*/ |
569 |
|
570 |
/* |
571 |
* The 0-based relative position of the left-most character |
572 |
* before the insertion in the contig |
573 |
*/ |
574 |
int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str()); |
575 |
while (pch != NULL) |
576 |
{ |
577 |
char* colon = strchr(pch, ':'); |
578 |
if (colon) |
579 |
{ |
580 |
*colon = 0; |
581 |
int mismatch_pos = atoi(pch); |
582 |
|
583 |
/* |
584 |
* for reversely mapped reads, |
585 |
* find the correct mismatched position. |
586 |
*/ |
587 |
if(orientation == '-'){ |
588 |
mismatch_pos = spliced_read_len - mismatch_pos - 1; |
589 |
} |
590 |
|
591 |
/* |
592 |
* Only count mismatches outside of the insertion region |
593 |
* If there is a mismatch within the insertion, |
594 |
* disallow this hit |
595 |
*/ |
596 |
if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){ |
597 |
num_mismatches++; |
598 |
}else{ |
599 |
return false; |
600 |
} |
601 |
} |
602 |
pch = strtok (NULL, ","); |
603 |
} |
604 |
|
605 |
|
606 |
vector<CigarOp> cigar; |
607 |
cigar.push_back(CigarOp(MATCH, left_match_length)); |
608 |
cigar.push_back(CigarOp(INS, insertion_match_length)); |
609 |
cigar.push_back(CigarOp(MATCH, right_match_length)); |
610 |
|
611 |
/* |
612 |
* For now, disallow hits that don't span |
613 |
* the insertion. If we allow these types of hits, |
614 |
* then long_spanning.cpp needs to be updated |
615 |
* in order to intelligently merge these kinds |
616 |
* of reads back together |
617 |
* |
618 |
* Following code has been changed to allow segment that end |
619 |
* in an insertion |
620 |
*/ |
621 |
bh = create_hit(name, |
622 |
contig, |
623 |
left, |
624 |
cigar, |
625 |
orientation == '-', |
626 |
junction_strand == "rev", |
627 |
num_mismatches, |
628 |
0, |
629 |
end); |
630 |
return true; |
631 |
} |
632 |
|
633 |
else |
634 |
{ |
635 |
uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
636 |
int spliced_read_len = strlen(seq_str); |
637 |
int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1; |
638 |
if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len; |
639 |
int8_t right_splice_pos = spliced_read_len - left_splice_pos; |
640 |
|
641 |
if (right_splice_pos <= 0 || left_splice_pos <= 0) |
642 |
return false; |
643 |
|
644 |
if (orientation == '+') |
645 |
{ |
646 |
if (left_splice_pos + seg_offset < _anchor_length){ |
647 |
return false; |
648 |
} |
649 |
} |
650 |
else |
651 |
{ |
652 |
if (right_splice_pos + seg_offset < _anchor_length) |
653 |
return false; |
654 |
} |
655 |
//uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos; |
656 |
//atoi(toks[right_window_edge_field].c_str()); |
657 |
int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1; |
658 |
|
659 |
string junction_strand = toks[num_extra_toks + strand_field]; |
660 |
if (!(junction_strand == "rev" || junction_strand == "fwd")|| |
661 |
!(orientation == '-' || orientation == '+')) |
662 |
{ |
663 |
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
664 |
//fprintf(stderr, "junction_strand=%s, orientation='%c'\n", |
665 |
// junction_strand.c_str(), orientation); |
666 |
return false; |
667 |
} |
668 |
|
669 |
//vector<string> mismatch_toks; |
670 |
char* pch = strtok (mismatches,","); |
671 |
int mismatches_in_anchor = 0; |
672 |
unsigned char num_mismatches = 0; |
673 |
while (pch != NULL) |
674 |
{ |
675 |
char* colon = strchr(pch, ':'); |
676 |
if (colon) |
677 |
{ |
678 |
*colon = 0; |
679 |
num_mismatches++; |
680 |
int mismatch_pos = atoi(pch); |
681 |
if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) || |
682 |
(orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len) |
683 |
mismatches_in_anchor++; |
684 |
} |
685 |
//mismatch_toks.push_back(pch); |
686 |
pch = strtok (NULL, ","); |
687 |
} |
688 |
|
689 |
// FIXME: we probably should exclude these hits somewhere, but this |
690 |
// isn't the right place |
691 |
vector<CigarOp> cigar; |
692 |
cigar.push_back(CigarOp(MATCH, left_splice_pos)); |
693 |
if(toks[num_extra_toks + junction_type_field] == "del"){ |
694 |
cigar.push_back(CigarOp(DEL, gap_len)); |
695 |
}else{ |
696 |
cigar.push_back(CigarOp(REF_SKIP, gap_len)); |
697 |
} |
698 |
cigar.push_back(CigarOp(MATCH, right_splice_pos)); |
699 |
|
700 |
bh = create_hit(name, |
701 |
contig, |
702 |
left, |
703 |
cigar, |
704 |
orientation == '-', |
705 |
junction_strand == "rev", |
706 |
num_mismatches, |
707 |
mismatches_in_anchor, |
708 |
end); |
709 |
return true; |
710 |
} |
711 |
} |
712 |
else |
713 |
{ |
714 |
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
715 |
//fprintf(stderr, "%s\n", orig_bwt_buf); |
716 |
// continue; |
717 |
return false; |
718 |
} |
719 |
|
720 |
return false; |
721 |
} |
722 |
|
723 |
|
724 |
int parseCigar(vector<CigarOp>& cigar, const char* cigar_str, |
725 |
bool &spliced_alignment) { |
726 |
const char* p_cig = cigar_str; |
727 |
int refspan=0; //alignment span on reference sequence |
728 |
while (*p_cig) |
729 |
{ |
730 |
char* t; |
731 |
int op_len = (int)strtol(p_cig, &t, 10); |
732 |
if (op_len <= 0) |
733 |
{ |
734 |
fprintf (stderr, "Error: CIGAR op has zero length\n"); |
735 |
return 0; |
736 |
} |
737 |
char op_char = toupper(*t); |
738 |
CigarOpCode opcode; |
739 |
switch (op_char) { |
740 |
case '=': |
741 |
case 'X': |
742 |
case 'M': opcode = MATCH; |
743 |
refspan+=op_len; |
744 |
break; |
745 |
case 'I': opcode = INS; |
746 |
break; |
747 |
case 'D': opcode = DEL; |
748 |
refspan+=op_len; |
749 |
break; |
750 |
case 'N': if (op_len > max_report_intron_length) |
751 |
return 0; |
752 |
opcode = REF_SKIP; |
753 |
spliced_alignment = true; |
754 |
refspan+=op_len; |
755 |
break; |
756 |
case 'S': opcode = SOFT_CLIP; |
757 |
break; |
758 |
case 'H': opcode = HARD_CLIP; |
759 |
break; |
760 |
case 'P': opcode = PAD; |
761 |
break; |
762 |
default: fprintf (stderr, "Error: invalid CIGAR operation\n"); |
763 |
return 0; |
764 |
} |
765 |
p_cig = t + 1; |
766 |
cigar.push_back(CigarOp(opcode, op_len)); |
767 |
} //while cigar codes |
768 |
if (*p_cig) { |
769 |
fprintf (stderr, "Error: unmatched CIGAR operation (%s)\n",p_cig); |
770 |
return 0; |
771 |
} |
772 |
return refspan; |
773 |
} |
774 |
|
775 |
int getSAMmismatches(char* &buf, vector<CigarOp>& cigar, |
776 |
vector<int>& mismatches, int& sam_nm, bool& antisense_splice) { |
777 |
int gspan=0;//genomic span of the alignment |
778 |
const char* tag_buf = buf; |
779 |
sam_nm=0; |
780 |
//num_mismatches=0; |
781 |
while((tag_buf = get_token((char**)&buf,"\t"))) |
782 |
{ |
783 |
vector<string> tuple_fields; |
784 |
tokenize(tag_buf,":", tuple_fields); |
785 |
if (tuple_fields.size() == 3) |
786 |
{ |
787 |
if (tuple_fields[0] == "XS") |
788 |
{ |
789 |
if (tuple_fields[2] == "-") |
790 |
antisense_splice = true; |
791 |
} |
792 |
else if (tuple_fields[0] == "NM") |
793 |
{ |
794 |
sam_nm = atoi(tuple_fields[2].c_str()); |
795 |
} |
796 |
else if (tuple_fields[0] == "NS") |
797 |
{ |
798 |
//ignored for now |
799 |
} |
800 |
else if (tuple_fields[0] == "MD") |
801 |
{ |
802 |
const char* p=tuple_fields[2].c_str(); |
803 |
int bi=0; //base offset position in the read |
804 |
while (*p != 0) { |
805 |
if (isdigit(*p)) { |
806 |
int v=atoi(p); |
807 |
do { p++; } while (isdigit(*p)); |
808 |
bi+=v; |
809 |
} |
810 |
while (isalpha(*p)) { |
811 |
p++; |
812 |
mismatches.push_back(bi); |
813 |
bi++; |
814 |
} |
815 |
if (*p=='^') { //reference deletion |
816 |
p++; |
817 |
while (isalpha(*p)) { //insert read bases |
818 |
p++; bi++; |
819 |
} |
820 |
} |
821 |
} |
822 |
} |
823 |
else |
824 |
{ |
825 |
//fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str()); |
826 |
//return false; |
827 |
} |
828 |
} |
829 |
} |
830 |
/* By convention,the NM field of the SAM record |
831 |
* counts an insertion or deletion. I dont' think |
832 |
* we want the mismatch count in the BowtieHit |
833 |
* record to reflect this. Therefore, subtract out |
834 |
* the mismatches due to in/dels |
835 |
*/ |
836 |
for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){ |
837 |
switch (itr->opcode) |
838 |
{ |
839 |
case MATCH: |
840 |
case REF_SKIP: |
841 |
case PAD: |
842 |
gspan += itr->length; |
843 |
break; |
844 |
case DEL: |
845 |
gspan += itr->length; |
846 |
sam_nm -= itr->length; |
847 |
break; |
848 |
case INS: |
849 |
sam_nm -= itr->length; |
850 |
break; |
851 |
} |
852 |
} |
853 |
return gspan; |
854 |
} |
855 |
|
856 |
bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
857 |
BowtieHit& bh, |
858 |
bool strip_slash, |
859 |
char* name_out, |
860 |
char* name_tags, |
861 |
char* seq, |
862 |
char* qual) |
863 |
{ |
864 |
if (!orig_bwt_buf || !*orig_bwt_buf) |
865 |
return false; |
866 |
char bwt_buf[2048]; |
867 |
strcpy(bwt_buf, orig_bwt_buf); |
868 |
// Are we still in the header region? |
869 |
if (bwt_buf[0] == '@') |
870 |
return false; |
871 |
|
872 |
char* buf = bwt_buf; |
873 |
char* name = get_token((char**)&buf,"\t"); |
874 |
char* sam_flag_str = get_token((char**)&buf,"\t"); |
875 |
char* text_name = get_token((char**)&buf,"\t"); |
876 |
char* text_offset_str = get_token((char**)&buf,"\t"); |
877 |
const char* map_qual_str = get_token((char**)&buf,"\t"); |
878 |
char* cigar_str = get_token((char**)&buf,"\t"); |
879 |
const char* mate_ref_str = get_token((char**)&buf,"\t"); |
880 |
const char* mate_pos_str = get_token((char**)&buf,"\t"); |
881 |
const char* inferred_insert_sz_str = get_token((char**)&buf,"\t"); |
882 |
|
883 |
const char* seq_str = get_token((char**)&buf,"\t"); |
884 |
if (seq) |
885 |
strcpy(seq, seq_str); |
886 |
|
887 |
const char* qual_str = get_token((char**)&buf,"\t"); |
888 |
if (qual) |
889 |
strcpy(qual, qual_str); |
890 |
|
891 |
if (!name || |
892 |
!sam_flag_str || |
893 |
!text_name || |
894 |
!text_offset_str || |
895 |
!map_qual_str || |
896 |
!cigar_str || |
897 |
!mate_ref_str || |
898 |
!mate_pos_str || |
899 |
!inferred_insert_sz_str || |
900 |
!seq_str || |
901 |
!qual_str) |
902 |
{ |
903 |
// truncated or malformed SAM record |
904 |
return false; |
905 |
} |
906 |
|
907 |
|
908 |
int sam_flag = atoi(sam_flag_str); |
909 |
int text_offset = atoi(text_offset_str); |
910 |
|
911 |
bool end = true; |
912 |
unsigned int seg_offset = 0; |
913 |
unsigned int seg_num = 0; |
914 |
unsigned int num_segs = 0; |
915 |
|
916 |
// Copy the tag out of the name field before we might wipe it out |
917 |
parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs); |
918 |
|
919 |
vector<CigarOp> cigar; |
920 |
bool spliced_alignment = false; |
921 |
int refspan=parseCigar(cigar, cigar_str, spliced_alignment); |
922 |
if (refspan==0) |
923 |
return false; |
924 |
//vector<string> attributes; |
925 |
//tokenize(tag_buf, " \t",attributes); |
926 |
|
927 |
bool antisense_splice = false; |
928 |
int sam_nm = 0; //the value of the NM tag (edit distance) |
929 |
//int mismatches[1024];//array with mismatch positions on the read (0-based from the left aligned end of the read) |
930 |
vector<int> mismatches; |
931 |
getSAMmismatches(buf, cigar, mismatches, sam_nm, antisense_splice); |
932 |
|
933 |
if (spliced_alignment) |
934 |
{ |
935 |
bh = create_hit(name, |
936 |
text_name, |
937 |
text_offset - 1, |
938 |
cigar, |
939 |
sam_flag & 0x0010, |
940 |
antisense_splice, |
941 |
mismatches.size(), |
942 |
0, |
943 |
end); |
944 |
} |
945 |
else |
946 |
{ |
947 |
//assert(cigar.size() == 1 && cigar[0].opcode == MATCH); |
948 |
bh = create_hit(name, |
949 |
text_name, |
950 |
text_offset - 1, // SAM files are 1-indexed |
951 |
cigar, |
952 |
sam_flag & 0x0010, |
953 |
false, |
954 |
mismatches.size(), |
955 |
0, |
956 |
end); |
957 |
} |
958 |
return true; |
959 |
} |
960 |
|
961 |
void cigar_add(vector<CigarOp>& cigar, CigarOp& op) { |
962 |
if (op.length<=0) return; |
963 |
if (cigar.size()>0 && cigar.back().opcode==op.opcode) { |
964 |
cigar.back().length+=op.length; |
965 |
} |
966 |
cigar.push_back(op); |
967 |
} |
968 |
|
969 |
int spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<int> mismatches, |
970 |
int &left, int spl_start, int spl_len, CigarOpCode spl_code) { |
971 |
//merge the original 'cigar' with the new insert/gap operation |
972 |
//at position spl_start and place the result into splcigar; |
973 |
int num_mismatches=mismatches.size(); |
974 |
//these offsets are relative to the beginning of alignment on reference |
975 |
int spl_ofs=spl_start-left; //relative position of splice op |
976 |
int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op |
977 |
CigarOp gapop(spl_code, spl_len); //for DEL or REF_SKIP |
978 |
if (spl_code==INS) |
979 |
spl_ofs_end += spl_len; |
980 |
int ref_ofs=0; //working offset on reference |
981 |
int read_ofs=0; //working offset on the read, relative to the leftmost aligned base |
982 |
bool xfound=false; |
983 |
//if (left<=spl_start+spl_len) { |
984 |
if (spl_ofs_end>0) { |
985 |
int prev_opcode=0; |
986 |
int prev_oplen=0; |
987 |
for (size_t c = 0 ; c < cigar.size(); ++c) |
988 |
{ |
989 |
int prev_read_ofs=read_ofs; |
990 |
int cur_op_ofs=ref_ofs; |
991 |
int cur_opcode=cigar[c].opcode; |
992 |
int cur_oplen=cigar[c].length; |
993 |
switch (cur_opcode) { |
994 |
case MATCH: |
995 |
ref_ofs+=cur_oplen; |
996 |
read_ofs+=cur_oplen; |
997 |
break; |
998 |
case DEL: |
999 |
case REF_SKIP: |
1000 |
case PAD: |
1001 |
ref_ofs+=cur_oplen; |
1002 |
break; |
1003 |
case SOFT_CLIP: |
1004 |
case INS: |
1005 |
read_ofs+=cur_oplen; |
1006 |
break; |
1007 |
//case HARD_CLIP: |
1008 |
} |
1009 |
if (cur_op_ofs>=spl_ofs_end || ref_ofs<=spl_ofs) { |
1010 |
if (spl_code!=INS && cur_op_ofs==spl_ofs_end) |
1011 |
{ |
1012 |
//we have to insert the gap here first |
1013 |
cigar_add(splcigar, gapop); |
1014 |
} |
1015 |
cigar_add(splcigar, cigar[c]); |
1016 |
} |
1017 |
else //if (ref_ofs>spl_ofs) { |
1018 |
{ //op intersection |
1019 |
xfound=true; |
1020 |
if (spl_code==INS) { |
1021 |
//we have to shorten cur_opcode |
1022 |
// find the overlap between current range |
1023 |
int ovl_start = (cur_op_ofs>spl_ofs) ? cur_op_ofs : spl_ofs; |
1024 |
int ovl_end = (ref_ofs>spl_ofs_end) ? spl_ofs_end : ref_ofs; |
1025 |
CigarOp op(cigar[c]); |
1026 |
int len=op.length; |
1027 |
len-=(ovl_end-ovl_start); |
1028 |
if (len>0) { |
1029 |
op.length=len; |
1030 |
cigar_add(splcigar, op); |
1031 |
} |
1032 |
} |
1033 |
else {//DEL or REF_SKIP |
1034 |
//spl_ofs == spl_ofs_end |
1035 |
//we have to split cur_opcode |
1036 |
CigarOp op(cigar[c]); |
1037 |
op.length=spl_ofs-cur_op_ofs; |
1038 |
cigar_add(splcigar, op); |
1039 |
cigar_add(splcigar, gapop); |
1040 |
op.length=ref_ofs-spl_ofs; |
1041 |
cigar_add(splcigar,op); |
1042 |
} |
1043 |
} //op intersection |
1044 |
prev_opcode=cur_opcode; |
1045 |
prev_oplen=cur_oplen; |
1046 |
} //for each cigar opcode |
1047 |
} //intersection possible |
1048 |
if (!xfound) {//no intersection found between splice event and alignment |
1049 |
if (spl_ofs_end<=0) { |
1050 |
//alignment starts after the splice event |
1051 |
if (spl_code==INS) left-=spl_len; |
1052 |
else left+=spl_len; |
1053 |
} |
1054 |
//else { |
1055 |
//alignment ends before the splice event |
1056 |
//nothing to do |
1057 |
// } |
1058 |
} |
1059 |
|
1060 |
//TODO: adjust mismatches if opcode==INS and they fall in the removed part |
1061 |
//TODO: what about anchor mismatches for REF_SKIP |
1062 |
return num_mismatches; |
1063 |
} |
1064 |
|
1065 |
bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
1066 |
BowtieHit& bh, |
1067 |
bool strip_slash, |
1068 |
char* name_out, |
1069 |
char* name_tags, |
1070 |
char* seq, |
1071 |
char* qual) |
1072 |
{ |
1073 |
if (!orig_bwt_buf || !*orig_bwt_buf) |
1074 |
return false; |
1075 |
|
1076 |
char bwt_buf[2048]; |
1077 |
strcpy(bwt_buf, orig_bwt_buf); |
1078 |
// Are we still in the header region? |
1079 |
if (bwt_buf[0] == '@') |
1080 |
return false; |
1081 |
|
1082 |
char* buf = bwt_buf; |
1083 |
char* name = get_token((char**)&buf,"\t"); |
1084 |
char* sam_flag_str = get_token((char**)&buf,"\t"); |
1085 |
char* text_name = get_token((char**)&buf,"\t"); |
1086 |
char* text_offset_str = get_token((char**)&buf,"\t"); |
1087 |
const char* map_qual_str = get_token((char**)&buf,"\t"); |
1088 |
char* cigar_str = get_token((char**)&buf,"\t"); |
1089 |
const char* mate_ref_str = get_token((char**)&buf,"\t"); |
1090 |
const char* mate_pos_str = get_token((char**)&buf,"\t"); |
1091 |
const char* inferred_insert_sz_str = get_token((char**)&buf,"\t"); |
1092 |
//int num_mismatches=0; |
1093 |
//int mismatches[1024]; //list of 0-based mismatch positions in this read |
1094 |
//parsed from SAM's MD:Z: tag |
1095 |
const char* seq_str = get_token((char**)&buf,"\t"); |
1096 |
if (seq) |
1097 |
strcpy(seq, seq_str); |
1098 |
|
1099 |
const char* qual_str = get_token((char**)&buf,"\t"); |
1100 |
if (qual) |
1101 |
strcpy(qual, qual_str); |
1102 |
|
1103 |
if (!name || |
1104 |
!sam_flag_str || |
1105 |
!text_name || |
1106 |
!text_offset_str || |
1107 |
!map_qual_str || |
1108 |
!cigar_str || |
1109 |
!mate_ref_str || |
1110 |
!mate_pos_str || |
1111 |
!inferred_insert_sz_str || |
1112 |
!seq_str || |
1113 |
!qual_str) |
1114 |
{ |
1115 |
// truncated or malformed SAM record |
1116 |
return false; |
1117 |
} |
1118 |
|
1119 |
|
1120 |
int sam_flag = atoi(sam_flag_str); |
1121 |
int text_offset = atoi(text_offset_str); |
1122 |
text_offset--; //make it 0-based (SAM is 1-based, Bowtie is 0-based) |
1123 |
bool end = true; |
1124 |
unsigned int seg_offset = 0; |
1125 |
unsigned int seg_num = 0; |
1126 |
unsigned int num_segs = 0; |
1127 |
|
1128 |
// Copy the tag out of the name field before we might wipe it out |
1129 |
parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs); |
1130 |
|
1131 |
vector<CigarOp> samcigar; |
1132 |
bool spliced_alignment = false; |
1133 |
int refspan=parseCigar(samcigar, cigar_str, spliced_alignment); |
1134 |
if (refspan==0) |
1135 |
return false; |
1136 |
bool antisense_splice = false; |
1137 |
int sam_nm = 0; |
1138 |
vector<int> mismatches; |
1139 |
getSAMmismatches(buf, samcigar, mismatches, sam_nm, antisense_splice); |
1140 |
|
1141 |
//############################################## |
1142 |
|
1143 |
// Add this alignment to the table of hits for this half of the |
1144 |
// Bowtie map |
1145 |
|
1146 |
// Parse the text_name field to recover the splice coords |
1147 |
vector<string> toks; |
1148 |
tokenize_strict(text_name, "|", toks); |
1149 |
|
1150 |
int num_extra_toks = (int)toks.size() - 6; |
1151 |
|
1152 |
if (num_extra_toks >= 0) |
1153 |
{ |
1154 |
static const uint8_t left_window_edge_field = 1; |
1155 |
static const uint8_t splice_field = 2; |
1156 |
//static const uint8_t right_window_edge_field = 3; |
1157 |
static const uint8_t junction_type_field = 4; |
1158 |
static const uint8_t strand_field = 5; |
1159 |
|
1160 |
string contig = toks[0]; |
1161 |
for (int t = 1; t <= num_extra_toks; ++t) |
1162 |
{ |
1163 |
contig += "|"; |
1164 |
contig += toks[t]; |
1165 |
} |
1166 |
|
1167 |
vector<string> splice_toks; |
1168 |
tokenize(toks[num_extra_toks + splice_field], "-", splice_toks); |
1169 |
if (splice_toks.size() != 2) |
1170 |
{ |
1171 |
fprintf(stderr, "Warning: found malformed splice record, skipping:\n"); |
1172 |
//fprintf(stderr, "\t%s (token: %s)\n", text_name, |
1173 |
// toks[num_extra_toks + splice_field].c_str()); |
1174 |
return false; |
1175 |
} |
1176 |
|
1177 |
string junction_strand = toks[num_extra_toks + strand_field]; |
1178 |
if(junction_strand != "rev" && junction_strand != "fwd"){ |
1179 |
fprintf(stderr, "Malformed insertion record\n"); |
1180 |
return false; |
1181 |
} |
1182 |
|
1183 |
int spl_num_mismatches=0; |
1184 |
|
1185 |
// |
1186 |
// check for an insertion hit |
1187 |
// |
1188 |
if(toks[num_extra_toks + junction_type_field] == "ins") |
1189 |
{ |
1190 |
//int8_t spliced_read_len = strlen(seq_str); |
1191 |
//TODO FIXME: use the CIGAR instead of seq length! |
1192 |
// The 0-based position of the left edge of the alignment. Note that this |
1193 |
// value may need to be further corrected to account for the presence of |
1194 |
// of the insertion. |
1195 |
int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
1196 |
|
1197 |
// The 0-based position of the last genomic sequence before the insertion |
1198 |
int left_splice_pos = atoi(splice_toks[0].c_str()); |
1199 |
|
1200 |
string insertedSequence = splice_toks[1]; |
1201 |
// The 0-based position of the first genomic sequence after the insertion |
1202 |
|
1203 |
vector<CigarOp> splcigar; |
1204 |
|
1205 |
//TODO: implement this |
1206 |
//this also updates left to the adjusted genomic coordinates |
1207 |
spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, |
1208 |
left, left_splice_pos, insertedSequence.length(), INS); |
1209 |
/* |
1210 |
uint32_t right_splice_pos = left_splice_pos + 1; |
1211 |
|
1212 |
//uint32_t right = left + spliced_read_len - 1; |
1213 |
int right = left + refspan - 1; |
1214 |
|
1215 |
if(left > left_splice_pos){ |
1216 |
//The genomic position of the left edge of the alignment needs to be corrected |
1217 |
//If the alignment does not extend into the insertion, simply subtract the length |
1218 |
//of the inserted sequence, otherwise, just set it equal to the right edge |
1219 |
left = left - insertedSequence.length(); |
1220 |
if(left < right_splice_pos){ |
1221 |
left = right_splice_pos; |
1222 |
} |
1223 |
} |
1224 |
if(right > left_splice_pos){ |
1225 |
right = right - insertedSequence.length(); |
1226 |
if(right < left_splice_pos){ |
1227 |
right = left_splice_pos; |
1228 |
} |
1229 |
} |
1230 |
// Now, right and left should be properly transformed into genomic coordinates |
1231 |
// We should be able to deduce how much the alignment matches the insertion |
1232 |
// simply based on the length of the read |
1233 |
int left_match_length = 0; |
1234 |
if(left <= left_splice_pos){ |
1235 |
left_match_length = left_splice_pos - left + 1; |
1236 |
} |
1237 |
int right_match_length = 0; |
1238 |
if(right >= right_splice_pos){ |
1239 |
right_match_length = right - right_splice_pos + 1; |
1240 |
} |
1241 |
int insertion_match_length = spliced_read_len - left_match_length - right_match_length; |
1242 |
|
1243 |
if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0) |
1244 |
return false; |
1245 |
|
1246 |
//char* pch = strtok( mismatches, ","); |
1247 |
//unsigned char num_mismatches = 0; |
1248 |
//text_offset holds the left end of the alignment, |
1249 |
//RELATIVE TO the start of the contig |
1250 |
|
1251 |
//The 0-based relative position of the left-most character |
1252 |
//before the insertion in the contig |
1253 |
int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str()); |
1254 |
for (size_t i=0;i<mismatches.size();++i) { |
1255 |
int mismatch_pos = mismatches[i]; |
1256 |
// for reversely mapped reads, |
1257 |
//find the correct mismatched position. |
1258 |
if (sam_flag & 0x0010){ |
1259 |
mismatch_pos = spliced_read_len - mismatch_pos - 1; |
1260 |
} |
1261 |
|
1262 |
//Only count mismatches outside of the insertion region |
1263 |
// If there is a mismatch within the insertion, |
1264 |
// disallow this hit |
1265 |
if(mismatch_pos + text_offset <= relative_splice_pos || |
1266 |
mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){ |
1267 |
spl_num_mismatches++; |
1268 |
}else{ |
1269 |
return false; |
1270 |
} |
1271 |
} |
1272 |
*/ |
1273 |
//vector<CigarOp> splcigar; |
1274 |
//spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS); |
1275 |
//splcigar.push_back(CigarOp(MATCH, left_match_length)); |
1276 |
//splcigar.push_back(CigarOp(INS, insertion_match_length)); |
1277 |
//splcigar.push_back(CigarOp(MATCH, right_match_length)); |
1278 |
|
1279 |
bh = create_hit(name, |
1280 |
contig, |
1281 |
left, |
1282 |
//splcigar, |
1283 |
splcigar, |
1284 |
sam_flag & 0x0010, |
1285 |
junction_strand == "rev", |
1286 |
spl_num_mismatches, |
1287 |
0, |
1288 |
end); |
1289 |
return true; |
1290 |
} //"ins" |
1291 |
else //"del" or intron |
1292 |
{ |
1293 |
int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset; |
1294 |
int spliced_read_len = strlen(seq_str); //FIXME TODO: use CIGAR instead |
1295 |
int left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1; |
1296 |
if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len; |
1297 |
int right_splice_pos = spliced_read_len - left_splice_pos; |
1298 |
|
1299 |
if (right_splice_pos <= 0 || left_splice_pos <= 0) |
1300 |
return false; |
1301 |
|
1302 |
if ((sam_flag & 0x0010) == 0) //###### |
1303 |
{ |
1304 |
if (left_splice_pos + seg_offset < _anchor_length) |
1305 |
return false; |
1306 |
} |
1307 |
else |
1308 |
{ |
1309 |
if (right_splice_pos + seg_offset < _anchor_length) |
1310 |
return false; |
1311 |
} |
1312 |
//uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos; |
1313 |
//atoi(toks[right_window_edge_field].c_str()); |
1314 |
int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1; |
1315 |
|
1316 |
int mismatches_in_anchor = 0; |
1317 |
for (size_t i=0;i<mismatches.size();++i) { |
1318 |
spl_num_mismatches++; |
1319 |
int mismatch_pos = mismatches[i]; |
1320 |
if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) || |
1321 |
((sam_flag & 0x0010) != 0 && |
1322 |
abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len) |
1323 |
mismatches_in_anchor++; |
1324 |
} |
1325 |
|
1326 |
// FIXME: we probably should exclude these hits somewhere, but this |
1327 |
// isn't the right place |
1328 |
vector<CigarOp> splcigar; |
1329 |
CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP ; |
1330 |
//TODO: FIXME: use relative offsets for this call instead, like for the INS case |
1331 |
spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left, left_splice_pos, gap_len, opcode); |
1332 |
/* |
1333 |
splcigar.push_back(CigarOp(MATCH, left_splice_pos)); |
1334 |
if(toks[num_extra_toks + junction_type_field] == "del"){ |
1335 |
splcigar.push_back(CigarOp(DEL, gap_len)); |
1336 |
}else{ |
1337 |
splcigar.push_back(CigarOp(REF_SKIP, gap_len)); |
1338 |
} |
1339 |
splcigar.push_back(CigarOp(MATCH, right_splice_pos)); |
1340 |
*/ |
1341 |
bh = create_hit(name, |
1342 |
contig, |
1343 |
left, |
1344 |
splcigar, |
1345 |
(sam_flag & 0x0010), |
1346 |
junction_strand == "rev", |
1347 |
spl_num_mismatches, |
1348 |
mismatches_in_anchor, |
1349 |
end); |
1350 |
return true; |
1351 |
} |
1352 |
} //parse splice data |
1353 |
else |
1354 |
{ |
1355 |
fprintf(stderr, "Warning: found malformed splice record, skipping\n"); |
1356 |
//fprintf(stderr, "%s\n", orig_bwt_buf); |
1357 |
// continue; |
1358 |
return false; |
1359 |
} |
1360 |
|
1361 |
return false; |
1362 |
} |
1363 |
|
1364 |
|
1365 |
|
1366 |
bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf, |
1367 |
BowtieHit& bh, bool strip_slash, |
1368 |
char* name_out, char* name_tags, |
1369 |
char* seq, char* qual) { |
1370 |
if (_sam_header==NULL) |
1371 |
err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!"); |
1372 |
const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf; |
1373 |
|
1374 |
|
1375 |
uint32_t sam_flag = hit_buf->core.flag; |
1376 |
|
1377 |
int text_offset = hit_buf->core.pos; |
1378 |
int text_mate_pos = hit_buf->core.mpos; |
1379 |
int target_id = hit_buf->core.tid; |
1380 |
int mate_target_id = hit_buf->core.mtid; |
1381 |
|
1382 |
vector<CigarOp> cigar; |
1383 |
bool spliced_alignment = false; |
1384 |
int num_hits = 1; |
1385 |
|
1386 |
double mapQ = hit_buf->core.qual; |
1387 |
long double error_prob; |
1388 |
if (mapQ > 0) |
1389 |
{ |
1390 |
long double p = (-1.0 * mapQ) / 10.0; |
1391 |
error_prob = pow(10.0L, p); |
1392 |
} |
1393 |
else |
1394 |
{ |
1395 |
error_prob = 1.0; |
1396 |
} |
1397 |
|
1398 |
//header->target_name[c->tid] |
1399 |
|
1400 |
bool end = true; |
1401 |
unsigned int seg_offset = 0; |
1402 |
unsigned int seg_num = 0; |
1403 |
unsigned int num_segs = 0; |
1404 |
// Copy the tag out of the name field before we might wipe it out |
1405 |
char* qname = bam1_qname(hit_buf); |
1406 |
char* pipe = strrchr(qname, '|'); |
1407 |
if (pipe) |
1408 |
{ |
1409 |
if (name_tags) |
1410 |
strcpy(name_tags, pipe); |
1411 |
|
1412 |
char* tag_buf = pipe + 1; |
1413 |
if (strchr(tag_buf, ':')) |
1414 |
{ |
1415 |
sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs); |
1416 |
if (seg_num + 1 == num_segs) |
1417 |
end = true; |
1418 |
else |
1419 |
end = false; |
1420 |
} |
1421 |
|
1422 |
*pipe = 0; |
1423 |
} |
1424 |
|
1425 |
|
1426 |
if (target_id < 0) { |
1427 |
//assert(cigar.size() == 1 && cigar[0].opcode == MATCH); |
1428 |
bh = create_hit(qname, |
1429 |
"*", //ref_name |
1430 |
0, //left coord |
1431 |
0, //read_len |
1432 |
false, //antisense_aln |
1433 |
0, //edit_dist |
1434 |
end); |
1435 |
return true; |
1436 |
} |
1437 |
|
1438 |
//string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id]; |
1439 |
string text_name = _sam_header->target_name[target_id]; |
1440 |
for (int i = 0; i < hit_buf->core.n_cigar; ++i) |
1441 |
{ |
1442 |
//char* t; |
1443 |
|
1444 |
int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT; |
1445 |
if (length <= 0) |
1446 |
{ |
1447 |
fprintf (stderr, "BAM error: CIGAR op has zero length\n"); |
1448 |
return false; |
1449 |
} |
1450 |
|
1451 |
CigarOpCode opcode; |
1452 |
switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK) |
1453 |
{ |
1454 |
case BAM_CMATCH: opcode = MATCH; break; |
1455 |
case BAM_CINS: opcode = INS; break; |
1456 |
case BAM_CDEL: opcode = DEL; break; |
1457 |
case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break; |
1458 |
case BAM_CHARD_CLIP: opcode = HARD_CLIP; break; |
1459 |
case BAM_CPAD: opcode = PAD; break; |
1460 |
case BAM_CREF_SKIP: |
1461 |
opcode = REF_SKIP; |
1462 |
spliced_alignment = true; |
1463 |
if (length > (int)max_report_intron_length) |
1464 |
{ |
1465 |
//fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n"); |
1466 |
return false; |
1467 |
} |
1468 |
break; |
1469 |
default: |
1470 |
fprintf (stderr, "BAM read: invalid CIGAR operation\n"); |
1471 |
return false; |
1472 |
} |
1473 |
if (opcode != HARD_CLIP) |
1474 |
cigar.push_back(CigarOp(opcode, length)); |
1475 |
} |
1476 |
|
1477 |
string mrnm; |
1478 |
if (mate_target_id >= 0) { |
1479 |
if (mate_target_id == target_id) { |
1480 |
//mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id]; |
1481 |
mrnm = _sam_header->target_name[mate_target_id]; |
1482 |
} |
1483 |
else { |
1484 |
//fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n"); |
1485 |
return false; |
1486 |
} |
1487 |
} |
1488 |
else { |
1489 |
text_mate_pos = 0; |
1490 |
} |
1491 |
//CuffStrand source_strand = CUFF_STRAND_UNKNOWN; |
1492 |
bool antisense_splice=false; |
1493 |
unsigned char num_mismatches = 0; |
1494 |
unsigned char num_splice_anchor_mismatches = 0; |
1495 |
|
1496 |
uint8_t* ptr = bam_aux_get(hit_buf, "XS"); |
1497 |
if (ptr) { |
1498 |
char src_strand_char = bam_aux2A(ptr); |
1499 |
if (src_strand_char == '-') |
1500 |
antisense_splice = true; |
1501 |
// else if (src_strand_char == '+') |
1502 |
// source_strand = CUFF_FWD; |
1503 |
} |
1504 |
|
1505 |
ptr = bam_aux_get(hit_buf, "NM"); |
1506 |
if (ptr) { |
1507 |
num_mismatches = bam_aux2i(ptr); |
1508 |
} |
1509 |
|
1510 |
ptr = bam_aux_get(hit_buf, "NH"); |
1511 |
if (ptr) { |
1512 |
num_hits = bam_aux2i(ptr); |
1513 |
} |
1514 |
|
1515 |
//bool antisense_aln = bam1_strand(hit_buf); |
1516 |
|
1517 |
//if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN) |
1518 |
// source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping()); |
1519 |
if (spliced_alignment) { |
1520 |
//if (source_strand == CUFF_STRAND_UNKNOWN) { |
1521 |
// fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n"); |
1522 |
// } |
1523 |
bh = create_hit(qname, |
1524 |
text_name, |
1525 |
text_offset, // BAM files are 0-indexed |
1526 |
cigar, |
1527 |
sam_flag & 0x0010, |
1528 |
antisense_splice, |
1529 |
num_mismatches, |
1530 |
num_splice_anchor_mismatches, |
1531 |
end); |
1532 |
|
1533 |
} |
1534 |
else { |
1535 |
//assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN); |
1536 |
//assert(cigar.size() == 1 && cigar[0].opcode == MATCH); |
1537 |
bh = create_hit(qname, |
1538 |
text_name, |
1539 |
text_offset, // BAM files are 0-indexed |
1540 |
cigar, |
1541 |
sam_flag & 0x0010, |
1542 |
false, |
1543 |
num_mismatches, |
1544 |
0, |
1545 |
end); |
1546 |
} |
1547 |
if (seq!=NULL) { |
1548 |
char *bseq = (char*)bam1_seq(hit_buf); |
1549 |
for(int i=0;i<(hit_buf->core.l_qseq);i++) { |
1550 |
char v = bam1_seqi(bseq,i); |
1551 |
seq[i]=bam_nt16_rev_table[v]; |
1552 |
} |
1553 |
seq[hit_buf->core.l_qseq]=0; |
1554 |
} |
1555 |
if (qual!=NULL) { |
1556 |
char *bq = (char*)bam1_qual(hit_buf); |
1557 |
for(int i=0;i<(hit_buf->core.l_qseq);i++) { |
1558 |
qual[i]=bq[i]+33; |
1559 |
} |
1560 |
qual[hit_buf->core.l_qseq]=0; |
1561 |
} |
1562 |
return true; |
1563 |
} |
1564 |
|
1565 |
bool BAMHitFactory::inspect_header(HitStream& hs) |
1566 |
{ |
1567 |
bam_header_t* header = ((samfile_t*)(hs._hit_file))->header; |
1568 |
|
1569 |
if (header == NULL) { |
1570 |
fprintf(stderr, "Warning: No BAM header\n"); |
1571 |
return false; |
1572 |
} |
1573 |
if (header->l_text == 0) { |
1574 |
fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n"); |
1575 |
return false; |
1576 |
} |
1577 |
return true; |
1578 |
} |
1579 |
|
1580 |
|
1581 |
void get_mapped_reads(FILE* bwtf, |
1582 |
HitTable& hits, |
1583 |
HitFactory& hit_factory, |
1584 |
bool strip_slash, |
1585 |
bool verbose) |
1586 |
{ |
1587 |
|
1588 |
|
1589 |
char bwt_buf[2048]; |
1590 |
uint32_t reads_extracted = 0; |
1591 |
|
1592 |
while (fgets(bwt_buf, 2048, bwtf)) |
1593 |
{ |
1594 |
// Chomp the newline |
1595 |
char* nl = strrchr(bwt_buf, '\n'); |
1596 |
if (nl) *nl = 0; |
1597 |
if (*bwt_buf == 0) |
1598 |
continue; |
1599 |
// Get a new record from the tab-delimited Bowtie map |
1600 |
BowtieHit bh; |
1601 |
if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash)) |
1602 |
{ |
1603 |
// Only check uniqueness if these hits are spliced |
1604 |
hits.add_hit(bh, true); |
1605 |
} |
1606 |
reads_extracted++; |
1607 |
} |
1608 |
|
1609 |
// This will sort the map by insert id. |
1610 |
hits.finalize(); |
1611 |
fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted); |
1612 |
} |
1613 |
|
1614 |
|
1615 |
/* |
1616 |
AlignStatus status(const BowtieHit* align) |
1617 |
{ |
1618 |
if (!align) |
1619 |
return UNALIGNED; |
1620 |
if (align->contiguous()) |
1621 |
return CONTIGUOUS; |
1622 |
return SPLICED; |
1623 |
} |
1624 |
*/ |
1625 |
|
1626 |
void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC) |
1627 |
{ |
1628 |
int max_hit_pos = -1; |
1629 |
for (size_t i = 0; i < hits.size(); ++i) |
1630 |
{ |
1631 |
max_hit_pos = max((int)hits[i].right(),max_hit_pos); |
1632 |
} |
1633 |
|
1634 |
if ((int)DoC.size() < max_hit_pos) |
1635 |
DoC.resize(max_hit_pos); |
1636 |
|
1637 |
for (size_t i = 0; i < hits.size(); ++i) |
1638 |
{ |
1639 |
const BowtieHit& bh = hits[i]; |
1640 |
|
1641 |
// split up the coverage contibution for this reads |
1642 |
size_t j = bh.left(); |
1643 |
const vector<CigarOp>& cigar = bh.cigar(); |
1644 |
|
1645 |
for (size_t c = 0 ; c < cigar.size(); ++c) |
1646 |
{ |
1647 |
switch(cigar[c].opcode) |
1648 |
{ |
1649 |
case MATCH: |
1650 |
for (size_t m = 0; m < cigar[c].length; ++m) |
1651 |
{ |
1652 |
if (DoC[j + m] < 0xFFFF) |
1653 |
DoC[j + m]++; |
1654 |
} |
1655 |
//fall through this case to REF_SKIP is intentional |
1656 |
case REF_SKIP: |
1657 |
j += cigar[c].length; |
1658 |
break; |
1659 |
default: |
1660 |
break; |
1661 |
} |
1662 |
|
1663 |
} |
1664 |
} |
1665 |
} |
1666 |
|
1667 |
void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC) |
1668 |
{ |
1669 |
if ((int)DoC.size() < bh.right()) |
1670 |
DoC.resize(bh.right()); |
1671 |
|
1672 |
// split up the coverage contibution for this reads |
1673 |
size_t j = bh.left(); |
1674 |
const vector<CigarOp>& cigar = bh.cigar(); |
1675 |
|
1676 |
for (size_t c = 0 ; c < cigar.size(); ++c) |
1677 |
{ |
1678 |
switch(cigar[c].opcode) |
1679 |
{ |
1680 |
case MATCH: |
1681 |
for (size_t m = 0; m < cigar[c].length; ++m) |
1682 |
{ |
1683 |
if (DoC[j + m] < VMAXINT32) |
1684 |
DoC[j + m]++; |
1685 |
} |
1686 |
//fall through this case to REF_SKIP is intentional |
1687 |
case REF_SKIP: |
1688 |
j += cigar[c].length; |
1689 |
break; |
1690 |
default: |
1691 |
break; |
1692 |
} |
1693 |
|
1694 |
} |
1695 |
} |
1696 |
|
1697 |
void print_hit(FILE* fout, |
1698 |
const char* read_name, |
1699 |
const BowtieHit& bh, |
1700 |
const char* ref_name, |
1701 |
const char* sequence, |
1702 |
const char* qualities, |
1703 |
bool from_bowtie) |
1704 |
{ |
1705 |
string seq; |
1706 |
string quals; |
1707 |
if (sequence) |
1708 |
{ |
1709 |
seq = sequence; |
1710 |
quals = qualities; |
1711 |
seq.resize(bh.read_len()); |
1712 |
quals.resize(bh.read_len()); |
1713 |
} |
1714 |
else |
1715 |
{ |
1716 |
seq = "*"; |
1717 |
} |
1718 |
|
1719 |
if (qualities) |
1720 |
{ |
1721 |
quals = qualities; |
1722 |
quals.resize(bh.read_len()); |
1723 |
} |
1724 |
else |
1725 |
{ |
1726 |
quals = "*"; |
1727 |
} |
1728 |
|
1729 |
uint32_t sam_flag = 0; |
1730 |
if (bh.antisense_align()) |
1731 |
{ |
1732 |
sam_flag |= 0x0010; // BAM_FREVERSE |
1733 |
if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed. |
1734 |
{ |
1735 |
reverse_complement(seq); |
1736 |
reverse(quals.begin(), quals.end()); |
1737 |
} |
1738 |
} |
1739 |
|
1740 |
uint32_t sam_pos = bh.left() + 1; |
1741 |
uint32_t map_quality = 255; |
1742 |
char cigar[256]; |
1743 |
cigar[0] = 0; |
1744 |
string mate_ref_name = "*"; |
1745 |
uint32_t mate_pos = 0; |
1746 |
uint32_t insert_size = 0; |
1747 |
//string qualities = "*"; |
1748 |
|
1749 |
const vector<CigarOp>& bh_cigar = bh.cigar(); |
1750 |
|
1751 |
/* |
1752 |
* In addition to calculating the cigar string, |
1753 |
* we need to figure out how many in/dels are in the |
1754 |
* sequence, so that we can give the correct |
1755 |
* value for the NM tag |
1756 |
*/ |
1757 |
int indel_distance = 0; |
1758 |
for (size_t c = 0; c < bh_cigar.size(); ++c) |
1759 |
{ |
1760 |
char ibuf[64]; |
1761 |
sprintf(ibuf, "%d", bh_cigar[c].length); |
1762 |
switch(bh_cigar[c].opcode) |
1763 |
{ |
1764 |
case MATCH: |
1765 |
strcat(cigar, ibuf); |
1766 |
strcat(cigar, "M"); |
1767 |
break; |
1768 |
case INS: |
1769 |
strcat(cigar, ibuf); |
1770 |
strcat(cigar, "I"); |
1771 |
indel_distance += bh_cigar[c].length; |
1772 |
break; |
1773 |
case DEL: |
1774 |
strcat(cigar, ibuf); |
1775 |
strcat(cigar, "D"); |
1776 |
indel_distance += bh_cigar[c].length; |
1777 |
break; |
1778 |
case REF_SKIP: |
1779 |
strcat(cigar, ibuf); |
1780 |
strcat(cigar, "N"); |
1781 |
break; |
1782 |
default: |
1783 |
break; |
1784 |
} |
1785 |
} |
1786 |
|
1787 |
//string q = string(bh.read_len(), '!'); |
1788 |
//string s = string(bh.read_len(), 'N'); |
1789 |
|
1790 |
fprintf(fout, |
1791 |
"%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", |
1792 |
read_name, |
1793 |
sam_flag, |
1794 |
ref_name, |
1795 |
sam_pos, |
1796 |
map_quality, |
1797 |
cigar, |
1798 |
mate_ref_name.c_str(), |
1799 |
mate_pos, |
1800 |
insert_size, |
1801 |
seq.c_str(), |
1802 |
quals.c_str()); |
1803 |
|
1804 |
if (!sam_readgroup_id.empty()) |
1805 |
fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str()); |
1806 |
|
1807 |
fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance); |
1808 |
|
1809 |
bool containsSplice = false; |
1810 |
for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){ |
1811 |
if(itr->opcode == REF_SKIP){ |
1812 |
containsSplice = true; |
1813 |
break; |
1814 |
} |
1815 |
} |
1816 |
|
1817 |
if (containsSplice) |
1818 |
fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+'); |
1819 |
|
1820 |
fprintf(fout, "\n"); |
1821 |
} |
1822 |
|
1823 |
void print_bamhit(GBamWriter& wbam, |
1824 |
const char* read_name, |
1825 |
const BowtieHit& bh, |
1826 |
const char* ref_name, |
1827 |
const char* sequence, |
1828 |
const char* qualities, |
1829 |
bool from_bowtie) |
1830 |
{ |
1831 |
string seq; |
1832 |
string quals; |
1833 |
if (sequence) { |
1834 |
seq = sequence; |
1835 |
quals = qualities; |
1836 |
seq.resize(bh.read_len()); |
1837 |
quals.resize(bh.read_len()); |
1838 |
} |
1839 |
else { |
1840 |
seq = "*"; |
1841 |
} |
1842 |
if (qualities) { |
1843 |
quals = qualities; |
1844 |
quals.resize(bh.read_len()); |
1845 |
} |
1846 |
else |
1847 |
{ |
1848 |
quals = "*"; |
1849 |
} |
1850 |
|
1851 |
uint32_t sam_flag = 0; |
1852 |
if (bh.antisense_align()) |
1853 |
{ |
1854 |
sam_flag |= 0x0010; // BAM_FREVERSE |
1855 |
if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed. |
1856 |
{ |
1857 |
reverse_complement(seq); |
1858 |
reverse(quals.begin(), quals.end()); |
1859 |
} |
1860 |
} |
1861 |
|
1862 |
uint32_t sam_pos = bh.left() + 1; |
1863 |
uint32_t map_quality = 255; |
1864 |
char cigar[256]; |
1865 |
cigar[0] = 0; |
1866 |
string mate_ref_name = "*"; |
1867 |
uint32_t mate_pos = 0; |
1868 |
uint32_t insert_size = 0; |
1869 |
//string qualities = "*"; |
1870 |
|
1871 |
const vector<CigarOp>& bh_cigar = bh.cigar(); |
1872 |
/* |
1873 |
* In addition to calculating the cigar string, |
1874 |
* we need to figure out how many in/dels are in the |
1875 |
* sequence, so that we can give the correct |
1876 |
* value for the NM tag |
1877 |
*/ |
1878 |
int indel_distance = 0; |
1879 |
for (size_t c = 0; c < bh_cigar.size(); ++c) |
1880 |
{ |
1881 |
char ibuf[64]; |
1882 |
sprintf(ibuf, "%d", bh_cigar[c].length); |
1883 |
switch(bh_cigar[c].opcode) |
1884 |
{ |
1885 |
case MATCH: |
1886 |
strcat(cigar, ibuf); |
1887 |
strcat(cigar, "M"); |
1888 |
break; |
1889 |
case INS: |
1890 |
strcat(cigar, ibuf); |
1891 |
strcat(cigar, "I"); |
1892 |
indel_distance += bh_cigar[c].length; |
1893 |
break; |
1894 |
case DEL: |
1895 |
strcat(cigar, ibuf); |
1896 |
strcat(cigar, "D"); |
1897 |
indel_distance += bh_cigar[c].length; |
1898 |
break; |
1899 |
case REF_SKIP: |
1900 |
strcat(cigar, ibuf); |
1901 |
strcat(cigar, "N"); |
1902 |
break; |
1903 |
default: |
1904 |
break; |
1905 |
} |
1906 |
} |
1907 |
|
1908 |
//string q = string(bh.read_len(), '!'); |
1909 |
//string s = string(bh.read_len(), 'N'); |
1910 |
/* |
1911 |
fprintf(fout, |
1912 |
"%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", |
1913 |
read_name, |
1914 |
sam_flag, |
1915 |
ref_name, |
1916 |
sam_pos, |
1917 |
map_quality, |
1918 |
cigar, |
1919 |
mate_ref_name.c_str(), |
1920 |
mate_pos, |
1921 |
insert_size, |
1922 |
seq.c_str(), |
1923 |
quals.c_str()); |
1924 |
|
1925 |
fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance); |
1926 |
*/ |
1927 |
vector<string> auxdata; |
1928 |
|
1929 |
if (!sam_readgroup_id.empty()) |
1930 |
{ |
1931 |
string nm("RG:Z:"); |
1932 |
nm += sam_readgroup_id; |
1933 |
auxdata.push_back(nm); |
1934 |
} |
1935 |
|
1936 |
string nm("NM:i:"); |
1937 |
str_appendInt(nm, bh.edit_dist() + indel_distance); |
1938 |
auxdata.push_back(nm); |
1939 |
bool containsSplice = false; |
1940 |
for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr) |
1941 |
if(itr->opcode == REF_SKIP){ |
1942 |
containsSplice = true; |
1943 |
break; |
1944 |
} |
1945 |
|
1946 |
if (containsSplice) { |
1947 |
//fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+'); |
1948 |
nm="XS:A:"; |
1949 |
nm+=(char)(bh.antisense_splice() ? '-' : '+'); |
1950 |
auxdata.push_back(nm); |
1951 |
} |
1952 |
|
1953 |
GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality, |
1954 |
cigar, mate_ref_name.c_str(), mate_pos, |
1955 |
insert_size, seq.c_str(), quals.c_str(), &auxdata); |
1956 |
|
1957 |
|
1958 |
|
1959 |
wbam.write(brec); |
1960 |
delete brec; |
1961 |
} |
1962 |
|
1963 |
/** |
1964 |
* Print a vector of cigar operations to a file. |
1965 |
* @param bh_cigar A vector of CigarOps |
1966 |
* @return a string representation of the cigar string |
1967 |
*/ |
1968 |
std::string print_cigar(vector<CigarOp>& bh_cigar){ |
1969 |
char cigar[256]; |
1970 |
cigar[0] = 0; |
1971 |
for (size_t c = 0; c < bh_cigar.size(); ++c) |
1972 |
{ |
1973 |
char ibuf[64]; |
1974 |
sprintf(ibuf, "%d", bh_cigar[c].length); |
1975 |
switch(bh_cigar[c].opcode) |
1976 |
{ |
1977 |
case MATCH: |
1978 |
strcat(cigar, ibuf); |
1979 |
strcat(cigar, "M"); |
1980 |
break; |
1981 |
case INS: |
1982 |
strcat(cigar, ibuf); |
1983 |
strcat(cigar, "I"); |
1984 |
break; |
1985 |
case DEL: |
1986 |
strcat(cigar, ibuf); |
1987 |
strcat(cigar, "D"); |
1988 |
break; |
1989 |
case REF_SKIP: |
1990 |
strcat(cigar, ibuf); |
1991 |
strcat(cigar, "N"); |
1992 |
break; |
1993 |
default: |
1994 |
break; |
1995 |
} |
1996 |
} |
1997 |
string result(cigar); |
1998 |
return result; |
1999 |
} |
2000 |
|
2001 |
bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt) |
2002 |
{ |
2003 |
RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id); |
2004 |
if (!ref_str) |
2005 |
return false; |
2006 |
|
2007 |
const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right()); |
2008 |
|
2009 |
size_t pos_seq = 0; |
2010 |
size_t pos_ref = 0; |
2011 |
size_t mismatch = 0; |
2012 |
for (size_t i = 0; i < _cigar.size(); ++i) |
2013 |
{ |
2014 |
CigarOp cigar = _cigar[i]; |
2015 |
switch(cigar.opcode) |
2016 |
{ |
2017 |
case MATCH: |
2018 |
{ |
2019 |
for (size_t j = 0; j < cigar.length; ++j) |
2020 |
{ |
2021 |
if (_seq[pos_seq++] != ref_seq[pos_ref++]) |
2022 |
++mismatch; |
2023 |
} |
2024 |
} |
2025 |
break; |
2026 |
case INS: |
2027 |
{ |
2028 |
pos_seq += cigar.length; |
2029 |
} |
2030 |
break; |
2031 |
|
2032 |
case DEL: |
2033 |
case REF_SKIP: |
2034 |
{ |
2035 |
pos_ref += cigar.length; |
2036 |
} |
2037 |
break; |
2038 |
|
2039 |
default: |
2040 |
break; |
2041 |
} |
2042 |
} |
2043 |
|
2044 |
return mismatch == _edit_dist; |
2045 |
} |