ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 69
Committed: Mon Sep 19 21:00:41 2011 UTC (8 years, 8 months ago) by gpertea
File size: 52149 byte(s)
Log Message:
wip - implementing SplicedSAMHitFactory::get_hit_from_buf()

Line File contents
1 /*
2 * bwt_map.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/17/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <cassert>
15 #include <cstdlib>
16 #include <iostream>
17 #include <set>
18 #include <vector>
19 #include <cmath>
20
21 #include "common.h"
22 #include "bwt_map.h"
23 #include "tokenize.h"
24 #include "reads.h"
25
26 using namespace std;
27
28 void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29 {
30 uint32_t reference_id = bh.ref_id();
31
32 pair<RefHits::iterator, bool> ret =
33 _hits_for_ref.insert(make_pair(reference_id, HitList()));
34 HitList& hl = ret.first->second;
35
36 if (check_uniqueness)
37 {
38 // Check uniqueness, in case we are adding spliced hits from
39 // several spliced alignment sources (e.g. de novo hashing + Bowtie
40 // against a user-supplied index). We don't want to count the same
41 // alignment twice if it happened to be found by more than one method
42 HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43 HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44 for (; lb != ub && lb != hl.end(); ++lb)
45 {
46 if (*lb == bh)
47 {
48 //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49 return;
50 }
51
52 if (lb->insert_id() == bh.insert_id() &&
53 lb->ref_id() == bh.ref_id() &&
54 lb->antisense_align() == bh.antisense_align())
55 {
56 // If we get here, we may be looking at the same alignment
57 // However, spanning_reads may report a shorter, trimmed alignment
58 // so not all fields will be equal. If they just disagree on the
59 // ends, and don't indicate a different junction coord, the
60 // alignments are the same.
61
62 if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63 (bh.left() <= lb->left() && bh.right() >= lb->right()))
64 {
65 vector<pair<int,int> > lb_gaps, bh_gaps;
66 lb->gaps(lb_gaps);
67 bh.gaps(bh_gaps);
68 if (lb_gaps == bh_gaps)
69 {
70 // One alignment is contained in the other, they agree on
71 // where the gaps, if any, are, and they share an id
72 // => this is a redundant aligment, so toss it
73 //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74 return;
75 }
76 }
77 }
78 }
79 }
80 _total_hits++;
81 hl.push_back(bh);
82 }
83
84 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85 {
86 return h1.insert_id() < h2.insert_id();
87 }
88
89 void LineHitFactory::openStream(HitStream& hs)
90 {
91 if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92 //open the file for HitStream here
93 hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94 if (hs._hit_file==NULL)
95 err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96 return;
97 }
98 if (hs._fzpipe!=NULL) {
99 hs._hit_file=hs._fzpipe->file;
100 }
101 }
102
103 void LineHitFactory::rewind(HitStream& hs)
104 {
105 if (hs._fzpipe!=NULL) {
106 hs._fzpipe->rewind();
107 hs._hit_file=hs._fzpipe->file;
108 }
109 else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110 }
111
112 bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113 FILE* f=(FILE *)(hs._hit_file);
114 bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115 if (!new_rec || feof(f)) {
116 hs._eof=true;
117 return false;
118 }
119 ++_line_num;
120 char* nl = strrchr(_hit_buf, '\n');
121 if (nl) *nl = 0;
122 buf = _hit_buf;
123 buf_size = _hit_buf_max_sz - 1;
124 return true;
125 }
126
127 void LineHitFactory::closeStream(HitStream& hs) {
128 if (hs._fzpipe!=NULL) {
129 hs._fzpipe->close();
130 return;
131 }
132 if (hs._hit_file!=NULL) {
133 fclose((FILE*)(hs._hit_file));
134 hs._hit_file=NULL;
135 }
136 }
137 void BAMHitFactory::openStream(HitStream& hs) {
138 if (hs._hit_file==NULL) {
139 if (hs._hit_file_name.empty())
140 //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141 return; //invalid stream, could be just a place holder
142 //open the file here if not already open
143 string fext=getFext(hs._hit_file_name);
144 if (fext=="sam")
145 hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146 else
147 hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148
149 samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150
151 if (sam_file == NULL)
152 err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153 if (sam_file->header == NULL)
154 err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155 memset(&_next_hit, 0, sizeof(_next_hit));
156 //_beginning = bgzf_tell(sam_file->x.bam);
157 _sam_header=sam_file->header;
158 if (inspect_header(hs) == false)
159 err_die("Error: invalid SAM header for file %s\n",
160 hs._hit_file_name.c_str());
161 }
162 }
163
164 void BAMHitFactory::closeStream(HitStream& hs) {
165 if (hs._hit_file) {
166 samclose((samfile_t*)(hs._hit_file));
167 }
168 hs._hit_file=NULL;
169 _sam_header=NULL;
170 }
171
172 void BAMHitFactory::rewind(HitStream& hs)
173 {
174 /*
175 if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176 {
177 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178 _eof = false;
179 }
180 */
181 this->closeStream(hs);
182 this->openStream(hs);
183 }
184
185 string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186 const bam1_t* bamrec=(const bam1_t*)hit_buf;
187 char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188 string sam_line(tamline);
189 free(tamline);
190 return sam_line;
191 }
192
193 bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194 if (_next_hit.data) {
195 free(_next_hit.data);
196 _next_hit.data = NULL;
197 }
198 _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199 if (hs.eof() || !hs.ready()) return false;
200
201 //mark_curr_pos();
202
203 memset(&_next_hit, 0, sizeof(_next_hit));
204
205 int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206 if (bytes_read <= 0) {
207 hs._eof = true;
208 return false;
209 }
210 buf = (const char*)&_next_hit;
211 buf_size = bytes_read;
212 return true;
213 }
214
215
216 BowtieHit HitFactory::create_hit(const string& insert_name,
217 const string& ref_name,
218 int left,
219 const vector<CigarOp>& cigar,
220 bool antisense_aln,
221 bool antisense_splice,
222 unsigned char edit_dist,
223 unsigned char splice_mms,
224 bool end)
225 {
226 uint64_t insert_id = _insert_table.get_id(insert_name);
227 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228
229 return BowtieHit(reference_id,
230 insert_id,
231 left,
232 cigar,
233 antisense_aln,
234 antisense_splice,
235 edit_dist,
236 splice_mms,
237 end);
238 }
239
240 BowtieHit HitFactory::create_hit(const string& insert_name,
241 const string& ref_name,
242 uint32_t left,
243 uint32_t read_len,
244 bool antisense_aln,
245 unsigned char edit_dist,
246 bool end)
247 {
248 uint64_t insert_id = _insert_table.get_id(insert_name);
249 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250
251 return BowtieHit(reference_id,
252 insert_id,
253 left,
254 read_len,
255 antisense_aln,
256 edit_dist,
257 end);
258 }
259
260 bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261 BowtieHit& bh,
262 bool strip_slash,
263 char* name_out,
264 char* name_tags,
265 char* seq,
266 char* qual)
267 {
268 if (!orig_bwt_buf || !*orig_bwt_buf)
269 return false;
270
271 static const int buf_size = 2048;
272
273 char bwt_buf[buf_size];
274 strcpy(bwt_buf, orig_bwt_buf);
275 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276
277 char orientation;
278
279 //int bwtf_ret = 0;
280 //uint32_t seqid = 0;
281
282 char text_name[buf_size];
283 unsigned int text_offset;
284
285
286 //unsigned int other_occs;
287 char mismatches[buf_size];
288 //memset(mismatches, 0, sizeof(mismatches));
289
290 const char* buf = bwt_buf;
291 char* name = get_token((char**)&buf,"\t");
292 char* orientation_str = get_token((char**)&buf,"\t");
293 char* text_name_str = get_token((char**)&buf,"\t");
294
295 strcpy(text_name, text_name_str);
296
297 char* text_offset_str = get_token((char**)&buf,"\t");
298 char* seq_str = get_token((char**)&buf,"\t");
299 if (seq)
300 strcpy(seq, seq_str);
301
302 const char* qual_str = get_token((char**)&buf,"\t");
303 if (qual)
304 strcpy(qual, qual_str);
305
306 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307 mismatches[0] = 0;
308 char* mismatches_str = get_token((char**)&buf, "\t");
309 if (mismatches_str)
310 strcpy(mismatches, mismatches_str);
311
312 orientation = orientation_str[0];
313 text_offset = atoi(text_offset_str);
314
315 bool end = true;
316 unsigned int seg_offset = 0;
317 unsigned int seg_num = 0;
318 unsigned int num_segs = 0;
319
320 // Copy the tag out of the name field before we might wipe it out
321 char* pipe = strrchr(name, '|');
322 if (pipe)
323 {
324 if (name_tags)
325 strcpy(name_tags, pipe);
326
327 char* tag_buf = pipe + 1;
328 if (strchr(tag_buf, ':'))
329 {
330 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331 if (seg_num + 1 == num_segs)
332 end = true;
333 else
334 end = false;
335 }
336
337 *pipe = 0;
338 }
339 // Stripping the slash and number following it gives the insert name
340 char* slash = strrchr(name, '/');
341 if (strip_slash && slash)
342 *slash = 0;
343
344 size_t read_len = strlen(seq_str);
345
346 // Add this alignment to the table of hits for this half of the
347 // Bowtie map
348
349 //vector<string> mismatch_toks;
350 char* pch = strtok (mismatches,",");
351 unsigned char num_mismatches = 0;
352 while (pch != NULL)
353 {
354 char* colon = strchr(pch, ':');
355 if (colon)
356 {
357 num_mismatches++;
358 }
359 //mismatch_toks.push_back(pch);
360 pch = strtok (NULL, ",");
361 }
362
363 bh = create_hit(name,
364 text_name,
365 text_offset,
366 (int)read_len,
367 orientation == '-',
368 num_mismatches,
369 end);
370
371 return true;
372 }
373
374 int anchor_mismatch = 0;
375
376 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
377 BowtieHit& bh,
378 bool strip_slash,
379 char* name_out,
380 char* name_tags,
381 char* seq,
382 char* qual)
383 {
384 if (!orig_bwt_buf || !*orig_bwt_buf)
385 return false;
386
387 static const int buf_size = 2048;
388
389 char bwt_buf[buf_size];
390 strcpy(bwt_buf, orig_bwt_buf);
391 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
392
393 char orientation;
394 char text_name[buf_size];
395 unsigned int text_offset;
396 char mismatches[buf_size];
397 //memset(mismatches, 0, sizeof(mismatches));
398
399 char* buf = bwt_buf;
400 char* name = get_token((char**)&buf,"\t");
401 char* orientation_str = get_token((char**)&buf,"\t");
402 char* text_name_str = get_token((char**)&buf,"\t");
403 strcpy(text_name, text_name_str);
404
405 char* text_offset_str = get_token((char**)&buf,"\t");
406 char* seq_str = get_token((char**)&buf,"\t");
407 if (seq)
408 strcpy(seq, seq_str);
409
410 const char* qual_str = get_token((char**)&buf,"\t");
411 if (qual)
412 strcpy(qual, qual_str);
413
414 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
415 mismatches[0] = 0;
416 char* mismatches_str = get_token((char**)&buf, "\t");
417 if (mismatches_str)
418 strcpy(mismatches, mismatches_str);
419
420 orientation = orientation_str[0];
421 text_offset = atoi(text_offset_str);
422
423 bool end = true;
424 unsigned int seg_offset = 0;
425 unsigned int seg_num = 0;
426 unsigned int num_segs = 0;
427
428 // Copy the tag out of the name field before we might wipe it out
429 char* pipe = strrchr(name, '|');
430 if (pipe)
431 {
432 if (name_tags)
433 strcpy(name_tags, pipe);
434
435 char* tag_buf = pipe + 1;
436 if (strchr(tag_buf, ':'))
437 {
438 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
439 if (seg_num + 1 == num_segs)
440 end = true;
441 else
442 end = false;
443 }
444
445 *pipe = 0;
446 }
447 // Stripping the slash and number following it gives the insert name
448 char* slash = strrchr(name, '/');
449 if (strip_slash && slash)
450 *slash = 0;
451
452 //int read_len = strlen(sequence);
453
454 // Add this alignment to the table of hits for this half of the
455 // Bowtie map
456
457 // Parse the text_name field to recover the splice coords
458 vector<string> toks;
459
460 tokenize_strict(text_name, "|", toks);
461
462 int num_extra_toks = (int)toks.size() - 6;
463
464 if (num_extra_toks >= 0)
465 {
466 static const uint8_t left_window_edge_field = 1;
467 static const uint8_t splice_field = 2;
468 //static const uint8_t right_window_edge_field = 3;
469 static const uint8_t junction_type_field = 4;
470 static const uint8_t strand_field = 5;
471
472 string contig = toks[0];
473 for (int t = 1; t <= num_extra_toks; ++t)
474 {
475 contig += "|";
476 contig += toks[t];
477 }
478
479 vector<string> splice_toks;
480 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
481 if (splice_toks.size() != 2)
482 {
483 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
484 //fprintf(stderr, "%s (token: %s)\n", text_name,
485 // toks[num_extra_toks + splice_field].c_str());
486 return false;
487 }
488
489 //
490 // check for an insertion hit
491 //
492 if(toks[num_extra_toks + junction_type_field] == "ins")
493 {
494 int8_t spliced_read_len = strlen(seq_str);
495 /*
496 * The 0-based position of the left edge of the alignment. Note that this
497 * value may need to be futher corrected to account for the presence of
498 * of the insertion.
499 */
500 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
501 uint32_t right = left + spliced_read_len - 1;
502
503 /*
504 * The 0-based position of the last genomic sequence before the insertion
505 */
506 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
507
508 string insertedSequence = splice_toks[1];
509 /*
510 * The 0-based position of the first genomic sequence after teh insertion
511 */
512 uint32_t right_splice_pos = left_splice_pos + 1;
513 if(left > left_splice_pos){
514 /*
515 * The genomic position of the left edge of the alignment needs to be corrected
516 * If the alignment does not extend into the insertion, simply subtract the length
517 * of the inserted sequence, otherwise, just set it equal to the right edge
518 */
519 left = left - insertedSequence.length();
520 if(left < right_splice_pos){
521 left = right_splice_pos;
522 }
523 }
524 if(right > left_splice_pos){
525 right = right - insertedSequence.length();
526 if(right < left_splice_pos){
527 right = left_splice_pos;
528 }
529 }
530 /*
531 * Now, right and left should be properly transformed into genomic coordinates
532 * We should be able to deduce how much the alignment matches the insertion
533 * simply based on the length of the read
534 */
535 int left_match_length = 0;
536 if(left <= left_splice_pos){
537 left_match_length = left_splice_pos - left + 1;
538 }
539 int right_match_length = 0;
540 if(right >= right_splice_pos){
541 right_match_length = right - right_splice_pos + 1;
542 }
543 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
544
545 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
546 return false;
547
548 string junction_strand = toks[num_extra_toks + strand_field];
549 if(junction_strand != "rev" && junction_strand != "fwd"){
550 fprintf(stderr, "Malformed insertion record\n");
551 return false;
552 }
553
554 char* pch = strtok( mismatches, ",");
555 unsigned char num_mismatches = 0;
556
557 /*
558 * remember that text_offset holds the left end of the
559 * alignment, relative to the start of the contig
560 */
561
562 /*
563 * The 0-based relative position of the left-most character
564 * before the insertion in the contig
565 */
566 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
567 while (pch != NULL)
568 {
569 char* colon = strchr(pch, ':');
570 if (colon)
571 {
572 *colon = 0;
573 int mismatch_pos = atoi(pch);
574
575 /*
576 * for reversely mapped reads,
577 * find the correct mismatched position.
578 */
579 if(orientation == '-'){
580 mismatch_pos = spliced_read_len - mismatch_pos - 1;
581 }
582
583 /*
584 * Only count mismatches outside of the insertion region
585 * If there is a mismatch within the insertion,
586 * disallow this hit
587 */
588 if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
589 num_mismatches++;
590 }else{
591 return false;
592 }
593 }
594 pch = strtok (NULL, ",");
595 }
596
597
598 vector<CigarOp> cigar;
599 cigar.push_back(CigarOp(MATCH, left_match_length));
600 cigar.push_back(CigarOp(INS, insertion_match_length));
601 cigar.push_back(CigarOp(MATCH, right_match_length));
602
603 /*
604 * For now, disallow hits that don't span
605 * the insertion. If we allow these types of hits,
606 * then long_spanning.cpp needs to be updated
607 * in order to intelligently merge these kinds
608 * of reads back together
609 *
610 * Following code has been changed to allow segment that end
611 * in an insertion
612 */
613 bh = create_hit(name,
614 contig,
615 left,
616 cigar,
617 orientation == '-',
618 junction_strand == "rev",
619 num_mismatches,
620 0,
621 end);
622 return true;
623 }
624
625 else
626 {
627 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
628 int spliced_read_len = strlen(seq_str);
629 int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
630 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
631 int8_t right_splice_pos = spliced_read_len - left_splice_pos;
632
633 if (right_splice_pos <= 0 || left_splice_pos <= 0)
634 return false;
635
636 if (orientation == '+')
637 {
638 if (left_splice_pos + seg_offset < _anchor_length){
639 return false;
640 }
641 }
642 else
643 {
644 if (right_splice_pos + seg_offset < _anchor_length)
645 return false;
646 }
647 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
648 //atoi(toks[right_window_edge_field].c_str());
649 int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
650
651 string junction_strand = toks[num_extra_toks + strand_field];
652 if (!(junction_strand == "rev" || junction_strand == "fwd")||
653 !(orientation == '-' || orientation == '+'))
654 {
655 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
656 //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
657 // junction_strand.c_str(), orientation);
658 return false;
659 }
660
661 //vector<string> mismatch_toks;
662 char* pch = strtok (mismatches,",");
663 int mismatches_in_anchor = 0;
664 unsigned char num_mismatches = 0;
665 while (pch != NULL)
666 {
667 char* colon = strchr(pch, ':');
668 if (colon)
669 {
670 *colon = 0;
671 num_mismatches++;
672 int mismatch_pos = atoi(pch);
673 if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
674 (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
675 mismatches_in_anchor++;
676 }
677 //mismatch_toks.push_back(pch);
678 pch = strtok (NULL, ",");
679 }
680
681 // FIXME: we probably should exclude these hits somewhere, but this
682 // isn't the right place
683 vector<CigarOp> cigar;
684 cigar.push_back(CigarOp(MATCH, left_splice_pos));
685 if(toks[num_extra_toks + junction_type_field] == "del"){
686 cigar.push_back(CigarOp(DEL, gap_len));
687 }else{
688 cigar.push_back(CigarOp(REF_SKIP, gap_len));
689 }
690 cigar.push_back(CigarOp(MATCH, right_splice_pos));
691
692 bh = create_hit(name,
693 contig,
694 left,
695 cigar,
696 orientation == '-',
697 junction_strand == "rev",
698 num_mismatches,
699 mismatches_in_anchor,
700 end);
701 return true;
702 }
703 }
704 else
705 {
706 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
707 //fprintf(stderr, "%s\n", orig_bwt_buf);
708 // continue;
709 return false;
710 }
711
712 return false;
713 }
714
715 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
716 BowtieHit& bh,
717 bool strip_slash,
718 char* name_out,
719 char* name_tags,
720 char* seq,
721 char* qual)
722 {
723 if (!orig_bwt_buf || !*orig_bwt_buf)
724 return false;
725 char bwt_buf[2048];
726 strcpy(bwt_buf, orig_bwt_buf);
727 // Are we still in the header region?
728 if (bwt_buf[0] == '@')
729 return false;
730
731 char* buf = bwt_buf;
732 char* name = get_token((char**)&buf,"\t");
733 char* sam_flag_str = get_token((char**)&buf,"\t");
734 char* text_name = get_token((char**)&buf,"\t");
735 char* text_offset_str = get_token((char**)&buf,"\t");
736 const char* map_qual_str = get_token((char**)&buf,"\t");
737 char* cigar_str = get_token((char**)&buf,"\t");
738 const char* mate_ref_str = get_token((char**)&buf,"\t");
739 const char* mate_pos_str = get_token((char**)&buf,"\t");
740 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
741
742 const char* seq_str = get_token((char**)&buf,"\t");
743 if (seq)
744 strcpy(seq, seq_str);
745
746 const char* qual_str = get_token((char**)&buf,"\t");
747 if (qual)
748 strcpy(qual, qual_str);
749
750 if (!name ||
751 !sam_flag_str ||
752 !text_name ||
753 !text_offset_str ||
754 !map_qual_str ||
755 !cigar_str ||
756 !mate_ref_str ||
757 !mate_pos_str ||
758 !inferred_insert_sz_str ||
759 !seq_str ||
760 !qual_str)
761 {
762 // truncated or malformed SAM record
763 return false;
764 }
765
766
767 int sam_flag = atoi(sam_flag_str);
768 int text_offset = atoi(text_offset_str);
769
770 bool end = true;
771 unsigned int seg_offset = 0;
772 unsigned int seg_num = 0;
773 unsigned int num_segs = 0;
774
775 // Copy the tag out of the name field before we might wipe it out
776 char* pipe = strrchr(name, '|');
777 if (pipe)
778 {
779 if (name_tags)
780 strcpy(name_tags, pipe);
781
782 char* tag_buf = pipe + 1;
783 if (strchr(tag_buf, ':'))
784 {
785 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
786 if (seg_num + 1 == num_segs)
787 end = true;
788 else
789 end = false;
790 }
791
792 *pipe = 0;
793 }
794
795 // Stripping the slash and number following it gives the insert name
796 char* slash = strrchr(name, '/');
797 if (strip_slash && slash)
798 *slash = 0;
799
800 char* p_cig = cigar_str;
801 //int len = strlen(sequence);
802 vector<CigarOp> cigar;
803 bool spliced_alignment = false;
804 // Mostly pilfered direct from the SAM tools:
805 while (*p_cig)
806 {
807 char* t;
808 int length = (int)strtol(p_cig, &t, 10);
809 if (length <= 0)
810 {
811 //fprintf (stderr, "CIGAR op has zero length\n");
812 return false;
813 }
814 char op_char = toupper(*t);
815 CigarOpCode opcode;
816 if (op_char == 'M') opcode = MATCH;
817 else if (op_char == 'I') opcode = INS;
818 else if (op_char == 'D') opcode = DEL;
819 else if (op_char == 'N')
820 {
821 if (length > max_report_intron_length)
822 return false;
823 opcode = REF_SKIP;
824 spliced_alignment = true;
825 }
826 else if (op_char == 'S') opcode = SOFT_CLIP;
827 else if (op_char == 'H') opcode = HARD_CLIP;
828 else if (op_char == 'P') opcode = PAD;
829 else
830 {
831 fprintf (stderr, "invalid CIGAR operation\n");
832 return false;
833 }
834 p_cig = t + 1;
835 //i += length;
836 cigar.push_back(CigarOp(opcode, length));
837 }
838 if (*p_cig)
839 {
840 fprintf (stderr, "unmatched CIGAR operation\n");
841 return false;
842 }
843
844 //vector<string> attributes;
845 //tokenize(tag_buf, " \t",attributes);
846
847 bool antisense_splice = false;
848 unsigned char num_mismatches = 0;
849 unsigned char num_splice_anchor_mismatches = 0;
850 const char* tag_buf = buf;
851
852 // while((tag_buf = get_token((char**)&buf,"\t")))
853 // {
854 // vector<string> tuple_fields;
855 // tokenize(tag_buf,":", tuple_fields);
856 // if (tuple_fields.size() == 3)
857 // {
858 // if (tuple_fields[0] == "XS")
859 // {
860 // if (tuple_fields[2] == "-")
861 // antisense_splice = true;
862 // }
863 // else if (tuple_fields[0] == "NM")
864 // {
865 // num_mismatches = atoi(tuple_fields[2].c_str());
866 // }
867 // else if (tuple_fields[0] == "NS")
868 // {
869 // num_splice_anchor_mismatches = atoi(tuple_fields[2].c_str());
870 // }
871 // else
872 // {
873 // fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
874 // return false;
875 // }
876 // }
877 // }
878
879 while((tag_buf = get_token((char**)&buf,"\t")))
880 {
881 vector<string> tuple_fields;
882 tokenize(tag_buf,":", tuple_fields);
883 if (tuple_fields.size() == 3)
884 {
885 if (tuple_fields[0] == "XS")
886 {
887 if (tuple_fields[2] == "-")
888 antisense_splice = true;
889 }
890 else if (tuple_fields[0] == "NM")
891 {
892 num_mismatches = atoi(tuple_fields[2].c_str());
893 }
894 else if (tuple_fields[0] == "NS")
895 {
896 //ignored for now
897 }
898 else
899 {
900 //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
901 //return false;
902 }
903 }
904 }
905
906
907 /*
908 * By convention,the NM field of the SAM record
909 * counts an insertion or deletion. I dont' think
910 * we want the mismatch count in the BowtieHit
911 * record to reflect this. Therefore, subtract out
912 * the mismatches due to in/dels
913 */
914 for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
915 if(itr->opcode == INS){
916 num_mismatches -= itr->length;
917 }
918 if(itr->opcode == DEL){
919 num_mismatches -= itr->length;
920 }
921 }
922 // vector<string> toks;
923 // tokenize(tag_buf, ":", toks);
924 if (spliced_alignment)
925 {
926 bh = create_hit(name,
927 text_name,
928 text_offset - 1,
929 cigar,
930 sam_flag & 0x0010,
931 antisense_splice,
932 num_mismatches,
933 num_splice_anchor_mismatches,
934 end);
935 return true;
936
937 }
938 else
939 {
940 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
941
942 bh = create_hit(name,
943 text_name,
944 text_offset - 1, // SAM files are 1-indexed
945 cigar,
946 sam_flag & 0x0010,
947 false,
948 num_mismatches,
949 0,
950 end);
951 return true;
952 }
953
954 return false;
955 }
956
957 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
958 BowtieHit& bh,
959 bool strip_slash,
960 char* name_out,
961 char* name_tags,
962 char* seq,
963 char* qual)
964 {
965 if (!orig_bwt_buf || !*orig_bwt_buf)
966 return false;
967
968 char bwt_buf[2048];
969 strcpy(bwt_buf, orig_bwt_buf);
970 // Are we still in the header region?
971 if (bwt_buf[0] == '@')
972 return false;
973
974 //char orientation;
975 /*
976 char mismatches[buf_size];
977
978 //char* orientation_str = get_token((char**)&buf,"\t");
979
980 mismatches[0] = 0;
981 char* mismatches_str = get_token((char**)&buf, "\t");
982 if (mismatches_str)
983 strcpy(mismatches, mismatches_str);
984
985 orientation = orientation_str[0];
986 text_offset = atoi(text_offset_str);
987 */
988
989 char* buf = bwt_buf;
990 char* name = get_token((char**)&buf,"\t");
991 char* sam_flag_str = get_token((char**)&buf,"\t");
992 char* text_name = get_token((char**)&buf,"\t");
993 char* text_offset_str = get_token((char**)&buf,"\t");
994 const char* map_qual_str = get_token((char**)&buf,"\t");
995 char* cigar_str = get_token((char**)&buf,"\t");
996 const char* mate_ref_str = get_token((char**)&buf,"\t");
997 const char* mate_pos_str = get_token((char**)&buf,"\t");
998 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
999
1000 const char* seq_str = get_token((char**)&buf,"\t");
1001 if (seq)
1002 strcpy(seq, seq_str);
1003
1004 const char* qual_str = get_token((char**)&buf,"\t");
1005 if (qual)
1006 strcpy(qual, qual_str);
1007
1008 if (!name ||
1009 !sam_flag_str ||
1010 !text_name ||
1011 !text_offset_str ||
1012 !map_qual_str ||
1013 !cigar_str ||
1014 !mate_ref_str ||
1015 !mate_pos_str ||
1016 !inferred_insert_sz_str ||
1017 !seq_str ||
1018 !qual_str)
1019 {
1020 // truncated or malformed SAM record
1021 return false;
1022 }
1023
1024
1025 int sam_flag = atoi(sam_flag_str);
1026 int text_offset = atoi(text_offset_str);
1027
1028 //##############################################
1029 bool end = true;
1030 unsigned int seg_offset = 0;
1031 unsigned int seg_num = 0;
1032 unsigned int num_segs = 0;
1033
1034 // Copy the tag out of the name field before we might wipe it out
1035 char* pipe = strrchr(name, '|');
1036 if (pipe)
1037 {
1038 if (name_tags)
1039 strcpy(name_tags, pipe);
1040
1041 char* tag_buf = pipe + 1;
1042 if (strchr(tag_buf, ':'))
1043 {
1044 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1045 if (seg_num + 1 == num_segs)
1046 end = true;
1047 else
1048 end = false;
1049 }
1050
1051 *pipe = 0;
1052 }
1053 // Stripping the slash and number following it gives the insert name
1054 char* slash = strrchr(name, '/');
1055 if (strip_slash && slash)
1056 *slash = 0;
1057
1058 //int read_len = strlen(sequence);
1059
1060 // Add this alignment to the table of hits for this half of the
1061 // Bowtie map
1062
1063 // Parse the text_name field to recover the splice coords
1064 vector<string> toks;
1065
1066 tokenize_strict(text_name, "|", toks);
1067
1068 int num_extra_toks = (int)toks.size() - 6;
1069
1070 if (num_extra_toks >= 0)
1071 {
1072 static const uint8_t left_window_edge_field = 1;
1073 static const uint8_t splice_field = 2;
1074 //static const uint8_t right_window_edge_field = 3;
1075 static const uint8_t junction_type_field = 4;
1076 static const uint8_t strand_field = 5;
1077
1078 string contig = toks[0];
1079 for (int t = 1; t <= num_extra_toks; ++t)
1080 {
1081 contig += "|";
1082 contig += toks[t];
1083 }
1084
1085 vector<string> splice_toks;
1086 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1087 if (splice_toks.size() != 2)
1088 {
1089 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1090 //fprintf(stderr, "%s (token: %s)\n", text_name,
1091 // toks[num_extra_toks + splice_field].c_str());
1092 return false;
1093 }
1094
1095 //
1096 // check for an insertion hit
1097 //
1098 if(toks[num_extra_toks + junction_type_field] == "ins")
1099 {
1100 int8_t spliced_read_len = strlen(seq_str);
1101 /*
1102 * The 0-based position of the left edge of the alignment. Note that this
1103 * value may need to be futher corrected to account for the presence of
1104 * of the insertion.
1105 */
1106 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1107 uint32_t right = left + spliced_read_len - 1;
1108
1109 /*
1110 * The 0-based position of the last genomic sequence before the insertion
1111 */
1112 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
1113
1114 string insertedSequence = splice_toks[1];
1115 /*
1116 * The 0-based position of the first genomic sequence after teh insertion
1117 */
1118 uint32_t right_splice_pos = left_splice_pos + 1;
1119 if(left > left_splice_pos){
1120 /*
1121 * The genomic position of the left edge of the alignment needs to be corrected
1122 * If the alignment does not extend into the insertion, simply subtract the length
1123 * of the inserted sequence, otherwise, just set it equal to the right edge
1124 */
1125 left = left - insertedSequence.length();
1126 if(left < right_splice_pos){
1127 left = right_splice_pos;
1128 }
1129 }
1130 if(right > left_splice_pos){
1131 right = right - insertedSequence.length();
1132 if(right < left_splice_pos){
1133 right = left_splice_pos;
1134 }
1135 }
1136 /*
1137 * Now, right and left should be properly transformed into genomic coordinates
1138 * We should be able to deduce how much the alignment matches the insertion
1139 * simply based on the length of the read
1140 */
1141 int left_match_length = 0;
1142 if(left <= left_splice_pos){
1143 left_match_length = left_splice_pos - left + 1;
1144 }
1145 int right_match_length = 0;
1146 if(right >= right_splice_pos){
1147 right_match_length = right - right_splice_pos + 1;
1148 }
1149 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1150
1151 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1152 return false;
1153
1154 string junction_strand = toks[num_extra_toks + strand_field];
1155 if(junction_strand != "rev" && junction_strand != "fwd"){
1156 fprintf(stderr, "Malformed insertion record\n");
1157 return false;
1158 }
1159
1160 char* pch = strtok( mismatches, ",");
1161 unsigned char num_mismatches = 0;
1162
1163 /*
1164 * remember that text_offset holds the left end of the
1165 * alignment, relative to the start of the contig
1166 */
1167
1168 /*
1169 * The 0-based relative position of the left-most character
1170 * before the insertion in the contig
1171 */
1172 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1173 while (pch != NULL)
1174 {
1175 char* colon = strchr(pch, ':');
1176 if (colon)
1177 {
1178 *colon = 0;
1179 int mismatch_pos = atoi(pch);
1180
1181 /*
1182 * for reversely mapped reads,
1183 * find the correct mismatched position.
1184 */
1185 if (sam_flag & 0x0010){
1186 mismatch_pos = spliced_read_len - mismatch_pos - 1;
1187 }
1188
1189 /*
1190 * Only count mismatches outside of the insertion region
1191 * If there is a mismatch within the insertion,
1192 * disallow this hit
1193 */
1194 if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1195 num_mismatches++;
1196 }else{
1197 return false;
1198 }
1199 }
1200 pch = strtok (NULL, ",");
1201 }
1202
1203
1204 vector<CigarOp> cigar;
1205 cigar.push_back(CigarOp(MATCH, left_match_length));
1206 cigar.push_back(CigarOp(INS, insertion_match_length));
1207 cigar.push_back(CigarOp(MATCH, right_match_length));
1208
1209 /*
1210 * For now, disallow hits that don't span
1211 * the insertion. If we allow these types of hits,
1212 * then long_spanning.cpp needs to be updated
1213 * in order to intelligently merge these kinds
1214 * of reads back together
1215 *
1216 * Following code has been changed to allow segment that end
1217 * in an insertion
1218 */
1219 bh = create_hit(name,
1220 contig,
1221 left,
1222 cigar,
1223 sam_flag & 0x0010, //#######
1224 junction_strand == "rev",
1225 num_mismatches,
1226 0,
1227 end);
1228 return true;
1229 }
1230
1231 else
1232 {
1233 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1234 int spliced_read_len = strlen(seq_str);
1235 int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
1236 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
1237 int8_t right_splice_pos = spliced_read_len - left_splice_pos;
1238
1239 if (right_splice_pos <= 0 || left_splice_pos <= 0)
1240 return false;
1241
1242 if ((sam_flag & 0x0010) == 0) //######
1243 {
1244 if (left_splice_pos + seg_offset < _anchor_length){
1245 return false;
1246 }
1247 }
1248 else
1249 {
1250 if (right_splice_pos + seg_offset < _anchor_length)
1251 return false;
1252 }
1253 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1254 //atoi(toks[right_window_edge_field].c_str());
1255 int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
1256
1257 string junction_strand = toks[num_extra_toks + strand_field];
1258 if (!(junction_strand == "rev" || junction_strand == "fwd")||
1259 !(orientation == '-' || orientation == '+'))
1260 {
1261 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1262 //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
1263 // junction_strand.c_str(), orientation);
1264 return false;
1265 }
1266
1267 //vector<string> mismatch_toks;
1268 char* pch = strtok (mismatches,",");
1269 int mismatches_in_anchor = 0;
1270 unsigned char num_mismatches = 0;
1271 while (pch != NULL)
1272 {
1273 char* colon = strchr(pch, ':');
1274 if (colon)
1275 {
1276 *colon = 0;
1277 num_mismatches++;
1278 int mismatch_pos = atoi(pch);
1279 if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
1280 (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
1281 mismatches_in_anchor++;
1282 }
1283 //mismatch_toks.push_back(pch);
1284 pch = strtok (NULL, ",");
1285 }
1286
1287 // FIXME: we probably should exclude these hits somewhere, but this
1288 // isn't the right place
1289 vector<CigarOp> cigar;
1290 cigar.push_back(CigarOp(MATCH, left_splice_pos));
1291 if(toks[num_extra_toks + junction_type_field] == "del"){
1292 cigar.push_back(CigarOp(DEL, gap_len));
1293 }else{
1294 cigar.push_back(CigarOp(REF_SKIP, gap_len));
1295 }
1296 cigar.push_back(CigarOp(MATCH, right_splice_pos));
1297
1298 bh = create_hit(name,
1299 contig,
1300 left,
1301 cigar,
1302 orientation == '-',
1303 junction_strand == "rev",
1304 num_mismatches,
1305 mismatches_in_anchor,
1306 end);
1307 return true;
1308 }
1309 }
1310 else
1311 {
1312 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1313 //fprintf(stderr, "%s\n", orig_bwt_buf);
1314 // continue;
1315 return false;
1316 }
1317
1318 return false;
1319 }
1320
1321
1322 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1323 BowtieHit& bh, bool strip_slash,
1324 char* name_out, char* name_tags,
1325 char* seq, char* qual) {
1326 if (_sam_header==NULL)
1327 err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1328 const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1329
1330
1331 uint32_t sam_flag = hit_buf->core.flag;
1332
1333 int text_offset = hit_buf->core.pos;
1334 int text_mate_pos = hit_buf->core.mpos;
1335 int target_id = hit_buf->core.tid;
1336 int mate_target_id = hit_buf->core.mtid;
1337
1338 vector<CigarOp> cigar;
1339 bool spliced_alignment = false;
1340 int num_hits = 1;
1341
1342 double mapQ = hit_buf->core.qual;
1343 long double error_prob;
1344 if (mapQ > 0)
1345 {
1346 long double p = (-1.0 * mapQ) / 10.0;
1347 error_prob = pow(10.0L, p);
1348 }
1349 else
1350 {
1351 error_prob = 1.0;
1352 }
1353
1354 //header->target_name[c->tid]
1355
1356 bool end = true;
1357 unsigned int seg_offset = 0;
1358 unsigned int seg_num = 0;
1359 unsigned int num_segs = 0;
1360 // Copy the tag out of the name field before we might wipe it out
1361 char* qname = bam1_qname(hit_buf);
1362 char* pipe = strrchr(qname, '|');
1363 if (pipe)
1364 {
1365 if (name_tags)
1366 strcpy(name_tags, pipe);
1367
1368 char* tag_buf = pipe + 1;
1369 if (strchr(tag_buf, ':'))
1370 {
1371 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1372 if (seg_num + 1 == num_segs)
1373 end = true;
1374 else
1375 end = false;
1376 }
1377
1378 *pipe = 0;
1379 }
1380
1381
1382 if (target_id < 0) {
1383 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1384 bh = create_hit(qname,
1385 "*", //ref_name
1386 0, //left coord
1387 0, //read_len
1388 false, //antisense_aln
1389 0, //edit_dist
1390 end);
1391 return true;
1392 }
1393
1394 //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1395 string text_name = _sam_header->target_name[target_id];
1396 for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1397 {
1398 //char* t;
1399
1400 int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1401 if (length <= 0)
1402 {
1403 fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1404 return false;
1405 }
1406
1407 CigarOpCode opcode;
1408 switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1409 {
1410 case BAM_CMATCH: opcode = MATCH; break;
1411 case BAM_CINS: opcode = INS; break;
1412 case BAM_CDEL: opcode = DEL; break;
1413 case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1414 case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1415 case BAM_CPAD: opcode = PAD; break;
1416 case BAM_CREF_SKIP:
1417 opcode = REF_SKIP;
1418 spliced_alignment = true;
1419 if (length > (int)max_report_intron_length)
1420 {
1421 //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1422 return false;
1423 }
1424 break;
1425 default:
1426 fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1427 return false;
1428 }
1429 if (opcode != HARD_CLIP)
1430 cigar.push_back(CigarOp(opcode, length));
1431 }
1432
1433 string mrnm;
1434 if (mate_target_id >= 0) {
1435 if (mate_target_id == target_id) {
1436 //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1437 mrnm = _sam_header->target_name[mate_target_id];
1438 }
1439 else {
1440 //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1441 return false;
1442 }
1443 }
1444 else {
1445 text_mate_pos = 0;
1446 }
1447 //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1448 bool antisense_splice=false;
1449 unsigned char num_mismatches = 0;
1450 unsigned char num_splice_anchor_mismatches = 0;
1451
1452 uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1453 if (ptr) {
1454 char src_strand_char = bam_aux2A(ptr);
1455 if (src_strand_char == '-')
1456 antisense_splice = true;
1457 // else if (src_strand_char == '+')
1458 // source_strand = CUFF_FWD;
1459 }
1460
1461 ptr = bam_aux_get(hit_buf, "NM");
1462 if (ptr) {
1463 num_mismatches = bam_aux2i(ptr);
1464 }
1465
1466 ptr = bam_aux_get(hit_buf, "NH");
1467 if (ptr) {
1468 num_hits = bam_aux2i(ptr);
1469 }
1470
1471 //bool antisense_aln = bam1_strand(hit_buf);
1472
1473 //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1474 // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1475 if (spliced_alignment) {
1476 //if (source_strand == CUFF_STRAND_UNKNOWN) {
1477 // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1478 // }
1479 bh = create_hit(qname,
1480 text_name,
1481 text_offset, // BAM files are 0-indexed
1482 cigar,
1483 sam_flag & 0x0010,
1484 antisense_splice,
1485 num_mismatches,
1486 num_splice_anchor_mismatches,
1487 end);
1488
1489 }
1490 else {
1491 //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1492 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1493 bh = create_hit(qname,
1494 text_name,
1495 text_offset, // BAM files are 0-indexed
1496 cigar,
1497 sam_flag & 0x0010,
1498 false,
1499 num_mismatches,
1500 0,
1501 end);
1502 }
1503 if (seq!=NULL) {
1504 char *bseq = (char*)bam1_seq(hit_buf);
1505 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1506 char v = bam1_seqi(bseq,i);
1507 seq[i]=bam_nt16_rev_table[v];
1508 }
1509 seq[hit_buf->core.l_qseq]=0;
1510 }
1511 if (qual!=NULL) {
1512 char *bq = (char*)bam1_qual(hit_buf);
1513 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1514 qual[i]=bq[i]+33;
1515 }
1516 qual[hit_buf->core.l_qseq]=0;
1517 }
1518 return true;
1519 }
1520
1521 bool BAMHitFactory::inspect_header(HitStream& hs)
1522 {
1523 bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1524
1525 if (header == NULL) {
1526 fprintf(stderr, "Warning: No BAM header\n");
1527 return false;
1528 }
1529 if (header->l_text == 0) {
1530 fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1531 return false;
1532 }
1533 return true;
1534 }
1535
1536
1537 void get_mapped_reads(FILE* bwtf,
1538 HitTable& hits,
1539 HitFactory& hit_factory,
1540 bool strip_slash,
1541 bool verbose)
1542 {
1543
1544
1545 char bwt_buf[2048];
1546 uint32_t reads_extracted = 0;
1547
1548 while (fgets(bwt_buf, 2048, bwtf))
1549 {
1550 // Chomp the newline
1551 char* nl = strrchr(bwt_buf, '\n');
1552 if (nl) *nl = 0;
1553 if (*bwt_buf == 0)
1554 continue;
1555 // Get a new record from the tab-delimited Bowtie map
1556 BowtieHit bh;
1557 if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1558 {
1559 // Only check uniqueness if these hits are spliced
1560 hits.add_hit(bh, true);
1561 }
1562 reads_extracted++;
1563 }
1564
1565 // This will sort the map by insert id.
1566 hits.finalize();
1567 fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1568 }
1569
1570
1571 /*
1572 AlignStatus status(const BowtieHit* align)
1573 {
1574 if (!align)
1575 return UNALIGNED;
1576 if (align->contiguous())
1577 return CONTIGUOUS;
1578 return SPLICED;
1579 }
1580 */
1581
1582 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1583 {
1584 int max_hit_pos = -1;
1585 for (size_t i = 0; i < hits.size(); ++i)
1586 {
1587 max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1588 }
1589
1590 if ((int)DoC.size() < max_hit_pos)
1591 DoC.resize(max_hit_pos);
1592
1593 for (size_t i = 0; i < hits.size(); ++i)
1594 {
1595 const BowtieHit& bh = hits[i];
1596
1597 // split up the coverage contibution for this reads
1598 size_t j = bh.left();
1599 const vector<CigarOp>& cigar = bh.cigar();
1600
1601 for (size_t c = 0 ; c < cigar.size(); ++c)
1602 {
1603 switch(cigar[c].opcode)
1604 {
1605 case MATCH:
1606 for (size_t m = 0; m < cigar[c].length; ++m)
1607 {
1608 if (DoC[j + m] < 0xFFFF)
1609 DoC[j + m]++;
1610 }
1611 //fall through this case to REF_SKIP is intentional
1612 case REF_SKIP:
1613 j += cigar[c].length;
1614 break;
1615 default:
1616 break;
1617 }
1618
1619 }
1620 }
1621 }
1622
1623 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1624 {
1625 if ((int)DoC.size() < bh.right())
1626 DoC.resize(bh.right());
1627
1628 // split up the coverage contibution for this reads
1629 size_t j = bh.left();
1630 const vector<CigarOp>& cigar = bh.cigar();
1631
1632 for (size_t c = 0 ; c < cigar.size(); ++c)
1633 {
1634 switch(cigar[c].opcode)
1635 {
1636 case MATCH:
1637 for (size_t m = 0; m < cigar[c].length; ++m)
1638 {
1639 if (DoC[j + m] < 0xFFFFFFFF)
1640 DoC[j + m]++;
1641 }
1642 //fall through this case to REF_SKIP is intentional
1643 case REF_SKIP:
1644 j += cigar[c].length;
1645 break;
1646 default:
1647 break;
1648 }
1649
1650 }
1651 }
1652
1653 void print_hit(FILE* fout,
1654 const char* read_name,
1655 const BowtieHit& bh,
1656 const char* ref_name,
1657 const char* sequence,
1658 const char* qualities,
1659 bool from_bowtie)
1660 {
1661 string seq;
1662 string quals;
1663 if (sequence)
1664 {
1665 seq = sequence;
1666 quals = qualities;
1667 seq.resize(bh.read_len());
1668 quals.resize(bh.read_len());
1669 }
1670 else
1671 {
1672 seq = "*";
1673 }
1674
1675 if (qualities)
1676 {
1677 quals = qualities;
1678 quals.resize(bh.read_len());
1679 }
1680 else
1681 {
1682 quals = "*";
1683 }
1684
1685 uint32_t sam_flag = 0;
1686 if (bh.antisense_align())
1687 {
1688 sam_flag |= 0x0010; // BAM_FREVERSE
1689 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1690 {
1691 reverse_complement(seq);
1692 reverse(quals.begin(), quals.end());
1693 }
1694 }
1695
1696 uint32_t sam_pos = bh.left() + 1;
1697 uint32_t map_quality = 255;
1698 char cigar[256];
1699 cigar[0] = 0;
1700 string mate_ref_name = "*";
1701 uint32_t mate_pos = 0;
1702 uint32_t insert_size = 0;
1703 //string qualities = "*";
1704
1705 const vector<CigarOp>& bh_cigar = bh.cigar();
1706
1707 /*
1708 * In addition to calculating the cigar string,
1709 * we need to figure out how many in/dels are in the
1710 * sequence, so that we can give the correct
1711 * value for the NM tag
1712 */
1713 int indel_distance = 0;
1714 for (size_t c = 0; c < bh_cigar.size(); ++c)
1715 {
1716 char ibuf[64];
1717 sprintf(ibuf, "%d", bh_cigar[c].length);
1718 switch(bh_cigar[c].opcode)
1719 {
1720 case MATCH:
1721 strcat(cigar, ibuf);
1722 strcat(cigar, "M");
1723 break;
1724 case INS:
1725 strcat(cigar, ibuf);
1726 strcat(cigar, "I");
1727 indel_distance += bh_cigar[c].length;
1728 break;
1729 case DEL:
1730 strcat(cigar, ibuf);
1731 strcat(cigar, "D");
1732 indel_distance += bh_cigar[c].length;
1733 break;
1734 case REF_SKIP:
1735 strcat(cigar, ibuf);
1736 strcat(cigar, "N");
1737 break;
1738 default:
1739 break;
1740 }
1741 }
1742
1743 //string q = string(bh.read_len(), '!');
1744 //string s = string(bh.read_len(), 'N');
1745
1746 fprintf(fout,
1747 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1748 read_name,
1749 sam_flag,
1750 ref_name,
1751 sam_pos,
1752 map_quality,
1753 cigar,
1754 mate_ref_name.c_str(),
1755 mate_pos,
1756 insert_size,
1757 seq.c_str(),
1758 quals.c_str());
1759
1760 if (!sam_readgroup_id.empty())
1761 fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1762
1763 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1764
1765 bool containsSplice = false;
1766 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1767 if(itr->opcode == REF_SKIP){
1768 containsSplice = true;
1769 break;
1770 }
1771 }
1772
1773 if (containsSplice)
1774 fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1775
1776 fprintf(fout, "\n");
1777 }
1778
1779 void print_bamhit(GBamWriter& wbam,
1780 const char* read_name,
1781 const BowtieHit& bh,
1782 const char* ref_name,
1783 const char* sequence,
1784 const char* qualities,
1785 bool from_bowtie)
1786 {
1787 string seq;
1788 string quals;
1789 if (sequence) {
1790 seq = sequence;
1791 quals = qualities;
1792 seq.resize(bh.read_len());
1793 quals.resize(bh.read_len());
1794 }
1795 else {
1796 seq = "*";
1797 }
1798 if (qualities) {
1799 quals = qualities;
1800 quals.resize(bh.read_len());
1801 }
1802 else
1803 {
1804 quals = "*";
1805 }
1806
1807 uint32_t sam_flag = 0;
1808 if (bh.antisense_align())
1809 {
1810 sam_flag |= 0x0010; // BAM_FREVERSE
1811 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1812 {
1813 reverse_complement(seq);
1814 reverse(quals.begin(), quals.end());
1815 }
1816 }
1817
1818 uint32_t sam_pos = bh.left() + 1;
1819 uint32_t map_quality = 255;
1820 char cigar[256];
1821 cigar[0] = 0;
1822 string mate_ref_name = "*";
1823 uint32_t mate_pos = 0;
1824 uint32_t insert_size = 0;
1825 //string qualities = "*";
1826
1827 const vector<CigarOp>& bh_cigar = bh.cigar();
1828 /*
1829 * In addition to calculating the cigar string,
1830 * we need to figure out how many in/dels are in the
1831 * sequence, so that we can give the correct
1832 * value for the NM tag
1833 */
1834 int indel_distance = 0;
1835 for (size_t c = 0; c < bh_cigar.size(); ++c)
1836 {
1837 char ibuf[64];
1838 sprintf(ibuf, "%d", bh_cigar[c].length);
1839 switch(bh_cigar[c].opcode)
1840 {
1841 case MATCH:
1842 strcat(cigar, ibuf);
1843 strcat(cigar, "M");
1844 break;
1845 case INS:
1846 strcat(cigar, ibuf);
1847 strcat(cigar, "I");
1848 indel_distance += bh_cigar[c].length;
1849 break;
1850 case DEL:
1851 strcat(cigar, ibuf);
1852 strcat(cigar, "D");
1853 indel_distance += bh_cigar[c].length;
1854 break;
1855 case REF_SKIP:
1856 strcat(cigar, ibuf);
1857 strcat(cigar, "N");
1858 break;
1859 default:
1860 break;
1861 }
1862 }
1863
1864 //string q = string(bh.read_len(), '!');
1865 //string s = string(bh.read_len(), 'N');
1866 /*
1867 fprintf(fout,
1868 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1869 read_name,
1870 sam_flag,
1871 ref_name,
1872 sam_pos,
1873 map_quality,
1874 cigar,
1875 mate_ref_name.c_str(),
1876 mate_pos,
1877 insert_size,
1878 seq.c_str(),
1879 quals.c_str());
1880
1881 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1882 */
1883 vector<string> auxdata;
1884
1885 if (!sam_readgroup_id.empty())
1886 {
1887 string nm("RG:Z:");
1888 nm += sam_readgroup_id;
1889 auxdata.push_back(nm);
1890 }
1891
1892 string nm("NM:i:");
1893 str_appendInt(nm, bh.edit_dist() + indel_distance);
1894 auxdata.push_back(nm);
1895 bool containsSplice = false;
1896 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1897 if(itr->opcode == REF_SKIP){
1898 containsSplice = true;
1899 break;
1900 }
1901
1902 if (containsSplice) {
1903 //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1904 nm="XS:A:";
1905 nm+=(char)(bh.antisense_splice() ? '-' : '+');
1906 auxdata.push_back(nm);
1907 }
1908
1909 GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1910 cigar, mate_ref_name.c_str(), mate_pos,
1911 insert_size, seq.c_str(), quals.c_str(), &auxdata);
1912
1913
1914
1915 wbam.write(brec);
1916 delete brec;
1917 }
1918
1919 /**
1920 * Print a vector of cigar operations to a file.
1921 * @param bh_cigar A vector of CigarOps
1922 * @return a string representation of the cigar string
1923 */
1924 std::string print_cigar(vector<CigarOp>& bh_cigar){
1925 char cigar[256];
1926 cigar[0] = 0;
1927 for (size_t c = 0; c < bh_cigar.size(); ++c)
1928 {
1929 char ibuf[64];
1930 sprintf(ibuf, "%d", bh_cigar[c].length);
1931 switch(bh_cigar[c].opcode)
1932 {
1933 case MATCH:
1934 strcat(cigar, ibuf);
1935 strcat(cigar, "M");
1936 break;
1937 case INS:
1938 strcat(cigar, ibuf);
1939 strcat(cigar, "I");
1940 break;
1941 case DEL:
1942 strcat(cigar, ibuf);
1943 strcat(cigar, "D");
1944 break;
1945 case REF_SKIP:
1946 strcat(cigar, ibuf);
1947 strcat(cigar, "N");
1948 break;
1949 default:
1950 break;
1951 }
1952 }
1953 string result(cigar);
1954 return result;
1955 }
1956
1957 bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
1958 {
1959 RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
1960 if (!ref_str)
1961 return false;
1962
1963 const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
1964
1965 size_t pos_seq = 0;
1966 size_t pos_ref = 0;
1967 size_t mismatch = 0;
1968 for (size_t i = 0; i < _cigar.size(); ++i)
1969 {
1970 CigarOp cigar = _cigar[i];
1971 switch(cigar.opcode)
1972 {
1973 case MATCH:
1974 {
1975 for (size_t j = 0; j < cigar.length; ++j)
1976 {
1977 if (_seq[pos_seq++] != ref_seq[pos_ref++])
1978 ++mismatch;
1979 }
1980 }
1981 break;
1982 case INS:
1983 {
1984 pos_seq += cigar.length;
1985 }
1986 break;
1987
1988 case DEL:
1989 case REF_SKIP:
1990 {
1991 pos_ref += cigar.length;
1992 }
1993 break;
1994
1995 default:
1996 break;
1997 }
1998 }
1999
2000 return mismatch == _edit_dist;
2001 }