ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (8 years, 10 months ago) by gpertea
File size: 41661 byte(s)
Log Message:
adding tophat source work

Line File contents
1 /*
2 * bwt_map.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/17/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <cassert>
15 #include <cstdlib>
16 #include <iostream>
17 #include <set>
18 #include <vector>
19 #include <cmath>
20
21 #include "common.h"
22 #include "bwt_map.h"
23 #include "tokenize.h"
24 #include "reads.h"
25
26 using namespace std;
27
28 void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29 {
30 uint32_t reference_id = bh.ref_id();
31
32 pair<RefHits::iterator, bool> ret =
33 _hits_for_ref.insert(make_pair(reference_id, HitList()));
34 HitList& hl = ret.first->second;
35
36 if (check_uniqueness)
37 {
38 // Check uniqueness, in case we are adding spliced hits from
39 // several spliced alignment sources (e.g. de novo hashing + Bowtie
40 // against a user-supplied index). We don't want to count the same
41 // alignment twice if it happened to be found by more than one method
42 HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43 HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44 for (; lb != ub && lb != hl.end(); ++lb)
45 {
46 if (*lb == bh)
47 {
48 //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49 return;
50 }
51
52 if (lb->insert_id() == bh.insert_id() &&
53 lb->ref_id() == bh.ref_id() &&
54 lb->antisense_align() == bh.antisense_align())
55 {
56 // If we get here, we may be looking at the same alignment
57 // However, spanning_reads may report a shorter, trimmed alignment
58 // so not all fields will be equal. If they just disagree on the
59 // ends, and don't indicate a different junction coord, the
60 // alignments are the same.
61
62 if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63 (bh.left() <= lb->left() && bh.right() >= lb->right()))
64 {
65 vector<pair<int,int> > lb_gaps, bh_gaps;
66 lb->gaps(lb_gaps);
67 bh.gaps(bh_gaps);
68 if (lb_gaps == bh_gaps)
69 {
70 // One alignment is contained in the other, they agree on
71 // where the gaps, if any, are, and they share an id
72 // => this is a redundant aligment, so toss it
73 //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74 return;
75 }
76 }
77 }
78 }
79 }
80 _total_hits++;
81 hl.push_back(bh);
82 }
83
84 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85 {
86 return h1.insert_id() < h2.insert_id();
87 }
88
89 void LineHitFactory::openStream(HitStream& hs)
90 {
91 if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92 //open the file for HitStream here
93 hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94 if (hs._hit_file==NULL)
95 err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96 return;
97 }
98 if (hs._fzpipe!=NULL) {
99 hs._hit_file=hs._fzpipe->file;
100 }
101 }
102
103 void LineHitFactory::rewind(HitStream& hs)
104 {
105 if (hs._fzpipe!=NULL) {
106 hs._fzpipe->rewind();
107 hs._hit_file=hs._fzpipe->file;
108 }
109 else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110 }
111
112 bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113 FILE* f=(FILE *)(hs._hit_file);
114 bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115 if (!new_rec || feof(f)) {
116 hs._eof=true;
117 return false;
118 }
119 ++_line_num;
120 char* nl = strrchr(_hit_buf, '\n');
121 if (nl) *nl = 0;
122 buf = _hit_buf;
123 buf_size = _hit_buf_max_sz - 1;
124 return true;
125 }
126
127 void LineHitFactory::closeStream(HitStream& hs) {
128 if (hs._fzpipe!=NULL) {
129 hs._fzpipe->close();
130 return;
131 }
132 if (hs._hit_file!=NULL) {
133 fclose((FILE*)(hs._hit_file));
134 hs._hit_file=NULL;
135 }
136 }
137 void BAMHitFactory::openStream(HitStream& hs) {
138 if (hs._hit_file==NULL) {
139 if (hs._hit_file_name.empty())
140 //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141 return; //invalid stream, could be just a place holder
142 //open the file here if not already open
143 string fext=getFext(hs._hit_file_name);
144 if (fext=="sam")
145 hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146 else
147 hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148
149 samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150
151 if (sam_file == NULL)
152 err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153 if (sam_file->header == NULL)
154 err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155 memset(&_next_hit, 0, sizeof(_next_hit));
156 //_beginning = bgzf_tell(sam_file->x.bam);
157 _sam_header=sam_file->header;
158 if (inspect_header(hs) == false)
159 err_die("Error: invalid SAM header for file %s\n",
160 hs._hit_file_name.c_str());
161 }
162 }
163
164 void BAMHitFactory::closeStream(HitStream& hs) {
165 if (hs._hit_file) {
166 samclose((samfile_t*)(hs._hit_file));
167 }
168 hs._hit_file=NULL;
169 _sam_header=NULL;
170 }
171
172 void BAMHitFactory::rewind(HitStream& hs)
173 {
174 /*
175 if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176 {
177 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178 _eof = false;
179 }
180 */
181 this->closeStream(hs);
182 this->openStream(hs);
183 }
184
185 string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186 const bam1_t* bamrec=(const bam1_t*)hit_buf;
187 char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188 string sam_line(tamline);
189 free(tamline);
190 return sam_line;
191 }
192
193 bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194 if (_next_hit.data) {
195 free(_next_hit.data);
196 _next_hit.data = NULL;
197 }
198 _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199 if (hs.eof() || !hs.ready()) return false;
200
201 //mark_curr_pos();
202
203 memset(&_next_hit, 0, sizeof(_next_hit));
204
205 int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206 if (bytes_read <= 0) {
207 hs._eof = true;
208 return false;
209 }
210 buf = (const char*)&_next_hit;
211 buf_size = bytes_read;
212 return true;
213 }
214
215
216 BowtieHit HitFactory::create_hit(const string& insert_name,
217 const string& ref_name,
218 int left,
219 const vector<CigarOp>& cigar,
220 bool antisense_aln,
221 bool antisense_splice,
222 unsigned char edit_dist,
223 unsigned char splice_mms,
224 bool end)
225 {
226 uint64_t insert_id = _insert_table.get_id(insert_name);
227 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228
229 return BowtieHit(reference_id,
230 insert_id,
231 left,
232 cigar,
233 antisense_aln,
234 antisense_splice,
235 edit_dist,
236 splice_mms,
237 end);
238 }
239
240 BowtieHit HitFactory::create_hit(const string& insert_name,
241 const string& ref_name,
242 uint32_t left,
243 uint32_t read_len,
244 bool antisense_aln,
245 unsigned char edit_dist,
246 bool end)
247 {
248 uint64_t insert_id = _insert_table.get_id(insert_name);
249 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250
251 return BowtieHit(reference_id,
252 insert_id,
253 left,
254 read_len,
255 antisense_aln,
256 edit_dist,
257 end);
258 }
259
260 bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261 BowtieHit& bh,
262 bool strip_slash,
263 char* name_out,
264 char* name_tags,
265 char* seq,
266 char* qual)
267 {
268 if (!orig_bwt_buf || !*orig_bwt_buf)
269 return false;
270
271 static const int buf_size = 2048;
272
273 char bwt_buf[buf_size];
274 strcpy(bwt_buf, orig_bwt_buf);
275 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276
277 char orientation;
278
279 //int bwtf_ret = 0;
280 //uint32_t seqid = 0;
281
282 char text_name[buf_size];
283 unsigned int text_offset;
284
285
286 //unsigned int other_occs;
287 char mismatches[buf_size];
288 //memset(mismatches, 0, sizeof(mismatches));
289
290 const char* buf = bwt_buf;
291 char* name = get_token((char**)&buf,"\t");
292 char* orientation_str = get_token((char**)&buf,"\t");
293 char* text_name_str = get_token((char**)&buf,"\t");
294
295 strcpy(text_name, text_name_str);
296
297 char* text_offset_str = get_token((char**)&buf,"\t");
298 char* seq_str = get_token((char**)&buf,"\t");
299 if (seq)
300 strcpy(seq, seq_str);
301
302 const char* qual_str = get_token((char**)&buf,"\t");
303 if (qual)
304 strcpy(qual, qual_str);
305
306 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307 mismatches[0] = 0;
308 char* mismatches_str = get_token((char**)&buf, "\t");
309 if (mismatches_str)
310 strcpy(mismatches, mismatches_str);
311
312 orientation = orientation_str[0];
313 text_offset = atoi(text_offset_str);
314
315 bool end = true;
316 unsigned int seg_offset = 0;
317 unsigned int seg_num = 0;
318 unsigned int num_segs = 0;
319
320 // Copy the tag out of the name field before we might wipe it out
321 char* pipe = strrchr(name, '|');
322 if (pipe)
323 {
324 if (name_tags)
325 strcpy(name_tags, pipe);
326
327 char* tag_buf = pipe + 1;
328 if (strchr(tag_buf, ':'))
329 {
330 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331 if (seg_num + 1 == num_segs)
332 end = true;
333 else
334 end = false;
335 }
336
337 *pipe = 0;
338 }
339 // Stripping the slash and number following it gives the insert name
340 char* slash = strrchr(name, '/');
341 if (strip_slash && slash)
342 *slash = 0;
343
344 size_t read_len = strlen(seq_str);
345
346 // Add this alignment to the table of hits for this half of the
347 // Bowtie map
348
349 //vector<string> mismatch_toks;
350 char* pch = strtok (mismatches,",");
351 unsigned char num_mismatches = 0;
352 while (pch != NULL)
353 {
354 char* colon = strchr(pch, ':');
355 if (colon)
356 {
357 num_mismatches++;
358 }
359 //mismatch_toks.push_back(pch);
360 pch = strtok (NULL, ",");
361 }
362
363 bh = create_hit(name,
364 text_name,
365 text_offset,
366 (int)read_len,
367 orientation == '-',
368 num_mismatches,
369 end);
370
371 return true;
372 }
373
374 int anchor_mismatch = 0;
375
376 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
377 BowtieHit& bh,
378 bool strip_slash,
379 char* name_out,
380 char* name_tags,
381 char* seq,
382 char* qual)
383 {
384 if (!orig_bwt_buf || !*orig_bwt_buf)
385 return false;
386
387 static const int buf_size = 2048;
388
389 char bwt_buf[buf_size];
390 strcpy(bwt_buf, orig_bwt_buf);
391 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
392
393 char orientation;
394 char text_name[buf_size];
395 unsigned int text_offset;
396 char mismatches[buf_size];
397 //memset(mismatches, 0, sizeof(mismatches));
398
399 char* buf = bwt_buf;
400 char* name = get_token((char**)&buf,"\t");
401 char* orientation_str = get_token((char**)&buf,"\t");
402 char* text_name_str = get_token((char**)&buf,"\t");
403 strcpy(text_name, text_name_str);
404
405 char* text_offset_str = get_token((char**)&buf,"\t");
406 char* seq_str = get_token((char**)&buf,"\t");
407 if (seq)
408 strcpy(seq, seq_str);
409
410 const char* qual_str = get_token((char**)&buf,"\t");
411 if (qual)
412 strcpy(qual, qual_str);
413
414 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
415 mismatches[0] = 0;
416 char* mismatches_str = get_token((char**)&buf, "\t");
417 if (mismatches_str)
418 strcpy(mismatches, mismatches_str);
419
420 orientation = orientation_str[0];
421 text_offset = atoi(text_offset_str);
422
423 bool end = true;
424 unsigned int seg_offset = 0;
425 unsigned int seg_num = 0;
426 unsigned int num_segs = 0;
427
428 // Copy the tag out of the name field before we might wipe it out
429 char* pipe = strrchr(name, '|');
430 if (pipe)
431 {
432 if (name_tags)
433 strcpy(name_tags, pipe);
434
435 char* tag_buf = pipe + 1;
436 if (strchr(tag_buf, ':'))
437 {
438 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
439 if (seg_num + 1 == num_segs)
440 end = true;
441 else
442 end = false;
443 }
444
445 *pipe = 0;
446 }
447 // Stripping the slash and number following it gives the insert name
448 char* slash = strrchr(name, '/');
449 if (strip_slash && slash)
450 *slash = 0;
451
452 //int read_len = strlen(sequence);
453
454 // Add this alignment to the table of hits for this half of the
455 // Bowtie map
456
457 // Parse the text_name field to recover the splice coords
458 vector<string> toks;
459
460 tokenize_strict(text_name, "|", toks);
461
462 int num_extra_toks = (int)toks.size() - 6;
463
464 if (num_extra_toks >= 0)
465 {
466 static const uint8_t left_window_edge_field = 1;
467 static const uint8_t splice_field = 2;
468 //static const uint8_t right_window_edge_field = 3;
469 static const uint8_t junction_type_field = 4;
470 static const uint8_t strand_field = 5;
471
472 string contig = toks[0];
473 for (int t = 1; t <= num_extra_toks; ++t)
474 {
475 contig += "|";
476 contig += toks[t];
477 }
478
479 vector<string> splice_toks;
480 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
481 if (splice_toks.size() != 2)
482 {
483 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
484 //fprintf(stderr, "%s (token: %s)\n", text_name,
485 // toks[num_extra_toks + splice_field].c_str());
486 return false;
487 }
488
489 //
490 // check for an insertion hit
491 //
492 if(toks[num_extra_toks + junction_type_field] == "ins")
493 {
494 int8_t spliced_read_len = strlen(seq_str);
495 /*
496 * The 0-based position of the left edge of the alignment. Note that this
497 * value may need to be futher corrected to account for the presence of
498 * of the insertion.
499 */
500 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
501 uint32_t right = left + spliced_read_len - 1;
502
503 /*
504 * The 0-based position of the last genomic sequence before the insertion
505 */
506 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
507
508 string insertedSequence = splice_toks[1];
509 /*
510 * The 0-based position of the first genomic sequence after teh insertion
511 */
512 uint32_t right_splice_pos = left_splice_pos + 1;
513 if(left > left_splice_pos){
514 /*
515 * The genomic position of the left edge of the alignment needs to be corrected
516 * If the alignment does not extend into the insertion, simply subtract the length
517 * of the inserted sequence, otherwise, just set it equal to the right edge
518 */
519 left = left - insertedSequence.length();
520 if(left < right_splice_pos){
521 left = right_splice_pos;
522 }
523 }
524 if(right > left_splice_pos){
525 right = right - insertedSequence.length();
526 if(right < left_splice_pos){
527 right = left_splice_pos;
528 }
529 }
530 /*
531 * Now, right and left should be properly transformed into genomic coordinates
532 * We should be able to deduce how much the alignment matches the insertion
533 * simply based on the length of the read
534 */
535 int left_match_length = 0;
536 if(left <= left_splice_pos){
537 left_match_length = left_splice_pos - left + 1;
538 }
539 int right_match_length = 0;
540 if(right >= right_splice_pos){
541 right_match_length = right - right_splice_pos + 1;
542 }
543 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
544
545 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
546 return false;
547
548 string junction_strand = toks[num_extra_toks + strand_field];
549 if(junction_strand != "rev" && junction_strand != "fwd"){
550 fprintf(stderr, "Malformed insertion record\n");
551 return false;
552 }
553
554 char* pch = strtok( mismatches, ",");
555 unsigned char num_mismatches = 0;
556
557 /*
558 * remember that text_offset holds the left end of the
559 * alignment, relative to the start of the contig
560 */
561
562 /*
563 * The 0-based relative position of the left-most character
564 * before the insertion in the contig
565 */
566 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
567 while (pch != NULL)
568 {
569 char* colon = strchr(pch, ':');
570 if (colon)
571 {
572 *colon = 0;
573 int mismatch_pos = atoi(pch);
574
575 /*
576 * for reversely mapped reads,
577 * find the correct mismatched position.
578 */
579 if(orientation == '-'){
580 mismatch_pos = spliced_read_len - mismatch_pos - 1;
581 }
582
583 /*
584 * Only count mismatches outside of the insertion region
585 * If there is a mismatch within the insertion,
586 * disallow this hit
587 */
588 if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
589 num_mismatches++;
590 }else{
591 return false;
592 }
593 }
594 pch = strtok (NULL, ",");
595 }
596
597
598 vector<CigarOp> cigar;
599 cigar.push_back(CigarOp(MATCH, left_match_length));
600 cigar.push_back(CigarOp(INS, insertion_match_length));
601 cigar.push_back(CigarOp(MATCH, right_match_length));
602
603 /*
604 * For now, disallow hits that don't span
605 * the insertion. If we allow these types of hits,
606 * then long_spanning.cpp needs to be updated
607 * in order to intelligently merge these kinds
608 * of reads back together
609 *
610 * Following code has been changed to allow segment that end
611 * in an insertion
612 */
613 bh = create_hit(name,
614 contig,
615 left,
616 cigar,
617 orientation == '-',
618 junction_strand == "rev",
619 num_mismatches,
620 0,
621 end);
622 return true;
623 }
624
625 else
626 {
627 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
628 int spliced_read_len = strlen(seq_str);
629 int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
630 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
631 int8_t right_splice_pos = spliced_read_len - left_splice_pos;
632
633 if (right_splice_pos <= 0 || left_splice_pos <= 0)
634 return false;
635
636 if (orientation == '+')
637 {
638 if (left_splice_pos + seg_offset < _anchor_length){
639 return false;
640 }
641 }
642 else
643 {
644 if (right_splice_pos + seg_offset < _anchor_length)
645 return false;
646 }
647 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
648 //atoi(toks[right_window_edge_field].c_str());
649 int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
650
651 string junction_strand = toks[num_extra_toks + strand_field];
652 if (!(junction_strand == "rev" || junction_strand == "fwd")||
653 !(orientation == '-' || orientation == '+'))
654 {
655 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
656 //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
657 // junction_strand.c_str(), orientation);
658 return false;
659 }
660
661 //vector<string> mismatch_toks;
662 char* pch = strtok (mismatches,",");
663 int mismatches_in_anchor = 0;
664 unsigned char num_mismatches = 0;
665 while (pch != NULL)
666 {
667 char* colon = strchr(pch, ':');
668 if (colon)
669 {
670 *colon = 0;
671 num_mismatches++;
672 int mismatch_pos = atoi(pch);
673 if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
674 (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
675 mismatches_in_anchor++;
676 }
677 //mismatch_toks.push_back(pch);
678 pch = strtok (NULL, ",");
679 }
680
681 // FIXME: we probably should exclude these hits somewhere, but this
682 // isn't the right place
683 vector<CigarOp> cigar;
684 cigar.push_back(CigarOp(MATCH, left_splice_pos));
685 if(toks[num_extra_toks + junction_type_field] == "del"){
686 cigar.push_back(CigarOp(DEL, gap_len));
687 }else{
688 cigar.push_back(CigarOp(REF_SKIP, gap_len));
689 }
690 cigar.push_back(CigarOp(MATCH, right_splice_pos));
691
692 bh = create_hit(name,
693 contig,
694 left,
695 cigar,
696 orientation == '-',
697 junction_strand == "rev",
698 num_mismatches,
699 mismatches_in_anchor,
700 end);
701 return true;
702 }
703 }
704 else
705 {
706 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
707 //fprintf(stderr, "%s\n", orig_bwt_buf);
708 // continue;
709 return false;
710 }
711
712 return false;
713 }
714
715 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
716 BowtieHit& bh,
717 bool strip_slash,
718 char* name_out,
719 char* name_tags,
720 char* seq,
721 char* qual)
722 {
723 if (!orig_bwt_buf || !*orig_bwt_buf)
724 return false;
725 char bwt_buf[2048];
726 strcpy(bwt_buf, orig_bwt_buf);
727 // Are we still in the header region?
728 if (bwt_buf[0] == '@')
729 return false;
730
731 char* buf = bwt_buf;
732 char* name = get_token((char**)&buf,"\t");
733 char* sam_flag_str = get_token((char**)&buf,"\t");
734 char* text_name = get_token((char**)&buf,"\t");
735 char* text_offset_str = get_token((char**)&buf,"\t");
736 const char* map_qual_str = get_token((char**)&buf,"\t");
737 char* cigar_str = get_token((char**)&buf,"\t");
738 const char* mate_ref_str = get_token((char**)&buf,"\t");
739 const char* mate_pos_str = get_token((char**)&buf,"\t");
740 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
741
742 const char* seq_str = get_token((char**)&buf,"\t");
743 if (seq)
744 strcpy(seq, seq_str);
745
746 const char* qual_str = get_token((char**)&buf,"\t");
747 if (qual)
748 strcpy(qual, qual_str);
749
750 if (!name ||
751 !sam_flag_str ||
752 !text_name ||
753 !text_offset_str ||
754 !map_qual_str ||
755 !cigar_str ||
756 !mate_ref_str ||
757 !mate_pos_str ||
758 !inferred_insert_sz_str ||
759 !seq_str ||
760 !qual_str)
761 {
762 // truncated or malformed SAM record
763 return false;
764 }
765
766
767 int sam_flag = atoi(sam_flag_str);
768 int text_offset = atoi(text_offset_str);
769
770 bool end = true;
771 unsigned int seg_offset = 0;
772 unsigned int seg_num = 0;
773 unsigned int num_segs = 0;
774
775 // Copy the tag out of the name field before we might wipe it out
776 char* pipe = strrchr(name, '|');
777 if (pipe)
778 {
779 if (name_tags)
780 strcpy(name_tags, pipe);
781
782 char* tag_buf = pipe + 1;
783 if (strchr(tag_buf, ':'))
784 {
785 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
786 if (seg_num + 1 == num_segs)
787 end = true;
788 else
789 end = false;
790 }
791
792 *pipe = 0;
793 }
794
795 // Stripping the slash and number following it gives the insert name
796 char* slash = strrchr(name, '/');
797 if (strip_slash && slash)
798 *slash = 0;
799
800 char* p_cig = cigar_str;
801 //int len = strlen(sequence);
802 vector<CigarOp> cigar;
803 bool spliced_alignment = false;
804 // Mostly pilfered direct from the SAM tools:
805 while (*p_cig)
806 {
807 char* t;
808 int length = (int)strtol(p_cig, &t, 10);
809 if (length <= 0)
810 {
811 //fprintf (stderr, "CIGAR op has zero length\n");
812 return false;
813 }
814 char op_char = toupper(*t);
815 CigarOpCode opcode;
816 if (op_char == 'M') opcode = MATCH;
817 else if (op_char == 'I') opcode = INS;
818 else if (op_char == 'D') opcode = DEL;
819 else if (op_char == 'N')
820 {
821 if (length > max_report_intron_length)
822 return false;
823 opcode = REF_SKIP;
824 spliced_alignment = true;
825 }
826 else if (op_char == 'S') opcode = SOFT_CLIP;
827 else if (op_char == 'H') opcode = HARD_CLIP;
828 else if (op_char == 'P') opcode = PAD;
829 else
830 {
831 fprintf (stderr, "invalid CIGAR operation\n");
832 return false;
833 }
834 p_cig = t + 1;
835 //i += length;
836 cigar.push_back(CigarOp(opcode, length));
837 }
838 if (*p_cig)
839 {
840 fprintf (stderr, "unmatched CIGAR operation\n");
841 return false;
842 }
843
844 //vector<string> attributes;
845 //tokenize(tag_buf, " \t",attributes);
846
847 bool antisense_splice = false;
848 unsigned char num_mismatches = 0;
849 unsigned char num_splice_anchor_mismatches = 0;
850 const char* tag_buf = buf;
851
852 // while((tag_buf = get_token((char**)&buf,"\t")))
853 // {
854 // vector<string> tuple_fields;
855 // tokenize(tag_buf,":", tuple_fields);
856 // if (tuple_fields.size() == 3)
857 // {
858 // if (tuple_fields[0] == "XS")
859 // {
860 // if (tuple_fields[2] == "-")
861 // antisense_splice = true;
862 // }
863 // else if (tuple_fields[0] == "NM")
864 // {
865 // num_mismatches = atoi(tuple_fields[2].c_str());
866 // }
867 // else if (tuple_fields[0] == "NS")
868 // {
869 // num_splice_anchor_mismatches = atoi(tuple_fields[2].c_str());
870 // }
871 // else
872 // {
873 // fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
874 // return false;
875 // }
876 // }
877 // }
878
879 while((tag_buf = get_token((char**)&buf,"\t")))
880 {
881 vector<string> tuple_fields;
882 tokenize(tag_buf,":", tuple_fields);
883 if (tuple_fields.size() == 3)
884 {
885 if (tuple_fields[0] == "XS")
886 {
887 if (tuple_fields[2] == "-")
888 antisense_splice = true;
889 }
890 else if (tuple_fields[0] == "NM")
891 {
892 num_mismatches = atoi(tuple_fields[2].c_str());
893 }
894 else if (tuple_fields[0] == "NS")
895 {
896 //ignored for now
897 }
898 else
899 {
900 //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
901 //return false;
902 }
903 }
904 }
905
906
907 /*
908 * By convention,the NM field of the SAM record
909 * counts an insertion or deletion. I dont' think
910 * we want the mismatch count in the BowtieHit
911 * record to reflect this. Therefore, subtract out
912 * the mismatches due to in/dels
913 */
914 for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
915 if(itr->opcode == INS){
916 num_mismatches -= itr->length;
917 }
918 if(itr->opcode == DEL){
919 num_mismatches -= itr->length;
920 }
921 }
922 // vector<string> toks;
923 // tokenize(tag_buf, ":", toks);
924 if (spliced_alignment)
925 {
926 bh = create_hit(name,
927 text_name,
928 text_offset - 1,
929 cigar,
930 sam_flag & 0x0010,
931 antisense_splice,
932 num_mismatches,
933 num_splice_anchor_mismatches,
934 end);
935 return true;
936
937 }
938 else
939 {
940 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
941
942 bh = create_hit(name,
943 text_name,
944 text_offset - 1, // SAM files are 1-indexed
945 cigar,
946 sam_flag & 0x0010,
947 false,
948 num_mismatches,
949 0,
950 end);
951 return true;
952 }
953
954 return false;
955 }
956
957
958
959 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
960 BowtieHit& bh, bool strip_slash,
961 char* name_out, char* name_tags,
962 char* seq, char* qual) {
963 if (_sam_header==NULL)
964 err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
965 const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
966
967
968 uint32_t sam_flag = hit_buf->core.flag;
969
970 int text_offset = hit_buf->core.pos;
971 int text_mate_pos = hit_buf->core.mpos;
972 int target_id = hit_buf->core.tid;
973 int mate_target_id = hit_buf->core.mtid;
974
975 vector<CigarOp> cigar;
976 bool spliced_alignment = false;
977 int num_hits = 1;
978
979 double mapQ = hit_buf->core.qual;
980 long double error_prob;
981 if (mapQ > 0)
982 {
983 long double p = (-1.0 * mapQ) / 10.0;
984 error_prob = pow(10.0L, p);
985 }
986 else
987 {
988 error_prob = 1.0;
989 }
990
991 //header->target_name[c->tid]
992
993 bool end = true;
994 unsigned int seg_offset = 0;
995 unsigned int seg_num = 0;
996 unsigned int num_segs = 0;
997 // Copy the tag out of the name field before we might wipe it out
998 char* qname = bam1_qname(hit_buf);
999 char* pipe = strrchr(qname, '|');
1000 if (pipe)
1001 {
1002 if (name_tags)
1003 strcpy(name_tags, pipe);
1004
1005 char* tag_buf = pipe + 1;
1006 if (strchr(tag_buf, ':'))
1007 {
1008 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1009 if (seg_num + 1 == num_segs)
1010 end = true;
1011 else
1012 end = false;
1013 }
1014
1015 *pipe = 0;
1016 }
1017
1018
1019 if (target_id < 0) {
1020 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1021 bh = create_hit(qname,
1022 "*", //ref_name
1023 0, //left coord
1024 0, //read_len
1025 false, //antisense_aln
1026 0, //edit_dist
1027 end);
1028 return true;
1029 }
1030
1031 //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1032 string text_name = _sam_header->target_name[target_id];
1033 for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1034 {
1035 //char* t;
1036
1037 int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1038 if (length <= 0)
1039 {
1040 fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1041 return false;
1042 }
1043
1044 CigarOpCode opcode;
1045 switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1046 {
1047 case BAM_CMATCH: opcode = MATCH; break;
1048 case BAM_CINS: opcode = INS; break;
1049 case BAM_CDEL: opcode = DEL; break;
1050 case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1051 case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1052 case BAM_CPAD: opcode = PAD; break;
1053 case BAM_CREF_SKIP:
1054 opcode = REF_SKIP;
1055 spliced_alignment = true;
1056 if (length > (int)max_report_intron_length)
1057 {
1058 //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1059 return false;
1060 }
1061 break;
1062 default:
1063 fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1064 return false;
1065 }
1066 if (opcode != HARD_CLIP)
1067 cigar.push_back(CigarOp(opcode, length));
1068 }
1069
1070 string mrnm;
1071 if (mate_target_id >= 0) {
1072 if (mate_target_id == target_id) {
1073 //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1074 mrnm = _sam_header->target_name[mate_target_id];
1075 }
1076 else {
1077 //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1078 return false;
1079 }
1080 }
1081 else {
1082 text_mate_pos = 0;
1083 }
1084 //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1085 bool antisense_splice=false;
1086 unsigned char num_mismatches = 0;
1087 unsigned char num_splice_anchor_mismatches = 0;
1088
1089 uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1090 if (ptr) {
1091 char src_strand_char = bam_aux2A(ptr);
1092 if (src_strand_char == '-')
1093 antisense_splice = true;
1094 // else if (src_strand_char == '+')
1095 // source_strand = CUFF_FWD;
1096 }
1097
1098 ptr = bam_aux_get(hit_buf, "NM");
1099 if (ptr) {
1100 num_mismatches = bam_aux2i(ptr);
1101 }
1102
1103 ptr = bam_aux_get(hit_buf, "NH");
1104 if (ptr) {
1105 num_hits = bam_aux2i(ptr);
1106 }
1107
1108 //bool antisense_aln = bam1_strand(hit_buf);
1109
1110 //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1111 // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1112 if (spliced_alignment) {
1113 //if (source_strand == CUFF_STRAND_UNKNOWN) {
1114 // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1115 // }
1116 bh = create_hit(qname,
1117 text_name,
1118 text_offset, // BAM files are 0-indexed
1119 cigar,
1120 sam_flag & 0x0010,
1121 antisense_splice,
1122 num_mismatches,
1123 num_splice_anchor_mismatches,
1124 end);
1125
1126 }
1127 else {
1128 //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1129 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1130 bh = create_hit(qname,
1131 text_name,
1132 text_offset, // BAM files are 0-indexed
1133 cigar,
1134 sam_flag & 0x0010,
1135 false,
1136 num_mismatches,
1137 0,
1138 end);
1139 }
1140 if (seq!=NULL) {
1141 char *bseq = (char*)bam1_seq(hit_buf);
1142 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1143 char v = bam1_seqi(bseq,i);
1144 seq[i]=bam_nt16_rev_table[v];
1145 }
1146 seq[hit_buf->core.l_qseq]=0;
1147 }
1148 if (qual!=NULL) {
1149 char *bq = (char*)bam1_qual(hit_buf);
1150 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1151 qual[i]=bq[i]+33;
1152 }
1153 qual[hit_buf->core.l_qseq]=0;
1154 }
1155 return true;
1156 }
1157
1158 bool BAMHitFactory::inspect_header(HitStream& hs)
1159 {
1160 bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1161
1162 if (header == NULL) {
1163 fprintf(stderr, "Warning: No BAM header\n");
1164 return false;
1165 }
1166 if (header->l_text == 0) {
1167 fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1168 return false;
1169 }
1170 return true;
1171 }
1172
1173
1174 void get_mapped_reads(FILE* bwtf,
1175 HitTable& hits,
1176 HitFactory& hit_factory,
1177 bool strip_slash,
1178 bool verbose)
1179 {
1180
1181
1182 char bwt_buf[2048];
1183 uint32_t reads_extracted = 0;
1184
1185 while (fgets(bwt_buf, 2048, bwtf))
1186 {
1187 // Chomp the newline
1188 char* nl = strrchr(bwt_buf, '\n');
1189 if (nl) *nl = 0;
1190 if (*bwt_buf == 0)
1191 continue;
1192 // Get a new record from the tab-delimited Bowtie map
1193 BowtieHit bh;
1194 if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1195 {
1196 // Only check uniqueness if these hits are spliced
1197 hits.add_hit(bh, true);
1198 }
1199 reads_extracted++;
1200 }
1201
1202 // This will sort the map by insert id.
1203 hits.finalize();
1204 fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1205 }
1206
1207
1208 /*
1209 AlignStatus status(const BowtieHit* align)
1210 {
1211 if (!align)
1212 return UNALIGNED;
1213 if (align->contiguous())
1214 return CONTIGUOUS;
1215 return SPLICED;
1216 }
1217 */
1218
1219 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1220 {
1221 int max_hit_pos = -1;
1222 for (size_t i = 0; i < hits.size(); ++i)
1223 {
1224 max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1225 }
1226
1227 if ((int)DoC.size() < max_hit_pos)
1228 DoC.resize(max_hit_pos);
1229
1230 for (size_t i = 0; i < hits.size(); ++i)
1231 {
1232 const BowtieHit& bh = hits[i];
1233
1234 // split up the coverage contibution for this reads
1235 size_t j = bh.left();
1236 const vector<CigarOp>& cigar = bh.cigar();
1237
1238 for (size_t c = 0 ; c < cigar.size(); ++c)
1239 {
1240 switch(cigar[c].opcode)
1241 {
1242 case MATCH:
1243 for (size_t m = 0; m < cigar[c].length; ++m)
1244 {
1245 if (DoC[j + m] < 0xFFFF)
1246 DoC[j + m]++;
1247 }
1248 //fall through this case to REF_SKIP is intentional
1249 case REF_SKIP:
1250 j += cigar[c].length;
1251 break;
1252 default:
1253 break;
1254 }
1255
1256 }
1257 }
1258 }
1259
1260 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1261 {
1262 if ((int)DoC.size() < bh.right())
1263 DoC.resize(bh.right());
1264
1265 // split up the coverage contibution for this reads
1266 size_t j = bh.left();
1267 const vector<CigarOp>& cigar = bh.cigar();
1268
1269 for (size_t c = 0 ; c < cigar.size(); ++c)
1270 {
1271 switch(cigar[c].opcode)
1272 {
1273 case MATCH:
1274 for (size_t m = 0; m < cigar[c].length; ++m)
1275 {
1276 if (DoC[j + m] < 0xFFFFFFFF)
1277 DoC[j + m]++;
1278 }
1279 //fall through this case to REF_SKIP is intentional
1280 case REF_SKIP:
1281 j += cigar[c].length;
1282 break;
1283 default:
1284 break;
1285 }
1286
1287 }
1288 }
1289
1290 void print_hit(FILE* fout,
1291 const char* read_name,
1292 const BowtieHit& bh,
1293 const char* ref_name,
1294 const char* sequence,
1295 const char* qualities,
1296 bool from_bowtie)
1297 {
1298 string seq;
1299 string quals;
1300 if (sequence)
1301 {
1302 seq = sequence;
1303 quals = qualities;
1304 seq.resize(bh.read_len());
1305 quals.resize(bh.read_len());
1306 }
1307 else
1308 {
1309 seq = "*";
1310 }
1311
1312 if (qualities)
1313 {
1314 quals = qualities;
1315 quals.resize(bh.read_len());
1316 }
1317 else
1318 {
1319 quals = "*";
1320 }
1321
1322 uint32_t sam_flag = 0;
1323 if (bh.antisense_align())
1324 {
1325 sam_flag |= 0x0010; // BAM_FREVERSE
1326 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1327 {
1328 reverse_complement(seq);
1329 reverse(quals.begin(), quals.end());
1330 }
1331 }
1332
1333 uint32_t sam_pos = bh.left() + 1;
1334 uint32_t map_quality = 255;
1335 char cigar[256];
1336 cigar[0] = 0;
1337 string mate_ref_name = "*";
1338 uint32_t mate_pos = 0;
1339 uint32_t insert_size = 0;
1340 //string qualities = "*";
1341
1342 const vector<CigarOp>& bh_cigar = bh.cigar();
1343
1344 /*
1345 * In addition to calculating the cigar string,
1346 * we need to figure out how many in/dels are in the
1347 * sequence, so that we can give the correct
1348 * value for the NM tag
1349 */
1350 int indel_distance = 0;
1351 for (size_t c = 0; c < bh_cigar.size(); ++c)
1352 {
1353 char ibuf[64];
1354 sprintf(ibuf, "%d", bh_cigar[c].length);
1355 switch(bh_cigar[c].opcode)
1356 {
1357 case MATCH:
1358 strcat(cigar, ibuf);
1359 strcat(cigar, "M");
1360 break;
1361 case INS:
1362 strcat(cigar, ibuf);
1363 strcat(cigar, "I");
1364 indel_distance += bh_cigar[c].length;
1365 break;
1366 case DEL:
1367 strcat(cigar, ibuf);
1368 strcat(cigar, "D");
1369 indel_distance += bh_cigar[c].length;
1370 break;
1371 case REF_SKIP:
1372 strcat(cigar, ibuf);
1373 strcat(cigar, "N");
1374 break;
1375 default:
1376 break;
1377 }
1378 }
1379
1380 //string q = string(bh.read_len(), '!');
1381 //string s = string(bh.read_len(), 'N');
1382
1383 fprintf(fout,
1384 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1385 read_name,
1386 sam_flag,
1387 ref_name,
1388 sam_pos,
1389 map_quality,
1390 cigar,
1391 mate_ref_name.c_str(),
1392 mate_pos,
1393 insert_size,
1394 seq.c_str(),
1395 quals.c_str());
1396
1397 if (!sam_readgroup_id.empty())
1398 fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1399
1400 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1401
1402 bool containsSplice = false;
1403 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1404 if(itr->opcode == REF_SKIP){
1405 containsSplice = true;
1406 break;
1407 }
1408 }
1409
1410 if (containsSplice)
1411 fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1412
1413 fprintf(fout, "\n");
1414 }
1415
1416 void print_bamhit(GBamWriter& wbam,
1417 const char* read_name,
1418 const BowtieHit& bh,
1419 const char* ref_name,
1420 const char* sequence,
1421 const char* qualities,
1422 bool from_bowtie)
1423 {
1424 string seq;
1425 string quals;
1426 if (sequence) {
1427 seq = sequence;
1428 quals = qualities;
1429 seq.resize(bh.read_len());
1430 quals.resize(bh.read_len());
1431 }
1432 else {
1433 seq = "*";
1434 }
1435 if (qualities) {
1436 quals = qualities;
1437 quals.resize(bh.read_len());
1438 }
1439 else
1440 {
1441 quals = "*";
1442 }
1443
1444 uint32_t sam_flag = 0;
1445 if (bh.antisense_align())
1446 {
1447 sam_flag |= 0x0010; // BAM_FREVERSE
1448 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1449 {
1450 reverse_complement(seq);
1451 reverse(quals.begin(), quals.end());
1452 }
1453 }
1454
1455 uint32_t sam_pos = bh.left() + 1;
1456 uint32_t map_quality = 255;
1457 char cigar[256];
1458 cigar[0] = 0;
1459 string mate_ref_name = "*";
1460 uint32_t mate_pos = 0;
1461 uint32_t insert_size = 0;
1462 //string qualities = "*";
1463
1464 const vector<CigarOp>& bh_cigar = bh.cigar();
1465 /*
1466 * In addition to calculating the cigar string,
1467 * we need to figure out how many in/dels are in the
1468 * sequence, so that we can give the correct
1469 * value for the NM tag
1470 */
1471 int indel_distance = 0;
1472 for (size_t c = 0; c < bh_cigar.size(); ++c)
1473 {
1474 char ibuf[64];
1475 sprintf(ibuf, "%d", bh_cigar[c].length);
1476 switch(bh_cigar[c].opcode)
1477 {
1478 case MATCH:
1479 strcat(cigar, ibuf);
1480 strcat(cigar, "M");
1481 break;
1482 case INS:
1483 strcat(cigar, ibuf);
1484 strcat(cigar, "I");
1485 indel_distance += bh_cigar[c].length;
1486 break;
1487 case DEL:
1488 strcat(cigar, ibuf);
1489 strcat(cigar, "D");
1490 indel_distance += bh_cigar[c].length;
1491 break;
1492 case REF_SKIP:
1493 strcat(cigar, ibuf);
1494 strcat(cigar, "N");
1495 break;
1496 default:
1497 break;
1498 }
1499 }
1500
1501 //string q = string(bh.read_len(), '!');
1502 //string s = string(bh.read_len(), 'N');
1503 /*
1504 fprintf(fout,
1505 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1506 read_name,
1507 sam_flag,
1508 ref_name,
1509 sam_pos,
1510 map_quality,
1511 cigar,
1512 mate_ref_name.c_str(),
1513 mate_pos,
1514 insert_size,
1515 seq.c_str(),
1516 quals.c_str());
1517
1518 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1519 */
1520 vector<string> auxdata;
1521
1522 if (!sam_readgroup_id.empty())
1523 {
1524 string nm("RG:Z:");
1525 nm += sam_readgroup_id;
1526 auxdata.push_back(nm);
1527 }
1528
1529 string nm("NM:i:");
1530 str_appendInt(nm, bh.edit_dist() + indel_distance);
1531 auxdata.push_back(nm);
1532 bool containsSplice = false;
1533 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1534 if(itr->opcode == REF_SKIP){
1535 containsSplice = true;
1536 break;
1537 }
1538
1539 if (containsSplice) {
1540 //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1541 nm="XS:A:";
1542 nm+=(char)(bh.antisense_splice() ? '-' : '+');
1543 auxdata.push_back(nm);
1544 }
1545
1546 GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1547 cigar, mate_ref_name.c_str(), mate_pos,
1548 insert_size, seq.c_str(), quals.c_str(), &auxdata);
1549
1550
1551
1552 wbam.write(brec);
1553 delete brec;
1554 }
1555
1556 /**
1557 * Print a vector of cigar operations to a file.
1558 * @param bh_cigar A vector of CigarOps
1559 * @return a string representation of the cigar string
1560 */
1561 std::string print_cigar(vector<CigarOp>& bh_cigar){
1562 char cigar[256];
1563 cigar[0] = 0;
1564 for (size_t c = 0; c < bh_cigar.size(); ++c)
1565 {
1566 char ibuf[64];
1567 sprintf(ibuf, "%d", bh_cigar[c].length);
1568 switch(bh_cigar[c].opcode)
1569 {
1570 case MATCH:
1571 strcat(cigar, ibuf);
1572 strcat(cigar, "M");
1573 break;
1574 case INS:
1575 strcat(cigar, ibuf);
1576 strcat(cigar, "I");
1577 break;
1578 case DEL:
1579 strcat(cigar, ibuf);
1580 strcat(cigar, "D");
1581 break;
1582 case REF_SKIP:
1583 strcat(cigar, ibuf);
1584 strcat(cigar, "N");
1585 break;
1586 default:
1587 break;
1588 }
1589 }
1590 string result(cigar);
1591 return result;
1592 }
1593
1594 bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
1595 {
1596 RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
1597 if (!ref_str)
1598 return false;
1599
1600 const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
1601
1602 size_t pos_seq = 0;
1603 size_t pos_ref = 0;
1604 size_t mismatch = 0;
1605 for (size_t i = 0; i < _cigar.size(); ++i)
1606 {
1607 CigarOp cigar = _cigar[i];
1608 switch(cigar.opcode)
1609 {
1610 case MATCH:
1611 {
1612 for (size_t j = 0; j < cigar.length; ++j)
1613 {
1614 if (_seq[pos_seq++] != ref_seq[pos_ref++])
1615 ++mismatch;
1616 }
1617 }
1618 break;
1619 case INS:
1620 {
1621 pos_seq += cigar.length;
1622 }
1623 break;
1624
1625 case DEL:
1626 case REF_SKIP:
1627 {
1628 pos_ref += cigar.length;
1629 }
1630 break;
1631
1632 default:
1633 break;
1634 }
1635 }
1636
1637 return mismatch == _edit_dist;
1638 }