ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 97
Committed: Thu Oct 6 19:45:36 2011 UTC (8 years, 8 months ago) by gpertea
File size: 52897 byte(s)
Log Message:
by default bam2fastx shows unmapped only

Line File contents
1 /*
2 * bwt_map.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/17/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <cassert>
15 #include <cstdlib>
16 #include <iostream>
17 #include <set>
18 #include <vector>
19 #include <cmath>
20
21 #include "common.h"
22 #include "bwt_map.h"
23 #include "tokenize.h"
24 #include "reads.h"
25
26 using namespace std;
27
28 void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29 {
30 uint32_t reference_id = bh.ref_id();
31
32 pair<RefHits::iterator, bool> ret =
33 _hits_for_ref.insert(make_pair(reference_id, HitList()));
34 HitList& hl = ret.first->second;
35
36 if (check_uniqueness)
37 {
38 // Check uniqueness, in case we are adding spliced hits from
39 // several spliced alignment sources (e.g. de novo hashing + Bowtie
40 // against a user-supplied index). We don't want to count the same
41 // alignment twice if it happened to be found by more than one method
42 HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43 HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44 for (; lb != ub && lb != hl.end(); ++lb)
45 {
46 if (*lb == bh)
47 {
48 //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49 return;
50 }
51
52 if (lb->insert_id() == bh.insert_id() &&
53 lb->ref_id() == bh.ref_id() &&
54 lb->antisense_align() == bh.antisense_align())
55 {
56 // If we get here, we may be looking at the same alignment
57 // However, spanning_reads may report a shorter, trimmed alignment
58 // so not all fields will be equal. If they just disagree on the
59 // ends, and don't indicate a different junction coord, the
60 // alignments are the same.
61
62 if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63 (bh.left() <= lb->left() && bh.right() >= lb->right()))
64 {
65 vector<pair<int,int> > lb_gaps, bh_gaps;
66 lb->gaps(lb_gaps);
67 bh.gaps(bh_gaps);
68 if (lb_gaps == bh_gaps)
69 {
70 // One alignment is contained in the other, they agree on
71 // where the gaps, if any, are, and they share an id
72 // => this is a redundant aligment, so toss it
73 //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74 return;
75 }
76 }
77 }
78 }
79 }
80 _total_hits++;
81 hl.push_back(bh);
82 }
83
84 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85 {
86 return h1.insert_id() < h2.insert_id();
87 }
88
89 void LineHitFactory::openStream(HitStream& hs)
90 {
91 if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92 //open the file for HitStream here
93 hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94 if (hs._hit_file==NULL)
95 err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96 return;
97 }
98 if (hs._fzpipe!=NULL) {
99 hs._hit_file=hs._fzpipe->file;
100 }
101 }
102
103 void LineHitFactory::rewind(HitStream& hs)
104 {
105 if (hs._fzpipe!=NULL) {
106 hs._fzpipe->rewind();
107 hs._hit_file=hs._fzpipe->file;
108 }
109 else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110 }
111
112 bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113 FILE* f=(FILE *)(hs._hit_file);
114 bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115 if (!new_rec || feof(f)) {
116 hs._eof=true;
117 return false;
118 }
119 ++_line_num;
120 char* nl = strrchr(_hit_buf, '\n');
121 if (nl) *nl = 0;
122 buf = _hit_buf;
123 buf_size = _hit_buf_max_sz - 1;
124 return true;
125 }
126
127 void LineHitFactory::closeStream(HitStream& hs) {
128 if (hs._fzpipe!=NULL) {
129 hs._fzpipe->close();
130 return;
131 }
132 if (hs._hit_file!=NULL) {
133 fclose((FILE*)(hs._hit_file));
134 hs._hit_file=NULL;
135 }
136 }
137 void BAMHitFactory::openStream(HitStream& hs) {
138 if (hs._hit_file==NULL) {
139 if (hs._hit_file_name.empty())
140 //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141 return; //invalid stream, could be just a place holder
142 //open the file here if not already open
143 string fext=getFext(hs._hit_file_name);
144 if (fext=="sam")
145 hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146 else
147 hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148
149 samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150
151 if (sam_file == NULL)
152 err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153 if (sam_file->header == NULL)
154 err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155 memset(&_next_hit, 0, sizeof(_next_hit));
156 //_beginning = bgzf_tell(sam_file->x.bam);
157 _sam_header=sam_file->header;
158 if (inspect_header(hs) == false)
159 err_die("Error: invalid SAM header for file %s\n",
160 hs._hit_file_name.c_str());
161 }
162 }
163
164 void BAMHitFactory::closeStream(HitStream& hs) {
165 if (hs._hit_file) {
166 samclose((samfile_t*)(hs._hit_file));
167 }
168 hs._hit_file=NULL;
169 _sam_header=NULL;
170 }
171
172 void BAMHitFactory::rewind(HitStream& hs)
173 {
174 /*
175 if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176 {
177 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178 _eof = false;
179 }
180 */
181 this->closeStream(hs);
182 this->openStream(hs);
183 }
184
185 string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186 const bam1_t* bamrec=(const bam1_t*)hit_buf;
187 char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188 string sam_line(tamline);
189 free(tamline);
190 return sam_line;
191 }
192
193 bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194 if (_next_hit.data) {
195 free(_next_hit.data);
196 _next_hit.data = NULL;
197 }
198 _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199 if (hs.eof() || !hs.ready()) return false;
200
201 //mark_curr_pos();
202
203 memset(&_next_hit, 0, sizeof(_next_hit));
204
205 int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206 if (bytes_read <= 0) {
207 hs._eof = true;
208 return false;
209 }
210 buf = (const char*)&_next_hit;
211 buf_size = bytes_read;
212 return true;
213 }
214
215
216 BowtieHit HitFactory::create_hit(const string& insert_name,
217 const string& ref_name,
218 int left,
219 const vector<CigarOp>& cigar,
220 bool antisense_aln,
221 bool antisense_splice,
222 unsigned char edit_dist,
223 unsigned char splice_mms,
224 bool end)
225 {
226 uint64_t insert_id = _insert_table.get_id(insert_name);
227 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228
229 return BowtieHit(reference_id,
230 insert_id,
231 left,
232 cigar,
233 antisense_aln,
234 antisense_splice,
235 edit_dist,
236 splice_mms,
237 end);
238 }
239
240 BowtieHit HitFactory::create_hit(const string& insert_name,
241 const string& ref_name,
242 uint32_t left,
243 uint32_t read_len,
244 bool antisense_aln,
245 unsigned char edit_dist,
246 bool end)
247 {
248 uint64_t insert_id = _insert_table.get_id(insert_name);
249 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250
251 return BowtieHit(reference_id,
252 insert_id,
253 left,
254 read_len,
255 antisense_aln,
256 edit_dist,
257 end);
258 }
259
260 bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261 BowtieHit& bh,
262 bool strip_slash,
263 char* name_out,
264 char* name_tags,
265 char* seq,
266 char* qual)
267 {
268 if (!orig_bwt_buf || !*orig_bwt_buf)
269 return false;
270
271 static const int buf_size = 2048;
272
273 char bwt_buf[buf_size];
274 strcpy(bwt_buf, orig_bwt_buf);
275 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276
277 char orientation;
278
279 //int bwtf_ret = 0;
280 //uint32_t seqid = 0;
281
282 char text_name[buf_size];
283 unsigned int text_offset;
284
285
286 //unsigned int other_occs;
287 char mismatches[buf_size];
288 //memset(mismatches, 0, sizeof(mismatches));
289
290 const char* buf = bwt_buf;
291 char* name = get_token((char**)&buf,"\t");
292 char* orientation_str = get_token((char**)&buf,"\t");
293 char* text_name_str = get_token((char**)&buf,"\t");
294
295 strcpy(text_name, text_name_str);
296
297 char* text_offset_str = get_token((char**)&buf,"\t");
298 char* seq_str = get_token((char**)&buf,"\t");
299 if (seq)
300 strcpy(seq, seq_str);
301
302 const char* qual_str = get_token((char**)&buf,"\t");
303 if (qual)
304 strcpy(qual, qual_str);
305
306 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307 mismatches[0] = 0;
308 char* mismatches_str = get_token((char**)&buf, "\t");
309 if (mismatches_str)
310 strcpy(mismatches, mismatches_str);
311
312 orientation = orientation_str[0];
313 text_offset = atoi(text_offset_str);
314
315 bool end = true;
316 unsigned int seg_offset = 0;
317 unsigned int seg_num = 0;
318 unsigned int num_segs = 0;
319
320 // Copy the tag out of the name field before we might wipe it out
321 char* pipe = strrchr(name, '|');
322 if (pipe)
323 {
324 if (name_tags)
325 strcpy(name_tags, pipe);
326
327 char* tag_buf = pipe + 1;
328 if (strchr(tag_buf, ':'))
329 {
330 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331 if (seg_num + 1 == num_segs)
332 end = true;
333 else
334 end = false;
335 }
336
337 *pipe = 0;
338 }
339 // Stripping the slash and number following it gives the insert name
340 char* slash = strrchr(name, '/');
341 if (strip_slash && slash)
342 *slash = 0;
343
344 size_t read_len = strlen(seq_str);
345
346 // Add this alignment to the table of hits for this half of the
347 // Bowtie map
348
349 //vector<string> mismatch_toks;
350 char* pch = strtok (mismatches,",");
351 unsigned char num_mismatches = 0;
352 while (pch != NULL)
353 {
354 char* colon = strchr(pch, ':');
355 if (colon)
356 {
357 num_mismatches++;
358 }
359 //mismatch_toks.push_back(pch);
360 pch = strtok (NULL, ",");
361 }
362
363 bh = create_hit(name,
364 text_name,
365 text_offset,
366 (int)read_len,
367 orientation == '-',
368 num_mismatches,
369 end);
370
371 return true;
372 }
373
374 int anchor_mismatch = 0;
375
376
377 void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
378 bool &end, unsigned int &seg_offset, unsigned int& seg_num,
379 unsigned int & num_segs) {
380 char* pipe = strrchr(name, '|');
381 if (pipe)
382 {
383 if (name_tags)
384 strcpy(name_tags, pipe);
385
386 char* tag_buf = pipe + 1;
387 if (strchr(tag_buf, ':'))
388 {
389 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
390 if (seg_num + 1 == num_segs)
391 end = true;
392 else
393 end = false;
394 }
395
396 *pipe = 0;
397 }
398
399 // Stripping the slash and number following it gives the insert name
400 char* slash = strrchr(name, '/');
401 if (strip_slash && slash)
402 *slash = 0;
403 }
404
405
406 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
407 BowtieHit& bh,
408 bool strip_slash,
409 char* name_out,
410 char* name_tags,
411 char* seq,
412 char* qual)
413 {
414 if (!orig_bwt_buf || !*orig_bwt_buf)
415 return false;
416
417 static const int buf_size = 2048;
418
419 char bwt_buf[buf_size];
420 strcpy(bwt_buf, orig_bwt_buf);
421 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
422
423 char orientation;
424 char text_name[buf_size];
425 unsigned int text_offset;
426 char mismatches[buf_size];
427 //memset(mismatches, 0, sizeof(mismatches));
428
429 char* buf = bwt_buf;
430 char* name = get_token((char**)&buf,"\t");
431 char* orientation_str = get_token((char**)&buf,"\t");
432 char* text_name_str = get_token((char**)&buf,"\t");
433 strcpy(text_name, text_name_str);
434
435 char* text_offset_str = get_token((char**)&buf,"\t");
436 char* seq_str = get_token((char**)&buf,"\t");
437 if (seq)
438 strcpy(seq, seq_str);
439
440 const char* qual_str = get_token((char**)&buf,"\t");
441 if (qual)
442 strcpy(qual, qual_str);
443
444 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
445 mismatches[0] = 0;
446 char* mismatches_str = get_token((char**)&buf, "\t");
447 if (mismatches_str)
448 strcpy(mismatches, mismatches_str);
449
450 orientation = orientation_str[0];
451 text_offset = atoi(text_offset_str);
452
453 bool end = true;
454 unsigned int seg_offset = 0;
455 unsigned int seg_num = 0;
456 unsigned int num_segs = 0;
457
458 // Copy the tag out of the name field before we might wipe it out
459 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
460
461 // Add this alignment to the table of hits for this half of the
462 // Bowtie map
463
464 // Parse the text_name field to recover the splice coords
465 vector<string> toks;
466
467 tokenize_strict(text_name, "|", toks);
468
469 int num_extra_toks = (int)toks.size() - 6;
470
471 if (num_extra_toks >= 0)
472 {
473 static const uint8_t left_window_edge_field = 1;
474 static const uint8_t splice_field = 2;
475 //static const uint8_t right_window_edge_field = 3;
476 static const uint8_t junction_type_field = 4;
477 static const uint8_t strand_field = 5;
478
479 string contig = toks[0];
480 for (int t = 1; t <= num_extra_toks; ++t)
481 {
482 contig += "|";
483 contig += toks[t];
484 }
485
486 vector<string> splice_toks;
487 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
488 if (splice_toks.size() != 2)
489 {
490 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
491 //fprintf(stderr, "%s (token: %s)\n", text_name,
492 // toks[num_extra_toks + splice_field].c_str());
493 return false;
494 }
495
496 //
497 // check for an insertion hit
498 //
499 if(toks[num_extra_toks + junction_type_field] == "ins")
500 {
501 int8_t spliced_read_len = strlen(seq_str);
502 /*
503 * The 0-based position of the left edge of the alignment. Note that this
504 * value may need to be futher corrected to account for the presence of
505 * of the insertion.
506 */
507 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
508 uint32_t right = left + spliced_read_len - 1;
509
510 /*
511 * The 0-based position of the last genomic sequence before the insertion
512 */
513 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
514
515 string insertedSequence = splice_toks[1];
516 /*
517 * The 0-based position of the first genomic sequence after teh insertion
518 */
519 uint32_t right_splice_pos = left_splice_pos + 1;
520 if(left > left_splice_pos){
521 /*
522 * The genomic position of the left edge of the alignment needs to be corrected
523 * If the alignment does not extend into the insertion, simply subtract the length
524 * of the inserted sequence, otherwise, just set it equal to the right edge
525 */
526 left = left - insertedSequence.length();
527 if(left < right_splice_pos){
528 left = right_splice_pos;
529 }
530 }
531 if(right > left_splice_pos){
532 right = right - insertedSequence.length();
533 if(right < left_splice_pos){
534 right = left_splice_pos;
535 }
536 }
537 /*
538 * Now, right and left should be properly transformed into genomic coordinates
539 * We should be able to deduce how much the alignment matches the insertion
540 * simply based on the length of the read
541 */
542 int left_match_length = 0;
543 if(left <= left_splice_pos){
544 left_match_length = left_splice_pos - left + 1;
545 }
546 int right_match_length = 0;
547 if(right >= right_splice_pos){
548 right_match_length = right - right_splice_pos + 1;
549 }
550 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
551
552 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
553 return false;
554
555 string junction_strand = toks[num_extra_toks + strand_field];
556 if(junction_strand != "rev" && junction_strand != "fwd"){
557 fprintf(stderr, "Malformed insertion record\n");
558 return false;
559 }
560
561 char* pch = strtok( mismatches, ",");
562 unsigned char num_mismatches = 0;
563
564 /*
565 * remember that text_offset holds the left end of the
566 * alignment, relative to the start of the contig
567 */
568
569 /*
570 * The 0-based relative position of the left-most character
571 * before the insertion in the contig
572 */
573 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
574 while (pch != NULL)
575 {
576 char* colon = strchr(pch, ':');
577 if (colon)
578 {
579 *colon = 0;
580 int mismatch_pos = atoi(pch);
581
582 /*
583 * for reversely mapped reads,
584 * find the correct mismatched position.
585 */
586 if(orientation == '-'){
587 mismatch_pos = spliced_read_len - mismatch_pos - 1;
588 }
589
590 /*
591 * Only count mismatches outside of the insertion region
592 * If there is a mismatch within the insertion,
593 * disallow this hit
594 */
595 if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
596 num_mismatches++;
597 }else{
598 return false;
599 }
600 }
601 pch = strtok (NULL, ",");
602 }
603
604
605 vector<CigarOp> cigar;
606 cigar.push_back(CigarOp(MATCH, left_match_length));
607 cigar.push_back(CigarOp(INS, insertion_match_length));
608 cigar.push_back(CigarOp(MATCH, right_match_length));
609
610 /*
611 * For now, disallow hits that don't span
612 * the insertion. If we allow these types of hits,
613 * then long_spanning.cpp needs to be updated
614 * in order to intelligently merge these kinds
615 * of reads back together
616 *
617 * Following code has been changed to allow segment that end
618 * in an insertion
619 */
620 bh = create_hit(name,
621 contig,
622 left,
623 cigar,
624 orientation == '-',
625 junction_strand == "rev",
626 num_mismatches,
627 0,
628 end);
629 return true;
630 }
631
632 else
633 {
634 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
635 int spliced_read_len = strlen(seq_str);
636 int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
637 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
638 int8_t right_splice_pos = spliced_read_len - left_splice_pos;
639
640 if (right_splice_pos <= 0 || left_splice_pos <= 0)
641 return false;
642
643 if (orientation == '+')
644 {
645 if (left_splice_pos + seg_offset < _anchor_length){
646 return false;
647 }
648 }
649 else
650 {
651 if (right_splice_pos + seg_offset < _anchor_length)
652 return false;
653 }
654 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
655 //atoi(toks[right_window_edge_field].c_str());
656 int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
657
658 string junction_strand = toks[num_extra_toks + strand_field];
659 if (!(junction_strand == "rev" || junction_strand == "fwd")||
660 !(orientation == '-' || orientation == '+'))
661 {
662 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
663 //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
664 // junction_strand.c_str(), orientation);
665 return false;
666 }
667
668 //vector<string> mismatch_toks;
669 char* pch = strtok (mismatches,",");
670 int mismatches_in_anchor = 0;
671 unsigned char num_mismatches = 0;
672 while (pch != NULL)
673 {
674 char* colon = strchr(pch, ':');
675 if (colon)
676 {
677 *colon = 0;
678 num_mismatches++;
679 int mismatch_pos = atoi(pch);
680 if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
681 (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
682 mismatches_in_anchor++;
683 }
684 //mismatch_toks.push_back(pch);
685 pch = strtok (NULL, ",");
686 }
687
688 // FIXME: we probably should exclude these hits somewhere, but this
689 // isn't the right place
690 vector<CigarOp> cigar;
691 cigar.push_back(CigarOp(MATCH, left_splice_pos));
692 if(toks[num_extra_toks + junction_type_field] == "del"){
693 cigar.push_back(CigarOp(DEL, gap_len));
694 }else{
695 cigar.push_back(CigarOp(REF_SKIP, gap_len));
696 }
697 cigar.push_back(CigarOp(MATCH, right_splice_pos));
698
699 bh = create_hit(name,
700 contig,
701 left,
702 cigar,
703 orientation == '-',
704 junction_strand == "rev",
705 num_mismatches,
706 mismatches_in_anchor,
707 end);
708 return true;
709 }
710 }
711 else
712 {
713 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
714 //fprintf(stderr, "%s\n", orig_bwt_buf);
715 // continue;
716 return false;
717 }
718
719 return false;
720 }
721
722
723 char parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
724 bool &spliced_alignment) {
725 // Mostly pilfered direct from the SAM tools:
726 const char* p_cig = cigar_str;
727 while (*p_cig)
728 {
729 char* t;
730 int length = (int)strtol(p_cig, &t, 10);
731 if (length <= 0)
732 {
733 //fprintf (stderr, "CIGAR op has zero length\n");
734 return false;
735 }
736 char op_char = toupper(*t);
737 CigarOpCode opcode;
738 if (op_char == 'M') opcode = MATCH;
739 else if (op_char == 'I') opcode = INS;
740 else if (op_char == 'D') opcode = DEL;
741 else if (op_char == 'N')
742 {
743 if (length > max_report_intron_length)
744 return false;
745 opcode = REF_SKIP;
746 spliced_alignment = true;
747 }
748 else if (op_char == 'S') opcode = SOFT_CLIP;
749 else if (op_char == 'H') opcode = HARD_CLIP;
750 else if (op_char == 'P') opcode = PAD;
751 else
752 {
753 fprintf (stderr, "invalid CIGAR operation\n");
754 return false;
755 }
756 p_cig = t + 1;
757 //i += length;
758 cigar.push_back(CigarOp(opcode, length));
759 }
760 if (*p_cig) {
761 fprintf (stderr, "unmatched CIGAR operation\n");
762 return *p_cig;
763 }
764 return 0;
765 }
766
767 void getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
768 int* mismatches, int& num_mismatches, int& sam_nm, bool& antisense_splice) {
769 const char* tag_buf = buf;
770 sam_nm=0;
771 num_mismatches=0;
772 while((tag_buf = get_token((char**)&buf,"\t")))
773 {
774 vector<string> tuple_fields;
775 tokenize(tag_buf,":", tuple_fields);
776 if (tuple_fields.size() == 3)
777 {
778 if (tuple_fields[0] == "XS")
779 {
780 if (tuple_fields[2] == "-")
781 antisense_splice = true;
782 }
783 else if (tuple_fields[0] == "NM")
784 {
785 sam_nm = atoi(tuple_fields[2].c_str());
786 }
787 else if (tuple_fields[0] == "NS")
788 {
789 //ignored for now
790 }
791 else if (tuple_fields[0] == "MD")
792 {
793 const char* p=tuple_fields[2].c_str();
794 int bi=0; //base offset position in the read
795 while (*p != 0) {
796 if (isdigit(*p)) {
797 int v=atoi(p);
798 do { p++; } while (isdigit(*p));
799 bi+=v;
800 }
801 while (isupper(*p)) {
802 p++;
803 mismatches[num_mismatches]=bi;
804 num_mismatches++;
805 bi++;
806 }
807 if (*p=='^') { //reference deletion
808 p++;
809 while (isupper(*p)) { //insert read bases
810 p++; bi++;
811 }
812 }
813 }
814 }
815 else
816 {
817 //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
818 //return false;
819 }
820 }
821 }
822 /* By convention,the NM field of the SAM record
823 * counts an insertion or deletion. I dont' think
824 * we want the mismatch count in the BowtieHit
825 * record to reflect this. Therefore, subtract out
826 * the mismatches due to in/dels
827 */
828 for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
829 if(itr->opcode == INS){
830 sam_nm -= itr->length;
831 }
832 if(itr->opcode == DEL){
833 sam_nm -= itr->length;
834 }
835 }
836 }
837
838 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
839 BowtieHit& bh,
840 bool strip_slash,
841 char* name_out,
842 char* name_tags,
843 char* seq,
844 char* qual)
845 {
846 if (!orig_bwt_buf || !*orig_bwt_buf)
847 return false;
848 char bwt_buf[2048];
849 strcpy(bwt_buf, orig_bwt_buf);
850 // Are we still in the header region?
851 if (bwt_buf[0] == '@')
852 return false;
853
854 char* buf = bwt_buf;
855 char* name = get_token((char**)&buf,"\t");
856 char* sam_flag_str = get_token((char**)&buf,"\t");
857 char* text_name = get_token((char**)&buf,"\t");
858 char* text_offset_str = get_token((char**)&buf,"\t");
859 const char* map_qual_str = get_token((char**)&buf,"\t");
860 char* cigar_str = get_token((char**)&buf,"\t");
861 const char* mate_ref_str = get_token((char**)&buf,"\t");
862 const char* mate_pos_str = get_token((char**)&buf,"\t");
863 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
864
865 const char* seq_str = get_token((char**)&buf,"\t");
866 if (seq)
867 strcpy(seq, seq_str);
868
869 const char* qual_str = get_token((char**)&buf,"\t");
870 if (qual)
871 strcpy(qual, qual_str);
872
873 if (!name ||
874 !sam_flag_str ||
875 !text_name ||
876 !text_offset_str ||
877 !map_qual_str ||
878 !cigar_str ||
879 !mate_ref_str ||
880 !mate_pos_str ||
881 !inferred_insert_sz_str ||
882 !seq_str ||
883 !qual_str)
884 {
885 // truncated or malformed SAM record
886 return false;
887 }
888
889
890 int sam_flag = atoi(sam_flag_str);
891 int text_offset = atoi(text_offset_str);
892
893 bool end = true;
894 unsigned int seg_offset = 0;
895 unsigned int seg_num = 0;
896 unsigned int num_segs = 0;
897
898 // Copy the tag out of the name field before we might wipe it out
899 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
900
901 vector<CigarOp> cigar;
902 bool spliced_alignment = false;
903 if (parseCigar(cigar, cigar_str, spliced_alignment)) {
904 return false;
905 }
906
907 //vector<string> attributes;
908 //tokenize(tag_buf, " \t",attributes);
909
910 bool antisense_splice = false;
911 int num_mismatches = 0;
912 int sam_nm = 0; //the value of the NM tag (edit distance)
913 int mismatches[1024];
914 getSAMmismatches(buf, cigar, mismatches, num_mismatches, sam_nm, antisense_splice);
915
916 if (spliced_alignment)
917 {
918 bh = create_hit(name,
919 text_name,
920 text_offset - 1,
921 cigar,
922 sam_flag & 0x0010,
923 antisense_splice,
924 num_mismatches,
925 0,
926 end);
927 }
928 else
929 {
930 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
931 bh = create_hit(name,
932 text_name,
933 text_offset - 1, // SAM files are 1-indexed
934 cigar,
935 sam_flag & 0x0010,
936 false,
937 num_mismatches,
938 0,
939 end);
940 }
941 return true;
942 }
943
944
945 void spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar,
946 int8_t l_len, int mlen, int8_t r_len, CigarOpCode opcode) {
947 //merge the original 'cigar' with the new insert/gap operation
948 // at position l_len into splcigar;
949 //find the original opcode that croses l_len location
950 int ofs=0;
951 bool found=false;
952 for (size_t c = 0 ; c < cigar.size(); ++c)
953 {
954 ofs+=cigar[c].length;
955 if (!found) {
956 if (ofs>=l_len) {
957 found=true;
958 //TODO:inject new code here, splitting existing code if necessary
959
960 continue;
961 }
962 else {
963 //not found yet, just copy these codes
964 splcigar.push_back(cigar[c]);
965 }
966 }
967 else { //found already
968 //TODO: check this
969 splcigar.push_back(cigar[c]);
970 }
971 }
972 }
973
974 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
975 BowtieHit& bh,
976 bool strip_slash,
977 char* name_out,
978 char* name_tags,
979 char* seq,
980 char* qual)
981 {
982 if (!orig_bwt_buf || !*orig_bwt_buf)
983 return false;
984
985 char bwt_buf[2048];
986 strcpy(bwt_buf, orig_bwt_buf);
987 // Are we still in the header region?
988 if (bwt_buf[0] == '@')
989 return false;
990
991 char* buf = bwt_buf;
992 char* name = get_token((char**)&buf,"\t");
993 char* sam_flag_str = get_token((char**)&buf,"\t");
994 char* text_name = get_token((char**)&buf,"\t");
995 char* text_offset_str = get_token((char**)&buf,"\t");
996 const char* map_qual_str = get_token((char**)&buf,"\t");
997 char* cigar_str = get_token((char**)&buf,"\t");
998 const char* mate_ref_str = get_token((char**)&buf,"\t");
999 const char* mate_pos_str = get_token((char**)&buf,"\t");
1000 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1001 int num_mismatches=0;
1002 int mismatches[1024]; //list of 0-based mismatch positions in this read
1003 //parsed from SAM's MD:Z: tag
1004 const char* seq_str = get_token((char**)&buf,"\t");
1005 if (seq)
1006 strcpy(seq, seq_str);
1007
1008 const char* qual_str = get_token((char**)&buf,"\t");
1009 if (qual)
1010 strcpy(qual, qual_str);
1011
1012 if (!name ||
1013 !sam_flag_str ||
1014 !text_name ||
1015 !text_offset_str ||
1016 !map_qual_str ||
1017 !cigar_str ||
1018 !mate_ref_str ||
1019 !mate_pos_str ||
1020 !inferred_insert_sz_str ||
1021 !seq_str ||
1022 !qual_str)
1023 {
1024 // truncated or malformed SAM record
1025 return false;
1026 }
1027
1028
1029 int sam_flag = atoi(sam_flag_str);
1030 int text_offset = atoi(text_offset_str);
1031 text_offset--; //SAM is 1-based, Bowtie is 0-based
1032 bool end = true;
1033 unsigned int seg_offset = 0;
1034 unsigned int seg_num = 0;
1035 unsigned int num_segs = 0;
1036
1037 // Copy the tag out of the name field before we might wipe it out
1038 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1039
1040 vector<CigarOp> samcigar;
1041 bool spliced_alignment = false;
1042 if (parseCigar(samcigar, cigar_str, spliced_alignment)) {
1043 return false;
1044 }
1045
1046 bool antisense_splice = false;
1047 int sam_nm = 0;
1048 getSAMmismatches(buf, samcigar, mismatches, num_mismatches, sam_nm, antisense_splice);
1049
1050 //##############################################
1051
1052 // Add this alignment to the table of hits for this half of the
1053 // Bowtie map
1054
1055 // Parse the text_name field to recover the splice coords
1056 vector<string> toks;
1057
1058 tokenize_strict(text_name, "|", toks);
1059
1060 int num_extra_toks = (int)toks.size() - 6;
1061
1062 if (num_extra_toks >= 0)
1063 {
1064 static const uint8_t left_window_edge_field = 1;
1065 static const uint8_t splice_field = 2;
1066 //static const uint8_t right_window_edge_field = 3;
1067 static const uint8_t junction_type_field = 4;
1068 static const uint8_t strand_field = 5;
1069
1070 string contig = toks[0];
1071 for (int t = 1; t <= num_extra_toks; ++t)
1072 {
1073 contig += "|";
1074 contig += toks[t];
1075 }
1076
1077 vector<string> splice_toks;
1078 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1079 if (splice_toks.size() != 2)
1080 {
1081 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1082 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1083 // toks[num_extra_toks + splice_field].c_str());
1084 return false;
1085 }
1086 int spl_num_mismatches=0;
1087 //
1088 // check for an insertion hit
1089 //
1090 if(toks[num_extra_toks + junction_type_field] == "ins")
1091 {
1092 int8_t spliced_read_len = strlen(seq_str);
1093 // The 0-based position of the left edge of the alignment. Note that this
1094 // value may need to be futher corrected to account for the presence of
1095 // of the insertion.
1096 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1097 uint32_t right = left + spliced_read_len - 1;
1098
1099 // The 0-based position of the last genomic sequence before the insertion
1100 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
1101
1102 string insertedSequence = splice_toks[1];
1103 // The 0-based position of the first genomic sequence after the insertion
1104
1105 uint32_t right_splice_pos = left_splice_pos + 1;
1106 if(left > left_splice_pos){
1107 /*
1108 * The genomic position of the left edge of the alignment needs to be corrected
1109 * If the alignment does not extend into the insertion, simply subtract the length
1110 * of the inserted sequence, otherwise, just set it equal to the right edge
1111 */
1112 left = left - insertedSequence.length();
1113 if(left < right_splice_pos){
1114 left = right_splice_pos;
1115 }
1116 }
1117 if(right > left_splice_pos){
1118 right = right - insertedSequence.length();
1119 if(right < left_splice_pos){
1120 right = left_splice_pos;
1121 }
1122 }
1123 /*
1124 * Now, right and left should be properly transformed into genomic coordinates
1125 * We should be able to deduce how much the alignment matches the insertion
1126 * simply based on the length of the read
1127 */
1128 int left_match_length = 0;
1129 if(left <= left_splice_pos){
1130 left_match_length = left_splice_pos - left + 1;
1131 }
1132 int right_match_length = 0;
1133 if(right >= right_splice_pos){
1134 right_match_length = right - right_splice_pos + 1;
1135 }
1136 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1137
1138 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1139 return false;
1140
1141 string junction_strand = toks[num_extra_toks + strand_field];
1142 if(junction_strand != "rev" && junction_strand != "fwd"){
1143 fprintf(stderr, "Malformed insertion record\n");
1144 return false;
1145 }
1146
1147 //char* pch = strtok( mismatches, ",");
1148 //unsigned char num_mismatches = 0;
1149 //text_offset holds the left end of the alignment,
1150 //RELATIVE TO the start of the contig
1151
1152 //The 0-based relative position of the left-most character
1153 //before the insertion in the contig
1154 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1155 for (int i=0;i<num_mismatches;i++) {
1156 int mismatch_pos = mismatches[i];
1157 // for reversely mapped reads,
1158 //find the correct mismatched position.
1159 if (sam_flag & 0x0010){
1160 mismatch_pos = spliced_read_len - mismatch_pos - 1;
1161 }
1162
1163 //Only count mismatches outside of the insertion region
1164 // If there is a mismatch within the insertion,
1165 // disallow this hit
1166 if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1167 spl_num_mismatches++;
1168 }else{
1169 return false;
1170 }
1171 }
1172
1173 //TODO: merge with the original cigar string
1174 vector<CigarOp> splcigar;
1175 spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1176 /*
1177 splcigar.push_back(CigarOp(MATCH, left_match_length));
1178 splcigar.push_back(CigarOp(INS, insertion_match_length));
1179 splcigar.push_back(CigarOp(MATCH, right_match_length));
1180 */
1181 /*
1182 * For now, disallow hits that don't span
1183 * the insertion. If we allow these types of hits,
1184 * then long_spanning.cpp needs to be updated
1185 * in order to intelligently merge these kinds
1186 * of reads back together
1187 *
1188 * Following code has been changed to allow segment that end
1189 * in an insertion
1190 */
1191 bh = create_hit(name,
1192 contig,
1193 left,
1194 //splcigar,
1195 splcigar,
1196 sam_flag & 0x0010,
1197 junction_strand == "rev",
1198 spl_num_mismatches,
1199 0,
1200 end);
1201 return true;
1202 } //"ins"
1203 else
1204 {
1205 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1206 int spliced_read_len = strlen(seq_str);
1207 int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
1208 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
1209 int8_t right_splice_pos = spliced_read_len - left_splice_pos;
1210
1211 if (right_splice_pos <= 0 || left_splice_pos <= 0)
1212 return false;
1213
1214 if ((sam_flag & 0x0010) == 0) //######
1215 {
1216 if (left_splice_pos + seg_offset < _anchor_length)
1217 return false;
1218 }
1219 else
1220 {
1221 if (right_splice_pos + seg_offset < _anchor_length)
1222 return false;
1223 }
1224 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1225 //atoi(toks[right_window_edge_field].c_str());
1226 int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
1227
1228 string junction_strand = toks[num_extra_toks + strand_field];
1229 if (!(junction_strand == "rev" || junction_strand == "fwd"))
1230 // || !(orientation == '-' || orientation == '+'))
1231 {
1232 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1233 //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
1234 // junction_strand.c_str(), orientation);
1235 return false;
1236 }
1237
1238 int mismatches_in_anchor = 0;
1239 for (int i=0;i<num_mismatches;i++) {
1240 spl_num_mismatches++;
1241 int mismatch_pos = mismatches[i];
1242 if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
1243 ((sam_flag & 0x0010) != 0 &&
1244 abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
1245 mismatches_in_anchor++;
1246 }
1247
1248 // FIXME: we probably should exclude these hits somewhere, but this
1249 // isn't the right place
1250 vector<CigarOp> splcigar;
1251 CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP ;
1252 spliceCigar(splcigar, samcigar, left_splice_pos, gap_len, right_splice_pos, INS);
1253 /*
1254 splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1255 if(toks[num_extra_toks + junction_type_field] == "del"){
1256 splcigar.push_back(CigarOp(DEL, gap_len));
1257 }else{
1258 splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1259 }
1260 splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1261 */
1262 bh = create_hit(name,
1263 contig,
1264 left,
1265 splcigar,
1266 (sam_flag & 0x0010),
1267 junction_strand == "rev",
1268 spl_num_mismatches,
1269 mismatches_in_anchor,
1270 end);
1271 return true;
1272 }
1273 } //parse splice data
1274 else
1275 {
1276 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1277 //fprintf(stderr, "%s\n", orig_bwt_buf);
1278 // continue;
1279 return false;
1280 }
1281
1282 return false;
1283 }
1284
1285
1286 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1287 BowtieHit& bh, bool strip_slash,
1288 char* name_out, char* name_tags,
1289 char* seq, char* qual) {
1290 if (_sam_header==NULL)
1291 err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1292 const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1293
1294
1295 uint32_t sam_flag = hit_buf->core.flag;
1296
1297 int text_offset = hit_buf->core.pos;
1298 int text_mate_pos = hit_buf->core.mpos;
1299 int target_id = hit_buf->core.tid;
1300 int mate_target_id = hit_buf->core.mtid;
1301
1302 vector<CigarOp> cigar;
1303 bool spliced_alignment = false;
1304 int num_hits = 1;
1305
1306 double mapQ = hit_buf->core.qual;
1307 long double error_prob;
1308 if (mapQ > 0)
1309 {
1310 long double p = (-1.0 * mapQ) / 10.0;
1311 error_prob = pow(10.0L, p);
1312 }
1313 else
1314 {
1315 error_prob = 1.0;
1316 }
1317
1318 //header->target_name[c->tid]
1319
1320 bool end = true;
1321 unsigned int seg_offset = 0;
1322 unsigned int seg_num = 0;
1323 unsigned int num_segs = 0;
1324 // Copy the tag out of the name field before we might wipe it out
1325 char* qname = bam1_qname(hit_buf);
1326 char* pipe = strrchr(qname, '|');
1327 if (pipe)
1328 {
1329 if (name_tags)
1330 strcpy(name_tags, pipe);
1331
1332 char* tag_buf = pipe + 1;
1333 if (strchr(tag_buf, ':'))
1334 {
1335 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1336 if (seg_num + 1 == num_segs)
1337 end = true;
1338 else
1339 end = false;
1340 }
1341
1342 *pipe = 0;
1343 }
1344
1345
1346 if (target_id < 0) {
1347 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1348 bh = create_hit(qname,
1349 "*", //ref_name
1350 0, //left coord
1351 0, //read_len
1352 false, //antisense_aln
1353 0, //edit_dist
1354 end);
1355 return true;
1356 }
1357
1358 //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1359 string text_name = _sam_header->target_name[target_id];
1360 for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1361 {
1362 //char* t;
1363
1364 int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1365 if (length <= 0)
1366 {
1367 fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1368 return false;
1369 }
1370
1371 CigarOpCode opcode;
1372 switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1373 {
1374 case BAM_CMATCH: opcode = MATCH; break;
1375 case BAM_CINS: opcode = INS; break;
1376 case BAM_CDEL: opcode = DEL; break;
1377 case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1378 case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1379 case BAM_CPAD: opcode = PAD; break;
1380 case BAM_CREF_SKIP:
1381 opcode = REF_SKIP;
1382 spliced_alignment = true;
1383 if (length > (int)max_report_intron_length)
1384 {
1385 //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1386 return false;
1387 }
1388 break;
1389 default:
1390 fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1391 return false;
1392 }
1393 if (opcode != HARD_CLIP)
1394 cigar.push_back(CigarOp(opcode, length));
1395 }
1396
1397 string mrnm;
1398 if (mate_target_id >= 0) {
1399 if (mate_target_id == target_id) {
1400 //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1401 mrnm = _sam_header->target_name[mate_target_id];
1402 }
1403 else {
1404 //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1405 return false;
1406 }
1407 }
1408 else {
1409 text_mate_pos = 0;
1410 }
1411 //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1412 bool antisense_splice=false;
1413 unsigned char num_mismatches = 0;
1414 unsigned char num_splice_anchor_mismatches = 0;
1415
1416 uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1417 if (ptr) {
1418 char src_strand_char = bam_aux2A(ptr);
1419 if (src_strand_char == '-')
1420 antisense_splice = true;
1421 // else if (src_strand_char == '+')
1422 // source_strand = CUFF_FWD;
1423 }
1424
1425 ptr = bam_aux_get(hit_buf, "NM");
1426 if (ptr) {
1427 num_mismatches = bam_aux2i(ptr);
1428 }
1429
1430 ptr = bam_aux_get(hit_buf, "NH");
1431 if (ptr) {
1432 num_hits = bam_aux2i(ptr);
1433 }
1434
1435 //bool antisense_aln = bam1_strand(hit_buf);
1436
1437 //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1438 // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1439 if (spliced_alignment) {
1440 //if (source_strand == CUFF_STRAND_UNKNOWN) {
1441 // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1442 // }
1443 bh = create_hit(qname,
1444 text_name,
1445 text_offset, // BAM files are 0-indexed
1446 cigar,
1447 sam_flag & 0x0010,
1448 antisense_splice,
1449 num_mismatches,
1450 num_splice_anchor_mismatches,
1451 end);
1452
1453 }
1454 else {
1455 //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1456 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1457 bh = create_hit(qname,
1458 text_name,
1459 text_offset, // BAM files are 0-indexed
1460 cigar,
1461 sam_flag & 0x0010,
1462 false,
1463 num_mismatches,
1464 0,
1465 end);
1466 }
1467 if (seq!=NULL) {
1468 char *bseq = (char*)bam1_seq(hit_buf);
1469 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1470 char v = bam1_seqi(bseq,i);
1471 seq[i]=bam_nt16_rev_table[v];
1472 }
1473 seq[hit_buf->core.l_qseq]=0;
1474 }
1475 if (qual!=NULL) {
1476 char *bq = (char*)bam1_qual(hit_buf);
1477 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1478 qual[i]=bq[i]+33;
1479 }
1480 qual[hit_buf->core.l_qseq]=0;
1481 }
1482 return true;
1483 }
1484
1485 bool BAMHitFactory::inspect_header(HitStream& hs)
1486 {
1487 bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1488
1489 if (header == NULL) {
1490 fprintf(stderr, "Warning: No BAM header\n");
1491 return false;
1492 }
1493 if (header->l_text == 0) {
1494 fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1495 return false;
1496 }
1497 return true;
1498 }
1499
1500
1501 void get_mapped_reads(FILE* bwtf,
1502 HitTable& hits,
1503 HitFactory& hit_factory,
1504 bool strip_slash,
1505 bool verbose)
1506 {
1507
1508
1509 char bwt_buf[2048];
1510 uint32_t reads_extracted = 0;
1511
1512 while (fgets(bwt_buf, 2048, bwtf))
1513 {
1514 // Chomp the newline
1515 char* nl = strrchr(bwt_buf, '\n');
1516 if (nl) *nl = 0;
1517 if (*bwt_buf == 0)
1518 continue;
1519 // Get a new record from the tab-delimited Bowtie map
1520 BowtieHit bh;
1521 if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1522 {
1523 // Only check uniqueness if these hits are spliced
1524 hits.add_hit(bh, true);
1525 }
1526 reads_extracted++;
1527 }
1528
1529 // This will sort the map by insert id.
1530 hits.finalize();
1531 fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1532 }
1533
1534
1535 /*
1536 AlignStatus status(const BowtieHit* align)
1537 {
1538 if (!align)
1539 return UNALIGNED;
1540 if (align->contiguous())
1541 return CONTIGUOUS;
1542 return SPLICED;
1543 }
1544 */
1545
1546 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1547 {
1548 int max_hit_pos = -1;
1549 for (size_t i = 0; i < hits.size(); ++i)
1550 {
1551 max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1552 }
1553
1554 if ((int)DoC.size() < max_hit_pos)
1555 DoC.resize(max_hit_pos);
1556
1557 for (size_t i = 0; i < hits.size(); ++i)
1558 {
1559 const BowtieHit& bh = hits[i];
1560
1561 // split up the coverage contibution for this reads
1562 size_t j = bh.left();
1563 const vector<CigarOp>& cigar = bh.cigar();
1564
1565 for (size_t c = 0 ; c < cigar.size(); ++c)
1566 {
1567 switch(cigar[c].opcode)
1568 {
1569 case MATCH:
1570 for (size_t m = 0; m < cigar[c].length; ++m)
1571 {
1572 if (DoC[j + m] < 0xFFFF)
1573 DoC[j + m]++;
1574 }
1575 //fall through this case to REF_SKIP is intentional
1576 case REF_SKIP:
1577 j += cigar[c].length;
1578 break;
1579 default:
1580 break;
1581 }
1582
1583 }
1584 }
1585 }
1586
1587 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1588 {
1589 if ((int)DoC.size() < bh.right())
1590 DoC.resize(bh.right());
1591
1592 // split up the coverage contibution for this reads
1593 size_t j = bh.left();
1594 const vector<CigarOp>& cigar = bh.cigar();
1595
1596 for (size_t c = 0 ; c < cigar.size(); ++c)
1597 {
1598 switch(cigar[c].opcode)
1599 {
1600 case MATCH:
1601 for (size_t m = 0; m < cigar[c].length; ++m)
1602 {
1603 if (DoC[j + m] < 0xFFFFFFFF)
1604 DoC[j + m]++;
1605 }
1606 //fall through this case to REF_SKIP is intentional
1607 case REF_SKIP:
1608 j += cigar[c].length;
1609 break;
1610 default:
1611 break;
1612 }
1613
1614 }
1615 }
1616
1617 void print_hit(FILE* fout,
1618 const char* read_name,
1619 const BowtieHit& bh,
1620 const char* ref_name,
1621 const char* sequence,
1622 const char* qualities,
1623 bool from_bowtie)
1624 {
1625 string seq;
1626 string quals;
1627 if (sequence)
1628 {
1629 seq = sequence;
1630 quals = qualities;
1631 seq.resize(bh.read_len());
1632 quals.resize(bh.read_len());
1633 }
1634 else
1635 {
1636 seq = "*";
1637 }
1638
1639 if (qualities)
1640 {
1641 quals = qualities;
1642 quals.resize(bh.read_len());
1643 }
1644 else
1645 {
1646 quals = "*";
1647 }
1648
1649 uint32_t sam_flag = 0;
1650 if (bh.antisense_align())
1651 {
1652 sam_flag |= 0x0010; // BAM_FREVERSE
1653 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1654 {
1655 reverse_complement(seq);
1656 reverse(quals.begin(), quals.end());
1657 }
1658 }
1659
1660 uint32_t sam_pos = bh.left() + 1;
1661 uint32_t map_quality = 255;
1662 char cigar[256];
1663 cigar[0] = 0;
1664 string mate_ref_name = "*";
1665 uint32_t mate_pos = 0;
1666 uint32_t insert_size = 0;
1667 //string qualities = "*";
1668
1669 const vector<CigarOp>& bh_cigar = bh.cigar();
1670
1671 /*
1672 * In addition to calculating the cigar string,
1673 * we need to figure out how many in/dels are in the
1674 * sequence, so that we can give the correct
1675 * value for the NM tag
1676 */
1677 int indel_distance = 0;
1678 for (size_t c = 0; c < bh_cigar.size(); ++c)
1679 {
1680 char ibuf[64];
1681 sprintf(ibuf, "%d", bh_cigar[c].length);
1682 switch(bh_cigar[c].opcode)
1683 {
1684 case MATCH:
1685 strcat(cigar, ibuf);
1686 strcat(cigar, "M");
1687 break;
1688 case INS:
1689 strcat(cigar, ibuf);
1690 strcat(cigar, "I");
1691 indel_distance += bh_cigar[c].length;
1692 break;
1693 case DEL:
1694 strcat(cigar, ibuf);
1695 strcat(cigar, "D");
1696 indel_distance += bh_cigar[c].length;
1697 break;
1698 case REF_SKIP:
1699 strcat(cigar, ibuf);
1700 strcat(cigar, "N");
1701 break;
1702 default:
1703 break;
1704 }
1705 }
1706
1707 //string q = string(bh.read_len(), '!');
1708 //string s = string(bh.read_len(), 'N');
1709
1710 fprintf(fout,
1711 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1712 read_name,
1713 sam_flag,
1714 ref_name,
1715 sam_pos,
1716 map_quality,
1717 cigar,
1718 mate_ref_name.c_str(),
1719 mate_pos,
1720 insert_size,
1721 seq.c_str(),
1722 quals.c_str());
1723
1724 if (!sam_readgroup_id.empty())
1725 fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1726
1727 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1728
1729 bool containsSplice = false;
1730 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1731 if(itr->opcode == REF_SKIP){
1732 containsSplice = true;
1733 break;
1734 }
1735 }
1736
1737 if (containsSplice)
1738 fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1739
1740 fprintf(fout, "\n");
1741 }
1742
1743 void print_bamhit(GBamWriter& wbam,
1744 const char* read_name,
1745 const BowtieHit& bh,
1746 const char* ref_name,
1747 const char* sequence,
1748 const char* qualities,
1749 bool from_bowtie)
1750 {
1751 string seq;
1752 string quals;
1753 if (sequence) {
1754 seq = sequence;
1755 quals = qualities;
1756 seq.resize(bh.read_len());
1757 quals.resize(bh.read_len());
1758 }
1759 else {
1760 seq = "*";
1761 }
1762 if (qualities) {
1763 quals = qualities;
1764 quals.resize(bh.read_len());
1765 }
1766 else
1767 {
1768 quals = "*";
1769 }
1770
1771 uint32_t sam_flag = 0;
1772 if (bh.antisense_align())
1773 {
1774 sam_flag |= 0x0010; // BAM_FREVERSE
1775 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1776 {
1777 reverse_complement(seq);
1778 reverse(quals.begin(), quals.end());
1779 }
1780 }
1781
1782 uint32_t sam_pos = bh.left() + 1;
1783 uint32_t map_quality = 255;
1784 char cigar[256];
1785 cigar[0] = 0;
1786 string mate_ref_name = "*";
1787 uint32_t mate_pos = 0;
1788 uint32_t insert_size = 0;
1789 //string qualities = "*";
1790
1791 const vector<CigarOp>& bh_cigar = bh.cigar();
1792 /*
1793 * In addition to calculating the cigar string,
1794 * we need to figure out how many in/dels are in the
1795 * sequence, so that we can give the correct
1796 * value for the NM tag
1797 */
1798 int indel_distance = 0;
1799 for (size_t c = 0; c < bh_cigar.size(); ++c)
1800 {
1801 char ibuf[64];
1802 sprintf(ibuf, "%d", bh_cigar[c].length);
1803 switch(bh_cigar[c].opcode)
1804 {
1805 case MATCH:
1806 strcat(cigar, ibuf);
1807 strcat(cigar, "M");
1808 break;
1809 case INS:
1810 strcat(cigar, ibuf);
1811 strcat(cigar, "I");
1812 indel_distance += bh_cigar[c].length;
1813 break;
1814 case DEL:
1815 strcat(cigar, ibuf);
1816 strcat(cigar, "D");
1817 indel_distance += bh_cigar[c].length;
1818 break;
1819 case REF_SKIP:
1820 strcat(cigar, ibuf);
1821 strcat(cigar, "N");
1822 break;
1823 default:
1824 break;
1825 }
1826 }
1827
1828 //string q = string(bh.read_len(), '!');
1829 //string s = string(bh.read_len(), 'N');
1830 /*
1831 fprintf(fout,
1832 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1833 read_name,
1834 sam_flag,
1835 ref_name,
1836 sam_pos,
1837 map_quality,
1838 cigar,
1839 mate_ref_name.c_str(),
1840 mate_pos,
1841 insert_size,
1842 seq.c_str(),
1843 quals.c_str());
1844
1845 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1846 */
1847 vector<string> auxdata;
1848
1849 if (!sam_readgroup_id.empty())
1850 {
1851 string nm("RG:Z:");
1852 nm += sam_readgroup_id;
1853 auxdata.push_back(nm);
1854 }
1855
1856 string nm("NM:i:");
1857 str_appendInt(nm, bh.edit_dist() + indel_distance);
1858 auxdata.push_back(nm);
1859 bool containsSplice = false;
1860 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1861 if(itr->opcode == REF_SKIP){
1862 containsSplice = true;
1863 break;
1864 }
1865
1866 if (containsSplice) {
1867 //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1868 nm="XS:A:";
1869 nm+=(char)(bh.antisense_splice() ? '-' : '+');
1870 auxdata.push_back(nm);
1871 }
1872
1873 GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1874 cigar, mate_ref_name.c_str(), mate_pos,
1875 insert_size, seq.c_str(), quals.c_str(), &auxdata);
1876
1877
1878
1879 wbam.write(brec);
1880 delete brec;
1881 }
1882
1883 /**
1884 * Print a vector of cigar operations to a file.
1885 * @param bh_cigar A vector of CigarOps
1886 * @return a string representation of the cigar string
1887 */
1888 std::string print_cigar(vector<CigarOp>& bh_cigar){
1889 char cigar[256];
1890 cigar[0] = 0;
1891 for (size_t c = 0; c < bh_cigar.size(); ++c)
1892 {
1893 char ibuf[64];
1894 sprintf(ibuf, "%d", bh_cigar[c].length);
1895 switch(bh_cigar[c].opcode)
1896 {
1897 case MATCH:
1898 strcat(cigar, ibuf);
1899 strcat(cigar, "M");
1900 break;
1901 case INS:
1902 strcat(cigar, ibuf);
1903 strcat(cigar, "I");
1904 break;
1905 case DEL:
1906 strcat(cigar, ibuf);
1907 strcat(cigar, "D");
1908 break;
1909 case REF_SKIP:
1910 strcat(cigar, ibuf);
1911 strcat(cigar, "N");
1912 break;
1913 default:
1914 break;
1915 }
1916 }
1917 string result(cigar);
1918 return result;
1919 }
1920
1921 bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
1922 {
1923 RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
1924 if (!ref_str)
1925 return false;
1926
1927 const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
1928
1929 size_t pos_seq = 0;
1930 size_t pos_ref = 0;
1931 size_t mismatch = 0;
1932 for (size_t i = 0; i < _cigar.size(); ++i)
1933 {
1934 CigarOp cigar = _cigar[i];
1935 switch(cigar.opcode)
1936 {
1937 case MATCH:
1938 {
1939 for (size_t j = 0; j < cigar.length; ++j)
1940 {
1941 if (_seq[pos_seq++] != ref_seq[pos_ref++])
1942 ++mismatch;
1943 }
1944 }
1945 break;
1946 case INS:
1947 {
1948 pos_seq += cigar.length;
1949 }
1950 break;
1951
1952 case DEL:
1953 case REF_SKIP:
1954 {
1955 pos_ref += cigar.length;
1956 }
1957 break;
1958
1959 default:
1960 break;
1961 }
1962 }
1963
1964 return mismatch == _edit_dist;
1965 }