ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 135
Committed: Mon Dec 12 22:28:38 2011 UTC (8 years, 5 months ago) by gpertea
File size: 52959 byte(s)
Log Message:
wip - SplicedSAMHitFactory() still not implemented

Line File contents
1 /*
2 * bwt_map.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/17/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <cassert>
15 #include <cstdlib>
16 #include <iostream>
17 #include <set>
18 #include <vector>
19 #include <cmath>
20
21 #include "common.h"
22 #include "bwt_map.h"
23 #include "tokenize.h"
24 #include "reads.h"
25
26 using namespace std;
27
28 void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29 {
30 uint32_t reference_id = bh.ref_id();
31
32 pair<RefHits::iterator, bool> ret =
33 _hits_for_ref.insert(make_pair(reference_id, HitList()));
34 HitList& hl = ret.first->second;
35
36 if (check_uniqueness)
37 {
38 // Check uniqueness, in case we are adding spliced hits from
39 // several spliced alignment sources (e.g. de novo hashing + Bowtie
40 // against a user-supplied index). We don't want to count the same
41 // alignment twice if it happened to be found by more than one method
42 HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43 HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44 for (; lb != ub && lb != hl.end(); ++lb)
45 {
46 if (*lb == bh)
47 {
48 //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49 return;
50 }
51
52 if (lb->insert_id() == bh.insert_id() &&
53 lb->ref_id() == bh.ref_id() &&
54 lb->antisense_align() == bh.antisense_align())
55 {
56 // If we get here, we may be looking at the same alignment
57 // However, spanning_reads may report a shorter, trimmed alignment
58 // so not all fields will be equal. If they just disagree on the
59 // ends, and don't indicate a different junction coord, the
60 // alignments are the same.
61
62 if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63 (bh.left() <= lb->left() && bh.right() >= lb->right()))
64 {
65 vector<pair<int,int> > lb_gaps, bh_gaps;
66 lb->gaps(lb_gaps);
67 bh.gaps(bh_gaps);
68 if (lb_gaps == bh_gaps)
69 {
70 // One alignment is contained in the other, they agree on
71 // where the gaps, if any, are, and they share an id
72 // => this is a redundant aligment, so toss it
73 //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74 return;
75 }
76 }
77 }
78 }
79 }
80 _total_hits++;
81 hl.push_back(bh);
82 }
83
84 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85 {
86 return h1.insert_id() < h2.insert_id();
87 }
88
89 void LineHitFactory::openStream(HitStream& hs)
90 {
91 if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92 //open the file for HitStream here
93 hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94 if (hs._hit_file==NULL)
95 err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96 return;
97 }
98 if (hs._fzpipe!=NULL) {
99 hs._hit_file=hs._fzpipe->file;
100 }
101 }
102
103 void LineHitFactory::rewind(HitStream& hs)
104 {
105 if (hs._fzpipe!=NULL) {
106 hs._fzpipe->rewind();
107 hs._hit_file=hs._fzpipe->file;
108 }
109 else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110 }
111
112 bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113 FILE* f=(FILE *)(hs._hit_file);
114 bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115 if (!new_rec || feof(f)) {
116 hs._eof=true;
117 return false;
118 }
119 ++_line_num;
120 char* nl = strrchr(_hit_buf, '\n');
121 if (nl) *nl = 0;
122 buf = _hit_buf;
123 buf_size = _hit_buf_max_sz - 1;
124 return true;
125 }
126
127 void LineHitFactory::closeStream(HitStream& hs) {
128 if (hs._fzpipe!=NULL) {
129 hs._fzpipe->close();
130 return;
131 }
132 if (hs._hit_file!=NULL) {
133 fclose((FILE*)(hs._hit_file));
134 hs._hit_file=NULL;
135 }
136 }
137 void BAMHitFactory::openStream(HitStream& hs) {
138 if (hs._hit_file==NULL) {
139 if (hs._hit_file_name.empty())
140 //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141 return; //invalid stream, could be just a place holder
142 //open the file here if not already open
143 string fext=getFext(hs._hit_file_name);
144 if (fext=="sam")
145 hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146 else
147 hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148
149 samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150
151 if (sam_file == NULL)
152 err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153 if (sam_file->header == NULL)
154 err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155 memset(&_next_hit, 0, sizeof(_next_hit));
156 //_beginning = bgzf_tell(sam_file->x.bam);
157 _sam_header=sam_file->header;
158 if (inspect_header(hs) == false)
159 err_die("Error: invalid SAM header for file %s\n",
160 hs._hit_file_name.c_str());
161 }
162 }
163
164 void BAMHitFactory::closeStream(HitStream& hs) {
165 if (hs._hit_file) {
166 samclose((samfile_t*)(hs._hit_file));
167 }
168 hs._hit_file=NULL;
169 _sam_header=NULL;
170 }
171
172 void BAMHitFactory::rewind(HitStream& hs)
173 {
174 /*
175 if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176 {
177 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178 _eof = false;
179 }
180 */
181 this->closeStream(hs);
182 this->openStream(hs);
183 }
184
185 string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186 const bam1_t* bamrec=(const bam1_t*)hit_buf;
187 char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188 string sam_line(tamline);
189 free(tamline);
190 return sam_line;
191 }
192
193 bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194 if (_next_hit.data) {
195 free(_next_hit.data);
196 _next_hit.data = NULL;
197 }
198 _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199 if (hs.eof() || !hs.ready()) return false;
200
201 //mark_curr_pos();
202
203 memset(&_next_hit, 0, sizeof(_next_hit));
204
205 int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206 if (bytes_read <= 0) {
207 hs._eof = true;
208 return false;
209 }
210 buf = (const char*)&_next_hit;
211 buf_size = bytes_read;
212 return true;
213 }
214
215
216 BowtieHit HitFactory::create_hit(const string& insert_name,
217 const string& ref_name,
218 int left,
219 const vector<CigarOp>& cigar,
220 bool antisense_aln,
221 bool antisense_splice,
222 unsigned char edit_dist,
223 unsigned char splice_mms,
224 bool end)
225 {
226 uint64_t insert_id = _insert_table.get_id(insert_name);
227 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228
229 return BowtieHit(reference_id,
230 insert_id,
231 left,
232 cigar,
233 antisense_aln,
234 antisense_splice,
235 edit_dist,
236 splice_mms,
237 end);
238 }
239
240 BowtieHit HitFactory::create_hit(const string& insert_name,
241 const string& ref_name,
242 uint32_t left,
243 uint32_t read_len,
244 bool antisense_aln,
245 unsigned char edit_dist,
246 bool end)
247 {
248 uint64_t insert_id = _insert_table.get_id(insert_name);
249 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250
251 return BowtieHit(reference_id,
252 insert_id,
253 left,
254 read_len,
255 antisense_aln,
256 edit_dist,
257 end);
258 }
259
260 bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261 BowtieHit& bh,
262 bool strip_slash,
263 char* name_out,
264 char* name_tags,
265 char* seq,
266 char* qual)
267 {
268 if (!orig_bwt_buf || !*orig_bwt_buf)
269 return false;
270
271 static const int buf_size = 2048;
272
273 char bwt_buf[buf_size];
274 strcpy(bwt_buf, orig_bwt_buf);
275 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276
277 char orientation;
278
279 //int bwtf_ret = 0;
280 //uint32_t seqid = 0;
281
282 char text_name[buf_size];
283 unsigned int text_offset;
284
285
286 //unsigned int other_occs;
287 char mismatches[buf_size];
288 //memset(mismatches, 0, sizeof(mismatches));
289
290 const char* buf = bwt_buf;
291 char* name = get_token((char**)&buf,"\t");
292 char* orientation_str = get_token((char**)&buf,"\t");
293 char* text_name_str = get_token((char**)&buf,"\t");
294
295 strcpy(text_name, text_name_str);
296
297 char* text_offset_str = get_token((char**)&buf,"\t");
298 char* seq_str = get_token((char**)&buf,"\t");
299 if (seq)
300 strcpy(seq, seq_str);
301
302 const char* qual_str = get_token((char**)&buf,"\t");
303 if (qual)
304 strcpy(qual, qual_str);
305
306 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307 mismatches[0] = 0;
308 char* mismatches_str = get_token((char**)&buf, "\t");
309 if (mismatches_str)
310 strcpy(mismatches, mismatches_str);
311
312 orientation = orientation_str[0];
313 text_offset = atoi(text_offset_str);
314
315 bool end = true;
316 unsigned int seg_offset = 0;
317 unsigned int seg_num = 0;
318 unsigned int num_segs = 0;
319
320 // Copy the tag out of the name field before we might wipe it out
321 char* pipe = strrchr(name, '|');
322 if (pipe)
323 {
324 if (name_tags)
325 strcpy(name_tags, pipe);
326
327 char* tag_buf = pipe + 1;
328 if (strchr(tag_buf, ':'))
329 {
330 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331 if (seg_num + 1 == num_segs)
332 end = true;
333 else
334 end = false;
335 }
336
337 *pipe = 0;
338 }
339 // Stripping the slash and number following it gives the insert name
340 char* slash = strrchr(name, '/');
341 if (strip_slash && slash)
342 *slash = 0;
343
344 size_t read_len = strlen(seq_str);
345
346 // Add this alignment to the table of hits for this half of the
347 // Bowtie map
348
349 //vector<string> mismatch_toks;
350 char* pch = strtok (mismatches,",");
351 unsigned char num_mismatches = 0;
352 while (pch != NULL)
353 {
354 char* colon = strchr(pch, ':');
355 if (colon)
356 {
357 num_mismatches++;
358 }
359 //mismatch_toks.push_back(pch);
360 pch = strtok (NULL, ",");
361 }
362
363 bh = create_hit(name,
364 text_name,
365 text_offset,
366 (int)read_len,
367 orientation == '-',
368 num_mismatches,
369 end);
370
371 return true;
372 }
373
374 int anchor_mismatch = 0;
375
376
377 void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
378 bool &end, unsigned int &seg_offset, unsigned int& seg_num,
379 unsigned int & num_segs) {
380 char* pipe = strrchr(name, '|');
381 if (pipe)
382 {
383 if (name_tags)
384 strcpy(name_tags, pipe);
385
386 char* tag_buf = pipe + 1;
387 if (strchr(tag_buf, ':'))
388 {
389 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
390 if (seg_num + 1 == num_segs)
391 end = true;
392 else
393 end = false;
394 }
395
396 *pipe = 0;
397 }
398
399 // Stripping the slash and number following it gives the insert name
400 char* slash = strrchr(name, '/');
401 if (strip_slash && slash)
402 *slash = 0;
403 }
404
405
406 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
407 BowtieHit& bh,
408 bool strip_slash,
409 char* name_out,
410 char* name_tags,
411 char* seq,
412 char* qual)
413 {
414 if (!orig_bwt_buf || !*orig_bwt_buf)
415 return false;
416
417 static const int buf_size = 2048;
418
419 char bwt_buf[buf_size];
420 strcpy(bwt_buf, orig_bwt_buf);
421 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
422
423 char orientation;
424 char text_name[buf_size];
425 unsigned int text_offset;
426 char mismatches[buf_size];
427 //memset(mismatches, 0, sizeof(mismatches));
428
429 char* buf = bwt_buf;
430 char* name = get_token((char**)&buf,"\t");
431 char* orientation_str = get_token((char**)&buf,"\t");
432 char* text_name_str = get_token((char**)&buf,"\t");
433 strcpy(text_name, text_name_str);
434
435 char* text_offset_str = get_token((char**)&buf,"\t");
436 char* seq_str = get_token((char**)&buf,"\t");
437 if (seq)
438 strcpy(seq, seq_str);
439
440 const char* qual_str = get_token((char**)&buf,"\t");
441 if (qual)
442 strcpy(qual, qual_str);
443
444 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
445 mismatches[0] = 0;
446 char* mismatches_str = get_token((char**)&buf, "\t");
447 if (mismatches_str)
448 strcpy(mismatches, mismatches_str);
449
450 orientation = orientation_str[0];
451 text_offset = atoi(text_offset_str);
452
453 bool end = true;
454 unsigned int seg_offset = 0;
455 unsigned int seg_num = 0;
456 unsigned int num_segs = 0;
457
458 // Copy the tag out of the name field before we might wipe it out
459 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
460
461 // Add this alignment to the table of hits for this half of the
462 // Bowtie map
463
464 // Parse the text_name field to recover the splice coords
465 vector<string> toks;
466
467 tokenize_strict(text_name, "|", toks);
468
469 int num_extra_toks = (int)toks.size() - 6;
470
471 if (num_extra_toks >= 0)
472 {
473 static const uint8_t left_window_edge_field = 1;
474 static const uint8_t splice_field = 2;
475 //static const uint8_t right_window_edge_field = 3;
476 static const uint8_t junction_type_field = 4;
477 static const uint8_t strand_field = 5;
478
479 string contig = toks[0];
480 for (int t = 1; t <= num_extra_toks; ++t)
481 {
482 contig += "|";
483 contig += toks[t];
484 }
485
486 vector<string> splice_toks;
487 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
488 if (splice_toks.size() != 2)
489 {
490 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
491 //fprintf(stderr, "%s (token: %s)\n", text_name,
492 // toks[num_extra_toks + splice_field].c_str());
493 return false;
494 }
495
496 //
497 // check for an insertion hit
498 //
499 if(toks[num_extra_toks + junction_type_field] == "ins")
500 {
501 int8_t spliced_read_len = strlen(seq_str);
502 /*
503 * The 0-based position of the left edge of the alignment. Note that this
504 * value may need to be futher corrected to account for the presence of
505 * of the insertion.
506 */
507 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
508 uint32_t right = left + spliced_read_len - 1;
509
510 /*
511 * The 0-based position of the last genomic sequence before the insertion
512 */
513 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
514
515 string insertedSequence = splice_toks[1];
516 /*
517 * The 0-based position of the first genomic sequence after teh insertion
518 */
519 uint32_t right_splice_pos = left_splice_pos + 1;
520 if(left > left_splice_pos){
521 /*
522 * The genomic position of the left edge of the alignment needs to be corrected
523 * If the alignment does not extend into the insertion, simply subtract the length
524 * of the inserted sequence, otherwise, just set it equal to the right edge
525 */
526 left = left - insertedSequence.length();
527 if(left < right_splice_pos){
528 left = right_splice_pos;
529 }
530 }
531 if(right > left_splice_pos){
532 right = right - insertedSequence.length();
533 if(right < left_splice_pos){
534 right = left_splice_pos;
535 }
536 }
537 /*
538 * Now, right and left should be properly transformed into genomic coordinates
539 * We should be able to deduce how much the alignment matches the insertion
540 * simply based on the length of the read
541 */
542 int left_match_length = 0;
543 if(left <= left_splice_pos){
544 left_match_length = left_splice_pos - left + 1;
545 }
546 int right_match_length = 0;
547 if(right >= right_splice_pos){
548 right_match_length = right - right_splice_pos + 1;
549 }
550 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
551
552 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
553 return false;
554
555 string junction_strand = toks[num_extra_toks + strand_field];
556 if(junction_strand != "rev" && junction_strand != "fwd"){
557 fprintf(stderr, "Malformed insertion record\n");
558 return false;
559 }
560
561 char* pch = strtok( mismatches, ",");
562 unsigned char num_mismatches = 0;
563
564 /*
565 * remember that text_offset holds the left end of the
566 * alignment, relative to the start of the contig
567 */
568
569 /*
570 * The 0-based relative position of the left-most character
571 * before the insertion in the contig
572 */
573 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
574 while (pch != NULL)
575 {
576 char* colon = strchr(pch, ':');
577 if (colon)
578 {
579 *colon = 0;
580 int mismatch_pos = atoi(pch);
581
582 /*
583 * for reversely mapped reads,
584 * find the correct mismatched position.
585 */
586 if(orientation == '-'){
587 mismatch_pos = spliced_read_len - mismatch_pos - 1;
588 }
589
590 /*
591 * Only count mismatches outside of the insertion region
592 * If there is a mismatch within the insertion,
593 * disallow this hit
594 */
595 if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
596 num_mismatches++;
597 }else{
598 return false;
599 }
600 }
601 pch = strtok (NULL, ",");
602 }
603
604
605 vector<CigarOp> cigar;
606 cigar.push_back(CigarOp(MATCH, left_match_length));
607 cigar.push_back(CigarOp(INS, insertion_match_length));
608 cigar.push_back(CigarOp(MATCH, right_match_length));
609
610 /*
611 * For now, disallow hits that don't span
612 * the insertion. If we allow these types of hits,
613 * then long_spanning.cpp needs to be updated
614 * in order to intelligently merge these kinds
615 * of reads back together
616 *
617 * Following code has been changed to allow segment that end
618 * in an insertion
619 */
620 bh = create_hit(name,
621 contig,
622 left,
623 cigar,
624 orientation == '-',
625 junction_strand == "rev",
626 num_mismatches,
627 0,
628 end);
629 return true;
630 }
631
632 else
633 {
634 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
635 int spliced_read_len = strlen(seq_str);
636 int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
637 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
638 int8_t right_splice_pos = spliced_read_len - left_splice_pos;
639
640 if (right_splice_pos <= 0 || left_splice_pos <= 0)
641 return false;
642
643 if (orientation == '+')
644 {
645 if (left_splice_pos + seg_offset < _anchor_length){
646 return false;
647 }
648 }
649 else
650 {
651 if (right_splice_pos + seg_offset < _anchor_length)
652 return false;
653 }
654 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
655 //atoi(toks[right_window_edge_field].c_str());
656 int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
657
658 string junction_strand = toks[num_extra_toks + strand_field];
659 if (!(junction_strand == "rev" || junction_strand == "fwd")||
660 !(orientation == '-' || orientation == '+'))
661 {
662 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
663 //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
664 // junction_strand.c_str(), orientation);
665 return false;
666 }
667
668 //vector<string> mismatch_toks;
669 char* pch = strtok (mismatches,",");
670 int mismatches_in_anchor = 0;
671 unsigned char num_mismatches = 0;
672 while (pch != NULL)
673 {
674 char* colon = strchr(pch, ':');
675 if (colon)
676 {
677 *colon = 0;
678 num_mismatches++;
679 int mismatch_pos = atoi(pch);
680 if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
681 (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
682 mismatches_in_anchor++;
683 }
684 //mismatch_toks.push_back(pch);
685 pch = strtok (NULL, ",");
686 }
687
688 // FIXME: we probably should exclude these hits somewhere, but this
689 // isn't the right place
690 vector<CigarOp> cigar;
691 cigar.push_back(CigarOp(MATCH, left_splice_pos));
692 if(toks[num_extra_toks + junction_type_field] == "del"){
693 cigar.push_back(CigarOp(DEL, gap_len));
694 }else{
695 cigar.push_back(CigarOp(REF_SKIP, gap_len));
696 }
697 cigar.push_back(CigarOp(MATCH, right_splice_pos));
698
699 bh = create_hit(name,
700 contig,
701 left,
702 cigar,
703 orientation == '-',
704 junction_strand == "rev",
705 num_mismatches,
706 mismatches_in_anchor,
707 end);
708 return true;
709 }
710 }
711 else
712 {
713 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
714 //fprintf(stderr, "%s\n", orig_bwt_buf);
715 // continue;
716 return false;
717 }
718
719 return false;
720 }
721
722
723 char parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
724 bool &spliced_alignment) {
725 // Mostly pilfered direct from the SAM tools:
726 const char* p_cig = cigar_str;
727 while (*p_cig)
728 {
729 char* t;
730 int length = (int)strtol(p_cig, &t, 10);
731 if (length <= 0)
732 {
733 //fprintf (stderr, "CIGAR op has zero length\n");
734 return false;
735 }
736 char op_char = toupper(*t);
737 CigarOpCode opcode;
738 if (op_char == 'M') opcode = MATCH;
739 else if (op_char == 'I') opcode = INS;
740 else if (op_char == 'D') opcode = DEL;
741 else if (op_char == 'N')
742 {
743 if (length > max_report_intron_length)
744 return false;
745 opcode = REF_SKIP;
746 spliced_alignment = true;
747 }
748 else if (op_char == 'S') opcode = SOFT_CLIP;
749 else if (op_char == 'H') opcode = HARD_CLIP;
750 else if (op_char == 'P') opcode = PAD;
751 else
752 {
753 fprintf (stderr, "invalid CIGAR operation\n");
754 return false;
755 }
756 p_cig = t + 1;
757 //i += length;
758 cigar.push_back(CigarOp(opcode, length));
759 }
760 if (*p_cig) {
761 fprintf (stderr, "unmatched CIGAR operation\n");
762 return *p_cig;
763 }
764 return 0;
765 }
766
767 void getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
768 int* mismatches, int& num_mismatches, int& sam_nm, bool& antisense_splice) {
769 const char* tag_buf = buf;
770 sam_nm=0;
771 num_mismatches=0;
772 while((tag_buf = get_token((char**)&buf,"\t")))
773 {
774 vector<string> tuple_fields;
775 tokenize(tag_buf,":", tuple_fields);
776 if (tuple_fields.size() == 3)
777 {
778 if (tuple_fields[0] == "XS")
779 {
780 if (tuple_fields[2] == "-")
781 antisense_splice = true;
782 }
783 else if (tuple_fields[0] == "NM")
784 {
785 sam_nm = atoi(tuple_fields[2].c_str());
786 }
787 else if (tuple_fields[0] == "NS")
788 {
789 //ignored for now
790 }
791 else if (tuple_fields[0] == "MD")
792 {
793 const char* p=tuple_fields[2].c_str();
794 int bi=0; //base offset position in the read
795 while (*p != 0) {
796 if (isdigit(*p)) {
797 int v=atoi(p);
798 do { p++; } while (isdigit(*p));
799 bi+=v;
800 }
801 while (isupper(*p)) {
802 p++;
803 mismatches[num_mismatches]=bi;
804 num_mismatches++;
805 bi++;
806 }
807 if (*p=='^') { //reference deletion
808 p++;
809 while (isupper(*p)) { //insert read bases
810 p++; bi++;
811 }
812 }
813 }
814 }
815 else
816 {
817 //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
818 //return false;
819 }
820 }
821 }
822 /* By convention,the NM field of the SAM record
823 * counts an insertion or deletion. I dont' think
824 * we want the mismatch count in the BowtieHit
825 * record to reflect this. Therefore, subtract out
826 * the mismatches due to in/dels
827 */
828 for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
829 if(itr->opcode == INS){
830 sam_nm -= itr->length;
831 }
832 if(itr->opcode == DEL){
833 sam_nm -= itr->length;
834 }
835 }
836 }
837
838 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
839 BowtieHit& bh,
840 bool strip_slash,
841 char* name_out,
842 char* name_tags,
843 char* seq,
844 char* qual)
845 {
846 if (!orig_bwt_buf || !*orig_bwt_buf)
847 return false;
848 char bwt_buf[2048];
849 strcpy(bwt_buf, orig_bwt_buf);
850 // Are we still in the header region?
851 if (bwt_buf[0] == '@')
852 return false;
853
854 char* buf = bwt_buf;
855 char* name = get_token((char**)&buf,"\t");
856 char* sam_flag_str = get_token((char**)&buf,"\t");
857 char* text_name = get_token((char**)&buf,"\t");
858 char* text_offset_str = get_token((char**)&buf,"\t");
859 const char* map_qual_str = get_token((char**)&buf,"\t");
860 char* cigar_str = get_token((char**)&buf,"\t");
861 const char* mate_ref_str = get_token((char**)&buf,"\t");
862 const char* mate_pos_str = get_token((char**)&buf,"\t");
863 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
864
865 const char* seq_str = get_token((char**)&buf,"\t");
866 if (seq)
867 strcpy(seq, seq_str);
868
869 const char* qual_str = get_token((char**)&buf,"\t");
870 if (qual)
871 strcpy(qual, qual_str);
872
873 if (!name ||
874 !sam_flag_str ||
875 !text_name ||
876 !text_offset_str ||
877 !map_qual_str ||
878 !cigar_str ||
879 !mate_ref_str ||
880 !mate_pos_str ||
881 !inferred_insert_sz_str ||
882 !seq_str ||
883 !qual_str)
884 {
885 // truncated or malformed SAM record
886 return false;
887 }
888
889
890 int sam_flag = atoi(sam_flag_str);
891 int text_offset = atoi(text_offset_str);
892
893 bool end = true;
894 unsigned int seg_offset = 0;
895 unsigned int seg_num = 0;
896 unsigned int num_segs = 0;
897
898 // Copy the tag out of the name field before we might wipe it out
899 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
900
901 vector<CigarOp> cigar;
902 bool spliced_alignment = false;
903 if (parseCigar(cigar, cigar_str, spliced_alignment)) {
904 return false;
905 }
906
907 //vector<string> attributes;
908 //tokenize(tag_buf, " \t",attributes);
909
910 bool antisense_splice = false;
911 int num_mismatches = 0;
912 int sam_nm = 0; //the value of the NM tag (edit distance)
913 int mismatches[1024];
914 getSAMmismatches(buf, cigar, mismatches, num_mismatches, sam_nm, antisense_splice);
915
916 if (spliced_alignment)
917 {
918 bh = create_hit(name,
919 text_name,
920 text_offset - 1,
921 cigar,
922 sam_flag & 0x0010,
923 antisense_splice,
924 num_mismatches,
925 0,
926 end);
927 }
928 else
929 {
930 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
931 bh = create_hit(name,
932 text_name,
933 text_offset - 1, // SAM files are 1-indexed
934 cigar,
935 sam_flag & 0x0010,
936 false,
937 num_mismatches,
938 0,
939 end);
940 }
941 return true;
942 }
943
944
945 void spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar,
946 int8_t l_len, int mlen, int8_t r_len, CigarOpCode opcode) {
947 //merge the original 'cigar' with the new insert/gap operation
948 // at position l_len into splcigar;
949 //find the original opcode that croses l_len location
950 int ofs=0;
951 bool found=false;
952 for (size_t c = 0 ; c < cigar.size(); ++c)
953 {
954 ofs+=cigar[c].length;
955 if (!found) {
956 if (ofs>=l_len) {
957 found=true;
958 //TODO:inject new code here, splitting existing code if necessary
959
960 continue;
961 }
962 else {
963 //not found yet, just copy these codes
964 splcigar.push_back(cigar[c]);
965 }
966 }
967 else { //found already
968 //TODO: check this
969 splcigar.push_back(cigar[c]);
970 }
971 }
972 }
973
974 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
975 BowtieHit& bh,
976 bool strip_slash,
977 char* name_out,
978 char* name_tags,
979 char* seq,
980 char* qual)
981 {
982 if (!orig_bwt_buf || !*orig_bwt_buf)
983 return false;
984
985 char bwt_buf[2048];
986 strcpy(bwt_buf, orig_bwt_buf);
987 // Are we still in the header region?
988 if (bwt_buf[0] == '@')
989 return false;
990
991 char* buf = bwt_buf;
992 char* name = get_token((char**)&buf,"\t");
993 char* sam_flag_str = get_token((char**)&buf,"\t");
994 char* text_name = get_token((char**)&buf,"\t");
995 char* text_offset_str = get_token((char**)&buf,"\t");
996 const char* map_qual_str = get_token((char**)&buf,"\t");
997 char* cigar_str = get_token((char**)&buf,"\t");
998 const char* mate_ref_str = get_token((char**)&buf,"\t");
999 const char* mate_pos_str = get_token((char**)&buf,"\t");
1000 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1001 int num_mismatches=0;
1002 int mismatches[1024]; //list of 0-based mismatch positions in this read
1003 //parsed from SAM's MD:Z: tag
1004 const char* seq_str = get_token((char**)&buf,"\t");
1005 if (seq)
1006 strcpy(seq, seq_str);
1007
1008 const char* qual_str = get_token((char**)&buf,"\t");
1009 if (qual)
1010 strcpy(qual, qual_str);
1011
1012 if (!name ||
1013 !sam_flag_str ||
1014 !text_name ||
1015 !text_offset_str ||
1016 !map_qual_str ||
1017 !cigar_str ||
1018 !mate_ref_str ||
1019 !mate_pos_str ||
1020 !inferred_insert_sz_str ||
1021 !seq_str ||
1022 !qual_str)
1023 {
1024 // truncated or malformed SAM record
1025 return false;
1026 }
1027
1028
1029 int sam_flag = atoi(sam_flag_str);
1030 int text_offset = atoi(text_offset_str);
1031 text_offset--; //SAM is 1-based, Bowtie is 0-based
1032 bool end = true;
1033 unsigned int seg_offset = 0;
1034 unsigned int seg_num = 0;
1035 unsigned int num_segs = 0;
1036
1037 // Copy the tag out of the name field before we might wipe it out
1038 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1039
1040 vector<CigarOp> samcigar;
1041 bool spliced_alignment = false;
1042 if (parseCigar(samcigar, cigar_str, spliced_alignment)) {
1043 return false;
1044 }
1045
1046 bool antisense_splice = false;
1047 int sam_nm = 0;
1048 getSAMmismatches(buf, samcigar, mismatches, num_mismatches, sam_nm, antisense_splice);
1049
1050 //##############################################
1051
1052 // Add this alignment to the table of hits for this half of the
1053 // Bowtie map
1054
1055 // Parse the text_name field to recover the splice coords
1056 vector<string> toks;
1057
1058 tokenize_strict(text_name, "|", toks);
1059
1060 int num_extra_toks = (int)toks.size() - 6;
1061
1062 if (num_extra_toks >= 0)
1063 {
1064 static const uint8_t left_window_edge_field = 1;
1065 static const uint8_t splice_field = 2;
1066 //static const uint8_t right_window_edge_field = 3;
1067 static const uint8_t junction_type_field = 4;
1068 static const uint8_t strand_field = 5;
1069
1070 string contig = toks[0];
1071 for (int t = 1; t <= num_extra_toks; ++t)
1072 {
1073 contig += "|";
1074 contig += toks[t];
1075 }
1076
1077 vector<string> splice_toks;
1078 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1079 if (splice_toks.size() != 2)
1080 {
1081 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1082 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1083 // toks[num_extra_toks + splice_field].c_str());
1084 return false;
1085 }
1086 int spl_num_mismatches=0;
1087 //
1088 // check for an insertion hit
1089 //
1090 if(toks[num_extra_toks + junction_type_field] == "ins")
1091 {
1092 int8_t spliced_read_len = strlen(seq_str);
1093 // The 0-based position of the left edge of the alignment. Note that this
1094 // value may need to be futher corrected to account for the presence of
1095 // of the insertion.
1096 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1097 uint32_t right = left + spliced_read_len - 1;
1098
1099 // The 0-based position of the last genomic sequence before the insertion
1100 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
1101
1102 string insertedSequence = splice_toks[1];
1103 // The 0-based position of the first genomic sequence after the insertion
1104
1105 uint32_t right_splice_pos = left_splice_pos + 1;
1106 if(left > left_splice_pos){
1107 /*
1108 * The genomic position of the left edge of the alignment needs to be corrected
1109 * If the alignment does not extend into the insertion, simply subtract the length
1110 * of the inserted sequence, otherwise, just set it equal to the right edge
1111 */
1112 left = left - insertedSequence.length();
1113 if(left < right_splice_pos){
1114 left = right_splice_pos;
1115 }
1116 }
1117 if(right > left_splice_pos){
1118 right = right - insertedSequence.length();
1119 if(right < left_splice_pos){
1120 right = left_splice_pos;
1121 }
1122 }
1123 /*
1124 * Now, right and left should be properly transformed into genomic coordinates
1125 * We should be able to deduce how much the alignment matches the insertion
1126 * simply based on the length of the read
1127 */
1128 int left_match_length = 0;
1129 if(left <= left_splice_pos){
1130 left_match_length = left_splice_pos - left + 1;
1131 }
1132 int right_match_length = 0;
1133 if(right >= right_splice_pos){
1134 right_match_length = right - right_splice_pos + 1;
1135 }
1136 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1137
1138 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1139 return false;
1140
1141 string junction_strand = toks[num_extra_toks + strand_field];
1142 if(junction_strand != "rev" && junction_strand != "fwd"){
1143 fprintf(stderr, "Malformed insertion record\n");
1144 return false;
1145 }
1146
1147 //char* pch = strtok( mismatches, ",");
1148 //unsigned char num_mismatches = 0;
1149 //text_offset holds the left end of the alignment,
1150 //RELATIVE TO the start of the contig
1151
1152 //The 0-based relative position of the left-most character
1153 //before the insertion in the contig
1154 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1155 for (int i=0;i<num_mismatches;i++) {
1156 int mismatch_pos = mismatches[i];
1157 // for reversely mapped reads,
1158 //find the correct mismatched position.
1159 if (sam_flag & 0x0010){
1160 mismatch_pos = spliced_read_len - mismatch_pos - 1;
1161 }
1162
1163 //Only count mismatches outside of the insertion region
1164 // If there is a mismatch within the insertion,
1165 // disallow this hit
1166 if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1167 spl_num_mismatches++;
1168 }else{
1169 return false;
1170 }
1171 }
1172
1173 //TODO: merge with the original cigar string
1174 vector<CigarOp> splcigar;
1175 spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1176 /*
1177 splcigar.push_back(CigarOp(MATCH, left_match_length));
1178 splcigar.push_back(CigarOp(INS, insertion_match_length));
1179 splcigar.push_back(CigarOp(MATCH, right_match_length));
1180 */
1181 /*
1182 * For now, disallow hits that don't span
1183 * the insertion. If we allow these types of hits,
1184 * then long_spanning.cpp needs to be updated
1185 * in order to intelligently merge these kinds
1186 * of reads back together
1187 *
1188 * Following code has been changed to allow segment that end
1189 * in an insertion
1190 */
1191 bh = create_hit(name,
1192 contig,
1193 left,
1194 //splcigar,
1195 splcigar,
1196 sam_flag & 0x0010,
1197 junction_strand == "rev",
1198 spl_num_mismatches,
1199 0,
1200 end);
1201 return true;
1202 } //"ins"
1203 else
1204 {
1205 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1206 int spliced_read_len = strlen(seq_str);
1207 int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
1208 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
1209 int8_t right_splice_pos = spliced_read_len - left_splice_pos;
1210
1211 if (right_splice_pos <= 0 || left_splice_pos <= 0)
1212 return false;
1213
1214 if ((sam_flag & 0x0010) == 0) //######
1215 {
1216 if (left_splice_pos + seg_offset < _anchor_length)
1217 return false;
1218 }
1219 else
1220 {
1221 if (right_splice_pos + seg_offset < _anchor_length)
1222 return false;
1223 }
1224 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1225 //atoi(toks[right_window_edge_field].c_str());
1226 int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
1227
1228 string junction_strand = toks[num_extra_toks + strand_field];
1229 if (!(junction_strand == "rev" || junction_strand == "fwd"))
1230 // || !(orientation == '-' || orientation == '+'))
1231 {
1232 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1233 //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
1234 // junction_strand.c_str(), orientation);
1235 return false;
1236 }
1237
1238 int mismatches_in_anchor = 0;
1239 for (int i=0;i<num_mismatches;i++) {
1240 spl_num_mismatches++;
1241 int mismatch_pos = mismatches[i];
1242 if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
1243 ((sam_flag & 0x0010) != 0 &&
1244 abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
1245 mismatches_in_anchor++;
1246 }
1247
1248 // FIXME: we probably should exclude these hits somewhere, but this
1249 // isn't the right place
1250 vector<CigarOp> splcigar;
1251 CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP ;
1252 spliceCigar(splcigar, samcigar, left_splice_pos, gap_len, right_splice_pos, opcode);
1253 /*
1254 splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1255 if(toks[num_extra_toks + junction_type_field] == "del"){
1256 splcigar.push_back(CigarOp(DEL, gap_len));
1257 }else{
1258 splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1259 }
1260 splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1261 */
1262 bh = create_hit(name,
1263 contig,
1264 left,
1265 splcigar,
1266 (sam_flag & 0x0010),
1267 junction_strand == "rev",
1268 spl_num_mismatches,
1269 mismatches_in_anchor,
1270 end);
1271 return true;
1272 }
1273 } //parse splice data
1274 else
1275 {
1276 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1277 //fprintf(stderr, "%s\n", orig_bwt_buf);
1278 // continue;
1279 return false;
1280 }
1281
1282 return false;
1283 }
1284
1285
1286
1287 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1288 BowtieHit& bh, bool strip_slash,
1289 char* name_out, char* name_tags,
1290 char* seq, char* qual) {
1291 if (_sam_header==NULL)
1292 err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1293 const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1294
1295
1296 uint32_t sam_flag = hit_buf->core.flag;
1297
1298 int text_offset = hit_buf->core.pos;
1299 int text_mate_pos = hit_buf->core.mpos;
1300 int target_id = hit_buf->core.tid;
1301 int mate_target_id = hit_buf->core.mtid;
1302
1303 vector<CigarOp> cigar;
1304 bool spliced_alignment = false;
1305 int num_hits = 1;
1306
1307 double mapQ = hit_buf->core.qual;
1308 long double error_prob;
1309 if (mapQ > 0)
1310 {
1311 long double p = (-1.0 * mapQ) / 10.0;
1312 error_prob = pow(10.0L, p);
1313 }
1314 else
1315 {
1316 error_prob = 1.0;
1317 }
1318
1319 //header->target_name[c->tid]
1320
1321 bool end = true;
1322 unsigned int seg_offset = 0;
1323 unsigned int seg_num = 0;
1324 unsigned int num_segs = 0;
1325 // Copy the tag out of the name field before we might wipe it out
1326 char* qname = bam1_qname(hit_buf);
1327 char* pipe = strrchr(qname, '|');
1328 if (pipe)
1329 {
1330 if (name_tags)
1331 strcpy(name_tags, pipe);
1332
1333 char* tag_buf = pipe + 1;
1334 if (strchr(tag_buf, ':'))
1335 {
1336 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1337 if (seg_num + 1 == num_segs)
1338 end = true;
1339 else
1340 end = false;
1341 }
1342
1343 *pipe = 0;
1344 }
1345
1346
1347 if (target_id < 0) {
1348 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1349 bh = create_hit(qname,
1350 "*", //ref_name
1351 0, //left coord
1352 0, //read_len
1353 false, //antisense_aln
1354 0, //edit_dist
1355 end);
1356 return true;
1357 }
1358
1359 //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1360 string text_name = _sam_header->target_name[target_id];
1361 for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1362 {
1363 //char* t;
1364
1365 int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1366 if (length <= 0)
1367 {
1368 fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1369 return false;
1370 }
1371
1372 CigarOpCode opcode;
1373 switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1374 {
1375 case BAM_CMATCH: opcode = MATCH; break;
1376 case BAM_CINS: opcode = INS; break;
1377 case BAM_CDEL: opcode = DEL; break;
1378 case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1379 case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1380 case BAM_CPAD: opcode = PAD; break;
1381 case BAM_CREF_SKIP:
1382 opcode = REF_SKIP;
1383 spliced_alignment = true;
1384 if (length > (int)max_report_intron_length)
1385 {
1386 //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1387 return false;
1388 }
1389 break;
1390 default:
1391 fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1392 return false;
1393 }
1394 if (opcode != HARD_CLIP)
1395 cigar.push_back(CigarOp(opcode, length));
1396 }
1397
1398 string mrnm;
1399 if (mate_target_id >= 0) {
1400 if (mate_target_id == target_id) {
1401 //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1402 mrnm = _sam_header->target_name[mate_target_id];
1403 }
1404 else {
1405 //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1406 return false;
1407 }
1408 }
1409 else {
1410 text_mate_pos = 0;
1411 }
1412 //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1413 bool antisense_splice=false;
1414 unsigned char num_mismatches = 0;
1415 unsigned char num_splice_anchor_mismatches = 0;
1416
1417 uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1418 if (ptr) {
1419 char src_strand_char = bam_aux2A(ptr);
1420 if (src_strand_char == '-')
1421 antisense_splice = true;
1422 // else if (src_strand_char == '+')
1423 // source_strand = CUFF_FWD;
1424 }
1425
1426 ptr = bam_aux_get(hit_buf, "NM");
1427 if (ptr) {
1428 num_mismatches = bam_aux2i(ptr);
1429 }
1430
1431 ptr = bam_aux_get(hit_buf, "NH");
1432 if (ptr) {
1433 num_hits = bam_aux2i(ptr);
1434 }
1435
1436 //bool antisense_aln = bam1_strand(hit_buf);
1437
1438 //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1439 // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1440 if (spliced_alignment) {
1441 //if (source_strand == CUFF_STRAND_UNKNOWN) {
1442 // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1443 // }
1444 bh = create_hit(qname,
1445 text_name,
1446 text_offset, // BAM files are 0-indexed
1447 cigar,
1448 sam_flag & 0x0010,
1449 antisense_splice,
1450 num_mismatches,
1451 num_splice_anchor_mismatches,
1452 end);
1453
1454 }
1455 else {
1456 //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1457 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1458 bh = create_hit(qname,
1459 text_name,
1460 text_offset, // BAM files are 0-indexed
1461 cigar,
1462 sam_flag & 0x0010,
1463 false,
1464 num_mismatches,
1465 0,
1466 end);
1467 }
1468 if (seq!=NULL) {
1469 char *bseq = (char*)bam1_seq(hit_buf);
1470 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1471 char v = bam1_seqi(bseq,i);
1472 seq[i]=bam_nt16_rev_table[v];
1473 }
1474 seq[hit_buf->core.l_qseq]=0;
1475 }
1476 if (qual!=NULL) {
1477 char *bq = (char*)bam1_qual(hit_buf);
1478 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1479 qual[i]=bq[i]+33;
1480 }
1481 qual[hit_buf->core.l_qseq]=0;
1482 }
1483 return true;
1484 }
1485
1486 bool BAMHitFactory::inspect_header(HitStream& hs)
1487 {
1488 bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1489
1490 if (header == NULL) {
1491 fprintf(stderr, "Warning: No BAM header\n");
1492 return false;
1493 }
1494 if (header->l_text == 0) {
1495 fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1496 return false;
1497 }
1498 return true;
1499 }
1500
1501
1502 void get_mapped_reads(FILE* bwtf,
1503 HitTable& hits,
1504 HitFactory& hit_factory,
1505 bool strip_slash,
1506 bool verbose)
1507 {
1508
1509
1510 char bwt_buf[2048];
1511 uint32_t reads_extracted = 0;
1512
1513 while (fgets(bwt_buf, 2048, bwtf))
1514 {
1515 // Chomp the newline
1516 char* nl = strrchr(bwt_buf, '\n');
1517 if (nl) *nl = 0;
1518 if (*bwt_buf == 0)
1519 continue;
1520 // Get a new record from the tab-delimited Bowtie map
1521 BowtieHit bh;
1522 if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1523 {
1524 // Only check uniqueness if these hits are spliced
1525 hits.add_hit(bh, true);
1526 }
1527 reads_extracted++;
1528 }
1529
1530 // This will sort the map by insert id.
1531 hits.finalize();
1532 fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1533 }
1534
1535
1536 /*
1537 AlignStatus status(const BowtieHit* align)
1538 {
1539 if (!align)
1540 return UNALIGNED;
1541 if (align->contiguous())
1542 return CONTIGUOUS;
1543 return SPLICED;
1544 }
1545 */
1546
1547 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1548 {
1549 int max_hit_pos = -1;
1550 for (size_t i = 0; i < hits.size(); ++i)
1551 {
1552 max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1553 }
1554
1555 if ((int)DoC.size() < max_hit_pos)
1556 DoC.resize(max_hit_pos);
1557
1558 for (size_t i = 0; i < hits.size(); ++i)
1559 {
1560 const BowtieHit& bh = hits[i];
1561
1562 // split up the coverage contibution for this reads
1563 size_t j = bh.left();
1564 const vector<CigarOp>& cigar = bh.cigar();
1565
1566 for (size_t c = 0 ; c < cigar.size(); ++c)
1567 {
1568 switch(cigar[c].opcode)
1569 {
1570 case MATCH:
1571 for (size_t m = 0; m < cigar[c].length; ++m)
1572 {
1573 if (DoC[j + m] < 0xFFFF)
1574 DoC[j + m]++;
1575 }
1576 //fall through this case to REF_SKIP is intentional
1577 case REF_SKIP:
1578 j += cigar[c].length;
1579 break;
1580 default:
1581 break;
1582 }
1583
1584 }
1585 }
1586 }
1587
1588 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1589 {
1590 if ((int)DoC.size() < bh.right())
1591 DoC.resize(bh.right());
1592
1593 // split up the coverage contibution for this reads
1594 size_t j = bh.left();
1595 const vector<CigarOp>& cigar = bh.cigar();
1596
1597 for (size_t c = 0 ; c < cigar.size(); ++c)
1598 {
1599 switch(cigar[c].opcode)
1600 {
1601 case MATCH:
1602 for (size_t m = 0; m < cigar[c].length; ++m)
1603 {
1604 if (DoC[j + m] < VMAXINT32)
1605 DoC[j + m]++;
1606 }
1607 //fall through this case to REF_SKIP is intentional
1608 case REF_SKIP:
1609 j += cigar[c].length;
1610 break;
1611 default:
1612 break;
1613 }
1614
1615 }
1616 }
1617
1618 void print_hit(FILE* fout,
1619 const char* read_name,
1620 const BowtieHit& bh,
1621 const char* ref_name,
1622 const char* sequence,
1623 const char* qualities,
1624 bool from_bowtie)
1625 {
1626 string seq;
1627 string quals;
1628 if (sequence)
1629 {
1630 seq = sequence;
1631 quals = qualities;
1632 seq.resize(bh.read_len());
1633 quals.resize(bh.read_len());
1634 }
1635 else
1636 {
1637 seq = "*";
1638 }
1639
1640 if (qualities)
1641 {
1642 quals = qualities;
1643 quals.resize(bh.read_len());
1644 }
1645 else
1646 {
1647 quals = "*";
1648 }
1649
1650 uint32_t sam_flag = 0;
1651 if (bh.antisense_align())
1652 {
1653 sam_flag |= 0x0010; // BAM_FREVERSE
1654 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1655 {
1656 reverse_complement(seq);
1657 reverse(quals.begin(), quals.end());
1658 }
1659 }
1660
1661 uint32_t sam_pos = bh.left() + 1;
1662 uint32_t map_quality = 255;
1663 char cigar[256];
1664 cigar[0] = 0;
1665 string mate_ref_name = "*";
1666 uint32_t mate_pos = 0;
1667 uint32_t insert_size = 0;
1668 //string qualities = "*";
1669
1670 const vector<CigarOp>& bh_cigar = bh.cigar();
1671
1672 /*
1673 * In addition to calculating the cigar string,
1674 * we need to figure out how many in/dels are in the
1675 * sequence, so that we can give the correct
1676 * value for the NM tag
1677 */
1678 int indel_distance = 0;
1679 for (size_t c = 0; c < bh_cigar.size(); ++c)
1680 {
1681 char ibuf[64];
1682 sprintf(ibuf, "%d", bh_cigar[c].length);
1683 switch(bh_cigar[c].opcode)
1684 {
1685 case MATCH:
1686 strcat(cigar, ibuf);
1687 strcat(cigar, "M");
1688 break;
1689 case INS:
1690 strcat(cigar, ibuf);
1691 strcat(cigar, "I");
1692 indel_distance += bh_cigar[c].length;
1693 break;
1694 case DEL:
1695 strcat(cigar, ibuf);
1696 strcat(cigar, "D");
1697 indel_distance += bh_cigar[c].length;
1698 break;
1699 case REF_SKIP:
1700 strcat(cigar, ibuf);
1701 strcat(cigar, "N");
1702 break;
1703 default:
1704 break;
1705 }
1706 }
1707
1708 //string q = string(bh.read_len(), '!');
1709 //string s = string(bh.read_len(), 'N');
1710
1711 fprintf(fout,
1712 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1713 read_name,
1714 sam_flag,
1715 ref_name,
1716 sam_pos,
1717 map_quality,
1718 cigar,
1719 mate_ref_name.c_str(),
1720 mate_pos,
1721 insert_size,
1722 seq.c_str(),
1723 quals.c_str());
1724
1725 if (!sam_readgroup_id.empty())
1726 fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1727
1728 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1729
1730 bool containsSplice = false;
1731 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1732 if(itr->opcode == REF_SKIP){
1733 containsSplice = true;
1734 break;
1735 }
1736 }
1737
1738 if (containsSplice)
1739 fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1740
1741 fprintf(fout, "\n");
1742 }
1743
1744 void print_bamhit(GBamWriter& wbam,
1745 const char* read_name,
1746 const BowtieHit& bh,
1747 const char* ref_name,
1748 const char* sequence,
1749 const char* qualities,
1750 bool from_bowtie)
1751 {
1752 string seq;
1753 string quals;
1754 if (sequence) {
1755 seq = sequence;
1756 quals = qualities;
1757 seq.resize(bh.read_len());
1758 quals.resize(bh.read_len());
1759 }
1760 else {
1761 seq = "*";
1762 }
1763 if (qualities) {
1764 quals = qualities;
1765 quals.resize(bh.read_len());
1766 }
1767 else
1768 {
1769 quals = "*";
1770 }
1771
1772 uint32_t sam_flag = 0;
1773 if (bh.antisense_align())
1774 {
1775 sam_flag |= 0x0010; // BAM_FREVERSE
1776 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1777 {
1778 reverse_complement(seq);
1779 reverse(quals.begin(), quals.end());
1780 }
1781 }
1782
1783 uint32_t sam_pos = bh.left() + 1;
1784 uint32_t map_quality = 255;
1785 char cigar[256];
1786 cigar[0] = 0;
1787 string mate_ref_name = "*";
1788 uint32_t mate_pos = 0;
1789 uint32_t insert_size = 0;
1790 //string qualities = "*";
1791
1792 const vector<CigarOp>& bh_cigar = bh.cigar();
1793 /*
1794 * In addition to calculating the cigar string,
1795 * we need to figure out how many in/dels are in the
1796 * sequence, so that we can give the correct
1797 * value for the NM tag
1798 */
1799 int indel_distance = 0;
1800 for (size_t c = 0; c < bh_cigar.size(); ++c)
1801 {
1802 char ibuf[64];
1803 sprintf(ibuf, "%d", bh_cigar[c].length);
1804 switch(bh_cigar[c].opcode)
1805 {
1806 case MATCH:
1807 strcat(cigar, ibuf);
1808 strcat(cigar, "M");
1809 break;
1810 case INS:
1811 strcat(cigar, ibuf);
1812 strcat(cigar, "I");
1813 indel_distance += bh_cigar[c].length;
1814 break;
1815 case DEL:
1816 strcat(cigar, ibuf);
1817 strcat(cigar, "D");
1818 indel_distance += bh_cigar[c].length;
1819 break;
1820 case REF_SKIP:
1821 strcat(cigar, ibuf);
1822 strcat(cigar, "N");
1823 break;
1824 default:
1825 break;
1826 }
1827 }
1828
1829 //string q = string(bh.read_len(), '!');
1830 //string s = string(bh.read_len(), 'N');
1831 /*
1832 fprintf(fout,
1833 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1834 read_name,
1835 sam_flag,
1836 ref_name,
1837 sam_pos,
1838 map_quality,
1839 cigar,
1840 mate_ref_name.c_str(),
1841 mate_pos,
1842 insert_size,
1843 seq.c_str(),
1844 quals.c_str());
1845
1846 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1847 */
1848 vector<string> auxdata;
1849
1850 if (!sam_readgroup_id.empty())
1851 {
1852 string nm("RG:Z:");
1853 nm += sam_readgroup_id;
1854 auxdata.push_back(nm);
1855 }
1856
1857 string nm("NM:i:");
1858 str_appendInt(nm, bh.edit_dist() + indel_distance);
1859 auxdata.push_back(nm);
1860 bool containsSplice = false;
1861 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1862 if(itr->opcode == REF_SKIP){
1863 containsSplice = true;
1864 break;
1865 }
1866
1867 if (containsSplice) {
1868 //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1869 nm="XS:A:";
1870 nm+=(char)(bh.antisense_splice() ? '-' : '+');
1871 auxdata.push_back(nm);
1872 }
1873
1874 GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1875 cigar, mate_ref_name.c_str(), mate_pos,
1876 insert_size, seq.c_str(), quals.c_str(), &auxdata);
1877
1878
1879
1880 wbam.write(brec);
1881 delete brec;
1882 }
1883
1884 /**
1885 * Print a vector of cigar operations to a file.
1886 * @param bh_cigar A vector of CigarOps
1887 * @return a string representation of the cigar string
1888 */
1889 std::string print_cigar(vector<CigarOp>& bh_cigar){
1890 char cigar[256];
1891 cigar[0] = 0;
1892 for (size_t c = 0; c < bh_cigar.size(); ++c)
1893 {
1894 char ibuf[64];
1895 sprintf(ibuf, "%d", bh_cigar[c].length);
1896 switch(bh_cigar[c].opcode)
1897 {
1898 case MATCH:
1899 strcat(cigar, ibuf);
1900 strcat(cigar, "M");
1901 break;
1902 case INS:
1903 strcat(cigar, ibuf);
1904 strcat(cigar, "I");
1905 break;
1906 case DEL:
1907 strcat(cigar, ibuf);
1908 strcat(cigar, "D");
1909 break;
1910 case REF_SKIP:
1911 strcat(cigar, ibuf);
1912 strcat(cigar, "N");
1913 break;
1914 default:
1915 break;
1916 }
1917 }
1918 string result(cigar);
1919 return result;
1920 }
1921
1922 bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
1923 {
1924 RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
1925 if (!ref_str)
1926 return false;
1927
1928 const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
1929
1930 size_t pos_seq = 0;
1931 size_t pos_ref = 0;
1932 size_t mismatch = 0;
1933 for (size_t i = 0; i < _cigar.size(); ++i)
1934 {
1935 CigarOp cigar = _cigar[i];
1936 switch(cigar.opcode)
1937 {
1938 case MATCH:
1939 {
1940 for (size_t j = 0; j < cigar.length; ++j)
1941 {
1942 if (_seq[pos_seq++] != ref_seq[pos_ref++])
1943 ++mismatch;
1944 }
1945 }
1946 break;
1947 case INS:
1948 {
1949 pos_seq += cigar.length;
1950 }
1951 break;
1952
1953 case DEL:
1954 case REF_SKIP:
1955 {
1956 pos_ref += cigar.length;
1957 }
1958 break;
1959
1960 default:
1961 break;
1962 }
1963 }
1964
1965 return mismatch == _edit_dist;
1966 }