ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 139
Committed: Thu Dec 15 22:36:13 2011 UTC (8 years, 5 months ago) by gpertea
File size: 57033 byte(s)
Log Message:
spliceCigar() implemented, needs testing

Line File contents
1 /*
2 * bwt_map.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/17/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <cassert>
15 #include <cstdlib>
16 #include <iostream>
17 #include <set>
18 #include <vector>
19 #include <cmath>
20
21 #include "common.h"
22 #include "bwt_map.h"
23 #include "tokenize.h"
24 #include "reads.h"
25
26 using namespace std;
27
28 void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29 {
30 uint32_t reference_id = bh.ref_id();
31
32 pair<RefHits::iterator, bool> ret =
33 _hits_for_ref.insert(make_pair(reference_id, HitList()));
34 HitList& hl = ret.first->second;
35
36 if (check_uniqueness)
37 {
38 // Check uniqueness, in case we are adding spliced hits from
39 // several spliced alignment sources (e.g. de novo hashing + Bowtie
40 // against a user-supplied index). We don't want to count the same
41 // alignment twice if it happened to be found by more than one method
42 HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43 HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44 for (; lb != ub && lb != hl.end(); ++lb)
45 {
46 if (*lb == bh)
47 {
48 //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49 return;
50 }
51
52 if (lb->insert_id() == bh.insert_id() &&
53 lb->ref_id() == bh.ref_id() &&
54 lb->antisense_align() == bh.antisense_align())
55 {
56 // If we get here, we may be looking at the same alignment
57 // However, spanning_reads may report a shorter, trimmed alignment
58 // so not all fields will be equal. If they just disagree on the
59 // ends, and don't indicate a different junction coord, the
60 // alignments are the same.
61
62 if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63 (bh.left() <= lb->left() && bh.right() >= lb->right()))
64 {
65 vector<pair<int,int> > lb_gaps, bh_gaps;
66 lb->gaps(lb_gaps);
67 bh.gaps(bh_gaps);
68 if (lb_gaps == bh_gaps)
69 {
70 // One alignment is contained in the other, they agree on
71 // where the gaps, if any, are, and they share an id
72 // => this is a redundant aligment, so toss it
73 //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74 return;
75 }
76 }
77 }
78 }
79 }
80 _total_hits++;
81 hl.push_back(bh);
82 }
83
84 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85 {
86 return h1.insert_id() < h2.insert_id();
87 }
88
89 void LineHitFactory::openStream(HitStream& hs)
90 {
91 if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92 //open the file for HitStream here
93 hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94 if (hs._hit_file==NULL)
95 err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96 return;
97 }
98 if (hs._fzpipe!=NULL) {
99 hs._hit_file=hs._fzpipe->file;
100 }
101 }
102
103 void LineHitFactory::rewind(HitStream& hs)
104 {
105 if (hs._fzpipe!=NULL) {
106 hs._fzpipe->rewind();
107 hs._hit_file=hs._fzpipe->file;
108 }
109 else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110 }
111
112 bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113 FILE* f=(FILE *)(hs._hit_file);
114 bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115 if (!new_rec || feof(f)) {
116 hs._eof=true;
117 return false;
118 }
119 ++_line_num;
120 char* nl = strrchr(_hit_buf, '\n');
121 if (nl) *nl = 0;
122 buf = _hit_buf;
123 buf_size = _hit_buf_max_sz - 1;
124 return true;
125 }
126
127 void LineHitFactory::closeStream(HitStream& hs) {
128 if (hs._fzpipe!=NULL) {
129 hs._fzpipe->close();
130 return;
131 }
132 if (hs._hit_file!=NULL) {
133 fclose((FILE*)(hs._hit_file));
134 hs._hit_file=NULL;
135 }
136 }
137 void BAMHitFactory::openStream(HitStream& hs) {
138 if (hs._hit_file==NULL) {
139 if (hs._hit_file_name.empty())
140 //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141 return; //invalid stream, could be just a place holder
142 //open the file here if not already open
143 string fext=getFext(hs._hit_file_name);
144 if (fext=="sam")
145 hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146 else
147 hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148
149 samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150
151 if (sam_file == NULL)
152 err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153 if (sam_file->header == NULL)
154 err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155 memset(&_next_hit, 0, sizeof(_next_hit));
156 //_beginning = bgzf_tell(sam_file->x.bam);
157 _sam_header=sam_file->header;
158 if (inspect_header(hs) == false)
159 err_die("Error: invalid SAM header for file %s\n",
160 hs._hit_file_name.c_str());
161 }
162 }
163
164 void BAMHitFactory::closeStream(HitStream& hs) {
165 if (hs._hit_file) {
166 samclose((samfile_t*)(hs._hit_file));
167 }
168 hs._hit_file=NULL;
169 _sam_header=NULL;
170 }
171
172 void BAMHitFactory::rewind(HitStream& hs)
173 {
174 /*
175 if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176 {
177 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178 _eof = false;
179 }
180 */
181 this->closeStream(hs);
182 this->openStream(hs);
183 }
184
185 string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186 const bam1_t* bamrec=(const bam1_t*)hit_buf;
187 char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188 string sam_line(tamline);
189 free(tamline);
190 return sam_line;
191 }
192
193 bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194 if (_next_hit.data) {
195 free(_next_hit.data);
196 _next_hit.data = NULL;
197 }
198 _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199 if (hs.eof() || !hs.ready()) return false;
200
201 //mark_curr_pos();
202
203 memset(&_next_hit, 0, sizeof(_next_hit));
204
205 int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206 if (bytes_read <= 0) {
207 hs._eof = true;
208 return false;
209 }
210 buf = (const char*)&_next_hit;
211 buf_size = bytes_read;
212 return true;
213 }
214
215
216 BowtieHit HitFactory::create_hit(const string& insert_name,
217 const string& ref_name,
218 int left,
219 const vector<CigarOp>& cigar,
220 bool antisense_aln,
221 bool antisense_splice,
222 unsigned char edit_dist,
223 unsigned char splice_mms,
224 bool end)
225 {
226 uint64_t insert_id = _insert_table.get_id(insert_name);
227 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228
229 return BowtieHit(reference_id,
230 insert_id,
231 left,
232 cigar,
233 antisense_aln,
234 antisense_splice,
235 edit_dist,
236 splice_mms,
237 end);
238 }
239
240 BowtieHit HitFactory::create_hit(const string& insert_name,
241 const string& ref_name,
242 uint32_t left,
243 uint32_t read_len,
244 bool antisense_aln,
245 unsigned char edit_dist,
246 bool end)
247 {
248 uint64_t insert_id = _insert_table.get_id(insert_name);
249 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250
251 return BowtieHit(reference_id,
252 insert_id,
253 left,
254 read_len,
255 antisense_aln,
256 edit_dist,
257 end);
258 }
259
260 bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261 BowtieHit& bh,
262 bool strip_slash,
263 char* name_out,
264 char* name_tags,
265 char* seq,
266 char* qual)
267 {
268 if (!orig_bwt_buf || !*orig_bwt_buf)
269 return false;
270
271 static const int buf_size = 2048;
272
273 char bwt_buf[buf_size];
274 strcpy(bwt_buf, orig_bwt_buf);
275 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276
277 char orientation;
278
279 //int bwtf_ret = 0;
280 //uint32_t seqid = 0;
281
282 char text_name[buf_size];
283 unsigned int text_offset;
284
285
286 //unsigned int other_occs;
287 char mismatches[buf_size];
288 //memset(mismatches, 0, sizeof(mismatches));
289
290 const char* buf = bwt_buf;
291 char* name = get_token((char**)&buf,"\t");
292 char* orientation_str = get_token((char**)&buf,"\t");
293 char* text_name_str = get_token((char**)&buf,"\t");
294
295 strcpy(text_name, text_name_str);
296
297 char* text_offset_str = get_token((char**)&buf,"\t");
298 char* seq_str = get_token((char**)&buf,"\t");
299 if (seq)
300 strcpy(seq, seq_str);
301
302 const char* qual_str = get_token((char**)&buf,"\t");
303 if (qual)
304 strcpy(qual, qual_str);
305
306 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307 mismatches[0] = 0;
308 char* mismatches_str = get_token((char**)&buf, "\t");
309 if (mismatches_str)
310 strcpy(mismatches, mismatches_str);
311
312 orientation = orientation_str[0];
313 text_offset = atoi(text_offset_str);
314
315 bool end = true;
316 unsigned int seg_offset = 0;
317 unsigned int seg_num = 0;
318 unsigned int num_segs = 0;
319
320 // Copy the tag out of the name field before we might wipe it out
321 char* pipe = strrchr(name, '|');
322 if (pipe)
323 {
324 if (name_tags)
325 strcpy(name_tags, pipe);
326
327 char* tag_buf = pipe + 1;
328 if (strchr(tag_buf, ':'))
329 {
330 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331 if (seg_num + 1 == num_segs)
332 end = true;
333 else
334 end = false;
335 }
336
337 *pipe = 0;
338 }
339 // Stripping the slash and number following it gives the insert name
340 char* slash = strrchr(name, '/');
341 if (strip_slash && slash)
342 *slash = 0;
343
344 size_t read_len = strlen(seq_str);
345
346 // Add this alignment to the table of hits for this half of the
347 // Bowtie map
348
349 //vector<string> mismatch_toks;
350 char* pch = strtok (mismatches,",");
351 unsigned char num_mismatches = 0;
352 while (pch != NULL)
353 {
354 char* colon = strchr(pch, ':');
355 if (colon)
356 {
357 num_mismatches++;
358 }
359 //mismatch_toks.push_back(pch);
360 pch = strtok (NULL, ",");
361 }
362
363 bh = create_hit(name,
364 text_name,
365 text_offset,
366 (int)read_len,
367 orientation == '-',
368 num_mismatches,
369 end);
370
371 return true;
372 }
373
374 int anchor_mismatch = 0;
375
376
377 void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
378 bool &end, unsigned int &seg_offset, unsigned int& seg_num,
379 unsigned int & num_segs) {
380 char* pipe = strrchr(name, '|');
381 if (pipe)
382 {
383 if (name_tags)
384 strcpy(name_tags, pipe);
385
386 char* tag_buf = pipe + 1;
387 if (strchr(tag_buf, ':'))
388 {
389 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
390 if (seg_num + 1 == num_segs)
391 end = true;
392 else
393 end = false;
394 }
395
396 *pipe = 0;
397 }
398
399 // Stripping the slash and number following it gives the insert name
400 char* slash = strrchr(name, '/');
401 if (strip_slash && slash)
402 *slash = 0;
403 }
404
405
406 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
407 BowtieHit& bh,
408 bool strip_slash,
409 char* name_out,
410 char* name_tags,
411 char* seq,
412 char* qual)
413 {
414 if (!orig_bwt_buf || !*orig_bwt_buf)
415 return false;
416
417 static const int buf_size = 2048;
418
419 char bwt_buf[buf_size];
420 strcpy(bwt_buf, orig_bwt_buf);
421 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
422
423 char orientation;
424 char text_name[buf_size];
425 unsigned int text_offset;
426 char mismatches[buf_size];
427 //memset(mismatches, 0, sizeof(mismatches));
428
429 char* buf = bwt_buf;
430 char* name = get_token((char**)&buf,"\t");
431 char* orientation_str = get_token((char**)&buf,"\t");
432 char* text_name_str = get_token((char**)&buf,"\t");
433 strcpy(text_name, text_name_str);
434
435 char* text_offset_str = get_token((char**)&buf,"\t");
436 char* seq_str = get_token((char**)&buf,"\t");
437 if (seq)
438 strcpy(seq, seq_str);
439
440 const char* qual_str = get_token((char**)&buf,"\t");
441 if (qual)
442 strcpy(qual, qual_str);
443
444 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
445 mismatches[0] = 0;
446 char* mismatches_str = get_token((char**)&buf, "\t");
447 if (mismatches_str)
448 strcpy(mismatches, mismatches_str);
449
450 orientation = orientation_str[0];
451 text_offset = atoi(text_offset_str);
452
453 bool end = true;
454 unsigned int seg_offset = 0;
455 unsigned int seg_num = 0;
456 unsigned int num_segs = 0;
457
458 // Copy the tag out of the name field before we might wipe it out
459 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
460
461 // Add this alignment to the table of hits for this half of the
462 // Bowtie map
463
464 // Parse the text_name field to recover the splice coords
465 vector<string> toks;
466
467 tokenize_strict(text_name, "|", toks);
468
469 int num_extra_toks = (int)toks.size() - 6;
470
471 if (num_extra_toks >= 0)
472 {
473 static const uint8_t left_window_edge_field = 1;
474 static const uint8_t splice_field = 2;
475 //static const uint8_t right_window_edge_field = 3;
476 static const uint8_t junction_type_field = 4;
477 static const uint8_t strand_field = 5;
478
479 string contig = toks[0];
480 for (int t = 1; t <= num_extra_toks; ++t)
481 {
482 contig += "|";
483 contig += toks[t];
484 }
485
486 vector<string> splice_toks;
487 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
488 if (splice_toks.size() != 2)
489 {
490 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
491 //fprintf(stderr, "%s (token: %s)\n", text_name,
492 // toks[num_extra_toks + splice_field].c_str());
493 return false;
494 }
495
496 //
497 // check for an insertion hit
498 //
499 if(toks[num_extra_toks + junction_type_field] == "ins")
500 {
501 int8_t spliced_read_len = strlen(seq_str);
502 /*
503 * The 0-based position of the left edge of the alignment. Note that this
504 * value may need to be futher corrected to account for the presence of
505 * of the insertion.
506 */
507 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
508 uint32_t right = left + spliced_read_len - 1;
509
510
511 /*
512 * The 0-based position of the last genomic sequence before the insertion
513 */
514 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
515
516 string insertedSequence = splice_toks[1];
517 /*
518 * The 0-based position of the first genomic sequence after teh insertion
519 */
520 uint32_t right_splice_pos = left_splice_pos + 1;
521 if(left > left_splice_pos){
522 /*
523 * The genomic position of the left edge of the alignment needs to be corrected
524 * If the alignment does not extend into the insertion, simply subtract the length
525 * of the inserted sequence, otherwise, just set it equal to the right edge
526 */
527 left = left - insertedSequence.length();
528 if(left < right_splice_pos){
529 left = right_splice_pos;
530 }
531 }
532 if(right > left_splice_pos){
533 right = right - insertedSequence.length();
534 if(right < left_splice_pos){
535 right = left_splice_pos;
536 }
537 }
538 /*
539 * Now, right and left should be properly transformed into genomic coordinates
540 * We should be able to deduce how much the alignment matches the insertion
541 * simply based on the length of the read
542 */
543 int left_match_length = 0;
544 if(left <= left_splice_pos){
545 left_match_length = left_splice_pos - left + 1;
546 }
547 int right_match_length = 0;
548 if(right >= right_splice_pos){
549 right_match_length = right - right_splice_pos + 1;
550 }
551 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
552
553 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
554 return false;
555
556 string junction_strand = toks[num_extra_toks + strand_field];
557 if(junction_strand != "rev" && junction_strand != "fwd"){
558 fprintf(stderr, "Malformed insertion record\n");
559 return false;
560 }
561
562 char* pch = strtok( mismatches, ",");
563 unsigned char num_mismatches = 0;
564
565 /*
566 * remember that text_offset holds the left end of the
567 * alignment, relative to the start of the contig
568 */
569
570 /*
571 * The 0-based relative position of the left-most character
572 * before the insertion in the contig
573 */
574 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
575 while (pch != NULL)
576 {
577 char* colon = strchr(pch, ':');
578 if (colon)
579 {
580 *colon = 0;
581 int mismatch_pos = atoi(pch);
582
583 /*
584 * for reversely mapped reads,
585 * find the correct mismatched position.
586 */
587 if(orientation == '-'){
588 mismatch_pos = spliced_read_len - mismatch_pos - 1;
589 }
590
591 /*
592 * Only count mismatches outside of the insertion region
593 * If there is a mismatch within the insertion,
594 * disallow this hit
595 */
596 if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
597 num_mismatches++;
598 }else{
599 return false;
600 }
601 }
602 pch = strtok (NULL, ",");
603 }
604
605
606 vector<CigarOp> cigar;
607 cigar.push_back(CigarOp(MATCH, left_match_length));
608 cigar.push_back(CigarOp(INS, insertion_match_length));
609 cigar.push_back(CigarOp(MATCH, right_match_length));
610
611 /*
612 * For now, disallow hits that don't span
613 * the insertion. If we allow these types of hits,
614 * then long_spanning.cpp needs to be updated
615 * in order to intelligently merge these kinds
616 * of reads back together
617 *
618 * Following code has been changed to allow segment that end
619 * in an insertion
620 */
621 bh = create_hit(name,
622 contig,
623 left,
624 cigar,
625 orientation == '-',
626 junction_strand == "rev",
627 num_mismatches,
628 0,
629 end);
630 return true;
631 }
632
633 else
634 {
635 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
636 int spliced_read_len = strlen(seq_str);
637 int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
638 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
639 int8_t right_splice_pos = spliced_read_len - left_splice_pos;
640
641 if (right_splice_pos <= 0 || left_splice_pos <= 0)
642 return false;
643
644 if (orientation == '+')
645 {
646 if (left_splice_pos + seg_offset < _anchor_length){
647 return false;
648 }
649 }
650 else
651 {
652 if (right_splice_pos + seg_offset < _anchor_length)
653 return false;
654 }
655 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
656 //atoi(toks[right_window_edge_field].c_str());
657 int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
658
659 string junction_strand = toks[num_extra_toks + strand_field];
660 if (!(junction_strand == "rev" || junction_strand == "fwd")||
661 !(orientation == '-' || orientation == '+'))
662 {
663 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
664 //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
665 // junction_strand.c_str(), orientation);
666 return false;
667 }
668
669 //vector<string> mismatch_toks;
670 char* pch = strtok (mismatches,",");
671 int mismatches_in_anchor = 0;
672 unsigned char num_mismatches = 0;
673 while (pch != NULL)
674 {
675 char* colon = strchr(pch, ':');
676 if (colon)
677 {
678 *colon = 0;
679 num_mismatches++;
680 int mismatch_pos = atoi(pch);
681 if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
682 (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
683 mismatches_in_anchor++;
684 }
685 //mismatch_toks.push_back(pch);
686 pch = strtok (NULL, ",");
687 }
688
689 // FIXME: we probably should exclude these hits somewhere, but this
690 // isn't the right place
691 vector<CigarOp> cigar;
692 cigar.push_back(CigarOp(MATCH, left_splice_pos));
693 if(toks[num_extra_toks + junction_type_field] == "del"){
694 cigar.push_back(CigarOp(DEL, gap_len));
695 }else{
696 cigar.push_back(CigarOp(REF_SKIP, gap_len));
697 }
698 cigar.push_back(CigarOp(MATCH, right_splice_pos));
699
700 bh = create_hit(name,
701 contig,
702 left,
703 cigar,
704 orientation == '-',
705 junction_strand == "rev",
706 num_mismatches,
707 mismatches_in_anchor,
708 end);
709 return true;
710 }
711 }
712 else
713 {
714 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
715 //fprintf(stderr, "%s\n", orig_bwt_buf);
716 // continue;
717 return false;
718 }
719
720 return false;
721 }
722
723
724 int parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
725 bool &spliced_alignment) {
726 const char* p_cig = cigar_str;
727 int refspan=0; //alignment span on reference sequence
728 while (*p_cig)
729 {
730 char* t;
731 int op_len = (int)strtol(p_cig, &t, 10);
732 if (op_len <= 0)
733 {
734 fprintf (stderr, "Error: CIGAR op has zero length\n");
735 return 0;
736 }
737 char op_char = toupper(*t);
738 CigarOpCode opcode;
739 switch (op_char) {
740 case '=':
741 case 'X':
742 case 'M': opcode = MATCH;
743 refspan+=op_len;
744 break;
745 case 'I': opcode = INS;
746 break;
747 case 'D': opcode = DEL;
748 refspan+=op_len;
749 break;
750 case 'N': if (op_len > max_report_intron_length)
751 return 0;
752 opcode = REF_SKIP;
753 spliced_alignment = true;
754 refspan+=op_len;
755 break;
756 case 'S': opcode = SOFT_CLIP;
757 break;
758 case 'H': opcode = HARD_CLIP;
759 break;
760 case 'P': opcode = PAD;
761 break;
762 default: fprintf (stderr, "Error: invalid CIGAR operation\n");
763 return 0;
764 }
765 p_cig = t + 1;
766 cigar.push_back(CigarOp(opcode, op_len));
767 } //while cigar codes
768 if (*p_cig) {
769 fprintf (stderr, "Error: unmatched CIGAR operation (%s)\n",p_cig);
770 return 0;
771 }
772 return refspan;
773 }
774
775 int getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
776 vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) {
777 int gspan=0;//genomic span of the alignment
778 const char* tag_buf = buf;
779 sam_nm=0;
780 int num_mismatches=0;
781 while((tag_buf = get_token((char**)&buf,"\t")))
782 {
783 vector<string> tuple_fields;
784 tokenize(tag_buf,":", tuple_fields);
785 if (tuple_fields.size() == 3)
786 {
787 if (tuple_fields[0] == "XS")
788 {
789 if (tuple_fields[2] == "-")
790 antisense_splice = true;
791 }
792 else if (tuple_fields[0] == "NM")
793 {
794 sam_nm = atoi(tuple_fields[2].c_str());
795 }
796 else if (tuple_fields[0] == "NS")
797 {
798 //ignored for now
799 }
800 else if (tuple_fields[0] == "MD")
801 {
802 const char* p=tuple_fields[2].c_str();
803 int bi=0; //base offset position in the read
804 while (*p != 0) {
805 if (isdigit(*p)) {
806 int v=atoi(p);
807 do { p++; } while (isdigit(*p));
808 bi+=v;
809 }
810 while (isalpha(*p)) {
811 p++;
812 num_mismatches++;
813 //mismatches.push_back(bi);
814 mismatches[bi]=true;
815 bi++;
816 }
817 if (*p=='^') { //reference deletion
818 p++;
819 while (isalpha(*p)) { //insert read bases
820 p++; bi++;
821 }
822 }
823 }
824 }
825 //else
826 //{
827 //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
828 //return false;
829 //}
830 }
831 }
832 /* By convention,the NM field of the SAM record
833 * counts an insertion or deletion. I dont' think
834 * we want the mismatch count in the BowtieHit
835 * record to reflect this. Therefore, subtract out
836 * the mismatches due to in/dels
837 */
838 for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
839 switch (itr->opcode)
840 {
841 case MATCH:
842 case REF_SKIP:
843 case PAD:
844 gspan += itr->length;
845 break;
846 case DEL:
847 gspan += itr->length;
848 sam_nm -= itr->length;
849 break;
850 case INS:
851 sam_nm -= itr->length;
852 break;
853 }
854 }
855 return num_mismatches;
856 }
857
858 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
859 BowtieHit& bh,
860 bool strip_slash,
861 char* name_out,
862 char* name_tags,
863 char* seq,
864 char* qual)
865 {
866 if (!orig_bwt_buf || !*orig_bwt_buf)
867 return false;
868 char bwt_buf[2048];
869 strcpy(bwt_buf, orig_bwt_buf);
870 // Are we still in the header region?
871 if (bwt_buf[0] == '@')
872 return false;
873
874 char* buf = bwt_buf;
875 char* name = get_token((char**)&buf,"\t");
876 char* sam_flag_str = get_token((char**)&buf,"\t");
877 char* text_name = get_token((char**)&buf,"\t");
878 char* text_offset_str = get_token((char**)&buf,"\t");
879 const char* map_qual_str = get_token((char**)&buf,"\t");
880 char* cigar_str = get_token((char**)&buf,"\t");
881 const char* mate_ref_str = get_token((char**)&buf,"\t");
882 const char* mate_pos_str = get_token((char**)&buf,"\t");
883 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
884
885 const char* seq_str = get_token((char**)&buf,"\t");
886 if (seq)
887 strcpy(seq, seq_str);
888
889 const char* qual_str = get_token((char**)&buf,"\t");
890 if (qual)
891 strcpy(qual, qual_str);
892
893 if (!name ||
894 !sam_flag_str ||
895 !text_name ||
896 !text_offset_str ||
897 !map_qual_str ||
898 !cigar_str ||
899 !mate_ref_str ||
900 !mate_pos_str ||
901 !inferred_insert_sz_str ||
902 !seq_str ||
903 !qual_str)
904 {
905 // truncated or malformed SAM record
906 return false;
907 }
908
909
910 int sam_flag = atoi(sam_flag_str);
911 int text_offset = atoi(text_offset_str);
912
913 bool end = true;
914 unsigned int seg_offset = 0;
915 unsigned int seg_num = 0;
916 unsigned int num_segs = 0;
917
918 // Copy the tag out of the name field before we might wipe it out
919 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
920
921 vector<CigarOp> cigar;
922 bool spliced_alignment = false;
923 int refspan=parseCigar(cigar, cigar_str, spliced_alignment);
924 if (refspan==0)
925 return false;
926 //vector<string> attributes;
927 //tokenize(tag_buf, " \t",attributes);
928
929 bool antisense_splice = false;
930 int sam_nm = 0; //the value of the NM tag (edit distance)
931 //int mismatches[1024];//array with mismatch positions on the read (0-based from the left aligned end of the read)
932 vector<bool> mismatches;
933 mismatches.resize(strlen(seq_str), false);
934 int num_mismatches=getSAMmismatches(buf, cigar, mismatches, sam_nm, antisense_splice);
935
936 if (spliced_alignment)
937 {
938 bh = create_hit(name,
939 text_name,
940 text_offset - 1,
941 cigar,
942 sam_flag & 0x0010,
943 antisense_splice,
944 num_mismatches,
945 0,
946 end);
947 }
948 else
949 {
950 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
951 bh = create_hit(name,
952 text_name,
953 text_offset - 1, // SAM files are 1-indexed
954 cigar,
955 sam_flag & 0x0010,
956 false,
957 num_mismatches,
958 0,
959 end);
960 }
961 return true;
962 }
963
964 void cigar_add(vector<CigarOp>& cigar, CigarOp& op) {
965 if (op.length<=0) return;
966 if (cigar.size()>0 && cigar.back().opcode==op.opcode) {
967 cigar.back().length+=op.length;
968 }
969 cigar.push_back(op);
970 }
971
972 int spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<bool> mismatches,
973 int &left, int spl_start, int spl_len, CigarOpCode spl_code) {
974 //merge the original 'cigar' with the new insert/gap operation
975 //at position spl_start and place the result into splcigar;
976 //TODO: ideally this should also get and rebuild the MD string (alignment mismatches)
977 int spl_mismatches=0;
978 //return value: mismatches in the insert region for INS case,
979 //or number of mismatches in the anchor region
980 //return -1 if somehow the hit seems bad
981
982 //these offsets are relative to the beginning of alignment on reference
983 int spl_ofs=spl_start-left; //relative position of splice op
984 int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op
985 CigarOp gapop(spl_code, spl_len); //for DEL or REF_SKIP
986 if (spl_code==INS)
987 spl_ofs_end += spl_len;
988 int ref_ofs=0; //working offset on reference
989 int read_ofs=0; //working offset on the read, relative to the leftmost aligned base
990 bool xfound=false;
991 //if (left<=spl_start+spl_len) {
992 if (spl_ofs_end>0) {
993 int prev_opcode=0;
994 int prev_oplen=0;
995 for (size_t c = 0 ; c < cigar.size(); ++c)
996 {
997 int prev_read_ofs=read_ofs;
998 int cur_op_ofs=ref_ofs;
999 int cur_opcode=cigar[c].opcode;
1000 int cur_oplen=cigar[c].length;
1001 switch (cur_opcode) {
1002 case MATCH:
1003 ref_ofs+=cur_oplen;
1004 read_ofs+=cur_oplen;
1005 if (spl_code==REF_SKIP)
1006 for (int o=cur_op_ofs;o<ref_ofs;o++) {
1007 int rofs=prev_read_ofs+(cur_op_ofs-o);
1008 if (abs(spl_ofs-o)<min_anchor_len && mismatches[rofs])
1009 spl_mismatches++;
1010 }
1011 break;
1012 case DEL:
1013 case REF_SKIP:
1014 case PAD:
1015 ref_ofs+=cur_oplen;
1016 break;
1017 case SOFT_CLIP:
1018 case INS:
1019 read_ofs+=cur_oplen;
1020 break;
1021 //case HARD_CLIP:
1022 }
1023
1024 if (cur_op_ofs>=spl_ofs_end || ref_ofs<=spl_ofs) {
1025 if (cur_op_ofs==spl_ofs_end && spl_code!=INS)
1026 {
1027 xfound=true;
1028 //we have to insert the gap here first
1029 cigar_add(splcigar, gapop);
1030 //also, check
1031 }
1032 cigar_add(splcigar, cigar[c]);
1033 }
1034 else //if (ref_ofs>spl_ofs) {
1035 { //op intersection
1036 xfound=true;
1037 if (spl_code==INS) {
1038 //we have to shorten cur_opcode
1039 // find the overlap between current range
1040 int ovl_start = (cur_op_ofs>spl_ofs) ? cur_op_ofs : spl_ofs;
1041 int ovl_end = (ref_ofs>spl_ofs_end) ? spl_ofs_end : ref_ofs;
1042 CigarOp op(cigar[c]);
1043 int len=op.length;
1044 len-=(ovl_end-ovl_start);
1045 if (len>0) {
1046 op.length=len;
1047 cigar_add(splcigar, op);
1048 }
1049 }
1050 else {//DEL or REF_SKIP
1051 //spl_ofs == spl_ofs_end
1052 //we have to split cur_opcode
1053 //look for mismatches within min_anchor_len distance from splice point
1054 CigarOp op(cigar[c]);
1055 op.length=spl_ofs-cur_op_ofs;
1056 cigar_add(splcigar, op);
1057 cigar_add(splcigar, gapop);
1058 op.length=ref_ofs-spl_ofs;
1059 cigar_add(splcigar,op);
1060 }
1061 } //op intersection
1062 prev_opcode=cur_opcode;
1063 prev_oplen=cur_oplen;
1064 } //for each cigar opcode
1065 } //intersection possible
1066
1067 //if (!xfound) {//no intersection found between splice event and alignment
1068 if (spl_ofs_end<=0) {
1069 //alignment starts after the splice event
1070 if (spl_code==INS) left-=spl_len;
1071 else left+=spl_len;
1072 }
1073 //else {
1074 //alignment ends before the splice event
1075 //nothing to do
1076 // }
1077 //return spl_mismatches;
1078 // }
1079
1080 return spl_mismatches;
1081 }
1082
1083 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1084 BowtieHit& bh,
1085 bool strip_slash,
1086 char* name_out,
1087 char* name_tags,
1088 char* seq,
1089 char* qual)
1090 {
1091 if (!orig_bwt_buf || !*orig_bwt_buf)
1092 return false;
1093
1094 char bwt_buf[2048];
1095 strcpy(bwt_buf, orig_bwt_buf);
1096 // Are we still in the header region?
1097 if (bwt_buf[0] == '@')
1098 return false;
1099
1100 char* buf = bwt_buf;
1101 char* name = get_token((char**)&buf,"\t");
1102 char* sam_flag_str = get_token((char**)&buf,"\t");
1103 char* text_name = get_token((char**)&buf,"\t");
1104 char* text_offset_str = get_token((char**)&buf,"\t");
1105 const char* map_qual_str = get_token((char**)&buf,"\t");
1106 char* cigar_str = get_token((char**)&buf,"\t");
1107 const char* mate_ref_str = get_token((char**)&buf,"\t");
1108 const char* mate_pos_str = get_token((char**)&buf,"\t");
1109 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1110 //int num_mismatches=0;
1111 //int mismatches[1024]; //list of 0-based mismatch positions in this read
1112 //parsed from SAM's MD:Z: tag
1113 const char* seq_str = get_token((char**)&buf,"\t");
1114 if (seq)
1115 strcpy(seq, seq_str);
1116
1117 const char* qual_str = get_token((char**)&buf,"\t");
1118 if (qual)
1119 strcpy(qual, qual_str);
1120
1121 if (!name ||
1122 !sam_flag_str ||
1123 !text_name ||
1124 !text_offset_str ||
1125 !map_qual_str ||
1126 !cigar_str ||
1127 !mate_ref_str ||
1128 !mate_pos_str ||
1129 !inferred_insert_sz_str ||
1130 !seq_str ||
1131 !qual_str)
1132 {
1133 // truncated or malformed SAM record
1134 return false;
1135 }
1136
1137
1138 int sam_flag = atoi(sam_flag_str);
1139 int text_offset = atoi(text_offset_str);
1140 text_offset--; //make it 0-based (SAM is 1-based, Bowtie is 0-based)
1141 bool end = true;
1142 unsigned int seg_offset = 0;
1143 unsigned int seg_num = 0;
1144 unsigned int num_segs = 0;
1145
1146 // Copy the tag out of the name field before we might wipe it out
1147 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1148
1149 vector<CigarOp> samcigar;
1150 bool spliced_alignment = false;
1151 int refspan=parseCigar(samcigar, cigar_str, spliced_alignment);
1152 if (refspan==0)
1153 return false;
1154 bool antisense_splice = false;
1155 int sam_nm = 0;
1156 vector<bool> mismatches;
1157 mismatches.resize(strlen(seq_str), false);
1158
1159 int num_mismatches=getSAMmismatches(buf, samcigar, mismatches, sam_nm, antisense_splice);
1160
1161 //##############################################
1162
1163 // Add this alignment to the table of hits for this half of the
1164 // Bowtie map
1165
1166 // Parse the text_name field to recover the splice coords
1167 vector<string> toks;
1168 tokenize_strict(text_name, "|", toks);
1169
1170 int num_extra_toks = (int)toks.size() - 6;
1171
1172 if (num_extra_toks >= 0)
1173 {
1174 static const uint8_t left_window_edge_field = 1;
1175 static const uint8_t splice_field = 2;
1176 //static const uint8_t right_window_edge_field = 3;
1177 static const uint8_t junction_type_field = 4;
1178 static const uint8_t strand_field = 5;
1179
1180 string contig = toks[0];
1181 for (int t = 1; t <= num_extra_toks; ++t)
1182 {
1183 contig += "|";
1184 contig += toks[t];
1185 }
1186
1187 vector<string> splice_toks;
1188 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1189 if (splice_toks.size() != 2)
1190 {
1191 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1192 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1193 // toks[num_extra_toks + splice_field].c_str());
1194 return false;
1195 }
1196
1197 string junction_strand = toks[num_extra_toks + strand_field];
1198 if(junction_strand != "rev" && junction_strand != "fwd"){
1199 fprintf(stderr, "Malformed insertion record\n");
1200 return false;
1201 }
1202
1203 //
1204 // check for an insertion hit
1205 //
1206 if(toks[num_extra_toks + junction_type_field] == "ins")
1207 {
1208 //int8_t spliced_read_len = strlen(seq_str);
1209 //TODO FIXME: use the CIGAR instead of seq length!
1210 // The 0-based position of the left edge of the alignment. Note that this
1211 // value may need to be further corrected to account for the presence of
1212 // of the insertion.
1213 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1214
1215 // The 0-based position of the last genomic sequence before the insertion
1216 int left_splice_pos = atoi(splice_toks[0].c_str());
1217
1218 string insertedSequence = splice_toks[1];
1219 // The 0-based position of the first genomic sequence after the insertion
1220
1221 vector<CigarOp> splcigar;
1222
1223 //TODO: implement this
1224 //this also updates left to the adjusted genomic coordinates
1225 int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches,
1226 left, left_splice_pos+1, insertedSequence.length(), INS);
1227 if (spl_num_mismatches<0) return false;
1228 num_mismatches-=spl_num_mismatches;
1229 /*
1230 uint32_t right_splice_pos = left_splice_pos + 1;
1231
1232 //uint32_t right = left + spliced_read_len - 1;
1233 int right = left + refspan - 1;
1234
1235 if(left > left_splice_pos){
1236 //The genomic position of the left edge of the alignment needs to be corrected
1237 //If the alignment does not extend into the insertion, simply subtract the length
1238 //of the inserted sequence, otherwise, just set it equal to the right edge
1239 left = left - insertedSequence.length();
1240 if(left < right_splice_pos){
1241 left = right_splice_pos;
1242 }
1243 }
1244 if(right > left_splice_pos){
1245 right = right - insertedSequence.length();
1246 if(right < left_splice_pos){
1247 right = left_splice_pos;
1248 }
1249 }
1250 // Now, right and left should be properly transformed into genomic coordinates
1251 // We should be able to deduce how much the alignment matches the insertion
1252 // simply based on the length of the read
1253 int left_match_length = 0;
1254 if(left <= left_splice_pos){
1255 left_match_length = left_splice_pos - left + 1;
1256 }
1257 int right_match_length = 0;
1258 if(right >= right_splice_pos){
1259 right_match_length = right - right_splice_pos + 1;
1260 }
1261 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1262
1263 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1264 return false;
1265
1266 //char* pch = strtok( mismatches, ",");
1267 //unsigned char num_mismatches = 0;
1268 //text_offset holds the left end of the alignment,
1269 //RELATIVE TO the start of the contig
1270
1271 //The 0-based relative position of the left-most character
1272 //before the insertion in the contig
1273 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1274 for (size_t i=0;i<mismatches.size();++i) {
1275 int mismatch_pos = mismatches[i];
1276 // for reversely mapped reads,
1277 //find the correct mismatched position.
1278 if (sam_flag & 0x0010){
1279 mismatch_pos = spliced_read_len - mismatch_pos - 1;
1280 }
1281
1282 //Only count mismatches outside of the insertion region
1283 // If there is a mismatch within the insertion,
1284 // disallow this hit
1285 if(mismatch_pos + text_offset <= relative_splice_pos ||
1286 mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1287 spl_num_mismatches++;
1288 }else{
1289 return false;
1290 }
1291 }
1292 */
1293 //vector<CigarOp> splcigar;
1294 //spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1295 //splcigar.push_back(CigarOp(MATCH, left_match_length));
1296 //splcigar.push_back(CigarOp(INS, insertion_match_length));
1297 //splcigar.push_back(CigarOp(MATCH, right_match_length));
1298
1299 bh = create_hit(name,
1300 contig,
1301 left,
1302 //splcigar,
1303 splcigar,
1304 sam_flag & 0x0010,
1305 junction_strand == "rev",
1306 num_mismatches,
1307 0,
1308 end);
1309 return true;
1310 } //"ins"
1311 else //"del" or intron
1312 {
1313 // The 0-based position of the left edge of the alignment.
1314 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1315
1316 // The 0-based position of the last genomic sequence before the deletion
1317 int left_splice_pos = atoi(splice_toks[0].c_str());
1318
1319 int gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1;
1320 /*
1321 if ((sam_flag & 0x0010) == 0) //######
1322 {
1323 if (left_splice_ofs + seg_offset < _anchor_length)
1324 return false;
1325 }
1326 else
1327 {
1328 if (right_splice_ofs + seg_offset < _anchor_length)
1329 return false;
1330 }
1331 */
1332 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1333 //atoi(toks[right_window_edge_field].c_str());
1334
1335 /*
1336 //offset of deletion point, relative to the beginning of the alignment
1337 int left_splice_ofs = left_splice_pos-left+1;
1338
1339 int mismatches_in_anchor = 0;
1340 for (size_t i=0;i<mismatches.size();++i) {
1341 spl_num_mismatches++;
1342 int mismatch_pos = mismatches[i];
1343 if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_ofs) < (int)min_anchor_len) ||
1344 ((sam_flag & 0x0010) != 0 &&
1345 abs(((int)refspan - left_splice_ofs + 1) - mismatch_pos)) < (int)min_anchor_len)
1346 mismatches_in_anchor++;
1347 }
1348 */
1349 vector<CigarOp> splcigar;
1350 CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP ;
1351 //TODO: FIXME: use relative offsets for this call instead, like for the INS case
1352 int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left,
1353 left_splice_pos+1, gap_len, opcode);
1354 if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches)
1355 return false;
1356 /*
1357 splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1358 if(toks[num_extra_toks + junction_type_field] == "del"){
1359 splcigar.push_back(CigarOp(DEL, gap_len));
1360 }else{
1361 splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1362 }
1363 splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1364 */
1365 bh = create_hit(name,
1366 contig,
1367 left,
1368 splcigar,
1369 (sam_flag & 0x0010),
1370 junction_strand == "rev",
1371 num_mismatches,
1372 spl_num_mismatches,
1373 end);
1374 return true;
1375 }
1376 } //parse splice data
1377 else
1378 {
1379 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1380 //fprintf(stderr, "%s\n", orig_bwt_buf);
1381 // continue;
1382 return false;
1383 }
1384
1385 return false;
1386 }
1387
1388
1389
1390 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1391 BowtieHit& bh, bool strip_slash,
1392 char* name_out, char* name_tags,
1393 char* seq, char* qual) {
1394 if (_sam_header==NULL)
1395 err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1396 const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1397
1398
1399 uint32_t sam_flag = hit_buf->core.flag;
1400
1401 int text_offset = hit_buf->core.pos;
1402 int text_mate_pos = hit_buf->core.mpos;
1403 int target_id = hit_buf->core.tid;
1404 int mate_target_id = hit_buf->core.mtid;
1405
1406 vector<CigarOp> cigar;
1407 bool spliced_alignment = false;
1408 int num_hits = 1;
1409
1410 double mapQ = hit_buf->core.qual;
1411 long double error_prob;
1412 if (mapQ > 0)
1413 {
1414 long double p = (-1.0 * mapQ) / 10.0;
1415 error_prob = pow(10.0L, p);
1416 }
1417 else
1418 {
1419 error_prob = 1.0;
1420 }
1421
1422 //header->target_name[c->tid]
1423
1424 bool end = true;
1425 unsigned int seg_offset = 0;
1426 unsigned int seg_num = 0;
1427 unsigned int num_segs = 0;
1428 // Copy the tag out of the name field before we might wipe it out
1429 char* qname = bam1_qname(hit_buf);
1430 char* pipe = strrchr(qname, '|');
1431 if (pipe)
1432 {
1433 if (name_tags)
1434 strcpy(name_tags, pipe);
1435
1436 char* tag_buf = pipe + 1;
1437 if (strchr(tag_buf, ':'))
1438 {
1439 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1440 if (seg_num + 1 == num_segs)
1441 end = true;
1442 else
1443 end = false;
1444 }
1445
1446 *pipe = 0;
1447 }
1448
1449
1450 if (target_id < 0) {
1451 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1452 bh = create_hit(qname,
1453 "*", //ref_name
1454 0, //left coord
1455 0, //read_len
1456 false, //antisense_aln
1457 0, //edit_dist
1458 end);
1459 return true;
1460 }
1461
1462 //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1463 string text_name = _sam_header->target_name[target_id];
1464 for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1465 {
1466 //char* t;
1467
1468 int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1469 if (length <= 0)
1470 {
1471 fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1472 return false;
1473 }
1474
1475 CigarOpCode opcode;
1476 switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1477 {
1478 case BAM_CMATCH: opcode = MATCH; break;
1479 case BAM_CINS: opcode = INS; break;
1480 case BAM_CDEL: opcode = DEL; break;
1481 case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1482 case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1483 case BAM_CPAD: opcode = PAD; break;
1484 case BAM_CREF_SKIP:
1485 opcode = REF_SKIP;
1486 spliced_alignment = true;
1487 if (length > (int)max_report_intron_length)
1488 {
1489 //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1490 return false;
1491 }
1492 break;
1493 default:
1494 fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1495 return false;
1496 }
1497 if (opcode != HARD_CLIP)
1498 cigar.push_back(CigarOp(opcode, length));
1499 }
1500
1501 string mrnm;
1502 if (mate_target_id >= 0) {
1503 if (mate_target_id == target_id) {
1504 //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1505 mrnm = _sam_header->target_name[mate_target_id];
1506 }
1507 else {
1508 //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1509 return false;
1510 }
1511 }
1512 else {
1513 text_mate_pos = 0;
1514 }
1515 //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1516 bool antisense_splice=false;
1517 unsigned char num_mismatches = 0;
1518 unsigned char num_splice_anchor_mismatches = 0;
1519
1520 uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1521 if (ptr) {
1522 char src_strand_char = bam_aux2A(ptr);
1523 if (src_strand_char == '-')
1524 antisense_splice = true;
1525 // else if (src_strand_char == '+')
1526 // source_strand = CUFF_FWD;
1527 }
1528
1529 ptr = bam_aux_get(hit_buf, "NM");
1530 if (ptr) {
1531 num_mismatches = bam_aux2i(ptr);
1532 }
1533
1534 ptr = bam_aux_get(hit_buf, "NH");
1535 if (ptr) {
1536 num_hits = bam_aux2i(ptr);
1537 }
1538
1539 //bool antisense_aln = bam1_strand(hit_buf);
1540
1541 //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1542 // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1543 if (spliced_alignment) {
1544 //if (source_strand == CUFF_STRAND_UNKNOWN) {
1545 // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1546 // }
1547 bh = create_hit(qname,
1548 text_name,
1549 text_offset, // BAM files are 0-indexed
1550 cigar,
1551 sam_flag & 0x0010,
1552 antisense_splice,
1553 num_mismatches,
1554 num_splice_anchor_mismatches,
1555 end);
1556
1557 }
1558 else {
1559 //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1560 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1561 bh = create_hit(qname,
1562 text_name,
1563 text_offset, // BAM files are 0-indexed
1564 cigar,
1565 sam_flag & 0x0010,
1566 false,
1567 num_mismatches,
1568 0,
1569 end);
1570 }
1571 if (seq!=NULL) {
1572 char *bseq = (char*)bam1_seq(hit_buf);
1573 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1574 char v = bam1_seqi(bseq,i);
1575 seq[i]=bam_nt16_rev_table[v];
1576 }
1577 seq[hit_buf->core.l_qseq]=0;
1578 }
1579 if (qual!=NULL) {
1580 char *bq = (char*)bam1_qual(hit_buf);
1581 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1582 qual[i]=bq[i]+33;
1583 }
1584 qual[hit_buf->core.l_qseq]=0;
1585 }
1586 return true;
1587 }
1588
1589 bool BAMHitFactory::inspect_header(HitStream& hs)
1590 {
1591 bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1592
1593 if (header == NULL) {
1594 fprintf(stderr, "Warning: No BAM header\n");
1595 return false;
1596 }
1597 if (header->l_text == 0) {
1598 fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1599 return false;
1600 }
1601 return true;
1602 }
1603
1604
1605 void get_mapped_reads(FILE* bwtf,
1606 HitTable& hits,
1607 HitFactory& hit_factory,
1608 bool strip_slash,
1609 bool verbose)
1610 {
1611
1612
1613 char bwt_buf[2048];
1614 uint32_t reads_extracted = 0;
1615
1616 while (fgets(bwt_buf, 2048, bwtf))
1617 {
1618 // Chomp the newline
1619 char* nl = strrchr(bwt_buf, '\n');
1620 if (nl) *nl = 0;
1621 if (*bwt_buf == 0)
1622 continue;
1623 // Get a new record from the tab-delimited Bowtie map
1624 BowtieHit bh;
1625 if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1626 {
1627 // Only check uniqueness if these hits are spliced
1628 hits.add_hit(bh, true);
1629 }
1630 reads_extracted++;
1631 }
1632
1633 // This will sort the map by insert id.
1634 hits.finalize();
1635 fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1636 }
1637
1638
1639 /*
1640 AlignStatus status(const BowtieHit* align)
1641 {
1642 if (!align)
1643 return UNALIGNED;
1644 if (align->contiguous())
1645 return CONTIGUOUS;
1646 return SPLICED;
1647 }
1648 */
1649
1650 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1651 {
1652 int max_hit_pos = -1;
1653 for (size_t i = 0; i < hits.size(); ++i)
1654 {
1655 max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1656 }
1657
1658 if ((int)DoC.size() < max_hit_pos)
1659 DoC.resize(max_hit_pos);
1660
1661 for (size_t i = 0; i < hits.size(); ++i)
1662 {
1663 const BowtieHit& bh = hits[i];
1664
1665 // split up the coverage contibution for this reads
1666 size_t j = bh.left();
1667 const vector<CigarOp>& cigar = bh.cigar();
1668
1669 for (size_t c = 0 ; c < cigar.size(); ++c)
1670 {
1671 switch(cigar[c].opcode)
1672 {
1673 case MATCH:
1674 for (size_t m = 0; m < cigar[c].length; ++m)
1675 {
1676 if (DoC[j + m] < 0xFFFF)
1677 DoC[j + m]++;
1678 }
1679 //fall through this case to REF_SKIP is intentional
1680 case REF_SKIP:
1681 j += cigar[c].length;
1682 break;
1683 default:
1684 break;
1685 }
1686
1687 }
1688 }
1689 }
1690
1691 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1692 {
1693 if ((int)DoC.size() < bh.right())
1694 DoC.resize(bh.right());
1695
1696 // split up the coverage contibution for this reads
1697 size_t j = bh.left();
1698 const vector<CigarOp>& cigar = bh.cigar();
1699
1700 for (size_t c = 0 ; c < cigar.size(); ++c)
1701 {
1702 switch(cigar[c].opcode)
1703 {
1704 case MATCH:
1705 for (size_t m = 0; m < cigar[c].length; ++m)
1706 {
1707 if (DoC[j + m] < VMAXINT32)
1708 DoC[j + m]++;
1709 }
1710 //fall through this case to REF_SKIP is intentional
1711 case REF_SKIP:
1712 j += cigar[c].length;
1713 break;
1714 default:
1715 break;
1716 }
1717
1718 }
1719 }
1720
1721 void print_hit(FILE* fout,
1722 const char* read_name,
1723 const BowtieHit& bh,
1724 const char* ref_name,
1725 const char* sequence,
1726 const char* qualities,
1727 bool from_bowtie)
1728 {
1729 string seq;
1730 string quals;
1731 if (sequence)
1732 {
1733 seq = sequence;
1734 quals = qualities;
1735 seq.resize(bh.read_len());
1736 quals.resize(bh.read_len());
1737 }
1738 else
1739 {
1740 seq = "*";
1741 }
1742
1743 if (qualities)
1744 {
1745 quals = qualities;
1746 quals.resize(bh.read_len());
1747 }
1748 else
1749 {
1750 quals = "*";
1751 }
1752
1753 uint32_t sam_flag = 0;
1754 if (bh.antisense_align())
1755 {
1756 sam_flag |= 0x0010; // BAM_FREVERSE
1757 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1758 {
1759 reverse_complement(seq);
1760 reverse(quals.begin(), quals.end());
1761 }
1762 }
1763
1764 uint32_t sam_pos = bh.left() + 1;
1765 uint32_t map_quality = 255;
1766 char cigar[256];
1767 cigar[0] = 0;
1768 string mate_ref_name = "*";
1769 uint32_t mate_pos = 0;
1770 uint32_t insert_size = 0;
1771 //string qualities = "*";
1772
1773 const vector<CigarOp>& bh_cigar = bh.cigar();
1774
1775 /*
1776 * In addition to calculating the cigar string,
1777 * we need to figure out how many in/dels are in the
1778 * sequence, so that we can give the correct
1779 * value for the NM tag
1780 */
1781 int indel_distance = 0;
1782 for (size_t c = 0; c < bh_cigar.size(); ++c)
1783 {
1784 char ibuf[64];
1785 sprintf(ibuf, "%d", bh_cigar[c].length);
1786 switch(bh_cigar[c].opcode)
1787 {
1788 case MATCH:
1789 strcat(cigar, ibuf);
1790 strcat(cigar, "M");
1791 break;
1792 case INS:
1793 strcat(cigar, ibuf);
1794 strcat(cigar, "I");
1795 indel_distance += bh_cigar[c].length;
1796 break;
1797 case DEL:
1798 strcat(cigar, ibuf);
1799 strcat(cigar, "D");
1800 indel_distance += bh_cigar[c].length;
1801 break;
1802 case REF_SKIP:
1803 strcat(cigar, ibuf);
1804 strcat(cigar, "N");
1805 break;
1806 default:
1807 break;
1808 }
1809 }
1810
1811 //string q = string(bh.read_len(), '!');
1812 //string s = string(bh.read_len(), 'N');
1813
1814 fprintf(fout,
1815 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1816 read_name,
1817 sam_flag,
1818 ref_name,
1819 sam_pos,
1820 map_quality,
1821 cigar,
1822 mate_ref_name.c_str(),
1823 mate_pos,
1824 insert_size,
1825 seq.c_str(),
1826 quals.c_str());
1827
1828 if (!sam_readgroup_id.empty())
1829 fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1830
1831 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1832
1833 bool containsSplice = false;
1834 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1835 if(itr->opcode == REF_SKIP){
1836 containsSplice = true;
1837 break;
1838 }
1839 }
1840
1841 if (containsSplice)
1842 fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1843
1844 fprintf(fout, "\n");
1845 }
1846
1847 void print_bamhit(GBamWriter& wbam,
1848 const char* read_name,
1849 const BowtieHit& bh,
1850 const char* ref_name,
1851 const char* sequence,
1852 const char* qualities,
1853 bool from_bowtie)
1854 {
1855 string seq;
1856 string quals;
1857 if (sequence) {
1858 seq = sequence;
1859 quals = qualities;
1860 seq.resize(bh.read_len());
1861 quals.resize(bh.read_len());
1862 }
1863 else {
1864 seq = "*";
1865 }
1866 if (qualities) {
1867 quals = qualities;
1868 quals.resize(bh.read_len());
1869 }
1870 else
1871 {
1872 quals = "*";
1873 }
1874
1875 uint32_t sam_flag = 0;
1876 if (bh.antisense_align())
1877 {
1878 sam_flag |= 0x0010; // BAM_FREVERSE
1879 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1880 {
1881 reverse_complement(seq);
1882 reverse(quals.begin(), quals.end());
1883 }
1884 }
1885
1886 uint32_t sam_pos = bh.left() + 1;
1887 uint32_t map_quality = 255;
1888 char cigar[256];
1889 cigar[0] = 0;
1890 string mate_ref_name = "*";
1891 uint32_t mate_pos = 0;
1892 uint32_t insert_size = 0;
1893 //string qualities = "*";
1894
1895 const vector<CigarOp>& bh_cigar = bh.cigar();
1896 /*
1897 * In addition to calculating the cigar string,
1898 * we need to figure out how many in/dels are in the
1899 * sequence, so that we can give the correct
1900 * value for the NM tag
1901 */
1902 int indel_distance = 0;
1903 for (size_t c = 0; c < bh_cigar.size(); ++c)
1904 {
1905 char ibuf[64];
1906 sprintf(ibuf, "%d", bh_cigar[c].length);
1907 switch(bh_cigar[c].opcode)
1908 {
1909 case MATCH:
1910 strcat(cigar, ibuf);
1911 strcat(cigar, "M");
1912 break;
1913 case INS:
1914 strcat(cigar, ibuf);
1915 strcat(cigar, "I");
1916 indel_distance += bh_cigar[c].length;
1917 break;
1918 case DEL:
1919 strcat(cigar, ibuf);
1920 strcat(cigar, "D");
1921 indel_distance += bh_cigar[c].length;
1922 break;
1923 case REF_SKIP:
1924 strcat(cigar, ibuf);
1925 strcat(cigar, "N");
1926 break;
1927 default:
1928 break;
1929 }
1930 }
1931
1932 //string q = string(bh.read_len(), '!');
1933 //string s = string(bh.read_len(), 'N');
1934 /*
1935 fprintf(fout,
1936 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1937 read_name,
1938 sam_flag,
1939 ref_name,
1940 sam_pos,
1941 map_quality,
1942 cigar,
1943 mate_ref_name.c_str(),
1944 mate_pos,
1945 insert_size,
1946 seq.c_str(),
1947 quals.c_str());
1948
1949 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1950 */
1951 vector<string> auxdata;
1952
1953 if (!sam_readgroup_id.empty())
1954 {
1955 string nm("RG:Z:");
1956 nm += sam_readgroup_id;
1957 auxdata.push_back(nm);
1958 }
1959
1960 string nm("NM:i:");
1961 str_appendInt(nm, bh.edit_dist() + indel_distance);
1962 auxdata.push_back(nm);
1963 bool containsSplice = false;
1964 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1965 if(itr->opcode == REF_SKIP){
1966 containsSplice = true;
1967 break;
1968 }
1969
1970 if (containsSplice) {
1971 //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1972 nm="XS:A:";
1973 nm+=(char)(bh.antisense_splice() ? '-' : '+');
1974 auxdata.push_back(nm);
1975 }
1976
1977 GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1978 cigar, mate_ref_name.c_str(), mate_pos,
1979 insert_size, seq.c_str(), quals.c_str(), &auxdata);
1980
1981
1982
1983 wbam.write(brec);
1984 delete brec;
1985 }
1986
1987 /**
1988 * Print a vector of cigar operations to a file.
1989 * @param bh_cigar A vector of CigarOps
1990 * @return a string representation of the cigar string
1991 */
1992 std::string print_cigar(vector<CigarOp>& bh_cigar){
1993 char cigar[256];
1994 cigar[0] = 0;
1995 for (size_t c = 0; c < bh_cigar.size(); ++c)
1996 {
1997 char ibuf[64];
1998 sprintf(ibuf, "%d", bh_cigar[c].length);
1999 switch(bh_cigar[c].opcode)
2000 {
2001 case MATCH:
2002 strcat(cigar, ibuf);
2003 strcat(cigar, "M");
2004 break;
2005 case INS:
2006 strcat(cigar, ibuf);
2007 strcat(cigar, "I");
2008 break;
2009 case DEL:
2010 strcat(cigar, ibuf);
2011 strcat(cigar, "D");
2012 break;
2013 case REF_SKIP:
2014 strcat(cigar, ibuf);
2015 strcat(cigar, "N");
2016 break;
2017 default:
2018 break;
2019 }
2020 }
2021 string result(cigar);
2022 return result;
2023 }
2024
2025 bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
2026 {
2027 RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
2028 if (!ref_str)
2029 return false;
2030
2031 const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
2032
2033 size_t pos_seq = 0;
2034 size_t pos_ref = 0;
2035 size_t mismatch = 0;
2036 for (size_t i = 0; i < _cigar.size(); ++i)
2037 {
2038 CigarOp cigar = _cigar[i];
2039 switch(cigar.opcode)
2040 {
2041 case MATCH:
2042 {
2043 for (size_t j = 0; j < cigar.length; ++j)
2044 {
2045 if (_seq[pos_seq++] != ref_seq[pos_ref++])
2046 ++mismatch;
2047 }
2048 }
2049 break;
2050 case INS:
2051 {
2052 pos_seq += cigar.length;
2053 }
2054 break;
2055
2056 case DEL:
2057 case REF_SKIP:
2058 {
2059 pos_ref += cigar.length;
2060 }
2061 break;
2062
2063 default:
2064 break;
2065 }
2066 }
2067
2068 return mismatch == _edit_dist;
2069 }