ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 140
Committed: Thu Dec 15 22:40:48 2011 UTC (8 years, 5 months ago) by gpertea
File size: 56917 byte(s)
Log Message:
Line File contents
1 /*
2 * bwt_map.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/17/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <cassert>
15 #include <cstdlib>
16 #include <iostream>
17 #include <set>
18 #include <vector>
19 #include <cmath>
20
21 #include "common.h"
22 #include "bwt_map.h"
23 #include "tokenize.h"
24 #include "reads.h"
25
26 using namespace std;
27
28 void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29 {
30 uint32_t reference_id = bh.ref_id();
31
32 pair<RefHits::iterator, bool> ret =
33 _hits_for_ref.insert(make_pair(reference_id, HitList()));
34 HitList& hl = ret.first->second;
35
36 if (check_uniqueness)
37 {
38 // Check uniqueness, in case we are adding spliced hits from
39 // several spliced alignment sources (e.g. de novo hashing + Bowtie
40 // against a user-supplied index). We don't want to count the same
41 // alignment twice if it happened to be found by more than one method
42 HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43 HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44 for (; lb != ub && lb != hl.end(); ++lb)
45 {
46 if (*lb == bh)
47 {
48 //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49 return;
50 }
51
52 if (lb->insert_id() == bh.insert_id() &&
53 lb->ref_id() == bh.ref_id() &&
54 lb->antisense_align() == bh.antisense_align())
55 {
56 // If we get here, we may be looking at the same alignment
57 // However, spanning_reads may report a shorter, trimmed alignment
58 // so not all fields will be equal. If they just disagree on the
59 // ends, and don't indicate a different junction coord, the
60 // alignments are the same.
61
62 if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63 (bh.left() <= lb->left() && bh.right() >= lb->right()))
64 {
65 vector<pair<int,int> > lb_gaps, bh_gaps;
66 lb->gaps(lb_gaps);
67 bh.gaps(bh_gaps);
68 if (lb_gaps == bh_gaps)
69 {
70 // One alignment is contained in the other, they agree on
71 // where the gaps, if any, are, and they share an id
72 // => this is a redundant aligment, so toss it
73 //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74 return;
75 }
76 }
77 }
78 }
79 }
80 _total_hits++;
81 hl.push_back(bh);
82 }
83
84 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85 {
86 return h1.insert_id() < h2.insert_id();
87 }
88
89 void LineHitFactory::openStream(HitStream& hs)
90 {
91 if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92 //open the file for HitStream here
93 hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94 if (hs._hit_file==NULL)
95 err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96 return;
97 }
98 if (hs._fzpipe!=NULL) {
99 hs._hit_file=hs._fzpipe->file;
100 }
101 }
102
103 void LineHitFactory::rewind(HitStream& hs)
104 {
105 if (hs._fzpipe!=NULL) {
106 hs._fzpipe->rewind();
107 hs._hit_file=hs._fzpipe->file;
108 }
109 else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110 }
111
112 bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113 FILE* f=(FILE *)(hs._hit_file);
114 bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115 if (!new_rec || feof(f)) {
116 hs._eof=true;
117 return false;
118 }
119 ++_line_num;
120 char* nl = strrchr(_hit_buf, '\n');
121 if (nl) *nl = 0;
122 buf = _hit_buf;
123 buf_size = _hit_buf_max_sz - 1;
124 return true;
125 }
126
127 void LineHitFactory::closeStream(HitStream& hs) {
128 if (hs._fzpipe!=NULL) {
129 hs._fzpipe->close();
130 return;
131 }
132 if (hs._hit_file!=NULL) {
133 fclose((FILE*)(hs._hit_file));
134 hs._hit_file=NULL;
135 }
136 }
137 void BAMHitFactory::openStream(HitStream& hs) {
138 if (hs._hit_file==NULL) {
139 if (hs._hit_file_name.empty())
140 //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141 return; //invalid stream, could be just a place holder
142 //open the file here if not already open
143 string fext=getFext(hs._hit_file_name);
144 if (fext=="sam")
145 hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146 else
147 hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148
149 samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150
151 if (sam_file == NULL)
152 err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153 if (sam_file->header == NULL)
154 err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155 memset(&_next_hit, 0, sizeof(_next_hit));
156 //_beginning = bgzf_tell(sam_file->x.bam);
157 _sam_header=sam_file->header;
158 if (inspect_header(hs) == false)
159 err_die("Error: invalid SAM header for file %s\n",
160 hs._hit_file_name.c_str());
161 }
162 }
163
164 void BAMHitFactory::closeStream(HitStream& hs) {
165 if (hs._hit_file) {
166 samclose((samfile_t*)(hs._hit_file));
167 }
168 hs._hit_file=NULL;
169 _sam_header=NULL;
170 }
171
172 void BAMHitFactory::rewind(HitStream& hs)
173 {
174 /*
175 if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176 {
177 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178 _eof = false;
179 }
180 */
181 this->closeStream(hs);
182 this->openStream(hs);
183 }
184
185 string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186 const bam1_t* bamrec=(const bam1_t*)hit_buf;
187 char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188 string sam_line(tamline);
189 free(tamline);
190 return sam_line;
191 }
192
193 bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194 if (_next_hit.data) {
195 free(_next_hit.data);
196 _next_hit.data = NULL;
197 }
198 _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199 if (hs.eof() || !hs.ready()) return false;
200
201 //mark_curr_pos();
202
203 memset(&_next_hit, 0, sizeof(_next_hit));
204
205 int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206 if (bytes_read <= 0) {
207 hs._eof = true;
208 return false;
209 }
210 buf = (const char*)&_next_hit;
211 buf_size = bytes_read;
212 return true;
213 }
214
215
216 BowtieHit HitFactory::create_hit(const string& insert_name,
217 const string& ref_name,
218 int left,
219 const vector<CigarOp>& cigar,
220 bool antisense_aln,
221 bool antisense_splice,
222 unsigned char edit_dist,
223 unsigned char splice_mms,
224 bool end)
225 {
226 uint64_t insert_id = _insert_table.get_id(insert_name);
227 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228
229 return BowtieHit(reference_id,
230 insert_id,
231 left,
232 cigar,
233 antisense_aln,
234 antisense_splice,
235 edit_dist,
236 splice_mms,
237 end);
238 }
239
240 BowtieHit HitFactory::create_hit(const string& insert_name,
241 const string& ref_name,
242 uint32_t left,
243 uint32_t read_len,
244 bool antisense_aln,
245 unsigned char edit_dist,
246 bool end)
247 {
248 uint64_t insert_id = _insert_table.get_id(insert_name);
249 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250
251 return BowtieHit(reference_id,
252 insert_id,
253 left,
254 read_len,
255 antisense_aln,
256 edit_dist,
257 end);
258 }
259
260 bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261 BowtieHit& bh,
262 bool strip_slash,
263 char* name_out,
264 char* name_tags,
265 char* seq,
266 char* qual)
267 {
268 if (!orig_bwt_buf || !*orig_bwt_buf)
269 return false;
270
271 static const int buf_size = 2048;
272
273 char bwt_buf[buf_size];
274 strcpy(bwt_buf, orig_bwt_buf);
275 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276
277 char orientation;
278
279 //int bwtf_ret = 0;
280 //uint32_t seqid = 0;
281
282 char text_name[buf_size];
283 unsigned int text_offset;
284
285
286 //unsigned int other_occs;
287 char mismatches[buf_size];
288 //memset(mismatches, 0, sizeof(mismatches));
289
290 const char* buf = bwt_buf;
291 char* name = get_token((char**)&buf,"\t");
292 char* orientation_str = get_token((char**)&buf,"\t");
293 char* text_name_str = get_token((char**)&buf,"\t");
294
295 strcpy(text_name, text_name_str);
296
297 char* text_offset_str = get_token((char**)&buf,"\t");
298 char* seq_str = get_token((char**)&buf,"\t");
299 if (seq)
300 strcpy(seq, seq_str);
301
302 const char* qual_str = get_token((char**)&buf,"\t");
303 if (qual)
304 strcpy(qual, qual_str);
305
306 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307 mismatches[0] = 0;
308 char* mismatches_str = get_token((char**)&buf, "\t");
309 if (mismatches_str)
310 strcpy(mismatches, mismatches_str);
311
312 orientation = orientation_str[0];
313 text_offset = atoi(text_offset_str);
314
315 bool end = true;
316 unsigned int seg_offset = 0;
317 unsigned int seg_num = 0;
318 unsigned int num_segs = 0;
319
320 // Copy the tag out of the name field before we might wipe it out
321 char* pipe = strrchr(name, '|');
322 if (pipe)
323 {
324 if (name_tags)
325 strcpy(name_tags, pipe);
326
327 char* tag_buf = pipe + 1;
328 if (strchr(tag_buf, ':'))
329 {
330 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331 if (seg_num + 1 == num_segs)
332 end = true;
333 else
334 end = false;
335 }
336
337 *pipe = 0;
338 }
339 // Stripping the slash and number following it gives the insert name
340 char* slash = strrchr(name, '/');
341 if (strip_slash && slash)
342 *slash = 0;
343
344 size_t read_len = strlen(seq_str);
345
346 // Add this alignment to the table of hits for this half of the
347 // Bowtie map
348
349 //vector<string> mismatch_toks;
350 char* pch = strtok (mismatches,",");
351 unsigned char num_mismatches = 0;
352 while (pch != NULL)
353 {
354 char* colon = strchr(pch, ':');
355 if (colon)
356 {
357 num_mismatches++;
358 }
359 //mismatch_toks.push_back(pch);
360 pch = strtok (NULL, ",");
361 }
362
363 bh = create_hit(name,
364 text_name,
365 text_offset,
366 (int)read_len,
367 orientation == '-',
368 num_mismatches,
369 end);
370
371 return true;
372 }
373
374 int anchor_mismatch = 0;
375
376
377 void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
378 bool &end, unsigned int &seg_offset, unsigned int& seg_num,
379 unsigned int & num_segs) {
380 char* pipe = strrchr(name, '|');
381 if (pipe)
382 {
383 if (name_tags)
384 strcpy(name_tags, pipe);
385
386 char* tag_buf = pipe + 1;
387 if (strchr(tag_buf, ':'))
388 {
389 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
390 if (seg_num + 1 == num_segs)
391 end = true;
392 else
393 end = false;
394 }
395
396 *pipe = 0;
397 }
398
399 // Stripping the slash and number following it gives the insert name
400 char* slash = strrchr(name, '/');
401 if (strip_slash && slash)
402 *slash = 0;
403 }
404
405
406 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
407 BowtieHit& bh,
408 bool strip_slash,
409 char* name_out,
410 char* name_tags,
411 char* seq,
412 char* qual)
413 {
414 if (!orig_bwt_buf || !*orig_bwt_buf)
415 return false;
416
417 static const int buf_size = 2048;
418
419 char bwt_buf[buf_size];
420 strcpy(bwt_buf, orig_bwt_buf);
421 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
422
423 char orientation;
424 char text_name[buf_size];
425 unsigned int text_offset;
426 char mismatches[buf_size];
427 //memset(mismatches, 0, sizeof(mismatches));
428
429 char* buf = bwt_buf;
430 char* name = get_token((char**)&buf,"\t");
431 char* orientation_str = get_token((char**)&buf,"\t");
432 char* text_name_str = get_token((char**)&buf,"\t");
433 strcpy(text_name, text_name_str);
434
435 char* text_offset_str = get_token((char**)&buf,"\t");
436 char* seq_str = get_token((char**)&buf,"\t");
437 if (seq)
438 strcpy(seq, seq_str);
439
440 const char* qual_str = get_token((char**)&buf,"\t");
441 if (qual)
442 strcpy(qual, qual_str);
443
444 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
445 mismatches[0] = 0;
446 char* mismatches_str = get_token((char**)&buf, "\t");
447 if (mismatches_str)
448 strcpy(mismatches, mismatches_str);
449
450 orientation = orientation_str[0];
451 text_offset = atoi(text_offset_str);
452
453 bool end = true;
454 unsigned int seg_offset = 0;
455 unsigned int seg_num = 0;
456 unsigned int num_segs = 0;
457
458 // Copy the tag out of the name field before we might wipe it out
459 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
460
461 // Add this alignment to the table of hits for this half of the
462 // Bowtie map
463
464 // Parse the text_name field to recover the splice coords
465 vector<string> toks;
466
467 tokenize_strict(text_name, "|", toks);
468
469 int num_extra_toks = (int)toks.size() - 6;
470
471 if (num_extra_toks >= 0)
472 {
473 static const uint8_t left_window_edge_field = 1;
474 static const uint8_t splice_field = 2;
475 //static const uint8_t right_window_edge_field = 3;
476 static const uint8_t junction_type_field = 4;
477 static const uint8_t strand_field = 5;
478
479 string contig = toks[0];
480 for (int t = 1; t <= num_extra_toks; ++t)
481 {
482 contig += "|";
483 contig += toks[t];
484 }
485
486 vector<string> splice_toks;
487 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
488 if (splice_toks.size() != 2)
489 {
490 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
491 //fprintf(stderr, "%s (token: %s)\n", text_name,
492 // toks[num_extra_toks + splice_field].c_str());
493 return false;
494 }
495
496 //
497 // check for an insertion hit
498 //
499 if(toks[num_extra_toks + junction_type_field] == "ins")
500 {
501 int8_t spliced_read_len = strlen(seq_str);
502 /*
503 * The 0-based position of the left edge of the alignment. Note that this
504 * value may need to be futher corrected to account for the presence of
505 * of the insertion.
506 */
507 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
508 uint32_t right = left + spliced_read_len - 1;
509
510
511 /*
512 * The 0-based position of the last genomic sequence before the insertion
513 */
514 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
515
516 string insertedSequence = splice_toks[1];
517 /*
518 * The 0-based position of the first genomic sequence after teh insertion
519 */
520 uint32_t right_splice_pos = left_splice_pos + 1;
521 if(left > left_splice_pos){
522 /*
523 * The genomic position of the left edge of the alignment needs to be corrected
524 * If the alignment does not extend into the insertion, simply subtract the length
525 * of the inserted sequence, otherwise, just set it equal to the right edge
526 */
527 left = left - insertedSequence.length();
528 if(left < right_splice_pos){
529 left = right_splice_pos;
530 }
531 }
532 if(right > left_splice_pos){
533 right = right - insertedSequence.length();
534 if(right < left_splice_pos){
535 right = left_splice_pos;
536 }
537 }
538 /*
539 * Now, right and left should be properly transformed into genomic coordinates
540 * We should be able to deduce how much the alignment matches the insertion
541 * simply based on the length of the read
542 */
543 int left_match_length = 0;
544 if(left <= left_splice_pos){
545 left_match_length = left_splice_pos - left + 1;
546 }
547 int right_match_length = 0;
548 if(right >= right_splice_pos){
549 right_match_length = right - right_splice_pos + 1;
550 }
551 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
552
553 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
554 return false;
555
556 string junction_strand = toks[num_extra_toks + strand_field];
557 if(junction_strand != "rev" && junction_strand != "fwd"){
558 fprintf(stderr, "Malformed insertion record\n");
559 return false;
560 }
561
562 char* pch = strtok( mismatches, ",");
563 unsigned char num_mismatches = 0;
564
565 /*
566 * remember that text_offset holds the left end of the
567 * alignment, relative to the start of the contig
568 */
569
570 /*
571 * The 0-based relative position of the left-most character
572 * before the insertion in the contig
573 */
574 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
575 while (pch != NULL)
576 {
577 char* colon = strchr(pch, ':');
578 if (colon)
579 {
580 *colon = 0;
581 int mismatch_pos = atoi(pch);
582
583 /*
584 * for reversely mapped reads,
585 * find the correct mismatched position.
586 */
587 if(orientation == '-'){
588 mismatch_pos = spliced_read_len - mismatch_pos - 1;
589 }
590
591 /*
592 * Only count mismatches outside of the insertion region
593 * If there is a mismatch within the insertion,
594 * disallow this hit
595 */
596 if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
597 num_mismatches++;
598 }else{
599 return false;
600 }
601 }
602 pch = strtok (NULL, ",");
603 }
604
605
606 vector<CigarOp> cigar;
607 cigar.push_back(CigarOp(MATCH, left_match_length));
608 cigar.push_back(CigarOp(INS, insertion_match_length));
609 cigar.push_back(CigarOp(MATCH, right_match_length));
610
611 /*
612 * For now, disallow hits that don't span
613 * the insertion. If we allow these types of hits,
614 * then long_spanning.cpp needs to be updated
615 * in order to intelligently merge these kinds
616 * of reads back together
617 *
618 * Following code has been changed to allow segment that end
619 * in an insertion
620 */
621 bh = create_hit(name,
622 contig,
623 left,
624 cigar,
625 orientation == '-',
626 junction_strand == "rev",
627 num_mismatches,
628 0,
629 end);
630 return true;
631 }
632
633 else
634 {
635 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
636 int spliced_read_len = strlen(seq_str);
637 int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
638 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
639 int8_t right_splice_pos = spliced_read_len - left_splice_pos;
640
641 if (right_splice_pos <= 0 || left_splice_pos <= 0)
642 return false;
643
644 if (orientation == '+')
645 {
646 if (left_splice_pos + seg_offset < _anchor_length){
647 return false;
648 }
649 }
650 else
651 {
652 if (right_splice_pos + seg_offset < _anchor_length)
653 return false;
654 }
655 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
656 //atoi(toks[right_window_edge_field].c_str());
657 int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
658
659 string junction_strand = toks[num_extra_toks + strand_field];
660 if (!(junction_strand == "rev" || junction_strand == "fwd")||
661 !(orientation == '-' || orientation == '+'))
662 {
663 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
664 //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
665 // junction_strand.c_str(), orientation);
666 return false;
667 }
668
669 //vector<string> mismatch_toks;
670 char* pch = strtok (mismatches,",");
671 int mismatches_in_anchor = 0;
672 unsigned char num_mismatches = 0;
673 while (pch != NULL)
674 {
675 char* colon = strchr(pch, ':');
676 if (colon)
677 {
678 *colon = 0;
679 num_mismatches++;
680 int mismatch_pos = atoi(pch);
681 if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
682 (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
683 mismatches_in_anchor++;
684 }
685 //mismatch_toks.push_back(pch);
686 pch = strtok (NULL, ",");
687 }
688
689 // FIXME: we probably should exclude these hits somewhere, but this
690 // isn't the right place
691 vector<CigarOp> cigar;
692 cigar.push_back(CigarOp(MATCH, left_splice_pos));
693 if(toks[num_extra_toks + junction_type_field] == "del"){
694 cigar.push_back(CigarOp(DEL, gap_len));
695 }else{
696 cigar.push_back(CigarOp(REF_SKIP, gap_len));
697 }
698 cigar.push_back(CigarOp(MATCH, right_splice_pos));
699
700 bh = create_hit(name,
701 contig,
702 left,
703 cigar,
704 orientation == '-',
705 junction_strand == "rev",
706 num_mismatches,
707 mismatches_in_anchor,
708 end);
709 return true;
710 }
711 }
712 else
713 {
714 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
715 //fprintf(stderr, "%s\n", orig_bwt_buf);
716 // continue;
717 return false;
718 }
719
720 return false;
721 }
722
723
724 int parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
725 bool &spliced_alignment) {
726 const char* p_cig = cigar_str;
727 int refspan=0; //alignment span on reference sequence
728 while (*p_cig)
729 {
730 char* t;
731 int op_len = (int)strtol(p_cig, &t, 10);
732 if (op_len <= 0)
733 {
734 fprintf (stderr, "Error: CIGAR op has zero length\n");
735 return 0;
736 }
737 char op_char = toupper(*t);
738 CigarOpCode opcode;
739 switch (op_char) {
740 case '=':
741 case 'X':
742 case 'M': opcode = MATCH;
743 refspan+=op_len;
744 break;
745 case 'I': opcode = INS;
746 break;
747 case 'D': opcode = DEL;
748 refspan+=op_len;
749 break;
750 case 'N': if (op_len > max_report_intron_length)
751 return 0;
752 opcode = REF_SKIP;
753 spliced_alignment = true;
754 refspan+=op_len;
755 break;
756 case 'S': opcode = SOFT_CLIP;
757 break;
758 case 'H': opcode = HARD_CLIP;
759 break;
760 case 'P': opcode = PAD;
761 break;
762 default: fprintf (stderr, "Error: invalid CIGAR operation\n");
763 return 0;
764 }
765 p_cig = t + 1;
766 cigar.push_back(CigarOp(opcode, op_len));
767 } //while cigar codes
768 if (*p_cig) {
769 fprintf (stderr, "Error: unmatched CIGAR operation (%s)\n",p_cig);
770 return 0;
771 }
772 return refspan;
773 }
774
775 int getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
776 vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) {
777 int gspan=0;//genomic span of the alignment
778 const char* tag_buf = buf;
779 sam_nm=0;
780 int num_mismatches=0;
781 while((tag_buf = get_token((char**)&buf,"\t")))
782 {
783 vector<string> tuple_fields;
784 tokenize(tag_buf,":", tuple_fields);
785 if (tuple_fields.size() == 3)
786 {
787 if (tuple_fields[0] == "XS")
788 {
789 if (tuple_fields[2] == "-")
790 antisense_splice = true;
791 }
792 else if (tuple_fields[0] == "NM")
793 {
794 sam_nm = atoi(tuple_fields[2].c_str());
795 }
796 else if (tuple_fields[0] == "NS")
797 {
798 //ignored for now
799 }
800 else if (tuple_fields[0] == "MD")
801 {
802 const char* p=tuple_fields[2].c_str();
803 int bi=0; //base offset position in the read
804 while (*p != 0) {
805 if (isdigit(*p)) {
806 int v=atoi(p);
807 do { p++; } while (isdigit(*p));
808 bi+=v;
809 }
810 while (isalpha(*p)) {
811 p++;
812 num_mismatches++;
813 //mismatches.push_back(bi);
814 mismatches[bi]=true;
815 bi++;
816 }
817 if (*p=='^') { //reference deletion
818 p++;
819 while (isalpha(*p)) { //insert read bases
820 p++; bi++;
821 }
822 }
823 }
824 }
825 //else
826 //{
827 //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
828 //return false;
829 //}
830 }
831 }
832 /* By convention,the NM field of the SAM record
833 * counts an insertion or deletion. I dont' think
834 * we want the mismatch count in the BowtieHit
835 * record to reflect this. Therefore, subtract out
836 * the mismatches due to in/dels
837 */
838 for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
839 switch (itr->opcode)
840 {
841 case MATCH:
842 case REF_SKIP:
843 case PAD:
844 gspan += itr->length;
845 break;
846 case DEL:
847 gspan += itr->length;
848 sam_nm -= itr->length;
849 break;
850 case INS:
851 sam_nm -= itr->length;
852 break;
853 }
854 }
855 return num_mismatches;
856 }
857
858 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
859 BowtieHit& bh,
860 bool strip_slash,
861 char* name_out,
862 char* name_tags,
863 char* seq,
864 char* qual)
865 {
866 if (!orig_bwt_buf || !*orig_bwt_buf)
867 return false;
868 char bwt_buf[2048];
869 strcpy(bwt_buf, orig_bwt_buf);
870 // Are we still in the header region?
871 if (bwt_buf[0] == '@')
872 return false;
873
874 char* buf = bwt_buf;
875 char* name = get_token((char**)&buf,"\t");
876 char* sam_flag_str = get_token((char**)&buf,"\t");
877 char* text_name = get_token((char**)&buf,"\t");
878 char* text_offset_str = get_token((char**)&buf,"\t");
879 const char* map_qual_str = get_token((char**)&buf,"\t");
880 char* cigar_str = get_token((char**)&buf,"\t");
881 const char* mate_ref_str = get_token((char**)&buf,"\t");
882 const char* mate_pos_str = get_token((char**)&buf,"\t");
883 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
884
885 const char* seq_str = get_token((char**)&buf,"\t");
886 if (seq)
887 strcpy(seq, seq_str);
888
889 const char* qual_str = get_token((char**)&buf,"\t");
890 if (qual)
891 strcpy(qual, qual_str);
892
893 if (!name ||
894 !sam_flag_str ||
895 !text_name ||
896 !text_offset_str ||
897 !map_qual_str ||
898 !cigar_str ||
899 !mate_ref_str ||
900 !mate_pos_str ||
901 !inferred_insert_sz_str ||
902 !seq_str ||
903 !qual_str)
904 {
905 // truncated or malformed SAM record
906 return false;
907 }
908
909
910 int sam_flag = atoi(sam_flag_str);
911 int text_offset = atoi(text_offset_str);
912
913 bool end = true;
914 unsigned int seg_offset = 0;
915 unsigned int seg_num = 0;
916 unsigned int num_segs = 0;
917
918 // Copy the tag out of the name field before we might wipe it out
919 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
920
921 vector<CigarOp> cigar;
922 bool spliced_alignment = false;
923 int refspan=parseCigar(cigar, cigar_str, spliced_alignment);
924 if (refspan==0)
925 return false;
926 //vector<string> attributes;
927 //tokenize(tag_buf, " \t",attributes);
928
929 bool antisense_splice = false;
930 int sam_nm = 0; //the value of the NM tag (edit distance)
931 //int mismatches[1024];//array with mismatch positions on the read (0-based from the left aligned end of the read)
932 vector<bool> mismatches;
933 mismatches.resize(strlen(seq_str), false);
934 int num_mismatches=getSAMmismatches(buf, cigar, mismatches, sam_nm, antisense_splice);
935
936 if (spliced_alignment)
937 {
938 bh = create_hit(name,
939 text_name,
940 text_offset - 1,
941 cigar,
942 sam_flag & 0x0010,
943 antisense_splice,
944 num_mismatches,
945 0,
946 end);
947 }
948 else
949 {
950 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
951 bh = create_hit(name,
952 text_name,
953 text_offset - 1, // SAM files are 1-indexed
954 cigar,
955 sam_flag & 0x0010,
956 false,
957 num_mismatches,
958 0,
959 end);
960 }
961 return true;
962 }
963
964 void cigar_add(vector<CigarOp>& cigar, CigarOp& op) {
965 if (op.length<=0) return;
966 if (cigar.size()>0 && cigar.back().opcode==op.opcode) {
967 cigar.back().length+=op.length;
968 }
969 cigar.push_back(op);
970 }
971
972 int spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<bool> mismatches,
973 int &left, int spl_start, int spl_len, CigarOpCode spl_code) {
974 //merge the original 'cigar' with the new insert/gap operation
975 //at position spl_start and place the result into splcigar;
976 //TODO: ideally this should also get and rebuild the MD string (alignment mismatches)
977 int spl_mismatches=0;
978 //return value: mismatches in the insert region for INS case,
979 //or number of mismatches in the anchor region
980 //return -1 if somehow the hit seems bad
981
982 //these offsets are relative to the beginning of alignment on reference
983 int spl_ofs=spl_start-left; //relative position of splice op
984 int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op
985 CigarOp gapop(spl_code, spl_len); //for DEL or REF_SKIP
986 if (spl_code==INS)
987 spl_ofs_end += spl_len;
988 int ref_ofs=0; //working offset on reference
989 int read_ofs=0; //working offset on the read, relative to the leftmost aligned base
990 bool xfound=false;
991 //if (left<=spl_start+spl_len) {
992 if (spl_ofs_end>0) {
993 int prev_opcode=0;
994 int prev_oplen=0;
995 for (size_t c = 0 ; c < cigar.size(); ++c)
996 {
997 int prev_read_ofs=read_ofs;
998 int cur_op_ofs=ref_ofs;
999 int cur_opcode=cigar[c].opcode;
1000 int cur_oplen=cigar[c].length;
1001 switch (cur_opcode) {
1002 case MATCH:
1003 ref_ofs+=cur_oplen;
1004 read_ofs+=cur_oplen;
1005 if (spl_code==REF_SKIP)
1006 for (int o=cur_op_ofs;o<ref_ofs;o++) {
1007 int rofs=prev_read_ofs+(cur_op_ofs-o);
1008 if (abs(spl_ofs-o)<min_anchor_len && mismatches[rofs])
1009 spl_mismatches++;
1010 }
1011 break;
1012 case DEL:
1013 case REF_SKIP:
1014 case PAD:
1015 ref_ofs+=cur_oplen;
1016 break;
1017 case SOFT_CLIP:
1018 case INS:
1019 read_ofs+=cur_oplen;
1020 break;
1021 //case HARD_CLIP:
1022 }
1023
1024 if (cur_op_ofs>=spl_ofs_end || ref_ofs<=spl_ofs) {
1025 if (cur_op_ofs==spl_ofs_end && spl_code!=INS)
1026 {
1027 xfound=true;
1028 //we have to insert the gap here first
1029 cigar_add(splcigar, gapop);
1030 //also, check
1031 }
1032 cigar_add(splcigar, cigar[c]);
1033 }
1034 else //if (ref_ofs>spl_ofs) {
1035 { //op intersection
1036 xfound=true;
1037 if (spl_code==INS) {
1038 //we have to shorten cur_opcode
1039 // find the overlap between current range
1040 int ovl_start = (cur_op_ofs>spl_ofs) ? cur_op_ofs : spl_ofs;
1041 int ovl_end = (ref_ofs>spl_ofs_end) ? spl_ofs_end : ref_ofs;
1042 CigarOp op(cigar[c]);
1043 int len=op.length;
1044 len-=(ovl_end-ovl_start);
1045 if (len>0) {
1046 op.length=len;
1047 cigar_add(splcigar, op);
1048 }
1049 }
1050 else {//DEL or REF_SKIP
1051 //spl_ofs == spl_ofs_end
1052 //we have to split cur_opcode
1053 //look for mismatches within min_anchor_len distance from splice point
1054 CigarOp op(cigar[c]);
1055 op.length=spl_ofs-cur_op_ofs;
1056 cigar_add(splcigar, op);
1057 cigar_add(splcigar, gapop);
1058 op.length=ref_ofs-spl_ofs;
1059 cigar_add(splcigar,op);
1060 }
1061 } //op intersection
1062 prev_opcode=cur_opcode;
1063 prev_oplen=cur_oplen;
1064 } //for each cigar opcode
1065 } //intersection possible
1066
1067 //if (!xfound) {//no intersection found between splice event and alignment
1068 if (spl_ofs_end<=0) {
1069 //alignment starts after the splice event
1070 if (spl_code==INS) left-=spl_len;
1071 else left+=spl_len;
1072 }
1073 //else {
1074 //alignment ends before the splice event
1075 //nothing to do
1076 // }
1077 //return spl_mismatches;
1078 // }
1079
1080 return spl_mismatches;
1081 }
1082
1083 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1084 BowtieHit& bh,
1085 bool strip_slash,
1086 char* name_out,
1087 char* name_tags,
1088 char* seq,
1089 char* qual)
1090 {
1091 if (!orig_bwt_buf || !*orig_bwt_buf)
1092 return false;
1093
1094 char bwt_buf[2048];
1095 strcpy(bwt_buf, orig_bwt_buf);
1096 // Are we still in the header region?
1097 if (bwt_buf[0] == '@')
1098 return false;
1099
1100 char* buf = bwt_buf;
1101 char* name = get_token((char**)&buf,"\t");
1102 char* sam_flag_str = get_token((char**)&buf,"\t");
1103 char* text_name = get_token((char**)&buf,"\t");
1104 char* text_offset_str = get_token((char**)&buf,"\t");
1105 const char* map_qual_str = get_token((char**)&buf,"\t");
1106 char* cigar_str = get_token((char**)&buf,"\t");
1107 const char* mate_ref_str = get_token((char**)&buf,"\t");
1108 const char* mate_pos_str = get_token((char**)&buf,"\t");
1109 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1110 //int num_mismatches=0;
1111 //int mismatches[1024]; //list of 0-based mismatch positions in this read
1112 //parsed from SAM's MD:Z: tag
1113 const char* seq_str = get_token((char**)&buf,"\t");
1114 if (seq)
1115 strcpy(seq, seq_str);
1116
1117 const char* qual_str = get_token((char**)&buf,"\t");
1118 if (qual)
1119 strcpy(qual, qual_str);
1120
1121 if (!name ||
1122 !sam_flag_str ||
1123 !text_name ||
1124 !text_offset_str ||
1125 !map_qual_str ||
1126 !cigar_str ||
1127 !mate_ref_str ||
1128 !mate_pos_str ||
1129 !inferred_insert_sz_str ||
1130 !seq_str ||
1131 !qual_str)
1132 {
1133 // truncated or malformed SAM record
1134 return false;
1135 }
1136
1137
1138 int sam_flag = atoi(sam_flag_str);
1139 int text_offset = atoi(text_offset_str);
1140 text_offset--; //make it 0-based (SAM is 1-based, Bowtie is 0-based)
1141 bool end = true;
1142 unsigned int seg_offset = 0;
1143 unsigned int seg_num = 0;
1144 unsigned int num_segs = 0;
1145
1146 // Copy the tag out of the name field before we might wipe it out
1147 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1148
1149 vector<CigarOp> samcigar;
1150 bool spliced_alignment = false;
1151 int refspan=parseCigar(samcigar, cigar_str, spliced_alignment);
1152 if (refspan==0)
1153 return false;
1154 bool antisense_splice = false;
1155 int sam_nm = 0;
1156 vector<bool> mismatches;
1157 mismatches.resize(strlen(seq_str), false);
1158
1159 int num_mismatches=getSAMmismatches(buf, samcigar, mismatches, sam_nm, antisense_splice);
1160
1161 //##############################################
1162
1163 // Add this alignment to the table of hits for this half of the
1164 // Bowtie map
1165
1166 // Parse the text_name field to recover the splice coords
1167 vector<string> toks;
1168 tokenize_strict(text_name, "|", toks);
1169
1170 int num_extra_toks = (int)toks.size() - 6;
1171
1172 if (num_extra_toks >= 0)
1173 {
1174 static const uint8_t left_window_edge_field = 1;
1175 static const uint8_t splice_field = 2;
1176 //static const uint8_t right_window_edge_field = 3;
1177 static const uint8_t junction_type_field = 4;
1178 static const uint8_t strand_field = 5;
1179
1180 string contig = toks[0];
1181 for (int t = 1; t <= num_extra_toks; ++t)
1182 {
1183 contig += "|";
1184 contig += toks[t];
1185 }
1186
1187 vector<string> splice_toks;
1188 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1189 if (splice_toks.size() != 2)
1190 {
1191 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1192 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1193 // toks[num_extra_toks + splice_field].c_str());
1194 return false;
1195 }
1196
1197 string junction_strand = toks[num_extra_toks + strand_field];
1198 if(junction_strand != "rev" && junction_strand != "fwd"){
1199 fprintf(stderr, "Malformed insertion record\n");
1200 return false;
1201 }
1202
1203 //
1204 // check for an insertion hit
1205 //
1206 if(toks[num_extra_toks + junction_type_field] == "ins")
1207 {
1208 //int8_t spliced_read_len = strlen(seq_str);
1209 //TODO FIXME: use the CIGAR instead of seq length!
1210 // The 0-based position of the left edge of the alignment. Note that this
1211 // value may need to be further corrected to account for the presence of
1212 // of the insertion.
1213 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1214
1215 // The 0-based position of the last genomic sequence before the insertion
1216 int left_splice_pos = atoi(splice_toks[0].c_str());
1217
1218 string insertedSequence = splice_toks[1];
1219 // The 0-based position of the first genomic sequence after the insertion
1220
1221 vector<CigarOp> splcigar;
1222 //this also updates left to the adjusted genomic coordinates
1223 int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches,
1224 left, left_splice_pos+1, insertedSequence.length(), INS);
1225 if (spl_num_mismatches<0) return false;
1226 num_mismatches-=spl_num_mismatches;
1227 /*
1228 uint32_t right_splice_pos = left_splice_pos + 1;
1229
1230 //uint32_t right = left + spliced_read_len - 1;
1231 int right = left + refspan - 1;
1232
1233 if(left > left_splice_pos){
1234 //The genomic position of the left edge of the alignment needs to be corrected
1235 //If the alignment does not extend into the insertion, simply subtract the length
1236 //of the inserted sequence, otherwise, just set it equal to the right edge
1237 left = left - insertedSequence.length();
1238 if(left < right_splice_pos){
1239 left = right_splice_pos;
1240 }
1241 }
1242 if(right > left_splice_pos){
1243 right = right - insertedSequence.length();
1244 if(right < left_splice_pos){
1245 right = left_splice_pos;
1246 }
1247 }
1248 // Now, right and left should be properly transformed into genomic coordinates
1249 // We should be able to deduce how much the alignment matches the insertion
1250 // simply based on the length of the read
1251 int left_match_length = 0;
1252 if(left <= left_splice_pos){
1253 left_match_length = left_splice_pos - left + 1;
1254 }
1255 int right_match_length = 0;
1256 if(right >= right_splice_pos){
1257 right_match_length = right - right_splice_pos + 1;
1258 }
1259 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1260
1261 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1262 return false;
1263
1264 //char* pch = strtok( mismatches, ",");
1265 //unsigned char num_mismatches = 0;
1266 //text_offset holds the left end of the alignment,
1267 //RELATIVE TO the start of the contig
1268
1269 //The 0-based relative position of the left-most character
1270 //before the insertion in the contig
1271 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1272 for (size_t i=0;i<mismatches.size();++i) {
1273 int mismatch_pos = mismatches[i];
1274 // for reversely mapped reads,
1275 //find the correct mismatched position.
1276 if (sam_flag & 0x0010){
1277 mismatch_pos = spliced_read_len - mismatch_pos - 1;
1278 }
1279
1280 //Only count mismatches outside of the insertion region
1281 // If there is a mismatch within the insertion,
1282 // disallow this hit
1283 if(mismatch_pos + text_offset <= relative_splice_pos ||
1284 mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1285 spl_num_mismatches++;
1286 }else{
1287 return false;
1288 }
1289 }
1290 */
1291 //vector<CigarOp> splcigar;
1292 //spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1293 //splcigar.push_back(CigarOp(MATCH, left_match_length));
1294 //splcigar.push_back(CigarOp(INS, insertion_match_length));
1295 //splcigar.push_back(CigarOp(MATCH, right_match_length));
1296
1297 bh = create_hit(name,
1298 contig,
1299 left,
1300 //splcigar,
1301 splcigar,
1302 sam_flag & 0x0010,
1303 junction_strand == "rev",
1304 num_mismatches,
1305 0,
1306 end);
1307 return true;
1308 } //"ins"
1309 else //"del" or intron
1310 {
1311 // The 0-based position of the left edge of the alignment.
1312 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1313
1314 // The 0-based position of the last genomic sequence before the deletion
1315 int left_splice_pos = atoi(splice_toks[0].c_str());
1316
1317 int gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1;
1318 /*
1319 if ((sam_flag & 0x0010) == 0) //######
1320 {
1321 if (left_splice_ofs + seg_offset < _anchor_length)
1322 return false;
1323 }
1324 else
1325 {
1326 if (right_splice_ofs + seg_offset < _anchor_length)
1327 return false;
1328 }
1329 */
1330 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1331 //atoi(toks[right_window_edge_field].c_str());
1332
1333 /*
1334 //offset of deletion point, relative to the beginning of the alignment
1335 int left_splice_ofs = left_splice_pos-left+1;
1336
1337 int mismatches_in_anchor = 0;
1338 for (size_t i=0;i<mismatches.size();++i) {
1339 spl_num_mismatches++;
1340 int mismatch_pos = mismatches[i];
1341 if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_ofs) < (int)min_anchor_len) ||
1342 ((sam_flag & 0x0010) != 0 &&
1343 abs(((int)refspan - left_splice_ofs + 1) - mismatch_pos)) < (int)min_anchor_len)
1344 mismatches_in_anchor++;
1345 }
1346 */
1347 vector<CigarOp> splcigar;
1348 CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP ;
1349
1350 int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left,
1351 left_splice_pos+1, gap_len, opcode);
1352 if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches)
1353 return false;
1354 /*
1355 splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1356 if(toks[num_extra_toks + junction_type_field] == "del"){
1357 splcigar.push_back(CigarOp(DEL, gap_len));
1358 }else{
1359 splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1360 }
1361 splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1362 */
1363 bh = create_hit(name,
1364 contig,
1365 left,
1366 splcigar,
1367 (sam_flag & 0x0010),
1368 junction_strand == "rev",
1369 num_mismatches,
1370 spl_num_mismatches,
1371 end);
1372 return true;
1373 }
1374 } //parse splice data
1375 else
1376 {
1377 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1378 //fprintf(stderr, "%s\n", orig_bwt_buf);
1379 // continue;
1380 return false;
1381 }
1382
1383 return false;
1384 }
1385
1386
1387
1388 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1389 BowtieHit& bh, bool strip_slash,
1390 char* name_out, char* name_tags,
1391 char* seq, char* qual) {
1392 if (_sam_header==NULL)
1393 err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1394 const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1395
1396
1397 uint32_t sam_flag = hit_buf->core.flag;
1398
1399 int text_offset = hit_buf->core.pos;
1400 int text_mate_pos = hit_buf->core.mpos;
1401 int target_id = hit_buf->core.tid;
1402 int mate_target_id = hit_buf->core.mtid;
1403
1404 vector<CigarOp> cigar;
1405 bool spliced_alignment = false;
1406 int num_hits = 1;
1407
1408 double mapQ = hit_buf->core.qual;
1409 long double error_prob;
1410 if (mapQ > 0)
1411 {
1412 long double p = (-1.0 * mapQ) / 10.0;
1413 error_prob = pow(10.0L, p);
1414 }
1415 else
1416 {
1417 error_prob = 1.0;
1418 }
1419
1420 //header->target_name[c->tid]
1421
1422 bool end = true;
1423 unsigned int seg_offset = 0;
1424 unsigned int seg_num = 0;
1425 unsigned int num_segs = 0;
1426 // Copy the tag out of the name field before we might wipe it out
1427 char* qname = bam1_qname(hit_buf);
1428 char* pipe = strrchr(qname, '|');
1429 if (pipe)
1430 {
1431 if (name_tags)
1432 strcpy(name_tags, pipe);
1433
1434 char* tag_buf = pipe + 1;
1435 if (strchr(tag_buf, ':'))
1436 {
1437 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1438 if (seg_num + 1 == num_segs)
1439 end = true;
1440 else
1441 end = false;
1442 }
1443
1444 *pipe = 0;
1445 }
1446
1447
1448 if (target_id < 0) {
1449 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1450 bh = create_hit(qname,
1451 "*", //ref_name
1452 0, //left coord
1453 0, //read_len
1454 false, //antisense_aln
1455 0, //edit_dist
1456 end);
1457 return true;
1458 }
1459
1460 //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1461 string text_name = _sam_header->target_name[target_id];
1462 for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1463 {
1464 //char* t;
1465
1466 int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1467 if (length <= 0)
1468 {
1469 fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1470 return false;
1471 }
1472
1473 CigarOpCode opcode;
1474 switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1475 {
1476 case BAM_CMATCH: opcode = MATCH; break;
1477 case BAM_CINS: opcode = INS; break;
1478 case BAM_CDEL: opcode = DEL; break;
1479 case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1480 case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1481 case BAM_CPAD: opcode = PAD; break;
1482 case BAM_CREF_SKIP:
1483 opcode = REF_SKIP;
1484 spliced_alignment = true;
1485 if (length > (int)max_report_intron_length)
1486 {
1487 //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1488 return false;
1489 }
1490 break;
1491 default:
1492 fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1493 return false;
1494 }
1495 if (opcode != HARD_CLIP)
1496 cigar.push_back(CigarOp(opcode, length));
1497 }
1498
1499 string mrnm;
1500 if (mate_target_id >= 0) {
1501 if (mate_target_id == target_id) {
1502 //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1503 mrnm = _sam_header->target_name[mate_target_id];
1504 }
1505 else {
1506 //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1507 return false;
1508 }
1509 }
1510 else {
1511 text_mate_pos = 0;
1512 }
1513 //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1514 bool antisense_splice=false;
1515 unsigned char num_mismatches = 0;
1516 unsigned char num_splice_anchor_mismatches = 0;
1517
1518 uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1519 if (ptr) {
1520 char src_strand_char = bam_aux2A(ptr);
1521 if (src_strand_char == '-')
1522 antisense_splice = true;
1523 // else if (src_strand_char == '+')
1524 // source_strand = CUFF_FWD;
1525 }
1526
1527 ptr = bam_aux_get(hit_buf, "NM");
1528 if (ptr) {
1529 num_mismatches = bam_aux2i(ptr);
1530 }
1531
1532 ptr = bam_aux_get(hit_buf, "NH");
1533 if (ptr) {
1534 num_hits = bam_aux2i(ptr);
1535 }
1536
1537 //bool antisense_aln = bam1_strand(hit_buf);
1538
1539 //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1540 // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1541 if (spliced_alignment) {
1542 //if (source_strand == CUFF_STRAND_UNKNOWN) {
1543 // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1544 // }
1545 bh = create_hit(qname,
1546 text_name,
1547 text_offset, // BAM files are 0-indexed
1548 cigar,
1549 sam_flag & 0x0010,
1550 antisense_splice,
1551 num_mismatches,
1552 num_splice_anchor_mismatches,
1553 end);
1554
1555 }
1556 else {
1557 //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1558 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1559 bh = create_hit(qname,
1560 text_name,
1561 text_offset, // BAM files are 0-indexed
1562 cigar,
1563 sam_flag & 0x0010,
1564 false,
1565 num_mismatches,
1566 0,
1567 end);
1568 }
1569 if (seq!=NULL) {
1570 char *bseq = (char*)bam1_seq(hit_buf);
1571 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1572 char v = bam1_seqi(bseq,i);
1573 seq[i]=bam_nt16_rev_table[v];
1574 }
1575 seq[hit_buf->core.l_qseq]=0;
1576 }
1577 if (qual!=NULL) {
1578 char *bq = (char*)bam1_qual(hit_buf);
1579 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1580 qual[i]=bq[i]+33;
1581 }
1582 qual[hit_buf->core.l_qseq]=0;
1583 }
1584 return true;
1585 }
1586
1587 bool BAMHitFactory::inspect_header(HitStream& hs)
1588 {
1589 bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1590
1591 if (header == NULL) {
1592 fprintf(stderr, "Warning: No BAM header\n");
1593 return false;
1594 }
1595 if (header->l_text == 0) {
1596 fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1597 return false;
1598 }
1599 return true;
1600 }
1601
1602
1603 void get_mapped_reads(FILE* bwtf,
1604 HitTable& hits,
1605 HitFactory& hit_factory,
1606 bool strip_slash,
1607 bool verbose)
1608 {
1609
1610
1611 char bwt_buf[2048];
1612 uint32_t reads_extracted = 0;
1613
1614 while (fgets(bwt_buf, 2048, bwtf))
1615 {
1616 // Chomp the newline
1617 char* nl = strrchr(bwt_buf, '\n');
1618 if (nl) *nl = 0;
1619 if (*bwt_buf == 0)
1620 continue;
1621 // Get a new record from the tab-delimited Bowtie map
1622 BowtieHit bh;
1623 if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1624 {
1625 // Only check uniqueness if these hits are spliced
1626 hits.add_hit(bh, true);
1627 }
1628 reads_extracted++;
1629 }
1630
1631 // This will sort the map by insert id.
1632 hits.finalize();
1633 fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1634 }
1635
1636
1637 /*
1638 AlignStatus status(const BowtieHit* align)
1639 {
1640 if (!align)
1641 return UNALIGNED;
1642 if (align->contiguous())
1643 return CONTIGUOUS;
1644 return SPLICED;
1645 }
1646 */
1647
1648 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1649 {
1650 int max_hit_pos = -1;
1651 for (size_t i = 0; i < hits.size(); ++i)
1652 {
1653 max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1654 }
1655
1656 if ((int)DoC.size() < max_hit_pos)
1657 DoC.resize(max_hit_pos);
1658
1659 for (size_t i = 0; i < hits.size(); ++i)
1660 {
1661 const BowtieHit& bh = hits[i];
1662
1663 // split up the coverage contibution for this reads
1664 size_t j = bh.left();
1665 const vector<CigarOp>& cigar = bh.cigar();
1666
1667 for (size_t c = 0 ; c < cigar.size(); ++c)
1668 {
1669 switch(cigar[c].opcode)
1670 {
1671 case MATCH:
1672 for (size_t m = 0; m < cigar[c].length; ++m)
1673 {
1674 if (DoC[j + m] < 0xFFFF)
1675 DoC[j + m]++;
1676 }
1677 //fall through this case to REF_SKIP is intentional
1678 case REF_SKIP:
1679 j += cigar[c].length;
1680 break;
1681 default:
1682 break;
1683 }
1684
1685 }
1686 }
1687 }
1688
1689 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1690 {
1691 if ((int)DoC.size() < bh.right())
1692 DoC.resize(bh.right());
1693
1694 // split up the coverage contibution for this reads
1695 size_t j = bh.left();
1696 const vector<CigarOp>& cigar = bh.cigar();
1697
1698 for (size_t c = 0 ; c < cigar.size(); ++c)
1699 {
1700 switch(cigar[c].opcode)
1701 {
1702 case MATCH:
1703 for (size_t m = 0; m < cigar[c].length; ++m)
1704 {
1705 if (DoC[j + m] < VMAXINT32)
1706 DoC[j + m]++;
1707 }
1708 //fall through this case to REF_SKIP is intentional
1709 case REF_SKIP:
1710 j += cigar[c].length;
1711 break;
1712 default:
1713 break;
1714 }
1715
1716 }
1717 }
1718
1719 void print_hit(FILE* fout,
1720 const char* read_name,
1721 const BowtieHit& bh,
1722 const char* ref_name,
1723 const char* sequence,
1724 const char* qualities,
1725 bool from_bowtie)
1726 {
1727 string seq;
1728 string quals;
1729 if (sequence)
1730 {
1731 seq = sequence;
1732 quals = qualities;
1733 seq.resize(bh.read_len());
1734 quals.resize(bh.read_len());
1735 }
1736 else
1737 {
1738 seq = "*";
1739 }
1740
1741 if (qualities)
1742 {
1743 quals = qualities;
1744 quals.resize(bh.read_len());
1745 }
1746 else
1747 {
1748 quals = "*";
1749 }
1750
1751 uint32_t sam_flag = 0;
1752 if (bh.antisense_align())
1753 {
1754 sam_flag |= 0x0010; // BAM_FREVERSE
1755 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1756 {
1757 reverse_complement(seq);
1758 reverse(quals.begin(), quals.end());
1759 }
1760 }
1761
1762 uint32_t sam_pos = bh.left() + 1;
1763 uint32_t map_quality = 255;
1764 char cigar[256];
1765 cigar[0] = 0;
1766 string mate_ref_name = "*";
1767 uint32_t mate_pos = 0;
1768 uint32_t insert_size = 0;
1769 //string qualities = "*";
1770
1771 const vector<CigarOp>& bh_cigar = bh.cigar();
1772
1773 /*
1774 * In addition to calculating the cigar string,
1775 * we need to figure out how many in/dels are in the
1776 * sequence, so that we can give the correct
1777 * value for the NM tag
1778 */
1779 int indel_distance = 0;
1780 for (size_t c = 0; c < bh_cigar.size(); ++c)
1781 {
1782 char ibuf[64];
1783 sprintf(ibuf, "%d", bh_cigar[c].length);
1784 switch(bh_cigar[c].opcode)
1785 {
1786 case MATCH:
1787 strcat(cigar, ibuf);
1788 strcat(cigar, "M");
1789 break;
1790 case INS:
1791 strcat(cigar, ibuf);
1792 strcat(cigar, "I");
1793 indel_distance += bh_cigar[c].length;
1794 break;
1795 case DEL:
1796 strcat(cigar, ibuf);
1797 strcat(cigar, "D");
1798 indel_distance += bh_cigar[c].length;
1799 break;
1800 case REF_SKIP:
1801 strcat(cigar, ibuf);
1802 strcat(cigar, "N");
1803 break;
1804 default:
1805 break;
1806 }
1807 }
1808
1809 //string q = string(bh.read_len(), '!');
1810 //string s = string(bh.read_len(), 'N');
1811
1812 fprintf(fout,
1813 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1814 read_name,
1815 sam_flag,
1816 ref_name,
1817 sam_pos,
1818 map_quality,
1819 cigar,
1820 mate_ref_name.c_str(),
1821 mate_pos,
1822 insert_size,
1823 seq.c_str(),
1824 quals.c_str());
1825
1826 if (!sam_readgroup_id.empty())
1827 fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1828
1829 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1830
1831 bool containsSplice = false;
1832 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1833 if(itr->opcode == REF_SKIP){
1834 containsSplice = true;
1835 break;
1836 }
1837 }
1838
1839 if (containsSplice)
1840 fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1841
1842 fprintf(fout, "\n");
1843 }
1844
1845 void print_bamhit(GBamWriter& wbam,
1846 const char* read_name,
1847 const BowtieHit& bh,
1848 const char* ref_name,
1849 const char* sequence,
1850 const char* qualities,
1851 bool from_bowtie)
1852 {
1853 string seq;
1854 string quals;
1855 if (sequence) {
1856 seq = sequence;
1857 quals = qualities;
1858 seq.resize(bh.read_len());
1859 quals.resize(bh.read_len());
1860 }
1861 else {
1862 seq = "*";
1863 }
1864 if (qualities) {
1865 quals = qualities;
1866 quals.resize(bh.read_len());
1867 }
1868 else
1869 {
1870 quals = "*";
1871 }
1872
1873 uint32_t sam_flag = 0;
1874 if (bh.antisense_align())
1875 {
1876 sam_flag |= 0x0010; // BAM_FREVERSE
1877 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1878 {
1879 reverse_complement(seq);
1880 reverse(quals.begin(), quals.end());
1881 }
1882 }
1883
1884 uint32_t sam_pos = bh.left() + 1;
1885 uint32_t map_quality = 255;
1886 char cigar[256];
1887 cigar[0] = 0;
1888 string mate_ref_name = "*";
1889 uint32_t mate_pos = 0;
1890 uint32_t insert_size = 0;
1891 //string qualities = "*";
1892
1893 const vector<CigarOp>& bh_cigar = bh.cigar();
1894 /*
1895 * In addition to calculating the cigar string,
1896 * we need to figure out how many in/dels are in the
1897 * sequence, so that we can give the correct
1898 * value for the NM tag
1899 */
1900 int indel_distance = 0;
1901 for (size_t c = 0; c < bh_cigar.size(); ++c)
1902 {
1903 char ibuf[64];
1904 sprintf(ibuf, "%d", bh_cigar[c].length);
1905 switch(bh_cigar[c].opcode)
1906 {
1907 case MATCH:
1908 strcat(cigar, ibuf);
1909 strcat(cigar, "M");
1910 break;
1911 case INS:
1912 strcat(cigar, ibuf);
1913 strcat(cigar, "I");
1914 indel_distance += bh_cigar[c].length;
1915 break;
1916 case DEL:
1917 strcat(cigar, ibuf);
1918 strcat(cigar, "D");
1919 indel_distance += bh_cigar[c].length;
1920 break;
1921 case REF_SKIP:
1922 strcat(cigar, ibuf);
1923 strcat(cigar, "N");
1924 break;
1925 default:
1926 break;
1927 }
1928 }
1929
1930 //string q = string(bh.read_len(), '!');
1931 //string s = string(bh.read_len(), 'N');
1932 /*
1933 fprintf(fout,
1934 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1935 read_name,
1936 sam_flag,
1937 ref_name,
1938 sam_pos,
1939 map_quality,
1940 cigar,
1941 mate_ref_name.c_str(),
1942 mate_pos,
1943 insert_size,
1944 seq.c_str(),
1945 quals.c_str());
1946
1947 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1948 */
1949 vector<string> auxdata;
1950
1951 if (!sam_readgroup_id.empty())
1952 {
1953 string nm("RG:Z:");
1954 nm += sam_readgroup_id;
1955 auxdata.push_back(nm);
1956 }
1957
1958 string nm("NM:i:");
1959 str_appendInt(nm, bh.edit_dist() + indel_distance);
1960 auxdata.push_back(nm);
1961 bool containsSplice = false;
1962 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1963 if(itr->opcode == REF_SKIP){
1964 containsSplice = true;
1965 break;
1966 }
1967
1968 if (containsSplice) {
1969 //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1970 nm="XS:A:";
1971 nm+=(char)(bh.antisense_splice() ? '-' : '+');
1972 auxdata.push_back(nm);
1973 }
1974
1975 GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1976 cigar, mate_ref_name.c_str(), mate_pos,
1977 insert_size, seq.c_str(), quals.c_str(), &auxdata);
1978
1979
1980
1981 wbam.write(brec);
1982 delete brec;
1983 }
1984
1985 /**
1986 * Print a vector of cigar operations to a file.
1987 * @param bh_cigar A vector of CigarOps
1988 * @return a string representation of the cigar string
1989 */
1990 std::string print_cigar(vector<CigarOp>& bh_cigar){
1991 char cigar[256];
1992 cigar[0] = 0;
1993 for (size_t c = 0; c < bh_cigar.size(); ++c)
1994 {
1995 char ibuf[64];
1996 sprintf(ibuf, "%d", bh_cigar[c].length);
1997 switch(bh_cigar[c].opcode)
1998 {
1999 case MATCH:
2000 strcat(cigar, ibuf);
2001 strcat(cigar, "M");
2002 break;
2003 case INS:
2004 strcat(cigar, ibuf);
2005 strcat(cigar, "I");
2006 break;
2007 case DEL:
2008 strcat(cigar, ibuf);
2009 strcat(cigar, "D");
2010 break;
2011 case REF_SKIP:
2012 strcat(cigar, ibuf);
2013 strcat(cigar, "N");
2014 break;
2015 default:
2016 break;
2017 }
2018 }
2019 string result(cigar);
2020 return result;
2021 }
2022
2023 bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
2024 {
2025 RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
2026 if (!ref_str)
2027 return false;
2028
2029 const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
2030
2031 size_t pos_seq = 0;
2032 size_t pos_ref = 0;
2033 size_t mismatch = 0;
2034 for (size_t i = 0; i < _cigar.size(); ++i)
2035 {
2036 CigarOp cigar = _cigar[i];
2037 switch(cigar.opcode)
2038 {
2039 case MATCH:
2040 {
2041 for (size_t j = 0; j < cigar.length; ++j)
2042 {
2043 if (_seq[pos_seq++] != ref_seq[pos_ref++])
2044 ++mismatch;
2045 }
2046 }
2047 break;
2048 case INS:
2049 {
2050 pos_seq += cigar.length;
2051 }
2052 break;
2053
2054 case DEL:
2055 case REF_SKIP:
2056 {
2057 pos_ref += cigar.length;
2058 }
2059 break;
2060
2061 default:
2062 break;
2063 }
2064 }
2065
2066 return mismatch == _edit_dist;
2067 }