ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 159
Committed: Mon Jan 30 22:38:08 2012 UTC (7 years, 8 months ago) by gpertea
File size: 82253 byte(s)
Log Message:
wip

Line File contents
1 /*
2 * bwt_map.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/17/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <cassert>
15 #include <cstdlib>
16 #include <iostream>
17 #include <set>
18 #include <vector>
19 #include <cmath>
20
21 #include <seqan/sequence.h>
22 #include <seqan/find.h>
23 #include <seqan/file.h>
24 #include <seqan/modifier.h>
25
26 #include "common.h"
27 #include "bwt_map.h"
28 #include "tokenize.h"
29 #include "reads.h"
30
31 using namespace std;
32
33 void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
34 {
35 uint32_t reference_id = bh.ref_id();
36
37 pair<RefHits::iterator, bool> ret =
38 _hits_for_ref.insert(make_pair(reference_id, HitList()));
39 HitList& hl = ret.first->second;
40
41 if (check_uniqueness)
42 {
43 // Check uniqueness, in case we are adding spliced hits from
44 // several spliced alignment sources (e.g. de novo hashing + Bowtie
45 // against a user-supplied index). We don't want to count the same
46 // alignment twice if it happened to be found by more than one method
47 HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
48 HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
49 for (; lb != ub && lb != hl.end(); ++lb)
50 {
51 if (*lb == bh)
52 {
53 //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
54 return;
55 }
56
57 if (lb->insert_id() == bh.insert_id() &&
58 lb->ref_id() == bh.ref_id() &&
59 lb->antisense_align() == bh.antisense_align())
60 {
61 // If we get here, we may be looking at the same alignment
62 // However, spanning_reads may report a shorter, trimmed alignment
63 // so not all fields will be equal. If they just disagree on the
64 // ends, and don't indicate a different junction coord, the
65 // alignments are the same.
66
67 if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
68 (bh.left() <= lb->left() && bh.right() >= lb->right()))
69 {
70 vector<pair<int,int> > lb_gaps, bh_gaps;
71 lb->gaps(lb_gaps);
72 bh.gaps(bh_gaps);
73 if (lb_gaps == bh_gaps)
74 {
75 // One alignment is contained in the other, they agree on
76 // where the gaps, if any, are, and they share an id
77 // => this is a redundant aligment, so toss it
78 //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
79 return;
80 }
81 }
82 }
83 }
84 }
85 _total_hits++;
86 hl.push_back(bh);
87 }
88
89 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
90 {
91 return h1.insert_id() < h2.insert_id();
92 }
93
94 void LineHitFactory::openStream(HitStream& hs)
95 {
96 if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
97 //open the file for HitStream here
98 hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
99 if (hs._hit_file==NULL)
100 err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
101 return;
102 }
103 if (hs._fzpipe!=NULL) {
104 hs._hit_file=hs._fzpipe->file;
105 }
106 }
107
108 void LineHitFactory::rewind(HitStream& hs)
109 {
110 if (hs._fzpipe!=NULL) {
111 hs._fzpipe->rewind();
112 hs._hit_file=hs._fzpipe->file;
113 }
114 else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
115 }
116
117 void LineHitFactory::seek(HitStream& hs, int64_t offset)
118 {
119 // daehwan - implement this later
120 if (hs._fzpipe != NULL) {
121 hs._fzpipe->seek(offset);
122 hs._hit_file=hs._fzpipe->file;
123 }
124 // else if (hs._hit_file) ::seek((FILE*)(hs._hit_file));
125 }
126
127 bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
128 FILE* f=(FILE *)(hs._hit_file);
129 bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
130 if (!new_rec || feof(f)) {
131 hs._eof=true;
132 return false;
133 }
134 ++_line_num;
135 char* nl = strrchr(_hit_buf, '\n');
136 if (nl) *nl = 0;
137 buf = _hit_buf;
138 buf_size = _hit_buf_max_sz - 1;
139 return true;
140 }
141
142 void LineHitFactory::closeStream(HitStream& hs) {
143 if (hs._fzpipe!=NULL) {
144 hs._fzpipe->close();
145 return;
146 }
147 if (hs._hit_file!=NULL) {
148 fclose((FILE*)(hs._hit_file));
149 hs._hit_file=NULL;
150 }
151 }
152 void BAMHitFactory::openStream(HitStream& hs) {
153 if (hs._hit_file==NULL) {
154 if (hs._hit_file_name.empty())
155 //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
156 return; //invalid stream, could be just a place holder
157 //open the file here if not already open
158 string fext=getFext(hs._hit_file_name);
159 if (fext=="sam")
160 hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
161 else
162 hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
163
164 samfile_t* sam_file=(samfile_t*)(hs._hit_file);
165
166 if (sam_file == NULL)
167 err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
168 if (sam_file->header == NULL)
169 err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
170 memset(&_next_hit, 0, sizeof(_next_hit));
171 //_beginning = bgzf_tell(sam_file->x.bam);
172 _sam_header=sam_file->header;
173 if (inspect_header(hs) == false)
174 err_die("Error: invalid SAM header for file %s\n",
175 hs._hit_file_name.c_str());
176 }
177 }
178
179 void BAMHitFactory::closeStream(HitStream& hs) {
180 if (hs._hit_file) {
181 samclose((samfile_t*)(hs._hit_file));
182 }
183 hs._hit_file=NULL;
184 _sam_header=NULL;
185 }
186
187 void BAMHitFactory::rewind(HitStream& hs)
188 {
189 /*
190 if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
191 {
192 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
193 _eof = false;
194 }
195 */
196 this->closeStream(hs);
197 this->openStream(hs);
198 }
199
200 void BAMHitFactory::seek(HitStream& hs, int64_t offset)
201 {
202 if (hs._hit_file) {
203 bgzf_seek(((samfile_t*)hs._hit_file)->x.bam, offset, SEEK_SET);
204 }
205 }
206
207 string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
208 const bam1_t* bamrec=(const bam1_t*)hit_buf;
209 char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
210 string sam_line(tamline);
211 free(tamline);
212 return sam_line;
213 }
214
215 bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
216 if (_next_hit.data) {
217 free(_next_hit.data);
218 _next_hit.data = NULL;
219 }
220 _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
221 if (hs.eof() || !hs.ready()) return false;
222
223 //mark_curr_pos();
224
225 memset(&_next_hit, 0, sizeof(_next_hit));
226
227 int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
228 if (bytes_read <= 0) {
229 hs._eof = true;
230 return false;
231 }
232 buf = (const char*)&_next_hit;
233 buf_size = bytes_read;
234 return true;
235 }
236
237
238 BowtieHit HitFactory::create_hit(const string& insert_name,
239 const string& ref_name,
240 const string& ref_name2,
241 int left,
242 const vector<CigarOp>& cigar,
243 bool antisense_aln,
244 bool antisense_splice,
245 unsigned char edit_dist,
246 unsigned char splice_mms,
247 bool end)
248 {
249 uint64_t insert_id = _insert_table.get_id(insert_name);
250
251 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
252 uint32_t reference_id2 = reference_id;
253
254 if (ref_name2.length() > 0)
255 reference_id2 = _ref_table.get_id(ref_name2, NULL, 0);
256
257 return BowtieHit(reference_id,
258 reference_id2,
259 insert_id,
260 left,
261 cigar,
262 antisense_aln,
263 antisense_splice,
264 edit_dist,
265 splice_mms,
266 end);
267 }
268
269 BowtieHit HitFactory::create_hit(const string& insert_name,
270 const string& ref_name,
271 uint32_t left,
272 uint32_t read_len,
273 bool antisense_aln,
274 unsigned char edit_dist,
275 bool end)
276 {
277 uint64_t insert_id = _insert_table.get_id(insert_name);
278 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
279
280 return BowtieHit(reference_id,
281 reference_id,
282 insert_id,
283 left,
284 read_len,
285 antisense_aln,
286 edit_dist,
287 end);
288 }
289
290 bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
291 BowtieHit& bh,
292 bool strip_slash,
293 char* name_out,
294 char* name_tags,
295 char* seq,
296 char* qual)
297 {
298 if (!orig_bwt_buf || !*orig_bwt_buf)
299 return false;
300
301 static const int buf_size = 2048;
302
303 char bwt_buf[buf_size];
304 strcpy(bwt_buf, orig_bwt_buf);
305 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
306
307 char orientation;
308
309 //int bwtf_ret = 0;
310 //uint32_t seqid = 0;
311
312 char text_name[buf_size];
313 unsigned int text_offset;
314
315
316 //unsigned int other_occs;
317 char mismatches[buf_size];
318 //memset(mismatches, 0, sizeof(mismatches));
319
320 const char* buf = bwt_buf;
321 char* name = get_token((char**)&buf,"\t");
322 char* orientation_str = get_token((char**)&buf,"\t");
323 char* text_name_str = get_token((char**)&buf,"\t");
324
325 strcpy(text_name, text_name_str);
326
327 char* text_offset_str = get_token((char**)&buf,"\t");
328 char* seq_str = get_token((char**)&buf,"\t");
329 if (seq)
330 strcpy(seq, seq_str);
331
332 const char* qual_str = get_token((char**)&buf,"\t");
333 if (qual)
334 strcpy(qual, qual_str);
335
336 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
337 mismatches[0] = 0;
338 char* mismatches_str = get_token((char**)&buf, "\t");
339 if (mismatches_str)
340 strcpy(mismatches, mismatches_str);
341
342 orientation = orientation_str[0];
343 text_offset = atoi(text_offset_str);
344
345 bool end = true;
346 unsigned int seg_offset = 0;
347 unsigned int seg_num = 0;
348 unsigned int num_segs = 0;
349
350 // Copy the tag out of the name field before we might wipe it out
351 char* pipe = strrchr(name, '|');
352 if (pipe)
353 {
354 if (name_tags)
355 strcpy(name_tags, pipe);
356
357 char* tag_buf = pipe + 1;
358 if (strchr(tag_buf, ':'))
359 {
360 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
361 if (seg_num + 1 == num_segs)
362 end = true;
363 else
364 end = false;
365 }
366
367 *pipe = 0;
368 }
369 // Stripping the slash and number following it gives the insert name
370 char* slash = strrchr(name, '/');
371 if (strip_slash && slash)
372 *slash = 0;
373
374 size_t read_len = strlen(seq_str);
375
376 // Add this alignment to the table of hits for this half of the
377 // Bowtie map
378
379 //vector<string> mismatch_toks;
380 char* pch = strtok (mismatches,",");
381 unsigned char num_mismatches = 0;
382 while (pch != NULL)
383 {
384 char* colon = strchr(pch, ':');
385 if (colon)
386 {
387 num_mismatches++;
388 }
389 //mismatch_toks.push_back(pch);
390 pch = strtok (NULL, ",");
391 }
392
393 bh = create_hit(name,
394 text_name,
395 text_offset,
396 (int)read_len,
397 orientation == '-',
398 num_mismatches,
399 end);
400
401 return true;
402 }
403
404 int anchor_mismatch = 0;
405
406
407 void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
408 bool &end, unsigned int &seg_offset, unsigned int& seg_num,
409 unsigned int & num_segs) {
410 char* pipe = strrchr(name, '|');
411 if (pipe)
412 {
413 if (name_tags)
414 strcpy(name_tags, pipe);
415
416 char* tag_buf = pipe + 1;
417 if (strchr(tag_buf, ':'))
418 {
419 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
420 if (seg_num + 1 == num_segs)
421 end = true;
422 else
423 end = false;
424 }
425
426 *pipe = 0;
427 }
428
429 // Stripping the slash and number following it gives the insert name
430 char* slash = strrchr(name, '/');
431 if (strip_slash && slash)
432 *slash = 0;
433 }
434
435
436 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
437 BowtieHit& bh,
438 bool strip_slash,
439 char* name_out,
440 char* name_tags,
441 char* seq,
442 char* qual)
443 {
444 if (!orig_bwt_buf || !*orig_bwt_buf)
445 return false;
446
447 static const int buf_size = 2048;
448
449 char bwt_buf[buf_size];
450 strcpy(bwt_buf, orig_bwt_buf);
451 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
452
453 char orientation;
454 char text_name[buf_size];
455 unsigned int text_offset;
456 char mismatches[buf_size];
457 //memset(mismatches, 0, sizeof(mismatches));
458
459 char* buf = bwt_buf;
460 char* name = get_token((char**)&buf,"\t");
461 char* orientation_str = get_token((char**)&buf,"\t");
462 char* text_name_str = get_token((char**)&buf,"\t");
463 strcpy(text_name, text_name_str);
464
465 char* text_offset_str = get_token((char**)&buf,"\t");
466 char* seq_str = get_token((char**)&buf,"\t");
467 if (seq)
468 strcpy(seq, seq_str);
469
470 const char* qual_str = get_token((char**)&buf,"\t");
471 if (qual)
472 strcpy(qual, qual_str);
473
474 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
475 mismatches[0] = 0;
476 char* mismatches_str = get_token((char**)&buf, "\t");
477 if (mismatches_str)
478 strcpy(mismatches, mismatches_str);
479
480 orientation = orientation_str[0];
481 text_offset = atoi(text_offset_str);
482
483 bool end = true;
484 unsigned int seg_offset = 0;
485 unsigned int seg_num = 0;
486 unsigned int num_segs = 0;
487
488 // Copy the tag out of the name field before we might wipe it out
489 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
490
491 // Add this alignment to the table of hits for this half of the
492 // Bowtie map
493
494 // Parse the text_name field to recover the splice coords
495 vector<string> toks;
496
497 tokenize_strict(text_name, "|", toks);
498
499 int num_extra_toks = (int)toks.size() - 6;
500
501 if (num_extra_toks >= 0)
502 {
503 static const uint8_t left_window_edge_field = 1;
504 static const uint8_t splice_field = 2;
505 //static const uint8_t right_window_edge_field = 3;
506 static const uint8_t junction_type_field = 4;
507 static const uint8_t strand_field = 5;
508
509 string contig = toks[0];
510 for (int t = 1; t <= num_extra_toks; ++t)
511 {
512 contig += "|";
513 contig += toks[t];
514 }
515
516 vector<string> splice_toks;
517 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
518 if (splice_toks.size() != 2)
519 {
520 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
521 //fprintf(stderr, "%s (token: %s)\n", text_name,
522 // toks[num_extra_toks + splice_field].c_str());
523 return false;
524 }
525
526 //
527 // check for an insertion hit
528 //
529 if(toks[num_extra_toks + junction_type_field] == "ins")
530 {
531 int8_t spliced_read_len = strlen(seq_str);
532 /*
533 * The 0-based position of the left edge of the alignment. Note that this
534 * value may need to be futher corrected to account for the presence of
535 * of the insertion.
536 */
537 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
538 uint32_t right = left + spliced_read_len - 1;
539
540
541 /*
542 * The 0-based position of the last genomic sequence before the insertion
543 */
544 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
545
546 string insertedSequence = splice_toks[1];
547 /*
548 * The 0-based position of the first genomic sequence after teh insertion
549 */
550 uint32_t right_splice_pos = left_splice_pos + 1;
551 if(left > left_splice_pos){
552 /*
553 * The genomic position of the left edge of the alignment needs to be corrected
554 * If the alignment does not extend into the insertion, simply subtract the length
555 * of the inserted sequence, otherwise, just set it equal to the right edge
556 */
557 left = left - insertedSequence.length();
558 if(left < right_splice_pos){
559 left = right_splice_pos;
560 }
561 }
562 if(right > left_splice_pos){
563 right = right - insertedSequence.length();
564 if(right < left_splice_pos){
565 right = left_splice_pos;
566 }
567 }
568 /*
569 * Now, right and left should be properly transformed into genomic coordinates
570 * We should be able to deduce how much the alignment matches the insertion
571 * simply based on the length of the read
572 */
573 int left_match_length = 0;
574 if(left <= left_splice_pos){
575 left_match_length = left_splice_pos - left + 1;
576 }
577 int right_match_length = 0;
578 if(right >= right_splice_pos){
579 right_match_length = right - right_splice_pos + 1;
580 }
581 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
582
583 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
584 return false;
585
586 string junction_strand = toks[num_extra_toks + strand_field];
587 if(junction_strand != "rev" && junction_strand != "fwd"){
588 fprintf(stderr, "Malformed insertion record\n");
589 return false;
590 }
591
592 char* pch = strtok( mismatches, ",");
593 unsigned char num_mismatches = 0;
594
595 /*
596 * remember that text_offset holds the left end of the
597 * alignment, relative to the start of the contig
598 */
599
600 /*
601 * The 0-based relative position of the left-most character
602 * before the insertion in the contig
603 */
604 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
605 while (pch != NULL)
606 {
607 char* colon = strchr(pch, ':');
608 if (colon)
609 {
610 *colon = 0;
611 int mismatch_pos = atoi(pch);
612
613 /*
614 * for reversely mapped reads,
615 * find the correct mismatched position.
616 */
617 if(orientation == '-'){
618 mismatch_pos = spliced_read_len - mismatch_pos - 1;
619 }
620
621 /*
622 * Only count mismatches outside of the insertion region
623 * If there is a mismatch within the insertion,
624 * disallow this hit
625 */
626 if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
627 num_mismatches++;
628 }else{
629 return false;
630 }
631 }
632 pch = strtok (NULL, ",");
633 }
634
635
636 vector<CigarOp> cigar;
637 cigar.push_back(CigarOp(MATCH, left_match_length));
638 cigar.push_back(CigarOp(INS, insertion_match_length));
639 cigar.push_back(CigarOp(MATCH, right_match_length));
640
641 /*
642 * For now, disallow hits that don't span
643 * the insertion. If we allow these types of hits,
644 * then long_spanning.cpp needs to be updated
645 * in order to intelligently merge these kinds
646 * of reads back together
647 *
648 * Following code has been changed to allow segment that end
649 * in an insertion
650 */
651 bh = create_hit(name,
652 contig,
653 "",
654 left,
655 cigar,
656 orientation == '-',
657 junction_strand == "rev",
658 num_mismatches,
659 0,
660 end);
661 return true;
662 }
663
664 else
665 {
666 const string& junction_type = toks[num_extra_toks + junction_type_field];
667 string junction_strand = toks[num_extra_toks + strand_field];
668
669 int spliced_read_len = strlen(seq_str);
670 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str());
671 int8_t left_splice_pos = atoi(splice_toks[0].c_str());
672 if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr"))
673 {
674 left += text_offset;
675 left_splice_pos = left_splice_pos - left + 1;
676 }
677 else
678 {
679 left -= text_offset;
680 left_splice_pos = left - left_splice_pos + 1;
681 }
682
683 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
684 int8_t right_splice_pos = spliced_read_len - left_splice_pos;
685
686 int gap_len = 0;
687 if (junction_type == "fus")
688 gap_len = atoi(splice_toks[1].c_str());
689 else
690 gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
691
692 if (right_splice_pos <= 0 || left_splice_pos <= 0)
693 return false;
694
695 if (orientation == '+')
696 {
697 if (left_splice_pos + seg_offset < _anchor_length){
698 return false;
699 }
700 }
701 else
702 {
703 if (right_splice_pos + seg_offset < _anchor_length)
704 return false;
705 }
706
707 if (!(junction_strand == "ff" || junction_strand == "fr" || junction_strand == "rf" || junction_strand == "rr" || junction_strand == "rev" || junction_strand == "fwd")||
708 !(orientation == '-' || orientation == '+'))
709 {
710 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
711 //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
712 // junction_strand.c_str(), orientation);
713 return false;
714 }
715
716 char* pch = strtok (mismatches,",");
717 int mismatches_in_anchor = 0;
718 unsigned char num_mismatches = 0;
719 while (pch != NULL)
720 {
721 char* colon = strchr(pch, ':');
722 if (colon)
723 {
724 *colon = 0;
725 num_mismatches++;
726 int mismatch_pos = atoi(pch);
727 if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
728 (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
729 mismatches_in_anchor++;
730 }
731 pch = strtok (NULL, ",");
732 }
733
734 // FIXME: we probably should exclude these hits somewhere, but this
735 // isn't the right place
736 vector<CigarOp> cigar;
737 if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr"))
738 cigar.push_back(CigarOp(MATCH, left_splice_pos));
739 else
740 cigar.push_back(CigarOp(mATCH, left_splice_pos));
741
742 if(junction_type == "del")
743 cigar.push_back(CigarOp(DEL, gap_len));
744 else if(junction_type == "fus")
745 {
746 if (junction_strand == "ff")
747 cigar.push_back(CigarOp(FUSION_FF, gap_len));
748 else if (junction_strand == "fr")
749 cigar.push_back(CigarOp(FUSION_FR, gap_len));
750 else if (junction_strand == "rf")
751 cigar.push_back(CigarOp(FUSION_RF, gap_len));
752 else
753 cigar.push_back(CigarOp(FUSION_RR, gap_len));
754 }
755 else
756 cigar.push_back(CigarOp(REF_SKIP, gap_len));
757
758 if (junction_type != "fus" || (junction_strand != "fr" && junction_strand != "rr"))
759 cigar.push_back(CigarOp(MATCH, right_splice_pos));
760 else
761 cigar.push_back(CigarOp(mATCH, right_splice_pos));
762
763 string contig2 = "";
764 if (junction_type == "fus")
765 {
766 vector<string> contigs;
767 tokenize(contig, "-", contigs);
768 if (contigs.size() != 2)
769 return false;
770
771 contig = contigs[0];
772 contig2 = contigs[1];
773
774 if (junction_strand == "rf" || junction_strand == "rr")
775 orientation = (orientation == '+' ? '-' : '+');
776 }
777
778 bh = create_hit(name,
779 contig,
780 contig2,
781 left,
782 cigar,
783 orientation == '-',
784 junction_strand == "rev",
785 num_mismatches,
786 mismatches_in_anchor,
787 end);
788 return true;
789 }
790 }
791 else
792 {
793 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
794 //fprintf(stderr, "%s\n", orig_bwt_buf);
795 // continue;
796 return false;
797 }
798
799 return false;
800 }
801
802
803 int parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
804 bool &spliced_alignment) {
805 const char* p_cig = cigar_str;
806 int refspan=0; //alignment span on reference sequence
807
808 while (*p_cig)
809 {
810 char* t;
811 int op_len = (int)strtol(p_cig, &t, 10);
812 if (op_len <= 0)
813 {
814 fprintf (stderr, "Error: CIGAR op has zero length\n");
815 return 0;
816 }
817 char op_char = toupper(*t);
818 CigarOpCode opcode;
819 switch (op_char) {
820 case '=':
821 case 'X':
822 case 'M': opcode = MATCH;
823 refspan+=op_len;
824 break;
825 case 'I': opcode = INS;
826 break;
827 case 'D': opcode = DEL;
828 refspan+=op_len;
829 break;
830 case 'N': if (op_len > max_report_intron_length)
831 return 0;
832 opcode = REF_SKIP;
833 spliced_alignment = true;
834 refspan+=op_len;
835 break;
836 case 'S': opcode = SOFT_CLIP;
837 break;
838 case 'H': opcode = HARD_CLIP;
839 break;
840 case 'P': opcode = PAD;
841 break;
842 default: fprintf (stderr, "Error: invalid CIGAR operation\n");
843 return 0;
844 }
845 p_cig = t + 1;
846 cigar.push_back(CigarOp(opcode, op_len));
847 } //while cigar codes
848 if (*p_cig) {
849 fprintf (stderr, "Error: unmatched CIGAR operation (%s)\n",p_cig);
850 return 0;
851 }
852 return refspan;
853 }
854
855 int getBAMmismatches(const bam1_t* buf, vector<CigarOp>& cigar,
856 vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) {
857 int gspan=0;//genomic span of the alignment
858 sam_nm=0;
859 int num_mismatches=0;
860
861 uint8_t* ptr = bam_aux_get(buf, "XS");
862 if (ptr) {
863 char src_strand_char = bam_aux2A(ptr);
864 if (src_strand_char == '-')
865 antisense_splice = true;
866 }
867
868 ptr = bam_aux_get(buf, "MD");
869 if (ptr) {
870 const char* p = bam_aux2Z(ptr);
871 int bi=0; //base offset position in the read
872 while (*p != 0) {
873 if (isdigit(*p)) {
874 int v=atoi(p);
875 do { p++; } while (isdigit(*p));
876 bi+=v;
877 }
878 while (isalpha(*p)) {
879 p++;
880 num_mismatches++;
881 //mismatches.push_back(bi);
882 mismatches[bi]=true;
883 bi++;
884 }
885 if (*p=='^') { //reference deletion
886 p++;
887 while (isalpha(*p)) { //insert read bases
888 p++; bi++;
889 }
890 }
891 }
892 }
893
894 /* By convention,the NM field of the SAM record
895 * counts an insertion or deletion. I dont' think
896 * we want the mismatch count in the BowtieHit
897 * record to reflect this. Therefore, subtract out
898 * the mismatches due to in/dels
899 */
900 for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
901 switch (itr->opcode)
902 {
903 case MATCH:
904 case REF_SKIP:
905 case PAD:
906 gspan += itr->length;
907 break;
908 case DEL:
909 gspan += itr->length;
910 sam_nm -= itr->length;
911 break;
912 case INS:
913 sam_nm -= itr->length;
914 break;
915 default:
916 break;
917 }
918 }
919 return num_mismatches;
920 }
921
922 int getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
923 vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) {
924 int gspan=0;//genomic span of the alignment
925 const char* tag_buf = buf;
926 sam_nm=0;
927 int num_mismatches=0;
928 while((tag_buf = get_token((char**)&buf,"\t")))
929 {
930 vector<string> tuple_fields;
931 tokenize(tag_buf,":", tuple_fields);
932 if (tuple_fields.size() == 3)
933 {
934 if (tuple_fields[0] == "XS")
935 {
936 if (tuple_fields[2] == "-")
937 antisense_splice = true;
938 }
939 else if (tuple_fields[0] == "NM")
940 {
941 sam_nm = atoi(tuple_fields[2].c_str());
942 }
943 else if (tuple_fields[0] == "NS")
944 {
945 //ignored for now
946 }
947 else if (tuple_fields[0] == "MD")
948 {
949 const char* p=tuple_fields[2].c_str();
950 int bi=0; //base offset position in the read
951 while (*p != 0) {
952 if (isdigit(*p)) {
953 int v=atoi(p);
954 do { p++; } while (isdigit(*p));
955 bi+=v;
956 }
957 while (isalpha(*p)) {
958 p++;
959 num_mismatches++;
960 //mismatches.push_back(bi);
961 mismatches[bi]=true;
962 bi++;
963 }
964 if (*p=='^') { //reference deletion
965 p++;
966 while (isalpha(*p)) { //insert read bases
967 p++; bi++;
968 }
969 }
970 }
971 }
972 //else
973 //{
974 //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
975 //return false;
976 //}
977 }
978 }
979 /* By convention,the NM field of the SAM record
980 * counts an insertion or deletion. I dont' think
981 * we want the mismatch count in the BowtieHit
982 * record to reflect this. Therefore, subtract out
983 * the mismatches due to in/dels
984 */
985 for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
986 switch (itr->opcode)
987 {
988 case MATCH:
989 case REF_SKIP:
990 case PAD:
991 gspan += itr->length;
992 break;
993 case DEL:
994 gspan += itr->length;
995 sam_nm -= itr->length;
996 break;
997 case INS:
998 sam_nm -= itr->length;
999 break;
1000 default:
1001 break;
1002 }
1003 }
1004 return num_mismatches;
1005 }
1006
1007 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1008 BowtieHit& bh,
1009 bool strip_slash,
1010 char* name_out,
1011 char* name_tags,
1012 char* seq,
1013 char* qual)
1014 {
1015 if (!orig_bwt_buf || !*orig_bwt_buf)
1016 return false;
1017 char bwt_buf[2048];
1018 strcpy(bwt_buf, orig_bwt_buf);
1019 // Are we still in the header region?
1020 if (bwt_buf[0] == '@')
1021 return false;
1022
1023 char* buf = bwt_buf;
1024 char* name = get_token((char**)&buf,"\t");
1025 char* sam_flag_str = get_token((char**)&buf,"\t");
1026 char* text_name = get_token((char**)&buf,"\t");
1027 char* text_offset_str = get_token((char**)&buf,"\t");
1028 const char* map_qual_str = get_token((char**)&buf,"\t");
1029 char* cigar_str = get_token((char**)&buf,"\t");
1030 const char* mate_ref_str = get_token((char**)&buf,"\t");
1031 const char* mate_pos_str = get_token((char**)&buf,"\t");
1032 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1033
1034 const char* seq_str = get_token((char**)&buf,"\t");
1035 if (seq)
1036 strcpy(seq, seq_str);
1037 const char* qual_str = get_token((char**)&buf,"\t");
1038 if (qual)
1039 strcpy(qual, qual_str);
1040
1041 if (!name ||
1042 !sam_flag_str ||
1043 !text_name ||
1044 !text_offset_str ||
1045 !map_qual_str ||
1046 !cigar_str ||
1047 !mate_ref_str ||
1048 !mate_pos_str ||
1049 !inferred_insert_sz_str ||
1050 !seq_str ||
1051 !qual_str)
1052 {
1053 // truncated or malformed SAM record
1054 return false;
1055 }
1056
1057 int sam_flag = atoi(sam_flag_str);
1058 string ref_name = text_name, ref_name2 = "";
1059 int text_offset = atoi(text_offset_str);
1060
1061 bool end = true;
1062 unsigned int seg_offset = 0;
1063 unsigned int seg_num = 0;
1064 unsigned int num_segs = 0;
1065
1066 // Copy the tag out of the name field before we might wipe it out
1067 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1068
1069 vector<CigarOp> cigar;
1070 bool spliced_alignment = false;
1071
1072 int refspan=parseCigar(cigar, cigar_str, spliced_alignment);
1073 if (refspan==0)
1074 return false;
1075 //vector<string> attributes;
1076 //tokenize(tag_buf, " \t",attributes);
1077
1078 bool antisense_splice = false;
1079 int sam_nm = 0; //the value of the NM tag (edit distance)
1080 //int mismatches[1024];//array with mismatch positions on the read (0-based from the left aligned end of the read)
1081 vector<bool> mismatches;
1082 mismatches.resize(strlen(seq_str), false);
1083 int num_mismatches=getSAMmismatches(buf, cigar, mismatches, sam_nm, antisense_splice);
1084
1085 if (spliced_alignment)
1086 {
1087 bh = create_hit(name,
1088 ref_name,
1089 ref_name2,
1090 text_offset - 1,
1091 cigar,
1092 sam_flag & 0x0010,
1093 antisense_splice,
1094 num_mismatches,
1095 0,
1096 end);
1097 }
1098 else
1099 {
1100 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1101 bh = create_hit(name,
1102 ref_name,
1103 ref_name2,
1104 text_offset - 1, // SAM files are 1-indexed
1105 cigar,
1106 sam_flag & 0x0010,
1107 false,
1108 num_mismatches,
1109 0,
1110 end);
1111 }
1112 return true;
1113 }
1114
1115 void cigar_add(vector<CigarOp>& cigar, CigarOp& op) {
1116 if (op.length<=0) return;
1117 if (cigar.size()>0 && cigar.back().opcode==op.opcode) {
1118 cigar.back().length+=op.length;
1119 }
1120 cigar.push_back(op);
1121 }
1122
1123 bool spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<bool> mismatches,
1124 int &left, int spl_start, int spl_len, CigarOpCode spl_code, int& spl_mismatches) {
1125 //merge the original 'cigar' with the new insert/gap operation
1126 //at position spl_start and place the result into splcigar;
1127 //TODO: ideally this should also get and rebuild the MD string (alignment mismatches)
1128
1129 //return value: mismatches in the insert region for INS case,
1130 //or number of mismatches in the anchor region
1131 //return -1 if somehow the hit seems bad
1132
1133 //these offsets are relative to the beginning of alignment on reference
1134 int spl_ofs=spl_start-left; //relative position of splice op
1135 if (spl_code == FUSION_FF || spl_code == FUSION_FR || spl_code == FUSION_RF || spl_code == FUSION_RR)
1136 spl_ofs = abs(spl_ofs);
1137 int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op
1138 CigarOp gapop(spl_code, spl_len); //for DEL, REF_SKIP, FUSIONS
1139 if (spl_code==INS)
1140 spl_ofs_end += spl_len;
1141
1142 int ref_ofs=0; //working offset on reference
1143 int read_ofs=0; //working offset on the read, relative to the leftmost aligned base
1144 bool xfound=false;
1145 //if (left<=spl_start+spl_len) {
1146 if (spl_ofs_end>0) {
1147 int prev_opcode=0;
1148 int prev_oplen=0;
1149 for (size_t c = 0 ; c < cigar.size(); ++c)
1150 {
1151 int prev_read_ofs=read_ofs;
1152 int cur_op_ofs=ref_ofs;
1153 int cur_opcode=cigar[c].opcode;
1154 int cur_oplen=cigar[c].length;
1155
1156 switch (cur_opcode) {
1157 case MATCH:
1158 ref_ofs+=cur_oplen;
1159 read_ofs+=cur_oplen;
1160 if (spl_code==REF_SKIP || spl_code==DEL ||
1161 spl_code==FUSION_FF || spl_code==FUSION_FR ||
1162 spl_code==FUSION_RF || spl_code==FUSION_RR) {
1163 for (int o=cur_op_ofs;o<ref_ofs;o++) {
1164 int rofs=prev_read_ofs+(o-cur_op_ofs);
1165 if (abs(spl_ofs-o)<min_anchor_len && mismatches[rofs])
1166 spl_mismatches++;
1167 }
1168 }
1169 else if (spl_code==INS) {
1170 for (int o=cur_op_ofs;o<ref_ofs;o++) {
1171 int rofs=prev_read_ofs+(o-cur_op_ofs);
1172 if (o>=spl_ofs && o<spl_ofs_end && mismatches[rofs])
1173 spl_mismatches++;
1174 }
1175 }
1176 break;
1177 case DEL:
1178 case REF_SKIP:
1179 case PAD:
1180 ref_ofs+=cur_oplen;
1181 break;
1182 case SOFT_CLIP:
1183 case INS:
1184 read_ofs+=cur_oplen;
1185 break;
1186 //case HARD_CLIP:
1187 }
1188
1189 if (cur_op_ofs>=spl_ofs_end || ref_ofs<=spl_ofs) {
1190 if (cur_op_ofs==spl_ofs_end) {
1191 if (spl_code!=INS) {
1192 if (cur_opcode!=INS) {
1193 xfound=true;
1194 //we have to insert the gap here first
1195 cigar_add(splcigar, gapop);
1196 //also, check
1197 }
1198 }
1199 }
1200
1201 CigarOp op(cigar[c]);
1202 if (xfound)
1203 {
1204 if (spl_code == FUSION_FR || spl_code == FUSION_RR)
1205 {
1206 if (op.opcode == MATCH)
1207 op.opcode = mATCH;
1208 else if (op.opcode == INS)
1209 op.opcode = iNS;
1210 else if (op.opcode == DEL)
1211 op.opcode = dEL;
1212 else if (op.opcode == REF_SKIP)
1213 op.opcode = rEF_SKIP;
1214 }
1215 }
1216 else
1217 {
1218 if (spl_code == FUSION_RF || spl_code == FUSION_RR)
1219 {
1220 if (op.opcode == MATCH)
1221 op.opcode = mATCH;
1222 else if (op.opcode == INS)
1223 op.opcode = iNS;
1224 else if (op.opcode == DEL)
1225 op.opcode = dEL;
1226 else if (op.opcode == REF_SKIP)
1227 op.opcode = rEF_SKIP;
1228 }
1229 }
1230
1231 cigar_add(splcigar, op);
1232 }
1233 else //if (ref_ofs>spl_ofs) {
1234 { //op intersection
1235 xfound=true;
1236 if (spl_code==INS) {
1237 //we have to shorten cur_opcode
1238 // find the overlap between current range
1239 int ovl_start = (cur_op_ofs>spl_ofs) ? cur_op_ofs : spl_ofs;
1240 int ovl_end = (ref_ofs>spl_ofs_end) ? spl_ofs_end : ref_ofs;
1241
1242 CigarOp op(cigar[c]);
1243 op.length=spl_ofs-cur_op_ofs;
1244 if (spl_ofs>cur_op_ofs)
1245 cigar_add(splcigar, op);
1246 if (spl_ofs<0)
1247 {
1248 CigarOp temp = gapop;
1249 temp.length += spl_ofs;
1250 if (temp.length>0)
1251 cigar_add(splcigar, temp);
1252 }
1253 else
1254 cigar_add(splcigar, gapop);
1255 op.length=ref_ofs-spl_ofs_end;
1256 if (ref_ofs>spl_ofs_end)
1257 cigar_add(splcigar,op);
1258 }
1259 else {//DEL or REF_SKIP or FUSION_[FR][FR]
1260 //spl_ofs == spl_ofs_end
1261 //we have to split cur_opcode
1262 //look for mismatches within min_anchor_len distance from splice point
1263 CigarOp op(cigar[c]);
1264 CigarOpCode opcode = op.opcode;
1265 op.length=spl_ofs-cur_op_ofs;
1266 if (spl_code == FUSION_RF || spl_code == FUSION_RR)
1267 {
1268 if (opcode == MATCH)
1269 op.opcode = mATCH;
1270 else if (opcode == INS)
1271 op.opcode = iNS;
1272 else if (opcode == DEL)
1273 op.opcode = dEL;
1274 else if (opcode == REF_SKIP)
1275 op.opcode = rEF_SKIP;
1276 }
1277 cigar_add(splcigar, op);
1278
1279 cigar_add(splcigar, gapop);
1280
1281 op.opcode = opcode;
1282 if (spl_code == FUSION_FR || spl_code == FUSION_RR)
1283 {
1284 if (opcode == MATCH)
1285 op.opcode = mATCH;
1286 else if (opcode == INS)
1287 op.opcode = iNS;
1288 else if (opcode == DEL)
1289 op.opcode = dEL;
1290 else if (opcode == REF_SKIP)
1291 op.opcode = rEF_SKIP;
1292 }
1293 op.length=ref_ofs-spl_ofs;
1294 cigar_add(splcigar,op);
1295 }
1296 } //op intersection
1297 prev_opcode=cur_opcode;
1298 prev_oplen=cur_oplen;
1299 } //for each cigar opcode
1300 } //intersection possible
1301
1302 //if (!xfound) {//no intersection found between splice event and alignment
1303 if (spl_ofs_end<=0) {
1304 //alignment starts after the splice event
1305 if (spl_code==INS) left-=spl_len;
1306 else left+=spl_len;
1307
1308 splcigar = cigar;
1309 }
1310 //else {
1311 //alignment ends before the splice event
1312 //nothing to do
1313 // }
1314 //return spl_mismatches;
1315 // }
1316
1317 return xfound;
1318 }
1319
1320 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1321 BowtieHit& bh,
1322 bool strip_slash,
1323 char* name_out,
1324 char* name_tags,
1325 char* seq,
1326 char* qual)
1327 {
1328 if (!orig_bwt_buf || !*orig_bwt_buf)
1329 return false;
1330
1331 char bwt_buf[2048];
1332 strcpy(bwt_buf, orig_bwt_buf);
1333 // Are we still in the header region?
1334 if (bwt_buf[0] == '@')
1335 return false;
1336
1337 char* buf = bwt_buf;
1338 char* name = get_token((char**)&buf,"\t");
1339 char* sam_flag_str = get_token((char**)&buf,"\t");
1340 char* text_name = get_token((char**)&buf,"\t");
1341 char* text_offset_str = get_token((char**)&buf,"\t");
1342 const char* map_qual_str = get_token((char**)&buf,"\t");
1343 char* cigar_str = get_token((char**)&buf,"\t");
1344 const char* mate_ref_str = get_token((char**)&buf,"\t");
1345 const char* mate_pos_str = get_token((char**)&buf,"\t");
1346 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1347 //int num_mismatches=0;
1348 //int mismatches[1024]; //list of 0-based mismatch positions in this read
1349 //parsed from SAM's MD:Z: tag
1350 const char* seq_str = get_token((char**)&buf,"\t");
1351 if (seq)
1352 strcpy(seq, seq_str);
1353
1354 const char* qual_str = get_token((char**)&buf,"\t");
1355 if (qual)
1356 strcpy(qual, qual_str);
1357
1358 if (!name ||
1359 !sam_flag_str ||
1360 !text_name ||
1361 !text_offset_str ||
1362 !map_qual_str ||
1363 !cigar_str ||
1364 !mate_ref_str ||
1365 !mate_pos_str ||
1366 !inferred_insert_sz_str ||
1367 !seq_str ||
1368 !qual_str)
1369 {
1370 // truncated or malformed SAM record
1371 return false;
1372 }
1373
1374 int sam_flag = atoi(sam_flag_str);
1375 int text_offset = atoi(text_offset_str);
1376 text_offset--; //make it 0-based (SAM is 1-based, Bowtie is 0-based)
1377 bool end = true;
1378 unsigned int seg_offset = 0;
1379 unsigned int seg_num = 0;
1380 unsigned int num_segs = 0;
1381
1382 // Copy the tag out of the name field before we might wipe it out
1383 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1384
1385 vector<CigarOp> samcigar;
1386 bool spliced_alignment = false;
1387 int refspan=parseCigar(samcigar, cigar_str, spliced_alignment);
1388
1389 if (refspan==0)
1390 return false;
1391 bool antisense_splice = false;
1392 int sam_nm = 0;
1393 vector<bool> mismatches;
1394 mismatches.resize(strlen(seq_str), false);
1395
1396 int num_mismatches=getSAMmismatches(buf, samcigar, mismatches, sam_nm, antisense_splice);
1397
1398 //##############################################
1399
1400 // Add this alignment to the table of hits for this half of the
1401 // Bowtie map
1402
1403 // Parse the text_name field to recover the splice coords
1404 vector<string> toks;
1405 tokenize_strict(text_name, "|", toks);
1406
1407 int num_extra_toks = (int)toks.size() - 6;
1408
1409 if (num_extra_toks >= 0)
1410 {
1411 static const uint8_t left_window_edge_field = 1;
1412 static const uint8_t splice_field = 2;
1413 //static const uint8_t right_window_edge_field = 3;
1414 static const uint8_t junction_type_field = 4;
1415 static const uint8_t strand_field = 5;
1416
1417 string contig = toks[0];
1418 for (int t = 1; t <= num_extra_toks; ++t)
1419 {
1420 contig += "|";
1421 contig += toks[t];
1422 }
1423
1424 vector<string> splice_toks;
1425 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1426 if (splice_toks.size() != 2)
1427 {
1428 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1429 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1430 // toks[num_extra_toks + splice_field].c_str());
1431 return false;
1432 }
1433
1434 string junction_strand = toks[num_extra_toks + strand_field];
1435 if(junction_strand != "rev" && junction_strand != "fwd"){
1436 fprintf(stderr, "Malformed insertion record\n");
1437 return false;
1438 }
1439
1440 //
1441 // check for an insertion hit
1442 //
1443 if(toks[num_extra_toks + junction_type_field] == "ins")
1444 {
1445 //int8_t spliced_read_len = strlen(seq_str);
1446 //TODO FIXME: use the CIGAR instead of seq length!
1447 // The 0-based position of the left edge of the alignment. Note that this
1448 // value may need to be further corrected to account for the presence of
1449 // of the insertion.
1450 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1451
1452 // The 0-based position of the last genomic sequence before the insertion
1453 int left_splice_pos = atoi(splice_toks[0].c_str());
1454
1455 string insertedSequence = splice_toks[1];
1456 // The 0-based position of the first genomic sequence after the insertion
1457
1458 vector<CigarOp> splcigar;
1459 //this also updates left to the adjusted genomic coordinates
1460 int spl_num_mismatches=0;
1461 bool overlapped = spliceCigar(splcigar, samcigar, mismatches,
1462 left, left_splice_pos+1, insertedSequence.length(),
1463 INS, spl_num_mismatches);
1464
1465 if (!overlapped)
1466 return false;
1467
1468 if (spl_num_mismatches<0) return false;
1469 num_mismatches-=spl_num_mismatches;
1470 /*
1471 uint32_t right_splice_pos = left_splice_pos + 1;
1472
1473 //uint32_t right = left + spliced_read_len - 1;
1474 int right = left + refspan - 1;
1475
1476 if(left > left_splice_pos){
1477 //The genomic position of the left edge of the alignment needs to be corrected
1478 //If the alignment does not extend into the insertion, simply subtract the length
1479 //of the inserted sequence, otherwise, just set it equal to the right edge
1480 left = left - insertedSequence.length();
1481 if(left < right_splice_pos){
1482 left = right_splice_pos;
1483 }
1484 }
1485 if(right > left_splice_pos){
1486 right = right - insertedSequence.length();
1487 if(right < left_splice_pos){
1488 right = left_splice_pos;
1489 }
1490 }
1491 // Now, right and left should be properly transformed into genomic coordinates
1492 // We should be able to deduce how much the alignment matches the insertion
1493 // simply based on the length of the read
1494 int left_match_length = 0;
1495 if(left <= left_splice_pos){
1496 left_match_length = left_splice_pos - left + 1;
1497 }
1498 int right_match_length = 0;
1499 if(right >= right_splice_pos){
1500 right_match_length = right - right_splice_pos + 1;
1501 }
1502 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1503
1504 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1505 return false;
1506
1507 //char* pch = strtok( mismatches, ",");
1508 //unsigned char num_mismatches = 0;
1509 //text_offset holds the left end of the alignment,
1510 //RELATIVE TO the start of the contig
1511
1512 //The 0-based relative position of the left-most character
1513 //before the insertion in the contig
1514 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1515 for (size_t i=0;i<mismatches.size();++i) {
1516 int mismatch_pos = mismatches[i];
1517 // for reversely mapped reads,
1518 //find the correct mismatched position.
1519 if (sam_flag & 0x0010){
1520 mismatch_pos = spliced_read_len - mismatch_pos - 1;
1521 }
1522
1523 //Only count mismatches outside of the insertion region
1524 // If there is a mismatch within the insertion,
1525 // disallow this hit
1526 if(mismatch_pos + text_offset <= relative_splice_pos ||
1527 mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1528 spl_num_mismatches++;
1529 }else{
1530 return false;
1531 }
1532 }
1533 */
1534 //vector<CigarOp> splcigar;
1535 //spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1536 //splcigar.push_back(CigarOp(MATCH, left_match_length));
1537 //splcigar.push_back(CigarOp(INS, insertion_match_length));
1538 //splcigar.push_back(CigarOp(MATCH, right_match_length));
1539
1540 bh = create_hit(name,
1541 contig,
1542 "",
1543 left,
1544 //splcigar,
1545 splcigar,
1546 sam_flag & 0x0010,
1547 junction_strand == "rev",
1548 num_mismatches,
1549 0,
1550 end);
1551
1552 return true;
1553 } //"ins"
1554 else //"del" or intron
1555 {
1556 // The 0-based position of the left edge of the alignment.
1557 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1558
1559 // The 0-based position of the last genomic sequence before the deletion
1560 int left_splice_pos = atoi(splice_toks[0].c_str());
1561
1562 int gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1;
1563 /*
1564 if ((sam_flag & 0x0010) == 0) //######
1565 {
1566 if (left_splice_ofs + seg_offset < _anchor_length)
1567 return false;
1568 }
1569 else
1570 {
1571 if (right_splice_ofs + seg_offset < _anchor_length)
1572 return false;
1573 }
1574 */
1575 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1576 //atoi(toks[right_window_edge_field].c_str());
1577
1578 /*
1579 //offset of deletion point, relative to the beginning of the alignment
1580 int left_splice_ofs = left_splice_pos-left+1;
1581
1582 int mismatches_in_anchor = 0;
1583 for (size_t i=0;i<mismatches.size();++i) {
1584 spl_num_mismatches++;
1585 int mismatch_pos = mismatches[i];
1586 if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_ofs) < (int)min_anchor_len) ||
1587 ((sam_flag & 0x0010) != 0 &&
1588 abs(((int)refspan - left_splice_ofs + 1) - mismatch_pos)) < (int)min_anchor_len)
1589 mismatches_in_anchor++;
1590 }
1591 */
1592 vector<CigarOp> splcigar;
1593
1594 CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP;
1595
1596 int spl_num_mismatches=0;
1597 bool overlapped = spliceCigar(splcigar, samcigar, mismatches, left,
1598 left_splice_pos+1, gap_len, opcode, spl_num_mismatches);
1599
1600 if (!overlapped)
1601 return false;
1602
1603 if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches)
1604 return false;
1605 /*
1606 splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1607 if(toks[num_extra_toks + junction_type_field] == "del"){
1608 splcigar.push_back(CigarOp(DEL, gap_len));
1609 }else{
1610 splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1611 }
1612 splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1613 */
1614 bh = create_hit(name,
1615 contig,
1616 "",
1617 left,
1618 splcigar,
1619 (sam_flag & 0x0010),
1620 junction_strand == "rev",
1621 num_mismatches,
1622 spl_num_mismatches,
1623 end);
1624
1625 return true;
1626 }
1627 } //parse splice data
1628 else
1629 {
1630 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1631 //fprintf(stderr, "%s\n", orig_bwt_buf);
1632 // continue;
1633 return false;
1634 }
1635
1636 return false;
1637 }
1638
1639
1640
1641 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1642 BowtieHit& bh, bool strip_slash,
1643 char* name_out, char* name_tags,
1644 char* seq, char* qual) {
1645 if (_sam_header==NULL)
1646 err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1647 const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1648
1649 uint32_t sam_flag = hit_buf->core.flag;
1650
1651 int text_offset = hit_buf->core.pos;
1652 int text_mate_pos = hit_buf->core.mpos;
1653 int target_id = hit_buf->core.tid;
1654 int mate_target_id = hit_buf->core.mtid;
1655
1656 vector<CigarOp> cigar;
1657 bool spliced_alignment = false;
1658 int num_hits = 1;
1659
1660 double mapQ = hit_buf->core.qual;
1661 long double error_prob;
1662 if (mapQ > 0)
1663 {
1664 long double p = (-1.0 * mapQ) / 10.0;
1665 error_prob = pow(10.0L, p);
1666 }
1667 else
1668 {
1669 error_prob = 1.0;
1670 }
1671
1672 bool end = true;
1673 unsigned int seg_offset = 0;
1674 unsigned int seg_num = 0;
1675 unsigned int num_segs = 0;
1676 // Copy the tag out of the name field before we might wipe it out
1677 char* qname = bam1_qname(hit_buf);
1678 char* pipe = strrchr(qname, '|');
1679 if (pipe)
1680 {
1681 if (name_tags)
1682 strcpy(name_tags, pipe);
1683
1684 char* tag_buf = pipe + 1;
1685 if (strchr(tag_buf, ':'))
1686 {
1687 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1688 if (seg_num + 1 == num_segs)
1689 end = true;
1690 else
1691 end = false;
1692 }
1693
1694 *pipe = 0;
1695 }
1696
1697 if (target_id < 0) {
1698 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1699 bh = create_hit(qname,
1700 "*", //ref_name
1701 0, //left coord
1702 0, //read_len
1703 false, //antisense_aln
1704 0, //edit_dist
1705 end);
1706 return true;
1707 }
1708
1709 if (seq!=NULL) {
1710 char *bseq = (char*)bam1_seq(hit_buf);
1711 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1712 char v = bam1_seqi(bseq,i);
1713 seq[i]=bam_nt16_rev_table[v];
1714 }
1715 seq[hit_buf->core.l_qseq]=0;
1716 }
1717 if (qual!=NULL) {
1718 char *bq = (char*)bam1_qual(hit_buf);
1719 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1720 qual[i]=bq[i]+33;
1721 }
1722 qual[hit_buf->core.l_qseq]=0;
1723 }
1724
1725 bool antisense_splice=false;
1726 unsigned char num_mismatches = 0;
1727 unsigned char num_splice_anchor_mismatches = 0;
1728
1729 uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1730 if (ptr) {
1731 char src_strand_char = bam_aux2A(ptr);
1732 if (src_strand_char == '-')
1733 antisense_splice = true;
1734 }
1735
1736 ptr = bam_aux_get(hit_buf, "NM");
1737 if (ptr) {
1738 num_mismatches = bam_aux2i(ptr);
1739 }
1740
1741 ptr = bam_aux_get(hit_buf, "NH");
1742 if (ptr) {
1743 num_hits = bam_aux2i(ptr);
1744 }
1745
1746 string text_name = _sam_header->target_name[target_id];
1747 string text_name2 = "";
1748
1749 bool fusion_alignment = false;
1750 string fusion_cigar_str;
1751 ptr = bam_aux_get(hit_buf, "XF");
1752 if (ptr) {
1753 fusion_alignment = true;
1754 char* xf = bam_aux2Z(ptr);
1755
1756 // ignore the second part of a fusion alignment
1757 if (xf[0] == '2')
1758 return false;
1759
1760 vector<string> fields;
1761 tokenize(xf, " ", fields);
1762
1763 vector<string> contigs;
1764 tokenize(fields[1], "-", contigs);
1765 if (contigs.size() >= 2)
1766 {
1767 text_name = contigs[0];
1768 text_name2 = contigs[1];
1769 }
1770
1771 text_offset = atoi(fields[2].c_str()) - 1;
1772 fusion_cigar_str = fields[3].c_str();
1773
1774 if (seq)
1775 strcpy(seq, fields[4].c_str());
1776 if (qual)
1777 strcpy(qual, fields[5].c_str());
1778 }
1779
1780 if (fusion_alignment) {
1781 const char* p_cig = fusion_cigar_str.c_str();
1782 while (*p_cig) {
1783 char* t;
1784 int length = (int)strtol(p_cig, &t, 10);
1785 if (length <= 0)
1786 {
1787 //fprintf (stderr, "CIGAR op has zero length\n");
1788 return false;
1789 }
1790 char op_char = *t;
1791 CigarOpCode opcode;
1792 if (op_char == 'M') opcode = MATCH;
1793 else if(op_char == 'm') opcode = mATCH;
1794 else if (op_char == 'I') opcode = INS;
1795 else if (op_char == 'i') opcode = iNS;
1796 else if (op_char == 'D') opcode = DEL;
1797 else if (op_char == 'd') opcode = dEL;
1798 else if (op_char == 'N' || op_char == 'n')
1799 {
1800 if (length > max_report_intron_length)
1801 return false;
1802
1803 if (op_char == 'N')
1804 opcode = REF_SKIP;
1805 else
1806 opcode = rEF_SKIP;
1807 spliced_alignment = true;
1808 }
1809 else if (op_char == 'F')
1810 {
1811 opcode = FUSION_FF;
1812 length = length - 1;
1813 }
1814 else if (op_char == 'S') opcode = SOFT_CLIP;
1815 else if (op_char == 'H') opcode = HARD_CLIP;
1816 else if (op_char == 'P') opcode = PAD;
1817 else
1818 {
1819 fprintf (stderr, "(%d-%d) invalid CIGAR operation\n", length, (int)op_char);
1820 return false;
1821 }
1822 p_cig = t + 1;
1823 cigar.push_back(CigarOp(opcode, length));
1824
1825 if(opcode == INS) {
1826 num_mismatches -= length;
1827 }
1828 else if(opcode == DEL) {
1829 num_mismatches -= length;
1830 }
1831
1832 /*
1833 * update fusion direction.
1834 */
1835 size_t cigar_size = cigar.size();
1836 if (cigar_size >= 3 && cigar[cigar_size - 2].opcode == FUSION_FF)
1837 {
1838 CigarOpCode prev = cigar[cigar_size - 3].opcode;
1839 CigarOpCode next = cigar[cigar_size - 1].opcode;
1840
1841 bool increase1 = false, increase2 = false;
1842 if (prev == MATCH || prev == DEL || prev == INS || prev == REF_SKIP)
1843 increase1 = true;
1844 if (next == MATCH || next == DEL || next == INS || next == REF_SKIP)
1845 increase2 = true;
1846
1847 if (increase1 && !increase2)
1848 cigar[cigar_size - 2].opcode = FUSION_FR;
1849 else if (!increase1 && increase2)
1850 cigar[cigar_size - 2].opcode = FUSION_RF;
1851 else if (!increase1 && !increase2)
1852 cigar[cigar_size - 2].opcode = FUSION_RR;
1853 }
1854 }
1855 }
1856 else {
1857 for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1858 {
1859 int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1860 if (length <= 0)
1861 {
1862 fprintf (stderr, "insert_id: %s - BAM error: CIGAR op has zero length\n", qname);
1863
1864 // daehwan - remove this
1865 exit(1);
1866
1867 return false;
1868 }
1869
1870 CigarOpCode opcode;
1871 switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1872 {
1873 case BAM_CMATCH: opcode = MATCH; break;
1874 case BAM_CINS: opcode = INS; break;
1875 case BAM_CDEL: opcode = DEL; break;
1876 case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1877 case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1878 case BAM_CPAD: opcode = PAD; break;
1879 case BAM_CREF_SKIP:
1880 opcode = REF_SKIP;
1881 spliced_alignment = true;
1882 if (length > (int)max_report_intron_length)
1883 {
1884 //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1885 return false;
1886 }
1887 break;
1888 default:
1889 fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1890 return false;
1891 }
1892
1893 if (opcode != HARD_CLIP)
1894 cigar.push_back(CigarOp(opcode, length));
1895
1896 /*
1897 * By convention,the NM field of the SAM record
1898 * counts an insertion or deletion. I dont' think
1899 * we want the mismatch count in the BowtieHit
1900 * record to reflect this. Therefore, subtract out
1901 * the mismatches due to in/dels
1902 */
1903 if(opcode == INS){
1904 num_mismatches -= length;
1905 }
1906 else if(opcode == DEL){
1907 num_mismatches -= length;
1908 }
1909 }
1910 }
1911
1912
1913 string mrnm;
1914 if (mate_target_id >= 0) {
1915 if (mate_target_id == target_id) {
1916 //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1917 mrnm = _sam_header->target_name[mate_target_id];
1918 }
1919 else {
1920 //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1921 return false;
1922 }
1923 }
1924 else {
1925 text_mate_pos = 0;
1926 }
1927
1928 if (spliced_alignment) {
1929 bh = create_hit(qname,
1930 text_name,
1931 text_name2,
1932 text_offset, // BAM files are 0-indexed
1933 cigar,
1934 sam_flag & 0x0010,
1935 antisense_splice,
1936 num_mismatches,
1937 num_splice_anchor_mismatches,
1938 end);
1939
1940 }
1941 else {
1942 bh = create_hit(qname,
1943 text_name,
1944 text_name2,
1945 text_offset, // BAM files are 0-indexed
1946 cigar,
1947 sam_flag & 0x0010,
1948 false,
1949 num_mismatches,
1950 0,
1951 end);
1952 }
1953
1954 return true;
1955 }
1956
1957 bool BAMHitFactory::inspect_header(HitStream& hs)
1958 {
1959 bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1960
1961 if (header == NULL) {
1962 fprintf(stderr, "Warning: No BAM header\n");
1963 return false;
1964 }
1965 if (header->l_text == 0) {
1966 fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1967 return false;
1968 }
1969 return true;
1970 }
1971
1972 bool SplicedBAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1973 BowtieHit& bh, bool strip_slash,
1974 char* name_out, char* name_tags,
1975 char* seq, char* qual) {
1976 if (_sam_header==NULL)
1977 err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1978
1979 const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1980 uint32_t sam_flag = hit_buf->core.flag;
1981
1982 int text_offset = hit_buf->core.pos;
1983 int text_mate_pos = hit_buf->core.mpos;
1984 int target_id = hit_buf->core.tid;
1985 int mate_target_id = hit_buf->core.mtid;
1986
1987 vector<CigarOp> samcigar;
1988 bool spliced_alignment = false;
1989 int num_hits = 1;
1990
1991 double mapQ = hit_buf->core.qual;
1992 long double error_prob;
1993 if (mapQ > 0)
1994 {
1995 long double p = (-1.0 * mapQ) / 10.0;
1996 error_prob = pow(10.0L, p);
1997 }
1998 else
1999 {
2000 error_prob = 1.0;
2001 }
2002
2003 if (seq!=NULL) {
2004 char *bseq = (char*)bam1_seq(hit_buf);
2005 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
2006 char v = bam1_seqi(bseq,i);
2007 seq[i]=bam_nt16_rev_table[v];
2008 }
2009 seq[hit_buf->core.l_qseq]=0;
2010 }
2011 if (qual!=NULL) {
2012 char *bq = (char*)bam1_qual(hit_buf);
2013 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
2014 qual[i]=bq[i]+33;
2015 }
2016 qual[hit_buf->core.l_qseq]=0;
2017 }
2018
2019 bool end = true;
2020 unsigned int seg_offset = 0;
2021 unsigned int seg_num = 0;
2022 unsigned int num_segs = 0;
2023 // Copy the tag out of the name field before we might wipe it out
2024 char* name = bam1_qname(hit_buf);
2025 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
2026
2027 if (target_id < 0) {
2028 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
2029 bh = create_hit(name,
2030 "*", //ref_name
2031 0, //left coord
2032 0, //read_len
2033 false, //antisense_aln
2034 0, //edit_dist
2035 end);
2036 return true;
2037 }
2038
2039 string text_name = _sam_header->target_name[target_id];
2040 for (int i = 0; i < hit_buf->core.n_cigar; ++i)
2041 {
2042 //char* t;
2043
2044 int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
2045 if (length <= 0)
2046 {
2047 fprintf (stderr, "BAM error: CIGAR op has zero length\n");
2048 return false;
2049 }
2050
2051 CigarOpCode opcode;
2052 switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
2053 {
2054 case BAM_CMATCH: opcode = MATCH; break;
2055 case BAM_CINS: opcode = INS; break;
2056 case BAM_CDEL: opcode = DEL; break;
2057 case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
2058 case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
2059 case BAM_CPAD: opcode = PAD; break;
2060 case BAM_CREF_SKIP:
2061 opcode = REF_SKIP;
2062 spliced_alignment = true;
2063 if (length > (int)max_report_intron_length)
2064 {
2065 //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
2066 return false;
2067 }
2068 break;
2069 default:
2070 fprintf (stderr, "BAM read: invalid CIGAR operation\n");
2071 return false;
2072 }
2073 if (opcode != HARD_CLIP)
2074 samcigar.push_back(CigarOp(opcode, length));
2075 }
2076
2077 string mrnm;
2078 if (mate_target_id >= 0) {
2079 if (mate_target_id == target_id) {
2080 mrnm = _sam_header->target_name[mate_target_id];
2081 }
2082 else {
2083 return false;
2084 }
2085 }
2086 else {
2087 text_mate_pos = 0;
2088 }
2089
2090 bool antisense_splice = false;
2091 int sam_nm = 0;
2092 vector<bool> mismatches;
2093 mismatches.resize(strlen(seq), false);
2094 int num_mismatches=getBAMmismatches(hit_buf, samcigar, mismatches, sam_nm, antisense_splice);
2095 unsigned char num_splice_anchor_mismatches = 0;
2096
2097 //##############################################
2098
2099 // Add this alignment to the table of hits for this half of the
2100 // Bowtie map
2101
2102 // Parse the text_name field to recover the splice coords
2103 vector<string> toks;
2104 tokenize_strict(text_name.c_str(), "|", toks);
2105 int num_extra_toks = (int)toks.size() - 6;
2106 if (num_extra_toks >= 0)
2107 {
2108 static const uint8_t left_window_edge_field = 1;
2109 static const uint8_t splice_field = 2;
2110 //static const uint8_t right_window_edge_field = 3;
2111 static const uint8_t junction_type_field = 4;
2112 static const uint8_t strand_field = 5;
2113
2114 string contig = toks[0];
2115 for (int t = 1; t <= num_extra_toks; ++t)
2116 {
2117 contig += "|";
2118 contig += toks[t];
2119 }
2120
2121 vector<string> splice_toks;
2122 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
2123 if (splice_toks.size() != 2)
2124 {
2125 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
2126 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
2127 // toks[num_extra_toks + splice_field].c_str());
2128 return false;
2129 }
2130
2131 const string& junction_type = toks[num_extra_toks + junction_type_field];
2132 const string junction_strand = toks[num_extra_toks + strand_field];
2133
2134 //
2135 // check for an insertion hit
2136 //
2137 if(junction_type == "ins")
2138 {
2139 //int8_t spliced_read_len = strlen(seq_str);
2140 //TODO FIXME: use the CIGAR instead of seq length!
2141 // The 0-based position of the left edge of the alignment. Note that this
2142 // value may need to be further corrected to account for the presence of
2143 // of the insertion.
2144 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
2145
2146 // The 0-based position of the last genomic sequence before the insertion
2147 int left_splice_pos = atoi(splice_toks[0].c_str());
2148
2149 string insertedSequence = splice_toks[1];
2150 // The 0-based position of the first genomic sequence after the insertion
2151
2152 vector<CigarOp> splcigar;
2153 //this also updates left to the adjusted genomic coordinates
2154 int spl_num_mismatches=0;
2155 bool overlapped=spliceCigar(splcigar, samcigar, mismatches,
2156 left, left_splice_pos+1, insertedSequence.length(), INS, spl_num_mismatches);
2157
2158 if (!overlapped)
2159 return false;
2160
2161 if (spl_num_mismatches<0) return false;
2162 num_mismatches-=spl_num_mismatches;
2163 bh = create_hit(name,
2164 contig,
2165 "",
2166 left,
2167 splcigar,
2168 sam_flag & 0x0010,
2169 junction_strand == "rev",
2170 num_mismatches,
2171 0,
2172 end);
2173
2174 // daehwan - remove this
2175 if (samcigar.size() > 1 && false)
2176 {
2177 cout << text_name << "\t" << left << endl
2178 << print_cigar(samcigar) << endl
2179 << print_cigar(splcigar) << endl;
2180 }
2181
2182 return true;
2183 } //"ins"
2184 else //"del", "intron", or "fusion"
2185 {
2186 char orientation = (sam_flag & 0x0010 ? '-' : '+');
2187 if (!(junction_strand == "ff" || junction_strand == "fr" || junction_strand == "rf" || junction_strand == "rr" || junction_strand == "rev" || junction_strand == "fwd")||
2188 !(orientation == '-' || orientation == '+'))
2189 {
2190 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
2191 return false;
2192 }
2193
2194 // The 0-based position of the left edge of the alignment.
2195 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str());
2196 if (junction_type != "fus" || (junction_strand != "rf" && junction_strand != "rr"))
2197 left += text_offset;
2198 else
2199 left -= text_offset;
2200
2201 vector<CigarOp> splcigar;
2202 CigarOpCode opcode;
2203 if(junction_type == "del")
2204 opcode = DEL;
2205 else if(junction_type == "fus")
2206 {
2207 if (junction_strand == "ff")
2208 opcode = FUSION_FF;
2209 else if (junction_strand == "fr")
2210 opcode = FUSION_FR;
2211 else if (junction_strand == "rf")
2212 opcode = FUSION_RF;
2213 else
2214 opcode = FUSION_RR;
2215 }
2216 else
2217 opcode = REF_SKIP;
2218
2219 int left_splice_pos = atoi(splice_toks[0].c_str());
2220
2221 // The 0-based position of the last genomic sequence before the deletion
2222 int gap_len = 0;
2223 if (junction_type == "fus")
2224 gap_len = atoi(splice_toks[1].c_str());
2225 else
2226 gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1;
2227
2228 if (opcode == FUSION_RF || opcode == FUSION_RR)
2229 left_splice_pos -= 1;
2230 else
2231 left_splice_pos += 1;
2232
2233 int spl_num_mismatches=0;
2234 bool overlapped=spliceCigar(splcigar, samcigar, mismatches, left,
2235 left_splice_pos, gap_len, opcode, spl_num_mismatches);
2236
2237 if (!overlapped)
2238 return false;
2239
2240 if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches)
2241 return false;
2242
2243 string contig2 = "";
2244 if (junction_type == "fus")
2245 {
2246 vector<string> contigs;
2247 tokenize(contig, "-", contigs);
2248 if (contigs.size() != 2)
2249 return false;
2250
2251 contig = contigs[0];
2252 contig2 = contigs[1];
2253
2254 if (junction_strand == "rf" || junction_strand == "rr")
2255 orientation = (orientation == '+' ? '-' : '+');
2256 }
2257
2258 bh = create_hit(name,
2259 contig,
2260 contig2,
2261 left,
2262 splcigar,
2263 orientation == '-',
2264 junction_strand == "rev",
2265 num_mismatches,
2266 spl_num_mismatches,
2267 end);
2268
2269 // daehwan - remove this
2270 if (samcigar.size() > 1 && false)
2271 {
2272 cout << text_name << "\t" << left << endl
2273 << print_cigar(samcigar) << endl
2274 << print_cigar(splcigar) << endl;
2275 }
2276
2277 return true;
2278 }
2279 } //parse splice data
2280 else
2281 {
2282 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
2283 //fprintf(stderr, "%s\n", orig_bwt_buf);
2284 // continue;
2285 return false;
2286 }
2287
2288 return false;
2289 }
2290
2291 void get_mapped_reads(FILE* bwtf,
2292 HitTable& hits,
2293 HitFactory& hit_factory,
2294 bool strip_slash,
2295 bool verbose)
2296 {
2297
2298
2299 char bwt_buf[2048];
2300 uint32_t reads_extracted = 0;
2301
2302 while (fgets(bwt_buf, 2048, bwtf))
2303 {
2304 // Chomp the newline
2305 char* nl = strrchr(bwt_buf, '\n');
2306 if (nl) *nl = 0;
2307 if (*bwt_buf == 0)
2308 continue;
2309 // Get a new record from the tab-delimited Bowtie map
2310 BowtieHit bh;
2311 if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
2312 {
2313 // Only check uniqueness if these hits are spliced
2314 hits.add_hit(bh, true);
2315 }
2316 reads_extracted++;
2317 }
2318
2319 // This will sort the map by insert id.
2320 hits.finalize();
2321 fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
2322 }
2323
2324
2325 /*
2326 AlignStatus status(const BowtieHit* align)
2327 {
2328 if (!align)
2329 return UNALIGNED;
2330 if (align->contiguous())
2331 return CONTIGUOUS;
2332 return SPLICED;
2333 }
2334 */
2335
2336 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
2337 {
2338 int max_hit_pos = -1;
2339 for (size_t i = 0; i < hits.size(); ++i)
2340 {
2341 max_hit_pos = max((int)hits[i].right(),max_hit_pos);
2342 }
2343
2344 if ((int)DoC.size() < max_hit_pos)
2345 DoC.resize(max_hit_pos);
2346
2347 for (size_t i = 0; i < hits.size(); ++i)
2348 {
2349 const BowtieHit& bh = hits[i];
2350
2351 // split up the coverage contibution for this reads
2352 size_t j = bh.left();
2353 const vector<CigarOp>& cigar = bh.cigar();
2354
2355 for (size_t c = 0 ; c < cigar.size(); ++c)
2356 {
2357 switch(cigar[c].opcode)
2358 {
2359 case MATCH:
2360 for (size_t m = 0; m < cigar[c].length; ++m)
2361 {
2362 if (DoC[j + m] < 0xFFFF)
2363 DoC[j + m]++;
2364 }
2365 //fall through this case to REF_SKIP is intentional
2366 case REF_SKIP:
2367 j += cigar[c].length;
2368 break;
2369 default:
2370 break;
2371 }
2372
2373 }
2374 }
2375 }
2376
2377 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
2378 {
2379 if ((int)DoC.size() < bh.right())
2380 DoC.resize(bh.right());
2381
2382 // split up the coverage contibution for this reads
2383 size_t j = bh.left();
2384 const vector<CigarOp>& cigar = bh.cigar();
2385
2386 for (size_t c = 0 ; c < cigar.size(); ++c)
2387 {
2388 switch(cigar[c].opcode)
2389 {
2390 case MATCH:
2391 for (size_t m = 0; m < cigar[c].length; ++m)
2392 {
2393 if (DoC[j + m] < VMAXINT32)
2394 DoC[j + m]++;
2395 }
2396 //fall through this case to REF_SKIP is intentional
2397 case REF_SKIP:
2398 j += cigar[c].length;
2399 break;
2400 default:
2401 break;
2402 }
2403
2404 }
2405 }
2406
2407 void print_bamhit(GBamWriter& wbam,
2408 const char* read_name,
2409 const BowtieHit& bh,
2410 const char* ref_name,
2411 const char* ref_name2,
2412 const char* sequence,
2413 const char* qualities,
2414 bool from_bowtie,
2415 const vector<string>* extra_fields)
2416 {
2417 string seq;
2418 string quals;
2419 if (sequence) {
2420 seq = sequence;
2421 quals = qualities;
2422 seq.resize(bh.read_len());
2423 quals.resize(bh.read_len());
2424 }
2425 else {
2426 seq = "*";
2427 }
2428 if (qualities) {
2429 quals = qualities;
2430 quals.resize(bh.read_len());
2431 }
2432 else {
2433 quals = "*";
2434 }
2435
2436 uint32_t sam_flag = 0;
2437 if (bh.antisense_align())
2438 {
2439 sam_flag |= 0x0010; // BAM_FREVERSE
2440 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
2441 {
2442 reverse_complement(seq);
2443 reverse(quals.begin(), quals.end());
2444 }
2445 }
2446
2447 uint32_t sam_pos = bh.left() + 1;
2448 uint32_t map_quality = 255;
2449 char cigar[256];
2450 cigar[0] = 0;
2451 string mate_ref_name = "*";
2452 uint32_t mate_pos = 0;
2453 uint32_t insert_size = 0;
2454 //string qualities = "*";
2455
2456 const vector<CigarOp>& bh_cigar = bh.cigar();
2457 /*
2458 * In addition to calculating the cigar string,
2459 * we need to figure out how many in/dels are in the
2460 * sequence, so that we can give the correct
2461 * value for the NM tag
2462 */
2463 int indel_distance = 0;
2464 CigarOpCode fusion_dir = FUSION_NOTHING;
2465 for (size_t c = 0; c < bh_cigar.size(); ++c)
2466 {
2467 const CigarOp& op = bh_cigar[c];
2468
2469 char ibuf[64];
2470 sprintf(ibuf, "%d", op.length);
2471 switch(op.opcode)
2472 {
2473 case MATCH:
2474 case mATCH:
2475 strcat(cigar, ibuf);
2476 if (bh_cigar[c].opcode == MATCH)
2477 strcat(cigar, "M");
2478 else
2479 strcat(cigar, "m");
2480 break;
2481 case INS:
2482 case iNS:
2483 strcat(cigar, ibuf);
2484 if (bh_cigar[c].opcode == INS)
2485 strcat(cigar, "I");
2486 else
2487 strcat(cigar, "i");
2488 indel_distance += bh_cigar[c].length;
2489 break;
2490 case DEL:
2491 case dEL:
2492 strcat(cigar, ibuf);
2493 if (bh_cigar[c].opcode == DEL)
2494 strcat(cigar, "D");
2495 else
2496 strcat(cigar, "d");
2497 indel_distance += bh_cigar[c].length;
2498 break;
2499 case REF_SKIP:
2500 case rEF_SKIP:
2501 strcat(cigar, ibuf);
2502 if (bh_cigar[c].opcode == REF_SKIP)
2503 strcat(cigar, "N");
2504 else
2505 strcat(cigar, "n");
2506 break;
2507 case FUSION_FF:
2508 case FUSION_FR:
2509 case FUSION_RF:
2510 case FUSION_RR:
2511 fusion_dir = op.opcode;
2512 sprintf(ibuf, "%d", bh_cigar[c].length + 1);
2513 strcat(cigar, ibuf);
2514 strcat(cigar, "F");
2515 break;
2516 default:
2517 break;
2518 }
2519 }
2520
2521 char cigar1[256] = {0}, cigar2[256] = {0};
2522 string left_seq, right_seq, left_qual, right_qual;
2523 int left1 = -1, left2 = -1;
2524 extract_partial_hits(bh, seq, quals,
2525 cigar1, cigar2, left_seq, right_seq,
2526 left_qual, right_qual, left1, left2);
2527
2528 bool containsSplice = false;
2529 for (vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
2530 {
2531 if (itr->opcode == REF_SKIP || itr->opcode == rEF_SKIP)
2532 {
2533 containsSplice = true;
2534 break;
2535 }
2536 }
2537
2538 vector<string> auxdata;
2539 if (extra_fields)
2540 auxdata.insert(auxdata.end(), extra_fields->begin(), extra_fields->end());
2541
2542 if (!sam_readgroup_id.empty())
2543 {
2544 string nm("RG:Z:");
2545 nm += sam_readgroup_id;
2546 auxdata.push_back(nm);
2547 }
2548
2549 string nm("NM:i:");
2550 str_appendInt(nm, bh.edit_dist() + indel_distance);
2551 auxdata.push_back(nm);
2552
2553 if (containsSplice) {
2554 // do not add more than once
2555 bool XS_found = false;
2556 for (size_t i = 0; i < auxdata.size(); ++i)
2557 {
2558 if (auxdata[i].substr(0, 2) == "XS")
2559 {
2560 XS_found = true;
2561 break;
2562 }
2563 }
2564
2565 if (!XS_found)
2566 {
2567 nm="XS:A:";
2568 nm+=(char)(bh.antisense_splice() ? '-' : '+');
2569 auxdata.push_back(nm);
2570 }
2571 }
2572
2573 if (fusion_dir != FUSION_NOTHING)
2574 {
2575 char XF[2048] = {0};
2576 sprintf(XF,
2577 "XF:Z:1 %s-%s %u %s %s %s",
2578 ref_name,
2579 ref_name2,
2580 sam_pos,
2581 cigar,
2582 seq.c_str(),
2583 quals.c_str());
2584 auxdata.push_back(XF);
2585
2586 GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, left1 + 1, map_quality,
2587 cigar1, mate_ref_name.c_str(), mate_pos,
2588 insert_size, left_seq.c_str(), left_qual.c_str(), &auxdata);
2589
2590 wbam.write(brec);
2591 delete brec;
2592
2593 sprintf(XF,
2594 "XF:Z:2 %s-%s %u %s %s %s",
2595 ref_name,
2596 ref_name2,
2597 sam_pos,
2598 cigar,
2599 seq.c_str(),
2600 quals.c_str());
2601 auxdata.back() = XF;
2602
2603 brec = wbam.new_record(read_name, sam_flag, ref_name2, left2 + 1, map_quality,
2604 cigar2, mate_ref_name.c_str(), mate_pos,
2605 insert_size, right_seq.c_str(), right_qual.c_str(), &auxdata);
2606
2607 wbam.write(brec);
2608 delete brec;
2609 }
2610 else
2611 {
2612 GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
2613 cigar, mate_ref_name.c_str(), mate_pos,
2614 insert_size, seq.c_str(), quals.c_str(), &auxdata);
2615
2616 wbam.write(brec);
2617 delete brec;
2618 }
2619 }
2620
2621 /**
2622 * Print a vector of cigar operations to a file.
2623 * @param bh_cigar A vector of CigarOps
2624 * @return a string representation of the cigar string
2625 */
2626 std::string print_cigar(const vector<CigarOp>& bh_cigar){
2627 char cigar[256];
2628 cigar[0] = 0;
2629 for (size_t c = 0; c < bh_cigar.size(); ++c)
2630 {
2631 char ibuf[64];
2632 sprintf(ibuf, "%d", bh_cigar[c].length);
2633 strcat(cigar, ibuf);
2634 switch(bh_cigar[c].opcode)
2635 {
2636 case MATCH:
2637 strcat(cigar, "M");
2638 break;
2639 case mATCH:
2640 strcat(cigar, "m");
2641 break;
2642 case INS:
2643 strcat(cigar, "I");
2644 break;
2645 case iNS:
2646 strcat(cigar, "i");
2647 break;
2648 case DEL:
2649 strcat(cigar, "D");
2650 break;
2651 case dEL:
2652 strcat(cigar, "d");
2653 break;
2654 case REF_SKIP:
2655 strcat(cigar, "N");
2656 break;
2657 case rEF_SKIP:
2658 strcat(cigar, "n");
2659 break;
2660 case FUSION_FF:
2661 case FUSION_FR:
2662 case FUSION_RF:
2663 case FUSION_RR:
2664 strcat(cigar, "F");
2665 break;
2666 default:
2667 break;
2668 }
2669 }
2670 string result(cigar);
2671 return result;
2672 }
2673
2674 void extract_partial_hits(const BowtieHit& bh, const string& seq, const string& qual,
2675 char* cigar1, char* cigar2, string& seq1, string& seq2,
2676 string& qual1, string& qual2, int& left1, int& left2)
2677 {
2678 const int left = bh.left();
2679 int right = left;
2680 int fusion_left = -1, fusion_right = -1;
2681
2682 const vector<CigarOp>& bh_cigar = bh.cigar();
2683
2684 CigarOpCode fusion_dir = FUSION_NOTHING;
2685 size_t fusion_idx = 0;
2686 size_t left_part_len = 0;
2687 for (size_t c = 0; c < bh_cigar.size(); ++c)
2688 {
2689 const CigarOp& op = bh_cigar[c];
2690 switch(op.opcode)
2691 {
2692 case MATCH:
2693 case REF_SKIP:
2694 case DEL:
2695 right += op.length;
2696 break;
2697 case mATCH:
2698 case rEF_SKIP:
2699 case dEL:
2700 right -= op.length;
2701 break;
2702 case FUSION_FF:
2703 case FUSION_FR:
2704 case FUSION_RF:
2705 case FUSION_RR:
2706 {
2707 fusion_dir = op.opcode;
2708 fusion_idx = c;
2709 if (op.opcode == FUSION_FF || op.opcode == FUSION_FR)
2710 fusion_left = right - 1;
2711 else
2712 fusion_left = right + 1;
2713 fusion_right = right = op.length;
2714 }
2715 break;
2716 default:
2717 break;
2718 }
2719
2720 if (fusion_dir == FUSION_NOTHING)
2721 {
2722 if (op.opcode == MATCH || op.opcode == mATCH || op.opcode == INS || op.opcode == iNS)
2723 {
2724 left_part_len += op.length;
2725 }
2726 }
2727 }
2728
2729 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_FR)
2730 {
2731 for (size_t c = 0; c < fusion_idx; ++c)
2732 {
2733 const CigarOp& op = bh_cigar[c];
2734 char ibuf[64];
2735 sprintf(ibuf, "%d", op.length);
2736 strcat(cigar1, ibuf);
2737
2738 switch (op.opcode)
2739 {
2740 case MATCH:
2741 strcat(cigar1, "M");
2742 break;
2743 case INS:
2744 strcat(cigar1, "I");
2745 break;
2746 case DEL:
2747 strcat(cigar1, "D");
2748 break;
2749 case REF_SKIP:
2750 strcat(cigar1, "N");
2751 break;
2752 default:
2753 assert (0);
2754 break;
2755 }
2756 }
2757 }
2758 else if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
2759 {
2760 assert (fusion_idx > 0);
2761 for (int c = fusion_idx - 1; c >=0; --c)
2762 {
2763 const CigarOp& op = bh_cigar[c];
2764 char ibuf[64];
2765 sprintf(ibuf, "%d", op.length);
2766 strcat(cigar1, ibuf);
2767
2768 switch (op.opcode)
2769 {
2770 case mATCH:
2771 strcat(cigar1, "M");
2772 break;
2773 case iNS:
2774 strcat(cigar1, "I");
2775 break;
2776 case dEL:
2777 strcat(cigar1, "D");
2778 break;
2779 case rEF_SKIP:
2780 strcat(cigar1, "N");
2781 break;
2782 default:
2783 assert (0);
2784 break;
2785 }
2786 }
2787 }
2788
2789 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RF)
2790 {
2791 for (size_t c = fusion_idx + 1; c < bh_cigar.size(); ++c)
2792 {
2793 const CigarOp& op = bh_cigar[c];
2794 char ibuf[64];
2795 sprintf(ibuf, "%d", op.length);
2796 strcat(cigar2, ibuf);
2797
2798 switch (op.opcode)
2799 {
2800 case MATCH:
2801 strcat(cigar2, "M");
2802 break;
2803 case INS:
2804 strcat(cigar2, "I");
2805 break;
2806 case DEL:
2807 strcat(cigar2, "D");
2808 break;
2809 case REF_SKIP:
2810 strcat(cigar2, "N");
2811 break;
2812 default:
2813 assert (0);
2814 break;
2815 }
2816 }
2817 }
2818 else if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RR)
2819 {
2820 assert (bh_cigar.size() > 0);
2821 for (size_t c = bh_cigar.size() - 1; c > fusion_idx; --c)
2822 {
2823 const CigarOp& op = bh_cigar[c];
2824 char ibuf[64];
2825 sprintf(ibuf, "%d", op.length);
2826 strcat(cigar2, ibuf);
2827
2828 switch (op.opcode)
2829 {
2830 case mATCH:
2831 strcat(cigar2, "M");
2832 break;
2833 case iNS:
2834 strcat(cigar2, "I");
2835 break;
2836 case dEL:
2837 strcat(cigar2, "D");
2838 break;
2839 case rEF_SKIP:
2840 strcat(cigar2, "N");
2841 break;
2842 default:
2843 assert (0);
2844 break;
2845 }
2846 }
2847 }
2848
2849 if (fusion_dir != FUSION_NOTHING)
2850 {
2851 seq1 = seq.substr(0, left_part_len);
2852 qual1 = qual.substr(0, left_part_len);
2853
2854 if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
2855 {
2856 reverse_complement(seq1);
2857 reverse(qual1.begin(), qual1.end());
2858 }
2859
2860 seq2 = seq.substr(left_part_len);
2861 qual2 = qual.substr(left_part_len);
2862
2863 if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RR)
2864 {
2865 reverse_complement(seq2);
2866 reverse(qual2.begin(), qual2.end());
2867 }
2868
2869 left1 = ((fusion_dir == FUSION_FF || fusion_dir == FUSION_FR) ? left : fusion_left);
2870 left2 = ((fusion_dir == FUSION_FF || fusion_dir == FUSION_RF) ? fusion_right : right + 1);
2871 }
2872 }
2873
2874
2875 bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt, bool bDebug)
2876 {
2877 RefSequenceTable::Sequence* ref_str1 = rt.get_seq(_ref_id);
2878 RefSequenceTable::Sequence* ref_str2 = rt.get_seq(_ref_id2);
2879
2880 if (!ref_str1 || !ref_str2)
2881 return false;
2882
2883 if (bDebug)
2884 {
2885 cout << "check_editdist_consistency" << endl
2886 << "insert id: " << _insert_id << endl;
2887 }
2888
2889 RefSequenceTable::Sequence* ref_str = ref_str1;
2890
2891 size_t pos_seq = 0;
2892 size_t pos_ref = _left;
2893 size_t mismatch = 0;
2894 bool bSawFusion = false;
2895 for (size_t i = 0; i < _cigar.size(); ++i)
2896 {
2897 CigarOp cigar = _cigar[i];
2898 switch(cigar.opcode)
2899 {
2900 case MATCH:
2901 {
2902 seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length);
2903 for (size_t j = 0; j < cigar.length; ++j)
2904 {
2905 seqan::Dna5 ref_nt = _seq[pos_seq];
2906 if (ref_nt != ref_seq[j])
2907 ++mismatch;
2908
2909 if (bDebug)
2910 cout << pos_seq << "\t" << ref_nt << " vs. " << ref_seq[j] << "\t" << mismatch << endl;
2911
2912 ++pos_seq;
2913 }
2914
2915 pos_ref += cigar.length;
2916 }
2917 break;
2918 case mATCH:
2919 {
2920 seqan::Dna5String ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1);
2921 seqan::convertInPlace(ref_seq, seqan::FunctorComplement<seqan::Dna>());
2922 seqan::reverseInPlace(ref_seq);
2923
2924 for (size_t j = 0; j < cigar.length; ++j)
2925 {
2926 seqan::Dna5 ref_nt = _seq[pos_seq];
2927 if (ref_nt != ref_seq[j])
2928 ++mismatch;
2929
2930 if (bDebug)
2931 cout << pos_seq << "\t" << ref_nt << " vs. " << ref_seq[j] << "\t" << mismatch << endl;
2932
2933 ++pos_seq;
2934 }
2935
2936 pos_ref -= cigar.length;
2937 }
2938 break;
2939 case INS:
2940 case iNS:
2941 {
2942 pos_seq += cigar.length;
2943 }
2944 break;
2945
2946 case DEL:
2947 case REF_SKIP:
2948 {
2949 pos_ref += cigar.length;
2950 }
2951 break;
2952
2953 case dEL:
2954 case rEF_SKIP:
2955 {
2956 pos_ref -= cigar.length;
2957 }
2958 break;
2959
2960 case FUSION_FF:
2961 case FUSION_FR:
2962 case FUSION_RF:
2963 case FUSION_RR:
2964 {
2965 // We don't allow a read spans more than two chromosomes.
2966 if (bSawFusion)
2967 return false;
2968
2969 ref_str = ref_str2;
2970 pos_ref = cigar.length;
2971
2972 bSawFusion = true;
2973 }
2974 break;
2975
2976 default:
2977 break;
2978 }
2979 }
2980
2981 if (bDebug)
2982 cout << "mismatch (real) vs. (calculated):" << mismatch << " vs. " << (int)_edit_dist << endl;
2983
2984 return mismatch == _edit_dist;
2985 }
2986
2987 void bowtie_sam_extra(const BowtieHit& bh, const RefSequenceTable& rt, vector<string>& fields)
2988 {
2989 RefSequenceTable::Sequence* ref_str1 = rt.get_seq(bh.ref_id());
2990 RefSequenceTable::Sequence* ref_str2 = rt.get_seq(bh.ref_id2());
2991
2992 if (!ref_str1 || !ref_str2)
2993 return;
2994
2995 RefSequenceTable::Sequence* ref_str = ref_str1;
2996
2997 size_t pos_seq = 0;
2998 size_t pos_mismatch = 0;
2999 size_t pos_ref = bh.left();
3000 size_t mismatch = 0;
3001 size_t num_gap_opens = 0;
3002 size_t num_gap_conts = 0;
3003 bool bSawFusion = false;
3004
3005 int AS_score = 0;
3006
3007 const vector<CigarOp>& cigars = bh.cigar();
3008 const string& seq = bh.seq();
3009 const string& qual = bh.qual();
3010
3011 string AS = "AS:i:";
3012 string MD = "MD:Z:";
3013
3014 for (size_t i = 0; i < cigars.size(); ++i)
3015 {
3016 CigarOp cigar = cigars[i];
3017 switch(cigar.opcode)
3018 {
3019 case MATCH:
3020 case mATCH:
3021 {
3022 seqan::Dna5String ref_seq;
3023 if (cigar.opcode == MATCH)
3024 {
3025 ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length);
3026 pos_ref += cigar.length;
3027 }
3028 else
3029 {
3030 ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1);
3031 seqan::convertInPlace(ref_seq, seqan::FunctorComplement<seqan::Dna>());
3032 seqan::reverseInPlace(ref_seq);
3033 pos_ref -= cigar.length;
3034 }
3035
3036 for (size_t j = 0; j < cigar.length; ++j)
3037 {
3038 seqan::Dna5 ref_nt = ref_seq[j];
3039 if (seq[pos_seq] != ref_nt)
3040 {
3041 ++mismatch;
3042
3043 if (pos_seq < qual.length())
3044 {
3045 float penalty = (bowtie2_max_penalty - bowtie2_min_penalty) * min((int)(qual[pos_seq] - '!'), 40) / 40.0;
3046 AS_score -= (int)penalty;
3047 }
3048
3049 str_appendInt(MD, (int)pos_mismatch);
3050 MD.push_back((char)ref_nt);
3051 pos_mismatch = 0;
3052 }
3053 else
3054 ++pos_mismatch;
3055
3056 ++pos_seq;
3057 }
3058 }
3059 break;
3060
3061 case INS:
3062 case iNS:
3063 {
3064 pos_seq += cigar.length;
3065
3066 AS_score -= bowtie2_read_gap_open;
3067 AS_score -= (int)(bowtie2_read_gap_cont * cigar.length);
3068
3069 num_gap_opens += 1;
3070 num_gap_conts += cigar.length;
3071 }
3072 break;
3073
3074 case DEL:
3075 case dEL:
3076 {
3077 AS_score -= bowtie2_ref_gap_open;
3078 AS_score -= (int)(bowtie2_ref_gap_cont * cigar.length);
3079
3080 num_gap_opens += 1;
3081 num_gap_conts += cigar.length;
3082
3083 seqan::Dna5String ref_seq;
3084 if (cigar.opcode == DEL)
3085 {
3086 ref_seq = seqan::infix(*ref_str, pos_ref, pos_ref + cigar.length);
3087 pos_ref += cigar.length;
3088 }
3089 else
3090 {
3091 ref_seq = seqan::infix(*ref_str, pos_ref - cigar.length + 1, pos_ref + 1);
3092 seqan::convertInPlace(ref_seq, seqan::FunctorComplement<seqan::Dna>());
3093 seqan::reverseInPlace(ref_seq);
3094 pos_ref -= cigar.length;
3095 }
3096
3097 str_appendInt(MD, (int)pos_mismatch);
3098 MD.push_back('^');
3099 for (size_t k = 0; k < length(ref_seq); ++k)
3100 MD.push_back((char)ref_seq[k]);
3101
3102 pos_mismatch = 0;
3103 }
3104 break;
3105
3106 case REF_SKIP:
3107 case rEF_SKIP:
3108 {
3109 if (cigar.opcode == REF_SKIP)
3110 pos_ref += cigar.length;
3111 else
3112 pos_ref -= cigar.length;
3113 }
3114 break;
3115
3116 case FUSION_FF:
3117 case FUSION_FR:
3118 case FUSION_RF:
3119 case FUSION_RR:
3120 {
3121 // We don't allow a read spans more than two chromosomes.
3122 if (bSawFusion)
3123 return;
3124
3125 ref_str = ref_str2;
3126 pos_ref = cigar.length;
3127
3128 bSawFusion = true;
3129 }
3130 break;
3131
3132 default:
3133 break;
3134 }
3135 }
3136
3137 str_appendInt(AS, AS_score);
3138 fields.push_back(AS);
3139
3140 string XM = "XM:i:";
3141 str_appendInt(XM, (int)mismatch);
3142 fields.push_back(XM);
3143
3144 string XO = "XO:i:";
3145 str_appendInt(XO, (int)num_gap_opens);
3146 fields.push_back(XO);
3147
3148 string XG = "XG:i:";
3149 str_appendInt(XG, (int)num_gap_conts);
3150 fields.push_back(XG);
3151
3152 str_appendInt(MD, (int)pos_mismatch);
3153 fields.push_back(MD);
3154 }