ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 138
Committed: Wed Dec 14 22:58:39 2011 UTC (8 years, 5 months ago) by gpertea
File size: 56204 byte(s)
Log Message:
Line File contents
1 /*
2 * bwt_map.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/17/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <cassert>
15 #include <cstdlib>
16 #include <iostream>
17 #include <set>
18 #include <vector>
19 #include <cmath>
20
21 #include "common.h"
22 #include "bwt_map.h"
23 #include "tokenize.h"
24 #include "reads.h"
25
26 using namespace std;
27
28 void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29 {
30 uint32_t reference_id = bh.ref_id();
31
32 pair<RefHits::iterator, bool> ret =
33 _hits_for_ref.insert(make_pair(reference_id, HitList()));
34 HitList& hl = ret.first->second;
35
36 if (check_uniqueness)
37 {
38 // Check uniqueness, in case we are adding spliced hits from
39 // several spliced alignment sources (e.g. de novo hashing + Bowtie
40 // against a user-supplied index). We don't want to count the same
41 // alignment twice if it happened to be found by more than one method
42 HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43 HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44 for (; lb != ub && lb != hl.end(); ++lb)
45 {
46 if (*lb == bh)
47 {
48 //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49 return;
50 }
51
52 if (lb->insert_id() == bh.insert_id() &&
53 lb->ref_id() == bh.ref_id() &&
54 lb->antisense_align() == bh.antisense_align())
55 {
56 // If we get here, we may be looking at the same alignment
57 // However, spanning_reads may report a shorter, trimmed alignment
58 // so not all fields will be equal. If they just disagree on the
59 // ends, and don't indicate a different junction coord, the
60 // alignments are the same.
61
62 if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63 (bh.left() <= lb->left() && bh.right() >= lb->right()))
64 {
65 vector<pair<int,int> > lb_gaps, bh_gaps;
66 lb->gaps(lb_gaps);
67 bh.gaps(bh_gaps);
68 if (lb_gaps == bh_gaps)
69 {
70 // One alignment is contained in the other, they agree on
71 // where the gaps, if any, are, and they share an id
72 // => this is a redundant aligment, so toss it
73 //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74 return;
75 }
76 }
77 }
78 }
79 }
80 _total_hits++;
81 hl.push_back(bh);
82 }
83
84 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85 {
86 return h1.insert_id() < h2.insert_id();
87 }
88
89 void LineHitFactory::openStream(HitStream& hs)
90 {
91 if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92 //open the file for HitStream here
93 hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94 if (hs._hit_file==NULL)
95 err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96 return;
97 }
98 if (hs._fzpipe!=NULL) {
99 hs._hit_file=hs._fzpipe->file;
100 }
101 }
102
103 void LineHitFactory::rewind(HitStream& hs)
104 {
105 if (hs._fzpipe!=NULL) {
106 hs._fzpipe->rewind();
107 hs._hit_file=hs._fzpipe->file;
108 }
109 else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110 }
111
112 bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113 FILE* f=(FILE *)(hs._hit_file);
114 bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115 if (!new_rec || feof(f)) {
116 hs._eof=true;
117 return false;
118 }
119 ++_line_num;
120 char* nl = strrchr(_hit_buf, '\n');
121 if (nl) *nl = 0;
122 buf = _hit_buf;
123 buf_size = _hit_buf_max_sz - 1;
124 return true;
125 }
126
127 void LineHitFactory::closeStream(HitStream& hs) {
128 if (hs._fzpipe!=NULL) {
129 hs._fzpipe->close();
130 return;
131 }
132 if (hs._hit_file!=NULL) {
133 fclose((FILE*)(hs._hit_file));
134 hs._hit_file=NULL;
135 }
136 }
137 void BAMHitFactory::openStream(HitStream& hs) {
138 if (hs._hit_file==NULL) {
139 if (hs._hit_file_name.empty())
140 //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141 return; //invalid stream, could be just a place holder
142 //open the file here if not already open
143 string fext=getFext(hs._hit_file_name);
144 if (fext=="sam")
145 hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146 else
147 hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148
149 samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150
151 if (sam_file == NULL)
152 err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153 if (sam_file->header == NULL)
154 err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155 memset(&_next_hit, 0, sizeof(_next_hit));
156 //_beginning = bgzf_tell(sam_file->x.bam);
157 _sam_header=sam_file->header;
158 if (inspect_header(hs) == false)
159 err_die("Error: invalid SAM header for file %s\n",
160 hs._hit_file_name.c_str());
161 }
162 }
163
164 void BAMHitFactory::closeStream(HitStream& hs) {
165 if (hs._hit_file) {
166 samclose((samfile_t*)(hs._hit_file));
167 }
168 hs._hit_file=NULL;
169 _sam_header=NULL;
170 }
171
172 void BAMHitFactory::rewind(HitStream& hs)
173 {
174 /*
175 if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176 {
177 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178 _eof = false;
179 }
180 */
181 this->closeStream(hs);
182 this->openStream(hs);
183 }
184
185 string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186 const bam1_t* bamrec=(const bam1_t*)hit_buf;
187 char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188 string sam_line(tamline);
189 free(tamline);
190 return sam_line;
191 }
192
193 bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194 if (_next_hit.data) {
195 free(_next_hit.data);
196 _next_hit.data = NULL;
197 }
198 _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199 if (hs.eof() || !hs.ready()) return false;
200
201 //mark_curr_pos();
202
203 memset(&_next_hit, 0, sizeof(_next_hit));
204
205 int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206 if (bytes_read <= 0) {
207 hs._eof = true;
208 return false;
209 }
210 buf = (const char*)&_next_hit;
211 buf_size = bytes_read;
212 return true;
213 }
214
215
216 BowtieHit HitFactory::create_hit(const string& insert_name,
217 const string& ref_name,
218 int left,
219 const vector<CigarOp>& cigar,
220 bool antisense_aln,
221 bool antisense_splice,
222 unsigned char edit_dist,
223 unsigned char splice_mms,
224 bool end)
225 {
226 uint64_t insert_id = _insert_table.get_id(insert_name);
227 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228
229 return BowtieHit(reference_id,
230 insert_id,
231 left,
232 cigar,
233 antisense_aln,
234 antisense_splice,
235 edit_dist,
236 splice_mms,
237 end);
238 }
239
240 BowtieHit HitFactory::create_hit(const string& insert_name,
241 const string& ref_name,
242 uint32_t left,
243 uint32_t read_len,
244 bool antisense_aln,
245 unsigned char edit_dist,
246 bool end)
247 {
248 uint64_t insert_id = _insert_table.get_id(insert_name);
249 uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250
251 return BowtieHit(reference_id,
252 insert_id,
253 left,
254 read_len,
255 antisense_aln,
256 edit_dist,
257 end);
258 }
259
260 bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261 BowtieHit& bh,
262 bool strip_slash,
263 char* name_out,
264 char* name_tags,
265 char* seq,
266 char* qual)
267 {
268 if (!orig_bwt_buf || !*orig_bwt_buf)
269 return false;
270
271 static const int buf_size = 2048;
272
273 char bwt_buf[buf_size];
274 strcpy(bwt_buf, orig_bwt_buf);
275 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276
277 char orientation;
278
279 //int bwtf_ret = 0;
280 //uint32_t seqid = 0;
281
282 char text_name[buf_size];
283 unsigned int text_offset;
284
285
286 //unsigned int other_occs;
287 char mismatches[buf_size];
288 //memset(mismatches, 0, sizeof(mismatches));
289
290 const char* buf = bwt_buf;
291 char* name = get_token((char**)&buf,"\t");
292 char* orientation_str = get_token((char**)&buf,"\t");
293 char* text_name_str = get_token((char**)&buf,"\t");
294
295 strcpy(text_name, text_name_str);
296
297 char* text_offset_str = get_token((char**)&buf,"\t");
298 char* seq_str = get_token((char**)&buf,"\t");
299 if (seq)
300 strcpy(seq, seq_str);
301
302 const char* qual_str = get_token((char**)&buf,"\t");
303 if (qual)
304 strcpy(qual, qual_str);
305
306 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307 mismatches[0] = 0;
308 char* mismatches_str = get_token((char**)&buf, "\t");
309 if (mismatches_str)
310 strcpy(mismatches, mismatches_str);
311
312 orientation = orientation_str[0];
313 text_offset = atoi(text_offset_str);
314
315 bool end = true;
316 unsigned int seg_offset = 0;
317 unsigned int seg_num = 0;
318 unsigned int num_segs = 0;
319
320 // Copy the tag out of the name field before we might wipe it out
321 char* pipe = strrchr(name, '|');
322 if (pipe)
323 {
324 if (name_tags)
325 strcpy(name_tags, pipe);
326
327 char* tag_buf = pipe + 1;
328 if (strchr(tag_buf, ':'))
329 {
330 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331 if (seg_num + 1 == num_segs)
332 end = true;
333 else
334 end = false;
335 }
336
337 *pipe = 0;
338 }
339 // Stripping the slash and number following it gives the insert name
340 char* slash = strrchr(name, '/');
341 if (strip_slash && slash)
342 *slash = 0;
343
344 size_t read_len = strlen(seq_str);
345
346 // Add this alignment to the table of hits for this half of the
347 // Bowtie map
348
349 //vector<string> mismatch_toks;
350 char* pch = strtok (mismatches,",");
351 unsigned char num_mismatches = 0;
352 while (pch != NULL)
353 {
354 char* colon = strchr(pch, ':');
355 if (colon)
356 {
357 num_mismatches++;
358 }
359 //mismatch_toks.push_back(pch);
360 pch = strtok (NULL, ",");
361 }
362
363 bh = create_hit(name,
364 text_name,
365 text_offset,
366 (int)read_len,
367 orientation == '-',
368 num_mismatches,
369 end);
370
371 return true;
372 }
373
374 int anchor_mismatch = 0;
375
376
377 void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
378 bool &end, unsigned int &seg_offset, unsigned int& seg_num,
379 unsigned int & num_segs) {
380 char* pipe = strrchr(name, '|');
381 if (pipe)
382 {
383 if (name_tags)
384 strcpy(name_tags, pipe);
385
386 char* tag_buf = pipe + 1;
387 if (strchr(tag_buf, ':'))
388 {
389 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
390 if (seg_num + 1 == num_segs)
391 end = true;
392 else
393 end = false;
394 }
395
396 *pipe = 0;
397 }
398
399 // Stripping the slash and number following it gives the insert name
400 char* slash = strrchr(name, '/');
401 if (strip_slash && slash)
402 *slash = 0;
403 }
404
405
406 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
407 BowtieHit& bh,
408 bool strip_slash,
409 char* name_out,
410 char* name_tags,
411 char* seq,
412 char* qual)
413 {
414 if (!orig_bwt_buf || !*orig_bwt_buf)
415 return false;
416
417 static const int buf_size = 2048;
418
419 char bwt_buf[buf_size];
420 strcpy(bwt_buf, orig_bwt_buf);
421 //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
422
423 char orientation;
424 char text_name[buf_size];
425 unsigned int text_offset;
426 char mismatches[buf_size];
427 //memset(mismatches, 0, sizeof(mismatches));
428
429 char* buf = bwt_buf;
430 char* name = get_token((char**)&buf,"\t");
431 char* orientation_str = get_token((char**)&buf,"\t");
432 char* text_name_str = get_token((char**)&buf,"\t");
433 strcpy(text_name, text_name_str);
434
435 char* text_offset_str = get_token((char**)&buf,"\t");
436 char* seq_str = get_token((char**)&buf,"\t");
437 if (seq)
438 strcpy(seq, seq_str);
439
440 const char* qual_str = get_token((char**)&buf,"\t");
441 if (qual)
442 strcpy(qual, qual_str);
443
444 /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
445 mismatches[0] = 0;
446 char* mismatches_str = get_token((char**)&buf, "\t");
447 if (mismatches_str)
448 strcpy(mismatches, mismatches_str);
449
450 orientation = orientation_str[0];
451 text_offset = atoi(text_offset_str);
452
453 bool end = true;
454 unsigned int seg_offset = 0;
455 unsigned int seg_num = 0;
456 unsigned int num_segs = 0;
457
458 // Copy the tag out of the name field before we might wipe it out
459 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
460
461 // Add this alignment to the table of hits for this half of the
462 // Bowtie map
463
464 // Parse the text_name field to recover the splice coords
465 vector<string> toks;
466
467 tokenize_strict(text_name, "|", toks);
468
469 int num_extra_toks = (int)toks.size() - 6;
470
471 if (num_extra_toks >= 0)
472 {
473 static const uint8_t left_window_edge_field = 1;
474 static const uint8_t splice_field = 2;
475 //static const uint8_t right_window_edge_field = 3;
476 static const uint8_t junction_type_field = 4;
477 static const uint8_t strand_field = 5;
478
479 string contig = toks[0];
480 for (int t = 1; t <= num_extra_toks; ++t)
481 {
482 contig += "|";
483 contig += toks[t];
484 }
485
486 vector<string> splice_toks;
487 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
488 if (splice_toks.size() != 2)
489 {
490 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
491 //fprintf(stderr, "%s (token: %s)\n", text_name,
492 // toks[num_extra_toks + splice_field].c_str());
493 return false;
494 }
495
496 //
497 // check for an insertion hit
498 //
499 if(toks[num_extra_toks + junction_type_field] == "ins")
500 {
501 int8_t spliced_read_len = strlen(seq_str);
502 /*
503 * The 0-based position of the left edge of the alignment. Note that this
504 * value may need to be futher corrected to account for the presence of
505 * of the insertion.
506 */
507 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
508 uint32_t right = left + spliced_read_len - 1;
509
510
511 /*
512 * The 0-based position of the last genomic sequence before the insertion
513 */
514 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
515
516 string insertedSequence = splice_toks[1];
517 /*
518 * The 0-based position of the first genomic sequence after teh insertion
519 */
520 uint32_t right_splice_pos = left_splice_pos + 1;
521 if(left > left_splice_pos){
522 /*
523 * The genomic position of the left edge of the alignment needs to be corrected
524 * If the alignment does not extend into the insertion, simply subtract the length
525 * of the inserted sequence, otherwise, just set it equal to the right edge
526 */
527 left = left - insertedSequence.length();
528 if(left < right_splice_pos){
529 left = right_splice_pos;
530 }
531 }
532 if(right > left_splice_pos){
533 right = right - insertedSequence.length();
534 if(right < left_splice_pos){
535 right = left_splice_pos;
536 }
537 }
538 /*
539 * Now, right and left should be properly transformed into genomic coordinates
540 * We should be able to deduce how much the alignment matches the insertion
541 * simply based on the length of the read
542 */
543 int left_match_length = 0;
544 if(left <= left_splice_pos){
545 left_match_length = left_splice_pos - left + 1;
546 }
547 int right_match_length = 0;
548 if(right >= right_splice_pos){
549 right_match_length = right - right_splice_pos + 1;
550 }
551 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
552
553 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
554 return false;
555
556 string junction_strand = toks[num_extra_toks + strand_field];
557 if(junction_strand != "rev" && junction_strand != "fwd"){
558 fprintf(stderr, "Malformed insertion record\n");
559 return false;
560 }
561
562 char* pch = strtok( mismatches, ",");
563 unsigned char num_mismatches = 0;
564
565 /*
566 * remember that text_offset holds the left end of the
567 * alignment, relative to the start of the contig
568 */
569
570 /*
571 * The 0-based relative position of the left-most character
572 * before the insertion in the contig
573 */
574 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
575 while (pch != NULL)
576 {
577 char* colon = strchr(pch, ':');
578 if (colon)
579 {
580 *colon = 0;
581 int mismatch_pos = atoi(pch);
582
583 /*
584 * for reversely mapped reads,
585 * find the correct mismatched position.
586 */
587 if(orientation == '-'){
588 mismatch_pos = spliced_read_len - mismatch_pos - 1;
589 }
590
591 /*
592 * Only count mismatches outside of the insertion region
593 * If there is a mismatch within the insertion,
594 * disallow this hit
595 */
596 if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
597 num_mismatches++;
598 }else{
599 return false;
600 }
601 }
602 pch = strtok (NULL, ",");
603 }
604
605
606 vector<CigarOp> cigar;
607 cigar.push_back(CigarOp(MATCH, left_match_length));
608 cigar.push_back(CigarOp(INS, insertion_match_length));
609 cigar.push_back(CigarOp(MATCH, right_match_length));
610
611 /*
612 * For now, disallow hits that don't span
613 * the insertion. If we allow these types of hits,
614 * then long_spanning.cpp needs to be updated
615 * in order to intelligently merge these kinds
616 * of reads back together
617 *
618 * Following code has been changed to allow segment that end
619 * in an insertion
620 */
621 bh = create_hit(name,
622 contig,
623 left,
624 cigar,
625 orientation == '-',
626 junction_strand == "rev",
627 num_mismatches,
628 0,
629 end);
630 return true;
631 }
632
633 else
634 {
635 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
636 int spliced_read_len = strlen(seq_str);
637 int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
638 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
639 int8_t right_splice_pos = spliced_read_len - left_splice_pos;
640
641 if (right_splice_pos <= 0 || left_splice_pos <= 0)
642 return false;
643
644 if (orientation == '+')
645 {
646 if (left_splice_pos + seg_offset < _anchor_length){
647 return false;
648 }
649 }
650 else
651 {
652 if (right_splice_pos + seg_offset < _anchor_length)
653 return false;
654 }
655 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
656 //atoi(toks[right_window_edge_field].c_str());
657 int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
658
659 string junction_strand = toks[num_extra_toks + strand_field];
660 if (!(junction_strand == "rev" || junction_strand == "fwd")||
661 !(orientation == '-' || orientation == '+'))
662 {
663 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
664 //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
665 // junction_strand.c_str(), orientation);
666 return false;
667 }
668
669 //vector<string> mismatch_toks;
670 char* pch = strtok (mismatches,",");
671 int mismatches_in_anchor = 0;
672 unsigned char num_mismatches = 0;
673 while (pch != NULL)
674 {
675 char* colon = strchr(pch, ':');
676 if (colon)
677 {
678 *colon = 0;
679 num_mismatches++;
680 int mismatch_pos = atoi(pch);
681 if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
682 (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
683 mismatches_in_anchor++;
684 }
685 //mismatch_toks.push_back(pch);
686 pch = strtok (NULL, ",");
687 }
688
689 // FIXME: we probably should exclude these hits somewhere, but this
690 // isn't the right place
691 vector<CigarOp> cigar;
692 cigar.push_back(CigarOp(MATCH, left_splice_pos));
693 if(toks[num_extra_toks + junction_type_field] == "del"){
694 cigar.push_back(CigarOp(DEL, gap_len));
695 }else{
696 cigar.push_back(CigarOp(REF_SKIP, gap_len));
697 }
698 cigar.push_back(CigarOp(MATCH, right_splice_pos));
699
700 bh = create_hit(name,
701 contig,
702 left,
703 cigar,
704 orientation == '-',
705 junction_strand == "rev",
706 num_mismatches,
707 mismatches_in_anchor,
708 end);
709 return true;
710 }
711 }
712 else
713 {
714 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
715 //fprintf(stderr, "%s\n", orig_bwt_buf);
716 // continue;
717 return false;
718 }
719
720 return false;
721 }
722
723
724 int parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
725 bool &spliced_alignment) {
726 const char* p_cig = cigar_str;
727 int refspan=0; //alignment span on reference sequence
728 while (*p_cig)
729 {
730 char* t;
731 int op_len = (int)strtol(p_cig, &t, 10);
732 if (op_len <= 0)
733 {
734 fprintf (stderr, "Error: CIGAR op has zero length\n");
735 return 0;
736 }
737 char op_char = toupper(*t);
738 CigarOpCode opcode;
739 switch (op_char) {
740 case '=':
741 case 'X':
742 case 'M': opcode = MATCH;
743 refspan+=op_len;
744 break;
745 case 'I': opcode = INS;
746 break;
747 case 'D': opcode = DEL;
748 refspan+=op_len;
749 break;
750 case 'N': if (op_len > max_report_intron_length)
751 return 0;
752 opcode = REF_SKIP;
753 spliced_alignment = true;
754 refspan+=op_len;
755 break;
756 case 'S': opcode = SOFT_CLIP;
757 break;
758 case 'H': opcode = HARD_CLIP;
759 break;
760 case 'P': opcode = PAD;
761 break;
762 default: fprintf (stderr, "Error: invalid CIGAR operation\n");
763 return 0;
764 }
765 p_cig = t + 1;
766 cigar.push_back(CigarOp(opcode, op_len));
767 } //while cigar codes
768 if (*p_cig) {
769 fprintf (stderr, "Error: unmatched CIGAR operation (%s)\n",p_cig);
770 return 0;
771 }
772 return refspan;
773 }
774
775 int getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
776 vector<int>& mismatches, int& sam_nm, bool& antisense_splice) {
777 int gspan=0;//genomic span of the alignment
778 const char* tag_buf = buf;
779 sam_nm=0;
780 //num_mismatches=0;
781 while((tag_buf = get_token((char**)&buf,"\t")))
782 {
783 vector<string> tuple_fields;
784 tokenize(tag_buf,":", tuple_fields);
785 if (tuple_fields.size() == 3)
786 {
787 if (tuple_fields[0] == "XS")
788 {
789 if (tuple_fields[2] == "-")
790 antisense_splice = true;
791 }
792 else if (tuple_fields[0] == "NM")
793 {
794 sam_nm = atoi(tuple_fields[2].c_str());
795 }
796 else if (tuple_fields[0] == "NS")
797 {
798 //ignored for now
799 }
800 else if (tuple_fields[0] == "MD")
801 {
802 const char* p=tuple_fields[2].c_str();
803 int bi=0; //base offset position in the read
804 while (*p != 0) {
805 if (isdigit(*p)) {
806 int v=atoi(p);
807 do { p++; } while (isdigit(*p));
808 bi+=v;
809 }
810 while (isalpha(*p)) {
811 p++;
812 mismatches.push_back(bi);
813 bi++;
814 }
815 if (*p=='^') { //reference deletion
816 p++;
817 while (isalpha(*p)) { //insert read bases
818 p++; bi++;
819 }
820 }
821 }
822 }
823 else
824 {
825 //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
826 //return false;
827 }
828 }
829 }
830 /* By convention,the NM field of the SAM record
831 * counts an insertion or deletion. I dont' think
832 * we want the mismatch count in the BowtieHit
833 * record to reflect this. Therefore, subtract out
834 * the mismatches due to in/dels
835 */
836 for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
837 switch (itr->opcode)
838 {
839 case MATCH:
840 case REF_SKIP:
841 case PAD:
842 gspan += itr->length;
843 break;
844 case DEL:
845 gspan += itr->length;
846 sam_nm -= itr->length;
847 break;
848 case INS:
849 sam_nm -= itr->length;
850 break;
851 }
852 }
853 return gspan;
854 }
855
856 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
857 BowtieHit& bh,
858 bool strip_slash,
859 char* name_out,
860 char* name_tags,
861 char* seq,
862 char* qual)
863 {
864 if (!orig_bwt_buf || !*orig_bwt_buf)
865 return false;
866 char bwt_buf[2048];
867 strcpy(bwt_buf, orig_bwt_buf);
868 // Are we still in the header region?
869 if (bwt_buf[0] == '@')
870 return false;
871
872 char* buf = bwt_buf;
873 char* name = get_token((char**)&buf,"\t");
874 char* sam_flag_str = get_token((char**)&buf,"\t");
875 char* text_name = get_token((char**)&buf,"\t");
876 char* text_offset_str = get_token((char**)&buf,"\t");
877 const char* map_qual_str = get_token((char**)&buf,"\t");
878 char* cigar_str = get_token((char**)&buf,"\t");
879 const char* mate_ref_str = get_token((char**)&buf,"\t");
880 const char* mate_pos_str = get_token((char**)&buf,"\t");
881 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
882
883 const char* seq_str = get_token((char**)&buf,"\t");
884 if (seq)
885 strcpy(seq, seq_str);
886
887 const char* qual_str = get_token((char**)&buf,"\t");
888 if (qual)
889 strcpy(qual, qual_str);
890
891 if (!name ||
892 !sam_flag_str ||
893 !text_name ||
894 !text_offset_str ||
895 !map_qual_str ||
896 !cigar_str ||
897 !mate_ref_str ||
898 !mate_pos_str ||
899 !inferred_insert_sz_str ||
900 !seq_str ||
901 !qual_str)
902 {
903 // truncated or malformed SAM record
904 return false;
905 }
906
907
908 int sam_flag = atoi(sam_flag_str);
909 int text_offset = atoi(text_offset_str);
910
911 bool end = true;
912 unsigned int seg_offset = 0;
913 unsigned int seg_num = 0;
914 unsigned int num_segs = 0;
915
916 // Copy the tag out of the name field before we might wipe it out
917 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
918
919 vector<CigarOp> cigar;
920 bool spliced_alignment = false;
921 int refspan=parseCigar(cigar, cigar_str, spliced_alignment);
922 if (refspan==0)
923 return false;
924 //vector<string> attributes;
925 //tokenize(tag_buf, " \t",attributes);
926
927 bool antisense_splice = false;
928 int sam_nm = 0; //the value of the NM tag (edit distance)
929 //int mismatches[1024];//array with mismatch positions on the read (0-based from the left aligned end of the read)
930 vector<int> mismatches;
931 getSAMmismatches(buf, cigar, mismatches, sam_nm, antisense_splice);
932
933 if (spliced_alignment)
934 {
935 bh = create_hit(name,
936 text_name,
937 text_offset - 1,
938 cigar,
939 sam_flag & 0x0010,
940 antisense_splice,
941 mismatches.size(),
942 0,
943 end);
944 }
945 else
946 {
947 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
948 bh = create_hit(name,
949 text_name,
950 text_offset - 1, // SAM files are 1-indexed
951 cigar,
952 sam_flag & 0x0010,
953 false,
954 mismatches.size(),
955 0,
956 end);
957 }
958 return true;
959 }
960
961 void cigar_add(vector<CigarOp>& cigar, CigarOp& op) {
962 if (op.length<=0) return;
963 if (cigar.size()>0 && cigar.back().opcode==op.opcode) {
964 cigar.back().length+=op.length;
965 }
966 cigar.push_back(op);
967 }
968
969 int spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<int> mismatches,
970 int &left, int spl_start, int spl_len, CigarOpCode spl_code) {
971 //merge the original 'cigar' with the new insert/gap operation
972 //at position spl_start and place the result into splcigar;
973 int num_mismatches=mismatches.size();
974 //these offsets are relative to the beginning of alignment on reference
975 int spl_ofs=spl_start-left; //relative position of splice op
976 int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op
977 CigarOp gapop(spl_code, spl_len); //for DEL or REF_SKIP
978 if (spl_code==INS)
979 spl_ofs_end += spl_len;
980 int ref_ofs=0; //working offset on reference
981 int read_ofs=0; //working offset on the read, relative to the leftmost aligned base
982 bool xfound=false;
983 //if (left<=spl_start+spl_len) {
984 if (spl_ofs_end>0) {
985 int prev_opcode=0;
986 int prev_oplen=0;
987 for (size_t c = 0 ; c < cigar.size(); ++c)
988 {
989 int prev_read_ofs=read_ofs;
990 int cur_op_ofs=ref_ofs;
991 int cur_opcode=cigar[c].opcode;
992 int cur_oplen=cigar[c].length;
993 switch (cur_opcode) {
994 case MATCH:
995 ref_ofs+=cur_oplen;
996 read_ofs+=cur_oplen;
997 break;
998 case DEL:
999 case REF_SKIP:
1000 case PAD:
1001 ref_ofs+=cur_oplen;
1002 break;
1003 case SOFT_CLIP:
1004 case INS:
1005 read_ofs+=cur_oplen;
1006 break;
1007 //case HARD_CLIP:
1008 }
1009 if (cur_op_ofs>=spl_ofs_end || ref_ofs<=spl_ofs) {
1010 if (spl_code!=INS && cur_op_ofs==spl_ofs_end)
1011 {
1012 //we have to insert the gap here first
1013 cigar_add(splcigar, gapop);
1014 }
1015 cigar_add(splcigar, cigar[c]);
1016 }
1017 else //if (ref_ofs>spl_ofs) {
1018 { //op intersection
1019 xfound=true;
1020 if (spl_code==INS) {
1021 //we have to shorten cur_opcode
1022 // find the overlap between current range
1023 int ovl_start = (cur_op_ofs>spl_ofs) ? cur_op_ofs : spl_ofs;
1024 int ovl_end = (ref_ofs>spl_ofs_end) ? spl_ofs_end : ref_ofs;
1025 CigarOp op(cigar[c]);
1026 int len=op.length;
1027 len-=(ovl_end-ovl_start);
1028 if (len>0) {
1029 op.length=len;
1030 cigar_add(splcigar, op);
1031 }
1032 }
1033 else {//DEL or REF_SKIP
1034 //spl_ofs == spl_ofs_end
1035 //we have to split cur_opcode
1036 CigarOp op(cigar[c]);
1037 op.length=spl_ofs-cur_op_ofs;
1038 cigar_add(splcigar, op);
1039 cigar_add(splcigar, gapop);
1040 op.length=ref_ofs-spl_ofs;
1041 cigar_add(splcigar,op);
1042 }
1043 } //op intersection
1044 prev_opcode=cur_opcode;
1045 prev_oplen=cur_oplen;
1046 } //for each cigar opcode
1047 } //intersection possible
1048 if (!xfound) {//no intersection found between splice event and alignment
1049 if (spl_ofs_end<=0) {
1050 //alignment starts after the splice event
1051 if (spl_code==INS) left-=spl_len;
1052 else left+=spl_len;
1053 }
1054 //else {
1055 //alignment ends before the splice event
1056 //nothing to do
1057 // }
1058 }
1059
1060 //TODO: adjust mismatches if opcode==INS and they fall in the removed part
1061 //TODO: what about anchor mismatches for REF_SKIP?
1062 return num_mismatches;
1063 }
1064
1065 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1066 BowtieHit& bh,
1067 bool strip_slash,
1068 char* name_out,
1069 char* name_tags,
1070 char* seq,
1071 char* qual)
1072 {
1073 if (!orig_bwt_buf || !*orig_bwt_buf)
1074 return false;
1075
1076 char bwt_buf[2048];
1077 strcpy(bwt_buf, orig_bwt_buf);
1078 // Are we still in the header region?
1079 if (bwt_buf[0] == '@')
1080 return false;
1081
1082 char* buf = bwt_buf;
1083 char* name = get_token((char**)&buf,"\t");
1084 char* sam_flag_str = get_token((char**)&buf,"\t");
1085 char* text_name = get_token((char**)&buf,"\t");
1086 char* text_offset_str = get_token((char**)&buf,"\t");
1087 const char* map_qual_str = get_token((char**)&buf,"\t");
1088 char* cigar_str = get_token((char**)&buf,"\t");
1089 const char* mate_ref_str = get_token((char**)&buf,"\t");
1090 const char* mate_pos_str = get_token((char**)&buf,"\t");
1091 const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1092 //int num_mismatches=0;
1093 //int mismatches[1024]; //list of 0-based mismatch positions in this read
1094 //parsed from SAM's MD:Z: tag
1095 const char* seq_str = get_token((char**)&buf,"\t");
1096 if (seq)
1097 strcpy(seq, seq_str);
1098
1099 const char* qual_str = get_token((char**)&buf,"\t");
1100 if (qual)
1101 strcpy(qual, qual_str);
1102
1103 if (!name ||
1104 !sam_flag_str ||
1105 !text_name ||
1106 !text_offset_str ||
1107 !map_qual_str ||
1108 !cigar_str ||
1109 !mate_ref_str ||
1110 !mate_pos_str ||
1111 !inferred_insert_sz_str ||
1112 !seq_str ||
1113 !qual_str)
1114 {
1115 // truncated or malformed SAM record
1116 return false;
1117 }
1118
1119
1120 int sam_flag = atoi(sam_flag_str);
1121 int text_offset = atoi(text_offset_str);
1122 text_offset--; //make it 0-based (SAM is 1-based, Bowtie is 0-based)
1123 bool end = true;
1124 unsigned int seg_offset = 0;
1125 unsigned int seg_num = 0;
1126 unsigned int num_segs = 0;
1127
1128 // Copy the tag out of the name field before we might wipe it out
1129 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1130
1131 vector<CigarOp> samcigar;
1132 bool spliced_alignment = false;
1133 int refspan=parseCigar(samcigar, cigar_str, spliced_alignment);
1134 if (refspan==0)
1135 return false;
1136 bool antisense_splice = false;
1137 int sam_nm = 0;
1138 vector<int> mismatches;
1139 getSAMmismatches(buf, samcigar, mismatches, sam_nm, antisense_splice);
1140
1141 //##############################################
1142
1143 // Add this alignment to the table of hits for this half of the
1144 // Bowtie map
1145
1146 // Parse the text_name field to recover the splice coords
1147 vector<string> toks;
1148 tokenize_strict(text_name, "|", toks);
1149
1150 int num_extra_toks = (int)toks.size() - 6;
1151
1152 if (num_extra_toks >= 0)
1153 {
1154 static const uint8_t left_window_edge_field = 1;
1155 static const uint8_t splice_field = 2;
1156 //static const uint8_t right_window_edge_field = 3;
1157 static const uint8_t junction_type_field = 4;
1158 static const uint8_t strand_field = 5;
1159
1160 string contig = toks[0];
1161 for (int t = 1; t <= num_extra_toks; ++t)
1162 {
1163 contig += "|";
1164 contig += toks[t];
1165 }
1166
1167 vector<string> splice_toks;
1168 tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1169 if (splice_toks.size() != 2)
1170 {
1171 fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1172 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1173 // toks[num_extra_toks + splice_field].c_str());
1174 return false;
1175 }
1176
1177 string junction_strand = toks[num_extra_toks + strand_field];
1178 if(junction_strand != "rev" && junction_strand != "fwd"){
1179 fprintf(stderr, "Malformed insertion record\n");
1180 return false;
1181 }
1182
1183 int spl_num_mismatches=0;
1184
1185 //
1186 // check for an insertion hit
1187 //
1188 if(toks[num_extra_toks + junction_type_field] == "ins")
1189 {
1190 //int8_t spliced_read_len = strlen(seq_str);
1191 //TODO FIXME: use the CIGAR instead of seq length!
1192 // The 0-based position of the left edge of the alignment. Note that this
1193 // value may need to be further corrected to account for the presence of
1194 // of the insertion.
1195 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1196
1197 // The 0-based position of the last genomic sequence before the insertion
1198 int left_splice_pos = atoi(splice_toks[0].c_str());
1199
1200 string insertedSequence = splice_toks[1];
1201 // The 0-based position of the first genomic sequence after the insertion
1202
1203 vector<CigarOp> splcigar;
1204
1205 //TODO: implement this
1206 //this also updates left to the adjusted genomic coordinates
1207 spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches,
1208 left, left_splice_pos, insertedSequence.length(), INS);
1209 /*
1210 uint32_t right_splice_pos = left_splice_pos + 1;
1211
1212 //uint32_t right = left + spliced_read_len - 1;
1213 int right = left + refspan - 1;
1214
1215 if(left > left_splice_pos){
1216 //The genomic position of the left edge of the alignment needs to be corrected
1217 //If the alignment does not extend into the insertion, simply subtract the length
1218 //of the inserted sequence, otherwise, just set it equal to the right edge
1219 left = left - insertedSequence.length();
1220 if(left < right_splice_pos){
1221 left = right_splice_pos;
1222 }
1223 }
1224 if(right > left_splice_pos){
1225 right = right - insertedSequence.length();
1226 if(right < left_splice_pos){
1227 right = left_splice_pos;
1228 }
1229 }
1230 // Now, right and left should be properly transformed into genomic coordinates
1231 // We should be able to deduce how much the alignment matches the insertion
1232 // simply based on the length of the read
1233 int left_match_length = 0;
1234 if(left <= left_splice_pos){
1235 left_match_length = left_splice_pos - left + 1;
1236 }
1237 int right_match_length = 0;
1238 if(right >= right_splice_pos){
1239 right_match_length = right - right_splice_pos + 1;
1240 }
1241 int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1242
1243 if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1244 return false;
1245
1246 //char* pch = strtok( mismatches, ",");
1247 //unsigned char num_mismatches = 0;
1248 //text_offset holds the left end of the alignment,
1249 //RELATIVE TO the start of the contig
1250
1251 //The 0-based relative position of the left-most character
1252 //before the insertion in the contig
1253 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1254 for (size_t i=0;i<mismatches.size();++i) {
1255 int mismatch_pos = mismatches[i];
1256 // for reversely mapped reads,
1257 //find the correct mismatched position.
1258 if (sam_flag & 0x0010){
1259 mismatch_pos = spliced_read_len - mismatch_pos - 1;
1260 }
1261
1262 //Only count mismatches outside of the insertion region
1263 // If there is a mismatch within the insertion,
1264 // disallow this hit
1265 if(mismatch_pos + text_offset <= relative_splice_pos ||
1266 mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1267 spl_num_mismatches++;
1268 }else{
1269 return false;
1270 }
1271 }
1272 */
1273 //vector<CigarOp> splcigar;
1274 //spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1275 //splcigar.push_back(CigarOp(MATCH, left_match_length));
1276 //splcigar.push_back(CigarOp(INS, insertion_match_length));
1277 //splcigar.push_back(CigarOp(MATCH, right_match_length));
1278
1279 bh = create_hit(name,
1280 contig,
1281 left,
1282 //splcigar,
1283 splcigar,
1284 sam_flag & 0x0010,
1285 junction_strand == "rev",
1286 spl_num_mismatches,
1287 0,
1288 end);
1289 return true;
1290 } //"ins"
1291 else //"del" or intron
1292 {
1293 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1294 int spliced_read_len = strlen(seq_str); //FIXME TODO: use CIGAR instead
1295 int left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
1296 if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
1297 int right_splice_pos = spliced_read_len - left_splice_pos;
1298
1299 if (right_splice_pos <= 0 || left_splice_pos <= 0)
1300 return false;
1301
1302 if ((sam_flag & 0x0010) == 0) //######
1303 {
1304 if (left_splice_pos + seg_offset < _anchor_length)
1305 return false;
1306 }
1307 else
1308 {
1309 if (right_splice_pos + seg_offset < _anchor_length)
1310 return false;
1311 }
1312 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1313 //atoi(toks[right_window_edge_field].c_str());
1314 int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
1315
1316 int mismatches_in_anchor = 0;
1317 for (size_t i=0;i<mismatches.size();++i) {
1318 spl_num_mismatches++;
1319 int mismatch_pos = mismatches[i];
1320 if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
1321 ((sam_flag & 0x0010) != 0 &&
1322 abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
1323 mismatches_in_anchor++;
1324 }
1325
1326 // FIXME: we probably should exclude these hits somewhere, but this
1327 // isn't the right place
1328 vector<CigarOp> splcigar;
1329 CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP ;
1330 //TODO: FIXME: use relative offsets for this call instead, like for the INS case
1331 spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left, left_splice_pos, gap_len, opcode);
1332 /*
1333 splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1334 if(toks[num_extra_toks + junction_type_field] == "del"){
1335 splcigar.push_back(CigarOp(DEL, gap_len));
1336 }else{
1337 splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1338 }
1339 splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1340 */
1341 bh = create_hit(name,
1342 contig,
1343 left,
1344 splcigar,
1345 (sam_flag & 0x0010),
1346 junction_strand == "rev",
1347 spl_num_mismatches,
1348 mismatches_in_anchor,
1349 end);
1350 return true;
1351 }
1352 } //parse splice data
1353 else
1354 {
1355 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1356 //fprintf(stderr, "%s\n", orig_bwt_buf);
1357 // continue;
1358 return false;
1359 }
1360
1361 return false;
1362 }
1363
1364
1365
1366 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1367 BowtieHit& bh, bool strip_slash,
1368 char* name_out, char* name_tags,
1369 char* seq, char* qual) {
1370 if (_sam_header==NULL)
1371 err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1372 const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1373
1374
1375 uint32_t sam_flag = hit_buf->core.flag;
1376
1377 int text_offset = hit_buf->core.pos;
1378 int text_mate_pos = hit_buf->core.mpos;
1379 int target_id = hit_buf->core.tid;
1380 int mate_target_id = hit_buf->core.mtid;
1381
1382 vector<CigarOp> cigar;
1383 bool spliced_alignment = false;
1384 int num_hits = 1;
1385
1386 double mapQ = hit_buf->core.qual;
1387 long double error_prob;
1388 if (mapQ > 0)
1389 {
1390 long double p = (-1.0 * mapQ) / 10.0;
1391 error_prob = pow(10.0L, p);
1392 }
1393 else
1394 {
1395 error_prob = 1.0;
1396 }
1397
1398 //header->target_name[c->tid]
1399
1400 bool end = true;
1401 unsigned int seg_offset = 0;
1402 unsigned int seg_num = 0;
1403 unsigned int num_segs = 0;
1404 // Copy the tag out of the name field before we might wipe it out
1405 char* qname = bam1_qname(hit_buf);
1406 char* pipe = strrchr(qname, '|');
1407 if (pipe)
1408 {
1409 if (name_tags)
1410 strcpy(name_tags, pipe);
1411
1412 char* tag_buf = pipe + 1;
1413 if (strchr(tag_buf, ':'))
1414 {
1415 sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1416 if (seg_num + 1 == num_segs)
1417 end = true;
1418 else
1419 end = false;
1420 }
1421
1422 *pipe = 0;
1423 }
1424
1425
1426 if (target_id < 0) {
1427 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1428 bh = create_hit(qname,
1429 "*", //ref_name
1430 0, //left coord
1431 0, //read_len
1432 false, //antisense_aln
1433 0, //edit_dist
1434 end);
1435 return true;
1436 }
1437
1438 //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1439 string text_name = _sam_header->target_name[target_id];
1440 for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1441 {
1442 //char* t;
1443
1444 int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1445 if (length <= 0)
1446 {
1447 fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1448 return false;
1449 }
1450
1451 CigarOpCode opcode;
1452 switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1453 {
1454 case BAM_CMATCH: opcode = MATCH; break;
1455 case BAM_CINS: opcode = INS; break;
1456 case BAM_CDEL: opcode = DEL; break;
1457 case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1458 case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1459 case BAM_CPAD: opcode = PAD; break;
1460 case BAM_CREF_SKIP:
1461 opcode = REF_SKIP;
1462 spliced_alignment = true;
1463 if (length > (int)max_report_intron_length)
1464 {
1465 //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1466 return false;
1467 }
1468 break;
1469 default:
1470 fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1471 return false;
1472 }
1473 if (opcode != HARD_CLIP)
1474 cigar.push_back(CigarOp(opcode, length));
1475 }
1476
1477 string mrnm;
1478 if (mate_target_id >= 0) {
1479 if (mate_target_id == target_id) {
1480 //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1481 mrnm = _sam_header->target_name[mate_target_id];
1482 }
1483 else {
1484 //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1485 return false;
1486 }
1487 }
1488 else {
1489 text_mate_pos = 0;
1490 }
1491 //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1492 bool antisense_splice=false;
1493 unsigned char num_mismatches = 0;
1494 unsigned char num_splice_anchor_mismatches = 0;
1495
1496 uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1497 if (ptr) {
1498 char src_strand_char = bam_aux2A(ptr);
1499 if (src_strand_char == '-')
1500 antisense_splice = true;
1501 // else if (src_strand_char == '+')
1502 // source_strand = CUFF_FWD;
1503 }
1504
1505 ptr = bam_aux_get(hit_buf, "NM");
1506 if (ptr) {
1507 num_mismatches = bam_aux2i(ptr);
1508 }
1509
1510 ptr = bam_aux_get(hit_buf, "NH");
1511 if (ptr) {
1512 num_hits = bam_aux2i(ptr);
1513 }
1514
1515 //bool antisense_aln = bam1_strand(hit_buf);
1516
1517 //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1518 // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1519 if (spliced_alignment) {
1520 //if (source_strand == CUFF_STRAND_UNKNOWN) {
1521 // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1522 // }
1523 bh = create_hit(qname,
1524 text_name,
1525 text_offset, // BAM files are 0-indexed
1526 cigar,
1527 sam_flag & 0x0010,
1528 antisense_splice,
1529 num_mismatches,
1530 num_splice_anchor_mismatches,
1531 end);
1532
1533 }
1534 else {
1535 //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1536 //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1537 bh = create_hit(qname,
1538 text_name,
1539 text_offset, // BAM files are 0-indexed
1540 cigar,
1541 sam_flag & 0x0010,
1542 false,
1543 num_mismatches,
1544 0,
1545 end);
1546 }
1547 if (seq!=NULL) {
1548 char *bseq = (char*)bam1_seq(hit_buf);
1549 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1550 char v = bam1_seqi(bseq,i);
1551 seq[i]=bam_nt16_rev_table[v];
1552 }
1553 seq[hit_buf->core.l_qseq]=0;
1554 }
1555 if (qual!=NULL) {
1556 char *bq = (char*)bam1_qual(hit_buf);
1557 for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1558 qual[i]=bq[i]+33;
1559 }
1560 qual[hit_buf->core.l_qseq]=0;
1561 }
1562 return true;
1563 }
1564
1565 bool BAMHitFactory::inspect_header(HitStream& hs)
1566 {
1567 bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1568
1569 if (header == NULL) {
1570 fprintf(stderr, "Warning: No BAM header\n");
1571 return false;
1572 }
1573 if (header->l_text == 0) {
1574 fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1575 return false;
1576 }
1577 return true;
1578 }
1579
1580
1581 void get_mapped_reads(FILE* bwtf,
1582 HitTable& hits,
1583 HitFactory& hit_factory,
1584 bool strip_slash,
1585 bool verbose)
1586 {
1587
1588
1589 char bwt_buf[2048];
1590 uint32_t reads_extracted = 0;
1591
1592 while (fgets(bwt_buf, 2048, bwtf))
1593 {
1594 // Chomp the newline
1595 char* nl = strrchr(bwt_buf, '\n');
1596 if (nl) *nl = 0;
1597 if (*bwt_buf == 0)
1598 continue;
1599 // Get a new record from the tab-delimited Bowtie map
1600 BowtieHit bh;
1601 if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1602 {
1603 // Only check uniqueness if these hits are spliced
1604 hits.add_hit(bh, true);
1605 }
1606 reads_extracted++;
1607 }
1608
1609 // This will sort the map by insert id.
1610 hits.finalize();
1611 fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1612 }
1613
1614
1615 /*
1616 AlignStatus status(const BowtieHit* align)
1617 {
1618 if (!align)
1619 return UNALIGNED;
1620 if (align->contiguous())
1621 return CONTIGUOUS;
1622 return SPLICED;
1623 }
1624 */
1625
1626 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1627 {
1628 int max_hit_pos = -1;
1629 for (size_t i = 0; i < hits.size(); ++i)
1630 {
1631 max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1632 }
1633
1634 if ((int)DoC.size() < max_hit_pos)
1635 DoC.resize(max_hit_pos);
1636
1637 for (size_t i = 0; i < hits.size(); ++i)
1638 {
1639 const BowtieHit& bh = hits[i];
1640
1641 // split up the coverage contibution for this reads
1642 size_t j = bh.left();
1643 const vector<CigarOp>& cigar = bh.cigar();
1644
1645 for (size_t c = 0 ; c < cigar.size(); ++c)
1646 {
1647 switch(cigar[c].opcode)
1648 {
1649 case MATCH:
1650 for (size_t m = 0; m < cigar[c].length; ++m)
1651 {
1652 if (DoC[j + m] < 0xFFFF)
1653 DoC[j + m]++;
1654 }
1655 //fall through this case to REF_SKIP is intentional
1656 case REF_SKIP:
1657 j += cigar[c].length;
1658 break;
1659 default:
1660 break;
1661 }
1662
1663 }
1664 }
1665 }
1666
1667 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1668 {
1669 if ((int)DoC.size() < bh.right())
1670 DoC.resize(bh.right());
1671
1672 // split up the coverage contibution for this reads
1673 size_t j = bh.left();
1674 const vector<CigarOp>& cigar = bh.cigar();
1675
1676 for (size_t c = 0 ; c < cigar.size(); ++c)
1677 {
1678 switch(cigar[c].opcode)
1679 {
1680 case MATCH:
1681 for (size_t m = 0; m < cigar[c].length; ++m)
1682 {
1683 if (DoC[j + m] < VMAXINT32)
1684 DoC[j + m]++;
1685 }
1686 //fall through this case to REF_SKIP is intentional
1687 case REF_SKIP:
1688 j += cigar[c].length;
1689 break;
1690 default:
1691 break;
1692 }
1693
1694 }
1695 }
1696
1697 void print_hit(FILE* fout,
1698 const char* read_name,
1699 const BowtieHit& bh,
1700 const char* ref_name,
1701 const char* sequence,
1702 const char* qualities,
1703 bool from_bowtie)
1704 {
1705 string seq;
1706 string quals;
1707 if (sequence)
1708 {
1709 seq = sequence;
1710 quals = qualities;
1711 seq.resize(bh.read_len());
1712 quals.resize(bh.read_len());
1713 }
1714 else
1715 {
1716 seq = "*";
1717 }
1718
1719 if (qualities)
1720 {
1721 quals = qualities;
1722 quals.resize(bh.read_len());
1723 }
1724 else
1725 {
1726 quals = "*";
1727 }
1728
1729 uint32_t sam_flag = 0;
1730 if (bh.antisense_align())
1731 {
1732 sam_flag |= 0x0010; // BAM_FREVERSE
1733 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1734 {
1735 reverse_complement(seq);
1736 reverse(quals.begin(), quals.end());
1737 }
1738 }
1739
1740 uint32_t sam_pos = bh.left() + 1;
1741 uint32_t map_quality = 255;
1742 char cigar[256];
1743 cigar[0] = 0;
1744 string mate_ref_name = "*";
1745 uint32_t mate_pos = 0;
1746 uint32_t insert_size = 0;
1747 //string qualities = "*";
1748
1749 const vector<CigarOp>& bh_cigar = bh.cigar();
1750
1751 /*
1752 * In addition to calculating the cigar string,
1753 * we need to figure out how many in/dels are in the
1754 * sequence, so that we can give the correct
1755 * value for the NM tag
1756 */
1757 int indel_distance = 0;
1758 for (size_t c = 0; c < bh_cigar.size(); ++c)
1759 {
1760 char ibuf[64];
1761 sprintf(ibuf, "%d", bh_cigar[c].length);
1762 switch(bh_cigar[c].opcode)
1763 {
1764 case MATCH:
1765 strcat(cigar, ibuf);
1766 strcat(cigar, "M");
1767 break;
1768 case INS:
1769 strcat(cigar, ibuf);
1770 strcat(cigar, "I");
1771 indel_distance += bh_cigar[c].length;
1772 break;
1773 case DEL:
1774 strcat(cigar, ibuf);
1775 strcat(cigar, "D");
1776 indel_distance += bh_cigar[c].length;
1777 break;
1778 case REF_SKIP:
1779 strcat(cigar, ibuf);
1780 strcat(cigar, "N");
1781 break;
1782 default:
1783 break;
1784 }
1785 }
1786
1787 //string q = string(bh.read_len(), '!');
1788 //string s = string(bh.read_len(), 'N');
1789
1790 fprintf(fout,
1791 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1792 read_name,
1793 sam_flag,
1794 ref_name,
1795 sam_pos,
1796 map_quality,
1797 cigar,
1798 mate_ref_name.c_str(),
1799 mate_pos,
1800 insert_size,
1801 seq.c_str(),
1802 quals.c_str());
1803
1804 if (!sam_readgroup_id.empty())
1805 fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1806
1807 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1808
1809 bool containsSplice = false;
1810 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1811 if(itr->opcode == REF_SKIP){
1812 containsSplice = true;
1813 break;
1814 }
1815 }
1816
1817 if (containsSplice)
1818 fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1819
1820 fprintf(fout, "\n");
1821 }
1822
1823 void print_bamhit(GBamWriter& wbam,
1824 const char* read_name,
1825 const BowtieHit& bh,
1826 const char* ref_name,
1827 const char* sequence,
1828 const char* qualities,
1829 bool from_bowtie)
1830 {
1831 string seq;
1832 string quals;
1833 if (sequence) {
1834 seq = sequence;
1835 quals = qualities;
1836 seq.resize(bh.read_len());
1837 quals.resize(bh.read_len());
1838 }
1839 else {
1840 seq = "*";
1841 }
1842 if (qualities) {
1843 quals = qualities;
1844 quals.resize(bh.read_len());
1845 }
1846 else
1847 {
1848 quals = "*";
1849 }
1850
1851 uint32_t sam_flag = 0;
1852 if (bh.antisense_align())
1853 {
1854 sam_flag |= 0x0010; // BAM_FREVERSE
1855 if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1856 {
1857 reverse_complement(seq);
1858 reverse(quals.begin(), quals.end());
1859 }
1860 }
1861
1862 uint32_t sam_pos = bh.left() + 1;
1863 uint32_t map_quality = 255;
1864 char cigar[256];
1865 cigar[0] = 0;
1866 string mate_ref_name = "*";
1867 uint32_t mate_pos = 0;
1868 uint32_t insert_size = 0;
1869 //string qualities = "*";
1870
1871 const vector<CigarOp>& bh_cigar = bh.cigar();
1872 /*
1873 * In addition to calculating the cigar string,
1874 * we need to figure out how many in/dels are in the
1875 * sequence, so that we can give the correct
1876 * value for the NM tag
1877 */
1878 int indel_distance = 0;
1879 for (size_t c = 0; c < bh_cigar.size(); ++c)
1880 {
1881 char ibuf[64];
1882 sprintf(ibuf, "%d", bh_cigar[c].length);
1883 switch(bh_cigar[c].opcode)
1884 {
1885 case MATCH:
1886 strcat(cigar, ibuf);
1887 strcat(cigar, "M");
1888 break;
1889 case INS:
1890 strcat(cigar, ibuf);
1891 strcat(cigar, "I");
1892 indel_distance += bh_cigar[c].length;
1893 break;
1894 case DEL:
1895 strcat(cigar, ibuf);
1896 strcat(cigar, "D");
1897 indel_distance += bh_cigar[c].length;
1898 break;
1899 case REF_SKIP:
1900 strcat(cigar, ibuf);
1901 strcat(cigar, "N");
1902 break;
1903 default:
1904 break;
1905 }
1906 }
1907
1908 //string q = string(bh.read_len(), '!');
1909 //string s = string(bh.read_len(), 'N');
1910 /*
1911 fprintf(fout,
1912 "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1913 read_name,
1914 sam_flag,
1915 ref_name,
1916 sam_pos,
1917 map_quality,
1918 cigar,
1919 mate_ref_name.c_str(),
1920 mate_pos,
1921 insert_size,
1922 seq.c_str(),
1923 quals.c_str());
1924
1925 fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1926 */
1927 vector<string> auxdata;
1928
1929 if (!sam_readgroup_id.empty())
1930 {
1931 string nm("RG:Z:");
1932 nm += sam_readgroup_id;
1933 auxdata.push_back(nm);
1934 }
1935
1936 string nm("NM:i:");
1937 str_appendInt(nm, bh.edit_dist() + indel_distance);
1938 auxdata.push_back(nm);
1939 bool containsSplice = false;
1940 for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1941 if(itr->opcode == REF_SKIP){
1942 containsSplice = true;
1943 break;
1944 }
1945
1946 if (containsSplice) {
1947 //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1948 nm="XS:A:";
1949 nm+=(char)(bh.antisense_splice() ? '-' : '+');
1950 auxdata.push_back(nm);
1951 }
1952
1953 GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1954 cigar, mate_ref_name.c_str(), mate_pos,
1955 insert_size, seq.c_str(), quals.c_str(), &auxdata);
1956
1957
1958
1959 wbam.write(brec);
1960 delete brec;
1961 }
1962
1963 /**
1964 * Print a vector of cigar operations to a file.
1965 * @param bh_cigar A vector of CigarOps
1966 * @return a string representation of the cigar string
1967 */
1968 std::string print_cigar(vector<CigarOp>& bh_cigar){
1969 char cigar[256];
1970 cigar[0] = 0;
1971 for (size_t c = 0; c < bh_cigar.size(); ++c)
1972 {
1973 char ibuf[64];
1974 sprintf(ibuf, "%d", bh_cigar[c].length);
1975 switch(bh_cigar[c].opcode)
1976 {
1977 case MATCH:
1978 strcat(cigar, ibuf);
1979 strcat(cigar, "M");
1980 break;
1981 case INS:
1982 strcat(cigar, ibuf);
1983 strcat(cigar, "I");
1984 break;
1985 case DEL:
1986 strcat(cigar, ibuf);
1987 strcat(cigar, "D");
1988 break;
1989 case REF_SKIP:
1990 strcat(cigar, ibuf);
1991 strcat(cigar, "N");
1992 break;
1993 default:
1994 break;
1995 }
1996 }
1997 string result(cigar);
1998 return result;
1999 }
2000
2001 bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
2002 {
2003 RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
2004 if (!ref_str)
2005 return false;
2006
2007 const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
2008
2009 size_t pos_seq = 0;
2010 size_t pos_ref = 0;
2011 size_t mismatch = 0;
2012 for (size_t i = 0; i < _cigar.size(); ++i)
2013 {
2014 CigarOp cigar = _cigar[i];
2015 switch(cigar.opcode)
2016 {
2017 case MATCH:
2018 {
2019 for (size_t j = 0; j < cigar.length; ++j)
2020 {
2021 if (_seq[pos_seq++] != ref_seq[pos_ref++])
2022 ++mismatch;
2023 }
2024 }
2025 break;
2026 case INS:
2027 {
2028 pos_seq += cigar.length;
2029 }
2030 break;
2031
2032 case DEL:
2033 case REF_SKIP:
2034 {
2035 pos_ref += cigar.length;
2036 }
2037 break;
2038
2039 default:
2040 break;
2041 }
2042 }
2043
2044 return mismatch == _edit_dist;
2045 }