ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.h
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (9 years ago) by gpertea
File size: 22727 byte(s)
Log Message:
adding tophat source work

Line File contents
1 #ifndef BWT_MAP_H
2 #define BWT_MAP_H
3
4 #ifdef HAVE_CONFIG_H
5 #include <config.h>
6 #endif
7
8 #include <stdint.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string>
12 #include <map>
13 #include <vector>
14 #include <cassert>
15 #include <cstring>
16 #include <seqan/sequence.h>
17
18 #include <bam/sam.h>
19 using namespace std;
20
21 #include "common.h"
22 #define _FBUF_SIZE 10*1024
23 /*
24 * bwt_map.h
25 * TopHat
26 *
27 * Created by Cole Trapnell on 11/17/08.
28 * Copyright 2008 Cole Trapnell. All rights reserved.
29 *
30 */
31
32 enum CigarOpCode { MATCH, INS, DEL, REF_SKIP, SOFT_CLIP, HARD_CLIP, PAD };
33
34 struct CigarOp
35 {
36 CigarOp(CigarOpCode o, uint32_t l) : opcode(o), length(l) {}
37 CigarOpCode opcode : 3;
38 uint32_t length : 29;
39
40 bool operator==(const CigarOp& rhs) const
41 {
42 return opcode == rhs.opcode && length == rhs.length;
43 }
44
45 };
46
47 typedef uint32_t ReadID;
48
49 class RefSequenceTable;
50
51 /* Stores the information from a single record of the bowtie map. A given read
52 may have many of these. Reads up to 255bp are supported.
53 */
54 struct BowtieHit
55 {
56 BowtieHit() :
57 _ref_id(0),
58 _insert_id(0),
59 _left(0),
60 _antisense_splice(false),
61 _antisense_aln(false),
62 _edit_dist(0),
63 _splice_mms(0),
64 _end(false){}
65
66 BowtieHit(uint32_t ref_id,
67 ReadID insert_id,
68 int left,
69 int read_len,
70 bool antisense,
71 unsigned char edit_dist,
72 bool end) :
73 _ref_id(ref_id),
74 _insert_id(insert_id),
75 _left(left),
76 _cigar(vector<CigarOp>(1,CigarOp(MATCH,read_len))),
77 _antisense_splice(false),
78 _antisense_aln(antisense),
79 _edit_dist(edit_dist),
80 _splice_mms(0),
81 _end(end)
82 {
83 assert(_cigar.capacity() == _cigar.size());
84 }
85
86 BowtieHit(uint32_t ref_id,
87 ReadID insert_id,
88 int left,
89 const vector<CigarOp>& cigar,
90 bool antisense_aln,
91 bool antisense_splice,
92 unsigned char edit_dist,
93 unsigned char splice_mms,
94 bool end) :
95 _ref_id(ref_id),
96 _insert_id(insert_id),
97 _left(left),
98 _cigar(cigar),
99 _antisense_splice(antisense_splice),
100 _antisense_aln(antisense_aln),
101 _edit_dist(edit_dist),
102 _splice_mms(splice_mms),
103 _end(end)
104 {
105 assert(_cigar.capacity() == _cigar.size());
106 }
107
108 int read_len() const
109 {
110 uint32_t len = 0;
111 for (size_t i = 0; i < _cigar.size(); ++i)
112 {
113 const CigarOp& op = _cigar[i];
114 switch(op.opcode)
115 {
116 case MATCH:
117 case INS:
118 case SOFT_CLIP:
119 len += op.length;
120 break;
121 default:
122 break;
123 }
124 }
125
126 return len;
127 }
128
129 bool operator==(const BowtieHit& rhs) const
130 {
131 return (_insert_id == rhs._insert_id &&
132 _ref_id == rhs._ref_id &&
133 _antisense_aln == rhs._antisense_aln &&
134 _left == rhs._left &&
135 _antisense_splice == rhs._antisense_splice &&
136 _edit_dist == rhs._edit_dist &&
137 /* DO NOT USE ACCEPTED IN COMPARISON */
138 _cigar == rhs._cigar);
139 }
140
141 bool operator<(const BowtieHit& rhs) const
142 {
143 if (_insert_id != rhs._insert_id)
144 return _insert_id < rhs._insert_id;
145 if (_ref_id != rhs._ref_id)
146 return _ref_id < rhs._ref_id;
147 if (_left != rhs._left)
148 return _left < rhs._left;
149 if (_antisense_aln != rhs._antisense_aln)
150 return _antisense_aln < rhs._antisense_aln;
151 if (_edit_dist != rhs._edit_dist)
152 return _edit_dist < rhs._edit_dist;
153 if (_cigar != rhs._cigar)
154 {
155 if (_cigar.size() != rhs._cigar.size())
156 return _cigar.size() < rhs._cigar.size();
157 for (size_t i = 0; i < _cigar.size(); ++i)
158 {
159 if (!(_cigar[i] == rhs._cigar[i]))
160 return (_cigar[i].opcode < rhs._cigar[i].opcode || (_cigar[i].opcode == rhs._cigar[i].opcode && _cigar[i].length < rhs._cigar[i].length));
161 }
162 }
163 return false;
164 }
165
166 uint32_t ref_id() const { return _ref_id; }
167 ReadID insert_id() const { return _insert_id; }
168 int left() const { return _left; }
169 int right() const
170 {
171 int r = _left;
172 for (size_t i = 0; i < _cigar.size(); ++i)
173 {
174 const CigarOp& op = _cigar[i];
175
176 switch(op.opcode)
177 {
178 case MATCH:
179 case REF_SKIP:
180 case DEL:
181 r += op.length;
182 break;
183 default:
184 break;
185 }
186 }
187 return r;
188 }
189
190 bool is_spliced()
191 {
192 for (size_t i = 0; i < _cigar.size(); ++i)
193 {
194 const CigarOp& op = _cigar[i];
195
196 if (op.opcode == REF_SKIP)
197 return true;
198 }
199 return false;
200 }
201
202 bool antisense_splice() const { return _antisense_splice; }
203 bool antisense_align() const { return _antisense_aln; }
204
205 unsigned char edit_dist() const { return _edit_dist; }
206 unsigned char splice_mms() const { return _splice_mms; }
207
208 // For convenience, if you just want a copy of the gap intervals
209 // for this hit.
210 void gaps(vector<pair<int,int> >& gaps_out) const
211 {
212 gaps_out.clear();
213 int pos = _left;
214 for (size_t i = 0; i < _cigar.size(); ++i)
215 {
216 const CigarOp& op = _cigar[i];
217
218 switch(op.opcode)
219 {
220 case REF_SKIP:
221 gaps_out.push_back(make_pair(pos, pos + op.length - 1));
222 pos += op.length;
223 break;
224 case MATCH:
225 pos += op.length;
226 break;
227 default:
228 break;
229 }
230 }
231 }
232
233 const vector<CigarOp>& cigar() const { return _cigar; }
234
235
236 bool contiguous() const
237 {
238 return _cigar.size() == 1 && _cigar[0].opcode == MATCH;
239 }
240
241 const string& hitfile_rec() const { return _hitfile_rec; }
242 void hitfile_rec(const string& rec) { _hitfile_rec = rec; }
243
244 const string& seq() const { return _seq; }
245 void seq(const string& seq) { _seq = seq; }
246
247 const string& qual() const { return _qual; }
248 void qual(const string& qual) { _qual = qual; }
249
250 bool end() const { return _end; }
251 void end(bool end) { _end = end; }
252
253 // this is for debugging purpose
254 bool check_editdist_consistency(const RefSequenceTable& rt);
255
256 private:
257
258 uint32_t _ref_id;
259 ReadID _insert_id; // Id of the sequencing insert
260 int _left; // Position in the reference of the left side of the alignment
261
262 vector<CigarOp> _cigar;
263
264 bool _antisense_splice; // Whether the junction spanned is on the reverse strand
265 bool _antisense_aln; // Whether the alignment is to the reverse strand
266
267 unsigned char _edit_dist; // Total mismatches (note this is not including insertions or deletions as mismatches, ie, not equivalent to NM field of a SAM record)
268 unsigned char _splice_mms; // Mismatches within min_anchor_len of a splice junction
269 string _hitfile_rec; // Points to the buffer for the record from which this hit came
270 string _seq;
271 string _qual;
272 bool _end; // Whether this segment is the last one of the read it belongs to
273 };
274
275 class ReadTable
276 {
277 public:
278
279 ReadTable() :
280 _next_id(1) {}
281
282 // This function should NEVER return zero
283 ReadID get_id(const string& name)
284 {
285 uint32_t _id = atoi(name.c_str());
286 //assert(_id);
287 _next_id = max(_next_id, (size_t)_id);
288 return _id;
289 }
290
291
292 uint32_t observation_order(ReadID ID)
293 {
294 if (ID == 0)
295 return 0xFFFFFFFF;
296 return ID;
297 }
298
299
300 size_t size() const { return _next_id; }
301
302 private:
303 size_t _next_id;
304 };
305
306 class RefSequenceTable
307 {
308 public:
309
310 typedef seqan::String<seqan::Dna5, seqan::Packed<seqan::Alloc<> > > Sequence;
311
312 struct SequenceInfo
313 {
314 SequenceInfo(uint32_t _order,
315 char* _name,
316 Sequence* _seq, uint32_t _len) :
317 observation_order(_order),
318 name(_name),
319 seq(_seq),
320 len(_len) {}
321
322 uint32_t observation_order;
323 char* name;
324 Sequence* seq;
325 uint32_t len;
326 };
327
328 typedef map<string, uint64_t> IDTable;
329 typedef map<uint32_t, SequenceInfo> InvertedIDTable;
330 typedef InvertedIDTable::iterator iterator;
331 typedef InvertedIDTable::const_iterator const_iterator;
332
333 RefSequenceTable(bool keep_names, bool keep_seqs = false) :
334 _next_id(1),
335 _keep_names(keep_names) {}
336
337 RefSequenceTable(const string& sam_header_filename,
338 bool keep_names,
339 bool keep_seqs = false) :
340 _next_id(1),
341 _keep_names(keep_names)
342 {
343 if (sam_header_filename != "")
344 {
345 samfile_t* fh = samopen(sam_header_filename.c_str(), "r", 0);
346 if (fh == 0) {
347 fprintf(stderr, "Failed to open SAM header file %s\n", sam_header_filename.c_str());
348 exit(1);
349 }
350
351 for (size_t i = 0; i < (size_t)fh->header->n_targets; ++i)
352 {
353 const char* name = fh->header->target_name[i];
354 uint32_t len = fh->header->target_len[i];
355 get_id(name, NULL, len);
356 //fprintf(stderr, "SQ: %s - %d\n", name, len);
357 }
358 }
359 }
360
361 // This function should NEVER return zero
362 uint32_t get_id(const string& name,
363 Sequence* seq,
364 uint32_t len)
365 {
366 uint32_t _id = hash_string(name.c_str());
367 pair<InvertedIDTable::iterator, bool> ret =
368 _by_id.insert(make_pair(_id, SequenceInfo(_next_id, NULL, NULL, 0)));
369 if (ret.second == true)
370 {
371 char* _name = NULL;
372 if (_keep_names)
373 _name = strdup(name.c_str());
374 ret.first->second.name = _name;
375 ret.first->second.seq = seq;
376 ret.first->second.len = len;
377 ++_next_id;
378 }
379 assert (_id);
380 return _id;
381 }
382
383 const char* get_name(uint32_t ID) const
384 {
385 InvertedIDTable::const_iterator itr = _by_id.find(ID);
386 if (itr != _by_id.end())
387 return itr->second.name;
388 else
389 return NULL;
390 }
391
392 uint32_t get_len(uint32_t ID) const
393 {
394 InvertedIDTable::const_iterator itr = _by_id.find(ID);
395 if (itr != _by_id.end())
396 return itr->second.len;
397 else
398 return 0;
399 }
400
401 Sequence* get_seq(uint32_t ID) const
402 {
403 InvertedIDTable::const_iterator itr = _by_id.find(ID);
404 if (itr != _by_id.end())
405 return itr->second.seq;
406 else
407 return NULL;
408 }
409
410 const SequenceInfo* get_info(uint32_t ID) const
411 {
412
413 InvertedIDTable::const_iterator itr = _by_id.find(ID);
414 if (itr != _by_id.end())
415 {
416 return &(itr->second);
417 }
418 else
419 return NULL;
420 }
421
422 int observation_order(uint32_t ID) const
423 {
424 InvertedIDTable::const_iterator itr = _by_id.find(ID);
425 if (itr != _by_id.end())
426 {
427 return itr->second.observation_order;
428 }
429 else
430 return -1;
431 }
432
433 iterator begin() { return _by_id.begin(); }
434 iterator end() { return _by_id.end(); }
435
436 const_iterator begin() const { return _by_id.begin(); }
437 const_iterator end() const { return _by_id.end(); }
438
439 size_t size() const { return _by_id.size(); }
440
441 void clear()
442 {
443 //_by_name.clear();
444 _by_id.clear();
445 }
446
447 private:
448
449 // This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
450 inline uint32_t hash_string(const char* __s)
451 {
452 uint32_t hash = 0x811c9dc5;
453 for ( ; *__s; ++__s)
454 {
455 hash *= 16777619;
456 hash ^= *__s;
457 }
458 return hash;
459 }
460
461 //IDTable _by_name;
462 uint32_t _next_id;
463 bool _keep_names;
464 InvertedIDTable _by_id;
465 };
466
467
468 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2);
469
470 typedef vector<BowtieHit> HitList;
471
472 /* This class stores all the hits from a Bowtie map */
473
474 class HitTable
475 {
476 public:
477
478 typedef map<uint64_t, HitList> RefHits;
479 typedef RefHits::const_iterator const_iterator;
480 typedef RefHits::iterator iterator;
481
482 HitTable() : _total_hits(0) {}
483
484 const_iterator begin() const { return _hits_for_ref.begin(); }
485 const_iterator end() const { return _hits_for_ref.end(); }
486
487 iterator begin() { return _hits_for_ref.begin(); }
488 iterator end() { return _hits_for_ref.end(); }
489
490 void add_hit(const BowtieHit& bh, bool check_uniqueness);
491
492 void finalize()
493 {
494 for (RefHits::iterator i = _hits_for_ref.begin();
495 i != _hits_for_ref.end();
496 ++i)
497 {
498 sort(i->second.begin(), i->second.end(), hit_insert_id_lt);
499 }
500 }
501
502 HitList* get_hits(uint64_t ref_id)
503 {
504 RefHits::iterator i = _hits_for_ref.find(ref_id);
505 if (i == _hits_for_ref.end())
506 return NULL;
507 else
508 return &(i->second);
509 }
510
511 uint32_t total_hits() const { return _total_hits; }
512
513 private:
514 RefHits _hits_for_ref;
515 uint32_t _total_hits;
516 };
517
518 class HitStream;
519
520 class HitFactory {
521 friend class HitStream;
522 public:
523 HitFactory(ReadTable& insert_table,
524 RefSequenceTable& reference_table) :
525 _insert_table(insert_table), _ref_table(reference_table)
526 {}
527 virtual ~HitFactory() {}
528 virtual void openStream(HitStream& hs)=0;
529 virtual void rewind(HitStream& hs)=0;
530 virtual void closeStream(HitStream& hs)=0;
531 BowtieHit create_hit(const string& insert_name,
532 const string& ref_name,
533 int left,
534 const vector<CigarOp>& cigar,
535 bool antisense_aln,
536 bool antisense_splice,
537 unsigned char edit_dist,
538 unsigned char splice_mms,
539 bool end);
540
541 BowtieHit create_hit(const string& insert_name,
542 const string& ref_name,
543 uint32_t left,
544 uint32_t read_len,
545 bool antisense_aln,
546 unsigned char edit_dist,
547 bool end);
548
549 virtual string hitfile_rec(HitStream& hs, const char* hit_buf)=0;
550 virtual bool next_record(HitStream& hs, const char*& buf, size_t& buf_size) = 0;
551 virtual bool get_hit_from_buf(const char* bwt_buf,
552 BowtieHit& bh,
553 bool strip_slash,
554 char* name_out = NULL,
555 char* name_tags = NULL,
556 char* seq = NULL,
557 char* qual = NULL) = 0;
558
559 protected:
560 ReadTable& _insert_table;
561 RefSequenceTable& _ref_table;
562 HitStream* _hit_stream;
563 };//class HitFactory
564
565
566 class LineHitFactory : public HitFactory {
567 //for text line-based formats like Bowtie and SAM
568 public:
569 LineHitFactory(ReadTable& insert_table,
570 RefSequenceTable& reference_table) :
571 HitFactory(insert_table, reference_table) {}
572
573 string hitfile_rec(HitStream& hs, const char* hit_buf) {
574 string r(hit_buf);
575 return r;
576 }
577 void openStream(HitStream& hs);
578 void rewind(HitStream& hs);
579 void closeStream(HitStream& hs);
580 bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
581 protected:
582 static const size_t _hit_buf_max_sz = 10 * 1024;
583 char _hit_buf[_hit_buf_max_sz];
584 int _line_num;
585 };
586
587 class BowtieHitFactory : public LineHitFactory {
588 public:
589 BowtieHitFactory(ReadTable& insert_table,
590 RefSequenceTable& reference_table) :
591 LineHitFactory(insert_table, reference_table) {}
592
593 bool get_hit_from_buf(const char* bwt_buf,
594 BowtieHit& bh,
595 bool strip_slash,
596 char* name_out = NULL,
597 char* name_tags = NULL,
598 char* seq = NULL,
599 char* qual = NULL);
600 };
601
602 class SplicedBowtieHitFactory : public LineHitFactory {
603 public:
604 SplicedBowtieHitFactory(ReadTable& insert_table,
605 RefSequenceTable& reference_table,
606 int anchor_length) :
607 LineHitFactory(insert_table, reference_table),
608 _anchor_length(anchor_length){}
609
610 bool get_hit_from_buf(const char* bwt_buf,
611 BowtieHit& bh,
612 bool strip_slash,
613 char* name_out = NULL,
614 char* name_tags = NULL,
615 char* seq = NULL,
616 char* qual = NULL);
617 private:
618 int _anchor_length;
619 int _seg_offset;
620 int _size_buf;
621 };
622
623 class SAMHitFactory : public LineHitFactory {
624 public:
625 SAMHitFactory(ReadTable& insert_table,
626 RefSequenceTable& reference_table) :
627 LineHitFactory(insert_table, reference_table) {}
628
629 bool get_hit_from_buf(const char* bwt_buf,
630 BowtieHit& bh,
631 bool strip_slash,
632 char* name_out = NULL,
633 char* name_tags = NULL,
634 char* seq = NULL,
635 char* qual = NULL);
636 };
637
638
639 /******************************************************************************
640 BAMHitFactory turns SAM alignments into BowtieHits
641 *******************************************************************************/
642 class BAMHitFactory : public HitFactory {
643 public:
644
645 BAMHitFactory(ReadTable& insert_table,
646 RefSequenceTable& reference_table) :
647 HitFactory(insert_table, reference_table)
648 {
649 _sam_header=NULL;
650 }
651 void openStream(HitStream& hs);
652 void rewind(HitStream& hs);
653 void closeStream(HitStream& hs);
654
655 bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
656
657 /*void mark_curr_pos()
658 {
659 _curr_pos = bgzf_tell(((samfile_t*)_hit_file)->x.bam);
660 }
661
662
663 void undo_hit()
664 {
665 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _curr_pos, SEEK_SET);
666 _eof = false;
667 }
668 */
669 string hitfile_rec(HitStream& hs, const char* hit_buf);
670
671 bool get_hit_from_buf(const char* bwt_buf,
672 BowtieHit& bh,
673 bool strip_slash,
674 char* name_out = NULL,
675 char* name_tags = NULL,
676 char* seq = NULL,
677 char* qual = NULL);
678 private:
679 //int64_t _curr_pos;
680 //int64_t _beginning;
681 bam1_t _next_hit;
682 bam_header_t* _sam_header;
683 bool inspect_header(HitStream& hs);
684 };
685
686
687 struct HitsForRead
688 {
689 HitsForRead() : insert_id(0) {}
690 uint64_t insert_id;
691 vector<BowtieHit> hits;
692 };
693
694 class HitStream
695 {
696 friend class HitFactory;
697 friend class LineHitFactory;
698 friend class BAMHitFactory;
699 //private:
700 HitFactory* _factory;
701 bool _spliced;
702 bool _strip_slash;
703 BowtieHit buffered_hit;
704 bool _keep_bufs;
705 bool _keep_seqs;
706 bool _keep_quals;
707 bool _from_bowtie;
708 void* _hit_file;
709 string _hit_file_name;
710 FZPipe* _fzpipe;
711 bool _eof;
712
713 public:
714 HitStream(void* hit_file, //could be FILE* or samfile_t*
715 HitFactory* hit_factory,
716 bool spliced,
717 bool strip_slash,
718 bool keep_bufs,
719 bool keep_seqs = false,
720 bool keep_quals = false,
721 bool from_bowtie = false) :
722 _factory(hit_factory),
723 _spliced(spliced),
724 _strip_slash(strip_slash),
725 buffered_hit(BowtieHit()),
726 _keep_bufs(keep_bufs),
727 _keep_seqs(keep_seqs),
728 _keep_quals(keep_quals),
729 _from_bowtie(from_bowtie),
730 _hit_file(hit_file),
731 _hit_file_name(),
732 _fzpipe(NULL),
733 _eof(false)
734 {
735 primeStream();
736 }
737
738 HitStream(string& hit_filename,
739 HitFactory* hit_factory,
740 bool spliced,
741 bool strip_slash,
742 bool keep_bufs,
743 bool keep_seqs = false,
744 bool keep_quals = false,
745 bool from_bowtie = false) :
746 _factory(hit_factory),
747 _spliced(spliced),
748 _strip_slash(strip_slash),
749 buffered_hit(BowtieHit()),
750 _keep_bufs(keep_bufs),
751 _keep_seqs(keep_seqs),
752 _keep_quals(keep_quals),
753 _from_bowtie(from_bowtie),
754 _hit_file(NULL),
755 _hit_file_name(hit_filename),
756 _fzpipe(NULL),
757 _eof(false)
758 {
759 _factory->openStream(*this);
760 primeStream();
761 }
762
763 HitStream(FZPipe& hit_filepipe,
764 HitFactory* hit_factory,
765 bool spliced,
766 bool strip_slash,
767 bool keep_bufs,
768 bool keep_seqs = false,
769 bool keep_quals = false,
770 bool from_bowtie = false) :
771 _factory(hit_factory),
772 _spliced(spliced),
773 _strip_slash(strip_slash),
774 buffered_hit(BowtieHit()),
775 _keep_bufs(keep_bufs),
776 _keep_seqs(keep_seqs),
777 _keep_quals(keep_quals),
778 _from_bowtie(from_bowtie),
779 _hit_file(NULL),
780 _hit_file_name(),
781 _fzpipe(&hit_filepipe),
782 _eof(false)
783 {
784 _hit_file=_fzpipe->file;
785 primeStream();
786 }
787
788 void primeStream() { //why?
789 // Prime the stream by reading a single hit into the buffered_hit
790 HitsForRead dummy = HitsForRead();
791 next_read_hits(dummy);
792 }
793 bool eof() { return _eof; }
794 bool ready() { return (_hit_file!=NULL); }
795 void reset() {
796 _factory->rewind(*this);
797 _eof=false;
798 // re-prime the stream;
799 buffered_hit = BowtieHit();
800 primeStream();
801 }
802
803 bool next_read_hits(HitsForRead& hits_for_read)
804 {
805 hits_for_read.hits.clear();
806 hits_for_read.insert_id = 0;
807
808 //if (!_hit_file || (feof(_hit_file) && buffered_hit.insert_id() == 0))
809 // return false;
810 if (!this->ready())
811 //err_die("Error: next_read_hits() called on HitFactory with no file handle\n");
812 return false;
813 if (this->eof() && buffered_hit.insert_id() == 0) {
814 return false;
815 }
816
817 //char bwt_buf[2048]; bwt_buf[0] = 0;
818 char bwt_seq[2048]; bwt_seq[0] = 0;
819 char bwt_qual[2048]; bwt_qual[0] = 0;
820
821 char* seq = _keep_seqs ? bwt_seq : NULL;
822 char* qual = _keep_quals ? bwt_qual : NULL;
823
824 hits_for_read.insert_id = buffered_hit.insert_id();
825 if (hits_for_read.insert_id)
826 hits_for_read.hits.push_back(buffered_hit);
827 const char* hit_buf;
828 size_t hit_buf_size = 0;
829 while (true) {
830 if (!_factory->next_record(*this, hit_buf, hit_buf_size)) {
831 buffered_hit = BowtieHit();
832 break; }
833 //string clean_buf = bwt_buf;
834 // Get a new record from the tab-delimited Bowtie map
835 BowtieHit bh;
836 if (_factory->get_hit_from_buf(hit_buf, bh, _strip_slash,
837 NULL, NULL, seq, qual)) {
838 if (_keep_bufs)
839 bh.hitfile_rec(_factory->hitfile_rec(*this, hit_buf));
840
841 if (_keep_seqs)
842 bh.seq(seq);
843
844 if (_keep_quals) {
845 // when it comes to convert from qual in color to qual in bp,
846 // we need to fill in the two extream qual values using the adjacent qual values.
847 size_t qual_len = strlen(qual);
848 if (color && !color_out && qual_len > 2) {
849 qual[0] = qual[1];
850 qual[qual_len-1] = qual[qual_len-2];
851 }
852 bh.qual(qual);
853 }
854
855 if (bh.insert_id() == hits_for_read.insert_id) {
856 hits_for_read.hits.push_back(bh);
857 }
858 else {
859 buffered_hit = bh;
860 break;
861 }
862 } //hit parsed
863 } //while reading hits
864
865 return (!hits_for_read.hits.empty() && hits_for_read.insert_id != 0);
866 }
867
868 uint64_t next_group_id() const
869 {
870 return buffered_hit.insert_id();
871 }
872 bool fromBowtie() { return _from_bowtie; }
873 };
874
875 typedef uint32_t MateStatusMask;
876
877 void get_mapped_reads(FILE* bwtf,
878 HitTable& hits,
879 HitFactory& hit_factory,
880 bool strip_slash,
881 bool verbose = false);
882
883 //bool left_status_better(MateStatusMask left, MateStatusMask right);
884 //bool status_equivalent(MateStatusMask left, MateStatusMask right);
885 typedef uint32_t MateStatusMask;
886
887 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC);
888 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC);
889
890 void accept_all_hits(HitTable& hits);
891
892 //print BowtieHit as SAM line
893 void print_hit(FILE* fout,
894 const char* read_name,
895 const BowtieHit& bh,
896 const char* ref_name,
897 const char* sequence,
898 const char* qualities,
899 bool from_bowtie = false);
900
901 //print BowtieHit as BAM record
902 void print_bamhit(GBamWriter& wbam,
903 const char* read_name,
904 const BowtieHit& bh,
905 const char* ref_name,
906 const char* sequence,
907 const char* qualities,
908 bool from_bowtie = false);
909
910 /**
911 * Convert a vector of CigarOps to a string representation
912 */
913 std::string print_cigar(vector<CigarOp>& bh_cigar);
914
915 #endif