ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.h
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (9 years ago) by gpertea
File size: 22727 byte(s)
Log Message:
adding tophat source work

Line User Rev File contents
1 gpertea 29 #ifndef BWT_MAP_H
2     #define BWT_MAP_H
3    
4     #ifdef HAVE_CONFIG_H
5     #include <config.h>
6     #endif
7    
8     #include <stdint.h>
9     #include <stdio.h>
10     #include <stdlib.h>
11     #include <string>
12     #include <map>
13     #include <vector>
14     #include <cassert>
15     #include <cstring>
16     #include <seqan/sequence.h>
17    
18     #include <bam/sam.h>
19     using namespace std;
20    
21     #include "common.h"
22     #define _FBUF_SIZE 10*1024
23     /*
24     * bwt_map.h
25     * TopHat
26     *
27     * Created by Cole Trapnell on 11/17/08.
28     * Copyright 2008 Cole Trapnell. All rights reserved.
29     *
30     */
31    
32     enum CigarOpCode { MATCH, INS, DEL, REF_SKIP, SOFT_CLIP, HARD_CLIP, PAD };
33    
34     struct CigarOp
35     {
36     CigarOp(CigarOpCode o, uint32_t l) : opcode(o), length(l) {}
37     CigarOpCode opcode : 3;
38     uint32_t length : 29;
39    
40     bool operator==(const CigarOp& rhs) const
41     {
42     return opcode == rhs.opcode && length == rhs.length;
43     }
44    
45     };
46    
47     typedef uint32_t ReadID;
48    
49     class RefSequenceTable;
50    
51     /* Stores the information from a single record of the bowtie map. A given read
52     may have many of these. Reads up to 255bp are supported.
53     */
54     struct BowtieHit
55     {
56     BowtieHit() :
57     _ref_id(0),
58     _insert_id(0),
59     _left(0),
60     _antisense_splice(false),
61     _antisense_aln(false),
62     _edit_dist(0),
63     _splice_mms(0),
64     _end(false){}
65    
66     BowtieHit(uint32_t ref_id,
67     ReadID insert_id,
68     int left,
69     int read_len,
70     bool antisense,
71     unsigned char edit_dist,
72     bool end) :
73     _ref_id(ref_id),
74     _insert_id(insert_id),
75     _left(left),
76     _cigar(vector<CigarOp>(1,CigarOp(MATCH,read_len))),
77     _antisense_splice(false),
78     _antisense_aln(antisense),
79     _edit_dist(edit_dist),
80     _splice_mms(0),
81     _end(end)
82     {
83     assert(_cigar.capacity() == _cigar.size());
84     }
85    
86     BowtieHit(uint32_t ref_id,
87     ReadID insert_id,
88     int left,
89     const vector<CigarOp>& cigar,
90     bool antisense_aln,
91     bool antisense_splice,
92     unsigned char edit_dist,
93     unsigned char splice_mms,
94     bool end) :
95     _ref_id(ref_id),
96     _insert_id(insert_id),
97     _left(left),
98     _cigar(cigar),
99     _antisense_splice(antisense_splice),
100     _antisense_aln(antisense_aln),
101     _edit_dist(edit_dist),
102     _splice_mms(splice_mms),
103     _end(end)
104     {
105     assert(_cigar.capacity() == _cigar.size());
106     }
107    
108     int read_len() const
109     {
110     uint32_t len = 0;
111     for (size_t i = 0; i < _cigar.size(); ++i)
112     {
113     const CigarOp& op = _cigar[i];
114     switch(op.opcode)
115     {
116     case MATCH:
117     case INS:
118     case SOFT_CLIP:
119     len += op.length;
120     break;
121     default:
122     break;
123     }
124     }
125    
126     return len;
127     }
128    
129     bool operator==(const BowtieHit& rhs) const
130     {
131     return (_insert_id == rhs._insert_id &&
132     _ref_id == rhs._ref_id &&
133     _antisense_aln == rhs._antisense_aln &&
134     _left == rhs._left &&
135     _antisense_splice == rhs._antisense_splice &&
136     _edit_dist == rhs._edit_dist &&
137     /* DO NOT USE ACCEPTED IN COMPARISON */
138     _cigar == rhs._cigar);
139     }
140    
141     bool operator<(const BowtieHit& rhs) const
142     {
143     if (_insert_id != rhs._insert_id)
144     return _insert_id < rhs._insert_id;
145     if (_ref_id != rhs._ref_id)
146     return _ref_id < rhs._ref_id;
147     if (_left != rhs._left)
148     return _left < rhs._left;
149     if (_antisense_aln != rhs._antisense_aln)
150     return _antisense_aln < rhs._antisense_aln;
151     if (_edit_dist != rhs._edit_dist)
152     return _edit_dist < rhs._edit_dist;
153     if (_cigar != rhs._cigar)
154     {
155     if (_cigar.size() != rhs._cigar.size())
156     return _cigar.size() < rhs._cigar.size();
157     for (size_t i = 0; i < _cigar.size(); ++i)
158     {
159     if (!(_cigar[i] == rhs._cigar[i]))
160     return (_cigar[i].opcode < rhs._cigar[i].opcode || (_cigar[i].opcode == rhs._cigar[i].opcode && _cigar[i].length < rhs._cigar[i].length));
161     }
162     }
163     return false;
164     }
165    
166     uint32_t ref_id() const { return _ref_id; }
167     ReadID insert_id() const { return _insert_id; }
168     int left() const { return _left; }
169     int right() const
170     {
171     int r = _left;
172     for (size_t i = 0; i < _cigar.size(); ++i)
173     {
174     const CigarOp& op = _cigar[i];
175    
176     switch(op.opcode)
177     {
178     case MATCH:
179     case REF_SKIP:
180     case DEL:
181     r += op.length;
182     break;
183     default:
184     break;
185     }
186     }
187     return r;
188     }
189    
190     bool is_spliced()
191     {
192     for (size_t i = 0; i < _cigar.size(); ++i)
193     {
194     const CigarOp& op = _cigar[i];
195    
196     if (op.opcode == REF_SKIP)
197     return true;
198     }
199     return false;
200     }
201    
202     bool antisense_splice() const { return _antisense_splice; }
203     bool antisense_align() const { return _antisense_aln; }
204    
205     unsigned char edit_dist() const { return _edit_dist; }
206     unsigned char splice_mms() const { return _splice_mms; }
207    
208     // For convenience, if you just want a copy of the gap intervals
209     // for this hit.
210     void gaps(vector<pair<int,int> >& gaps_out) const
211     {
212     gaps_out.clear();
213     int pos = _left;
214     for (size_t i = 0; i < _cigar.size(); ++i)
215     {
216     const CigarOp& op = _cigar[i];
217    
218     switch(op.opcode)
219     {
220     case REF_SKIP:
221     gaps_out.push_back(make_pair(pos, pos + op.length - 1));
222     pos += op.length;
223     break;
224     case MATCH:
225     pos += op.length;
226     break;
227     default:
228     break;
229     }
230     }
231     }
232    
233     const vector<CigarOp>& cigar() const { return _cigar; }
234    
235    
236     bool contiguous() const
237     {
238     return _cigar.size() == 1 && _cigar[0].opcode == MATCH;
239     }
240    
241     const string& hitfile_rec() const { return _hitfile_rec; }
242     void hitfile_rec(const string& rec) { _hitfile_rec = rec; }
243    
244     const string& seq() const { return _seq; }
245     void seq(const string& seq) { _seq = seq; }
246    
247     const string& qual() const { return _qual; }
248     void qual(const string& qual) { _qual = qual; }
249    
250     bool end() const { return _end; }
251     void end(bool end) { _end = end; }
252    
253     // this is for debugging purpose
254     bool check_editdist_consistency(const RefSequenceTable& rt);
255    
256     private:
257    
258     uint32_t _ref_id;
259     ReadID _insert_id; // Id of the sequencing insert
260     int _left; // Position in the reference of the left side of the alignment
261    
262     vector<CigarOp> _cigar;
263    
264     bool _antisense_splice; // Whether the junction spanned is on the reverse strand
265     bool _antisense_aln; // Whether the alignment is to the reverse strand
266    
267     unsigned char _edit_dist; // Total mismatches (note this is not including insertions or deletions as mismatches, ie, not equivalent to NM field of a SAM record)
268     unsigned char _splice_mms; // Mismatches within min_anchor_len of a splice junction
269     string _hitfile_rec; // Points to the buffer for the record from which this hit came
270     string _seq;
271     string _qual;
272     bool _end; // Whether this segment is the last one of the read it belongs to
273     };
274    
275     class ReadTable
276     {
277     public:
278    
279     ReadTable() :
280     _next_id(1) {}
281    
282     // This function should NEVER return zero
283     ReadID get_id(const string& name)
284     {
285     uint32_t _id = atoi(name.c_str());
286     //assert(_id);
287     _next_id = max(_next_id, (size_t)_id);
288     return _id;
289     }
290    
291    
292     uint32_t observation_order(ReadID ID)
293     {
294     if (ID == 0)
295     return 0xFFFFFFFF;
296     return ID;
297     }
298    
299    
300     size_t size() const { return _next_id; }
301    
302     private:
303     size_t _next_id;
304     };
305    
306     class RefSequenceTable
307     {
308     public:
309    
310     typedef seqan::String<seqan::Dna5, seqan::Packed<seqan::Alloc<> > > Sequence;
311    
312     struct SequenceInfo
313     {
314     SequenceInfo(uint32_t _order,
315     char* _name,
316     Sequence* _seq, uint32_t _len) :
317     observation_order(_order),
318     name(_name),
319     seq(_seq),
320     len(_len) {}
321    
322     uint32_t observation_order;
323     char* name;
324     Sequence* seq;
325     uint32_t len;
326     };
327    
328     typedef map<string, uint64_t> IDTable;
329     typedef map<uint32_t, SequenceInfo> InvertedIDTable;
330     typedef InvertedIDTable::iterator iterator;
331     typedef InvertedIDTable::const_iterator const_iterator;
332    
333     RefSequenceTable(bool keep_names, bool keep_seqs = false) :
334     _next_id(1),
335     _keep_names(keep_names) {}
336    
337     RefSequenceTable(const string& sam_header_filename,
338     bool keep_names,
339     bool keep_seqs = false) :
340     _next_id(1),
341     _keep_names(keep_names)
342     {
343     if (sam_header_filename != "")
344     {
345     samfile_t* fh = samopen(sam_header_filename.c_str(), "r", 0);
346     if (fh == 0) {
347     fprintf(stderr, "Failed to open SAM header file %s\n", sam_header_filename.c_str());
348     exit(1);
349     }
350    
351     for (size_t i = 0; i < (size_t)fh->header->n_targets; ++i)
352     {
353     const char* name = fh->header->target_name[i];
354     uint32_t len = fh->header->target_len[i];
355     get_id(name, NULL, len);
356     //fprintf(stderr, "SQ: %s - %d\n", name, len);
357     }
358     }
359     }
360    
361     // This function should NEVER return zero
362     uint32_t get_id(const string& name,
363     Sequence* seq,
364     uint32_t len)
365     {
366     uint32_t _id = hash_string(name.c_str());
367     pair<InvertedIDTable::iterator, bool> ret =
368     _by_id.insert(make_pair(_id, SequenceInfo(_next_id, NULL, NULL, 0)));
369     if (ret.second == true)
370     {
371     char* _name = NULL;
372     if (_keep_names)
373     _name = strdup(name.c_str());
374     ret.first->second.name = _name;
375     ret.first->second.seq = seq;
376     ret.first->second.len = len;
377     ++_next_id;
378     }
379     assert (_id);
380     return _id;
381     }
382    
383     const char* get_name(uint32_t ID) const
384     {
385     InvertedIDTable::const_iterator itr = _by_id.find(ID);
386     if (itr != _by_id.end())
387     return itr->second.name;
388     else
389     return NULL;
390     }
391    
392     uint32_t get_len(uint32_t ID) const
393     {
394     InvertedIDTable::const_iterator itr = _by_id.find(ID);
395     if (itr != _by_id.end())
396     return itr->second.len;
397     else
398     return 0;
399     }
400    
401     Sequence* get_seq(uint32_t ID) const
402     {
403     InvertedIDTable::const_iterator itr = _by_id.find(ID);
404     if (itr != _by_id.end())
405     return itr->second.seq;
406     else
407     return NULL;
408     }
409    
410     const SequenceInfo* get_info(uint32_t ID) const
411     {
412    
413     InvertedIDTable::const_iterator itr = _by_id.find(ID);
414     if (itr != _by_id.end())
415     {
416     return &(itr->second);
417     }
418     else
419     return NULL;
420     }
421    
422     int observation_order(uint32_t ID) const
423     {
424     InvertedIDTable::const_iterator itr = _by_id.find(ID);
425     if (itr != _by_id.end())
426     {
427     return itr->second.observation_order;
428     }
429     else
430     return -1;
431     }
432    
433     iterator begin() { return _by_id.begin(); }
434     iterator end() { return _by_id.end(); }
435    
436     const_iterator begin() const { return _by_id.begin(); }
437     const_iterator end() const { return _by_id.end(); }
438    
439     size_t size() const { return _by_id.size(); }
440    
441     void clear()
442     {
443     //_by_name.clear();
444     _by_id.clear();
445     }
446    
447     private:
448    
449     // This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
450     inline uint32_t hash_string(const char* __s)
451     {
452     uint32_t hash = 0x811c9dc5;
453     for ( ; *__s; ++__s)
454     {
455     hash *= 16777619;
456     hash ^= *__s;
457     }
458     return hash;
459     }
460    
461     //IDTable _by_name;
462     uint32_t _next_id;
463     bool _keep_names;
464     InvertedIDTable _by_id;
465     };
466    
467    
468     bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2);
469    
470     typedef vector<BowtieHit> HitList;
471    
472     /* This class stores all the hits from a Bowtie map */
473    
474     class HitTable
475     {
476     public:
477    
478     typedef map<uint64_t, HitList> RefHits;
479     typedef RefHits::const_iterator const_iterator;
480     typedef RefHits::iterator iterator;
481    
482     HitTable() : _total_hits(0) {}
483    
484     const_iterator begin() const { return _hits_for_ref.begin(); }
485     const_iterator end() const { return _hits_for_ref.end(); }
486    
487     iterator begin() { return _hits_for_ref.begin(); }
488     iterator end() { return _hits_for_ref.end(); }
489    
490     void add_hit(const BowtieHit& bh, bool check_uniqueness);
491    
492     void finalize()
493     {
494     for (RefHits::iterator i = _hits_for_ref.begin();
495     i != _hits_for_ref.end();
496     ++i)
497     {
498     sort(i->second.begin(), i->second.end(), hit_insert_id_lt);
499     }
500     }
501    
502     HitList* get_hits(uint64_t ref_id)
503     {
504     RefHits::iterator i = _hits_for_ref.find(ref_id);
505     if (i == _hits_for_ref.end())
506     return NULL;
507     else
508     return &(i->second);
509     }
510    
511     uint32_t total_hits() const { return _total_hits; }
512    
513     private:
514     RefHits _hits_for_ref;
515     uint32_t _total_hits;
516     };
517    
518     class HitStream;
519    
520     class HitFactory {
521     friend class HitStream;
522     public:
523     HitFactory(ReadTable& insert_table,
524     RefSequenceTable& reference_table) :
525     _insert_table(insert_table), _ref_table(reference_table)
526     {}
527     virtual ~HitFactory() {}
528     virtual void openStream(HitStream& hs)=0;
529     virtual void rewind(HitStream& hs)=0;
530     virtual void closeStream(HitStream& hs)=0;
531     BowtieHit create_hit(const string& insert_name,
532     const string& ref_name,
533     int left,
534     const vector<CigarOp>& cigar,
535     bool antisense_aln,
536     bool antisense_splice,
537     unsigned char edit_dist,
538     unsigned char splice_mms,
539     bool end);
540    
541     BowtieHit create_hit(const string& insert_name,
542     const string& ref_name,
543     uint32_t left,
544     uint32_t read_len,
545     bool antisense_aln,
546     unsigned char edit_dist,
547     bool end);
548    
549     virtual string hitfile_rec(HitStream& hs, const char* hit_buf)=0;
550     virtual bool next_record(HitStream& hs, const char*& buf, size_t& buf_size) = 0;
551     virtual bool get_hit_from_buf(const char* bwt_buf,
552     BowtieHit& bh,
553     bool strip_slash,
554     char* name_out = NULL,
555     char* name_tags = NULL,
556     char* seq = NULL,
557     char* qual = NULL) = 0;
558    
559     protected:
560     ReadTable& _insert_table;
561     RefSequenceTable& _ref_table;
562     HitStream* _hit_stream;
563     };//class HitFactory
564    
565    
566     class LineHitFactory : public HitFactory {
567     //for text line-based formats like Bowtie and SAM
568     public:
569     LineHitFactory(ReadTable& insert_table,
570     RefSequenceTable& reference_table) :
571     HitFactory(insert_table, reference_table) {}
572    
573     string hitfile_rec(HitStream& hs, const char* hit_buf) {
574     string r(hit_buf);
575     return r;
576     }
577     void openStream(HitStream& hs);
578     void rewind(HitStream& hs);
579     void closeStream(HitStream& hs);
580     bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
581     protected:
582     static const size_t _hit_buf_max_sz = 10 * 1024;
583     char _hit_buf[_hit_buf_max_sz];
584     int _line_num;
585     };
586    
587     class BowtieHitFactory : public LineHitFactory {
588     public:
589     BowtieHitFactory(ReadTable& insert_table,
590     RefSequenceTable& reference_table) :
591     LineHitFactory(insert_table, reference_table) {}
592    
593     bool get_hit_from_buf(const char* bwt_buf,
594     BowtieHit& bh,
595     bool strip_slash,
596     char* name_out = NULL,
597     char* name_tags = NULL,
598     char* seq = NULL,
599     char* qual = NULL);
600     };
601    
602     class SplicedBowtieHitFactory : public LineHitFactory {
603     public:
604     SplicedBowtieHitFactory(ReadTable& insert_table,
605     RefSequenceTable& reference_table,
606     int anchor_length) :
607     LineHitFactory(insert_table, reference_table),
608     _anchor_length(anchor_length){}
609    
610     bool get_hit_from_buf(const char* bwt_buf,
611     BowtieHit& bh,
612     bool strip_slash,
613     char* name_out = NULL,
614     char* name_tags = NULL,
615     char* seq = NULL,
616     char* qual = NULL);
617     private:
618     int _anchor_length;
619     int _seg_offset;
620     int _size_buf;
621     };
622    
623     class SAMHitFactory : public LineHitFactory {
624     public:
625     SAMHitFactory(ReadTable& insert_table,
626     RefSequenceTable& reference_table) :
627     LineHitFactory(insert_table, reference_table) {}
628    
629     bool get_hit_from_buf(const char* bwt_buf,
630     BowtieHit& bh,
631     bool strip_slash,
632     char* name_out = NULL,
633     char* name_tags = NULL,
634     char* seq = NULL,
635     char* qual = NULL);
636     };
637    
638    
639     /******************************************************************************
640     BAMHitFactory turns SAM alignments into BowtieHits
641     *******************************************************************************/
642     class BAMHitFactory : public HitFactory {
643     public:
644    
645     BAMHitFactory(ReadTable& insert_table,
646     RefSequenceTable& reference_table) :
647     HitFactory(insert_table, reference_table)
648     {
649     _sam_header=NULL;
650     }
651     void openStream(HitStream& hs);
652     void rewind(HitStream& hs);
653     void closeStream(HitStream& hs);
654    
655     bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
656    
657     /*void mark_curr_pos()
658     {
659     _curr_pos = bgzf_tell(((samfile_t*)_hit_file)->x.bam);
660     }
661    
662    
663     void undo_hit()
664     {
665     bgzf_seek(((samfile_t*)_hit_file)->x.bam, _curr_pos, SEEK_SET);
666     _eof = false;
667     }
668     */
669     string hitfile_rec(HitStream& hs, const char* hit_buf);
670    
671     bool get_hit_from_buf(const char* bwt_buf,
672     BowtieHit& bh,
673     bool strip_slash,
674     char* name_out = NULL,
675     char* name_tags = NULL,
676     char* seq = NULL,
677     char* qual = NULL);
678     private:
679     //int64_t _curr_pos;
680     //int64_t _beginning;
681     bam1_t _next_hit;
682     bam_header_t* _sam_header;
683     bool inspect_header(HitStream& hs);
684     };
685    
686    
687     struct HitsForRead
688     {
689     HitsForRead() : insert_id(0) {}
690     uint64_t insert_id;
691     vector<BowtieHit> hits;
692     };
693    
694     class HitStream
695     {
696     friend class HitFactory;
697     friend class LineHitFactory;
698     friend class BAMHitFactory;
699     //private:
700     HitFactory* _factory;
701     bool _spliced;
702     bool _strip_slash;
703     BowtieHit buffered_hit;
704     bool _keep_bufs;
705     bool _keep_seqs;
706     bool _keep_quals;
707     bool _from_bowtie;
708     void* _hit_file;
709     string _hit_file_name;
710     FZPipe* _fzpipe;
711     bool _eof;
712    
713     public:
714     HitStream(void* hit_file, //could be FILE* or samfile_t*
715     HitFactory* hit_factory,
716     bool spliced,
717     bool strip_slash,
718     bool keep_bufs,
719     bool keep_seqs = false,
720     bool keep_quals = false,
721     bool from_bowtie = false) :
722     _factory(hit_factory),
723     _spliced(spliced),
724     _strip_slash(strip_slash),
725     buffered_hit(BowtieHit()),
726     _keep_bufs(keep_bufs),
727     _keep_seqs(keep_seqs),
728     _keep_quals(keep_quals),
729     _from_bowtie(from_bowtie),
730     _hit_file(hit_file),
731     _hit_file_name(),
732     _fzpipe(NULL),
733     _eof(false)
734     {
735     primeStream();
736     }
737    
738     HitStream(string& hit_filename,
739     HitFactory* hit_factory,
740     bool spliced,
741     bool strip_slash,
742     bool keep_bufs,
743     bool keep_seqs = false,
744     bool keep_quals = false,
745     bool from_bowtie = false) :
746     _factory(hit_factory),
747     _spliced(spliced),
748     _strip_slash(strip_slash),
749     buffered_hit(BowtieHit()),
750     _keep_bufs(keep_bufs),
751     _keep_seqs(keep_seqs),
752     _keep_quals(keep_quals),
753     _from_bowtie(from_bowtie),
754     _hit_file(NULL),
755     _hit_file_name(hit_filename),
756     _fzpipe(NULL),
757     _eof(false)
758     {
759     _factory->openStream(*this);
760     primeStream();
761     }
762    
763     HitStream(FZPipe& hit_filepipe,
764     HitFactory* hit_factory,
765     bool spliced,
766     bool strip_slash,
767     bool keep_bufs,
768     bool keep_seqs = false,
769     bool keep_quals = false,
770     bool from_bowtie = false) :
771     _factory(hit_factory),
772     _spliced(spliced),
773     _strip_slash(strip_slash),
774     buffered_hit(BowtieHit()),
775     _keep_bufs(keep_bufs),
776     _keep_seqs(keep_seqs),
777     _keep_quals(keep_quals),
778     _from_bowtie(from_bowtie),
779     _hit_file(NULL),
780     _hit_file_name(),
781     _fzpipe(&hit_filepipe),
782     _eof(false)
783     {
784     _hit_file=_fzpipe->file;
785     primeStream();
786     }
787    
788     void primeStream() { //why?
789     // Prime the stream by reading a single hit into the buffered_hit
790     HitsForRead dummy = HitsForRead();
791     next_read_hits(dummy);
792     }
793     bool eof() { return _eof; }
794     bool ready() { return (_hit_file!=NULL); }
795     void reset() {
796     _factory->rewind(*this);
797     _eof=false;
798     // re-prime the stream;
799     buffered_hit = BowtieHit();
800     primeStream();
801     }
802    
803     bool next_read_hits(HitsForRead& hits_for_read)
804     {
805     hits_for_read.hits.clear();
806     hits_for_read.insert_id = 0;
807    
808     //if (!_hit_file || (feof(_hit_file) && buffered_hit.insert_id() == 0))
809     // return false;
810     if (!this->ready())
811     //err_die("Error: next_read_hits() called on HitFactory with no file handle\n");
812     return false;
813     if (this->eof() && buffered_hit.insert_id() == 0) {
814     return false;
815     }
816    
817     //char bwt_buf[2048]; bwt_buf[0] = 0;
818     char bwt_seq[2048]; bwt_seq[0] = 0;
819     char bwt_qual[2048]; bwt_qual[0] = 0;
820    
821     char* seq = _keep_seqs ? bwt_seq : NULL;
822     char* qual = _keep_quals ? bwt_qual : NULL;
823    
824     hits_for_read.insert_id = buffered_hit.insert_id();
825     if (hits_for_read.insert_id)
826     hits_for_read.hits.push_back(buffered_hit);
827     const char* hit_buf;
828     size_t hit_buf_size = 0;
829     while (true) {
830     if (!_factory->next_record(*this, hit_buf, hit_buf_size)) {
831     buffered_hit = BowtieHit();
832     break; }
833     //string clean_buf = bwt_buf;
834     // Get a new record from the tab-delimited Bowtie map
835     BowtieHit bh;
836     if (_factory->get_hit_from_buf(hit_buf, bh, _strip_slash,
837     NULL, NULL, seq, qual)) {
838     if (_keep_bufs)
839     bh.hitfile_rec(_factory->hitfile_rec(*this, hit_buf));
840    
841     if (_keep_seqs)
842     bh.seq(seq);
843    
844     if (_keep_quals) {
845     // when it comes to convert from qual in color to qual in bp,
846     // we need to fill in the two extream qual values using the adjacent qual values.
847     size_t qual_len = strlen(qual);
848     if (color && !color_out && qual_len > 2) {
849     qual[0] = qual[1];
850     qual[qual_len-1] = qual[qual_len-2];
851     }
852     bh.qual(qual);
853     }
854    
855     if (bh.insert_id() == hits_for_read.insert_id) {
856     hits_for_read.hits.push_back(bh);
857     }
858     else {
859     buffered_hit = bh;
860     break;
861     }
862     } //hit parsed
863     } //while reading hits
864    
865     return (!hits_for_read.hits.empty() && hits_for_read.insert_id != 0);
866     }
867    
868     uint64_t next_group_id() const
869     {
870     return buffered_hit.insert_id();
871     }
872     bool fromBowtie() { return _from_bowtie; }
873     };
874    
875     typedef uint32_t MateStatusMask;
876    
877     void get_mapped_reads(FILE* bwtf,
878     HitTable& hits,
879     HitFactory& hit_factory,
880     bool strip_slash,
881     bool verbose = false);
882    
883     //bool left_status_better(MateStatusMask left, MateStatusMask right);
884     //bool status_equivalent(MateStatusMask left, MateStatusMask right);
885     typedef uint32_t MateStatusMask;
886    
887     void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC);
888     void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC);
889    
890     void accept_all_hits(HitTable& hits);
891    
892     //print BowtieHit as SAM line
893     void print_hit(FILE* fout,
894     const char* read_name,
895     const BowtieHit& bh,
896     const char* ref_name,
897     const char* sequence,
898     const char* qualities,
899     bool from_bowtie = false);
900    
901     //print BowtieHit as BAM record
902     void print_bamhit(GBamWriter& wbam,
903     const char* read_name,
904     const BowtieHit& bh,
905     const char* ref_name,
906     const char* sequence,
907     const char* qualities,
908     bool from_bowtie = false);
909    
910     /**
911     * Convert a vector of CigarOps to a string representation
912     */
913     std::string print_cigar(vector<CigarOp>& bh_cigar);
914    
915     #endif