ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.h
Revision: 69
Committed: Mon Sep 19 21:00:41 2011 UTC (8 years, 10 months ago) by gpertea
File size: 23276 byte(s)
Log Message:
wip - implementing SplicedSAMHitFactory::get_hit_from_buf()

Line User Rev File contents
1 gpertea 29 #ifndef BWT_MAP_H
2     #define BWT_MAP_H
3    
4     #ifdef HAVE_CONFIG_H
5     #include <config.h>
6     #endif
7    
8     #include <stdint.h>
9     #include <stdio.h>
10     #include <stdlib.h>
11     #include <string>
12     #include <map>
13     #include <vector>
14     #include <cassert>
15     #include <cstring>
16     #include <seqan/sequence.h>
17    
18     #include <bam/sam.h>
19     using namespace std;
20    
21     #include "common.h"
22     #define _FBUF_SIZE 10*1024
23     /*
24     * bwt_map.h
25     * TopHat
26     *
27     * Created by Cole Trapnell on 11/17/08.
28     * Copyright 2008 Cole Trapnell. All rights reserved.
29     *
30     */
31    
32     enum CigarOpCode { MATCH, INS, DEL, REF_SKIP, SOFT_CLIP, HARD_CLIP, PAD };
33    
34     struct CigarOp
35     {
36     CigarOp(CigarOpCode o, uint32_t l) : opcode(o), length(l) {}
37     CigarOpCode opcode : 3;
38     uint32_t length : 29;
39    
40     bool operator==(const CigarOp& rhs) const
41     {
42     return opcode == rhs.opcode && length == rhs.length;
43     }
44    
45     };
46    
47     typedef uint32_t ReadID;
48    
49     class RefSequenceTable;
50    
51     /* Stores the information from a single record of the bowtie map. A given read
52     may have many of these. Reads up to 255bp are supported.
53     */
54     struct BowtieHit
55     {
56     BowtieHit() :
57     _ref_id(0),
58     _insert_id(0),
59     _left(0),
60     _antisense_splice(false),
61     _antisense_aln(false),
62     _edit_dist(0),
63     _splice_mms(0),
64     _end(false){}
65    
66     BowtieHit(uint32_t ref_id,
67     ReadID insert_id,
68     int left,
69     int read_len,
70     bool antisense,
71     unsigned char edit_dist,
72     bool end) :
73     _ref_id(ref_id),
74     _insert_id(insert_id),
75     _left(left),
76     _cigar(vector<CigarOp>(1,CigarOp(MATCH,read_len))),
77     _antisense_splice(false),
78     _antisense_aln(antisense),
79     _edit_dist(edit_dist),
80     _splice_mms(0),
81     _end(end)
82     {
83     assert(_cigar.capacity() == _cigar.size());
84     }
85    
86     BowtieHit(uint32_t ref_id,
87     ReadID insert_id,
88     int left,
89     const vector<CigarOp>& cigar,
90     bool antisense_aln,
91     bool antisense_splice,
92     unsigned char edit_dist,
93     unsigned char splice_mms,
94     bool end) :
95     _ref_id(ref_id),
96     _insert_id(insert_id),
97     _left(left),
98     _cigar(cigar),
99     _antisense_splice(antisense_splice),
100     _antisense_aln(antisense_aln),
101     _edit_dist(edit_dist),
102     _splice_mms(splice_mms),
103     _end(end)
104     {
105     assert(_cigar.capacity() == _cigar.size());
106     }
107    
108     int read_len() const
109     {
110     uint32_t len = 0;
111     for (size_t i = 0; i < _cigar.size(); ++i)
112     {
113     const CigarOp& op = _cigar[i];
114     switch(op.opcode)
115     {
116     case MATCH:
117     case INS:
118     case SOFT_CLIP:
119     len += op.length;
120     break;
121     default:
122     break;
123     }
124     }
125    
126     return len;
127     }
128    
129     bool operator==(const BowtieHit& rhs) const
130     {
131     return (_insert_id == rhs._insert_id &&
132     _ref_id == rhs._ref_id &&
133     _antisense_aln == rhs._antisense_aln &&
134     _left == rhs._left &&
135     _antisense_splice == rhs._antisense_splice &&
136     _edit_dist == rhs._edit_dist &&
137     /* DO NOT USE ACCEPTED IN COMPARISON */
138     _cigar == rhs._cigar);
139     }
140    
141     bool operator<(const BowtieHit& rhs) const
142     {
143     if (_insert_id != rhs._insert_id)
144     return _insert_id < rhs._insert_id;
145     if (_ref_id != rhs._ref_id)
146     return _ref_id < rhs._ref_id;
147     if (_left != rhs._left)
148     return _left < rhs._left;
149     if (_antisense_aln != rhs._antisense_aln)
150     return _antisense_aln < rhs._antisense_aln;
151     if (_edit_dist != rhs._edit_dist)
152     return _edit_dist < rhs._edit_dist;
153     if (_cigar != rhs._cigar)
154     {
155     if (_cigar.size() != rhs._cigar.size())
156     return _cigar.size() < rhs._cigar.size();
157     for (size_t i = 0; i < _cigar.size(); ++i)
158     {
159     if (!(_cigar[i] == rhs._cigar[i]))
160     return (_cigar[i].opcode < rhs._cigar[i].opcode || (_cigar[i].opcode == rhs._cigar[i].opcode && _cigar[i].length < rhs._cigar[i].length));
161     }
162     }
163     return false;
164     }
165    
166     uint32_t ref_id() const { return _ref_id; }
167     ReadID insert_id() const { return _insert_id; }
168     int left() const { return _left; }
169     int right() const
170     {
171     int r = _left;
172     for (size_t i = 0; i < _cigar.size(); ++i)
173     {
174     const CigarOp& op = _cigar[i];
175    
176     switch(op.opcode)
177     {
178     case MATCH:
179     case REF_SKIP:
180     case DEL:
181     r += op.length;
182     break;
183     default:
184     break;
185     }
186     }
187     return r;
188     }
189    
190     bool is_spliced()
191     {
192     for (size_t i = 0; i < _cigar.size(); ++i)
193     {
194     const CigarOp& op = _cigar[i];
195    
196     if (op.opcode == REF_SKIP)
197     return true;
198     }
199     return false;
200     }
201    
202     bool antisense_splice() const { return _antisense_splice; }
203     bool antisense_align() const { return _antisense_aln; }
204    
205     unsigned char edit_dist() const { return _edit_dist; }
206     unsigned char splice_mms() const { return _splice_mms; }
207    
208     // For convenience, if you just want a copy of the gap intervals
209     // for this hit.
210     void gaps(vector<pair<int,int> >& gaps_out) const
211     {
212     gaps_out.clear();
213     int pos = _left;
214     for (size_t i = 0; i < _cigar.size(); ++i)
215     {
216     const CigarOp& op = _cigar[i];
217    
218     switch(op.opcode)
219     {
220     case REF_SKIP:
221     gaps_out.push_back(make_pair(pos, pos + op.length - 1));
222     pos += op.length;
223     break;
224     case MATCH:
225     pos += op.length;
226     break;
227     default:
228     break;
229     }
230     }
231     }
232    
233     const vector<CigarOp>& cigar() const { return _cigar; }
234    
235    
236     bool contiguous() const
237     {
238     return _cigar.size() == 1 && _cigar[0].opcode == MATCH;
239     }
240    
241     const string& hitfile_rec() const { return _hitfile_rec; }
242     void hitfile_rec(const string& rec) { _hitfile_rec = rec; }
243    
244     const string& seq() const { return _seq; }
245     void seq(const string& seq) { _seq = seq; }
246    
247     const string& qual() const { return _qual; }
248     void qual(const string& qual) { _qual = qual; }
249    
250     bool end() const { return _end; }
251     void end(bool end) { _end = end; }
252    
253     // this is for debugging purpose
254     bool check_editdist_consistency(const RefSequenceTable& rt);
255    
256     private:
257    
258     uint32_t _ref_id;
259     ReadID _insert_id; // Id of the sequencing insert
260     int _left; // Position in the reference of the left side of the alignment
261    
262     vector<CigarOp> _cigar;
263    
264     bool _antisense_splice; // Whether the junction spanned is on the reverse strand
265     bool _antisense_aln; // Whether the alignment is to the reverse strand
266    
267     unsigned char _edit_dist; // Total mismatches (note this is not including insertions or deletions as mismatches, ie, not equivalent to NM field of a SAM record)
268     unsigned char _splice_mms; // Mismatches within min_anchor_len of a splice junction
269     string _hitfile_rec; // Points to the buffer for the record from which this hit came
270     string _seq;
271     string _qual;
272     bool _end; // Whether this segment is the last one of the read it belongs to
273     };
274    
275     class ReadTable
276     {
277     public:
278    
279     ReadTable() :
280     _next_id(1) {}
281    
282     // This function should NEVER return zero
283     ReadID get_id(const string& name)
284     {
285     uint32_t _id = atoi(name.c_str());
286     //assert(_id);
287     _next_id = max(_next_id, (size_t)_id);
288     return _id;
289     }
290    
291    
292     uint32_t observation_order(ReadID ID)
293     {
294     if (ID == 0)
295     return 0xFFFFFFFF;
296     return ID;
297     }
298    
299    
300     size_t size() const { return _next_id; }
301    
302     private:
303     size_t _next_id;
304     };
305    
306     class RefSequenceTable
307     {
308     public:
309    
310     typedef seqan::String<seqan::Dna5, seqan::Packed<seqan::Alloc<> > > Sequence;
311    
312     struct SequenceInfo
313     {
314     SequenceInfo(uint32_t _order,
315     char* _name,
316     Sequence* _seq, uint32_t _len) :
317     observation_order(_order),
318     name(_name),
319     seq(_seq),
320     len(_len) {}
321    
322     uint32_t observation_order;
323     char* name;
324     Sequence* seq;
325     uint32_t len;
326     };
327    
328     typedef map<string, uint64_t> IDTable;
329     typedef map<uint32_t, SequenceInfo> InvertedIDTable;
330     typedef InvertedIDTable::iterator iterator;
331     typedef InvertedIDTable::const_iterator const_iterator;
332    
333     RefSequenceTable(bool keep_names, bool keep_seqs = false) :
334     _next_id(1),
335     _keep_names(keep_names) {}
336    
337     RefSequenceTable(const string& sam_header_filename,
338     bool keep_names,
339     bool keep_seqs = false) :
340     _next_id(1),
341     _keep_names(keep_names)
342     {
343     if (sam_header_filename != "")
344     {
345     samfile_t* fh = samopen(sam_header_filename.c_str(), "r", 0);
346     if (fh == 0) {
347     fprintf(stderr, "Failed to open SAM header file %s\n", sam_header_filename.c_str());
348     exit(1);
349     }
350    
351     for (size_t i = 0; i < (size_t)fh->header->n_targets; ++i)
352     {
353     const char* name = fh->header->target_name[i];
354     uint32_t len = fh->header->target_len[i];
355     get_id(name, NULL, len);
356     //fprintf(stderr, "SQ: %s - %d\n", name, len);
357     }
358     }
359     }
360    
361     // This function should NEVER return zero
362     uint32_t get_id(const string& name,
363     Sequence* seq,
364     uint32_t len)
365     {
366     uint32_t _id = hash_string(name.c_str());
367     pair<InvertedIDTable::iterator, bool> ret =
368     _by_id.insert(make_pair(_id, SequenceInfo(_next_id, NULL, NULL, 0)));
369     if (ret.second == true)
370     {
371     char* _name = NULL;
372     if (_keep_names)
373     _name = strdup(name.c_str());
374     ret.first->second.name = _name;
375     ret.first->second.seq = seq;
376     ret.first->second.len = len;
377     ++_next_id;
378     }
379     assert (_id);
380     return _id;
381     }
382    
383     const char* get_name(uint32_t ID) const
384     {
385     InvertedIDTable::const_iterator itr = _by_id.find(ID);
386     if (itr != _by_id.end())
387     return itr->second.name;
388     else
389     return NULL;
390     }
391    
392     uint32_t get_len(uint32_t ID) const
393     {
394     InvertedIDTable::const_iterator itr = _by_id.find(ID);
395     if (itr != _by_id.end())
396     return itr->second.len;
397     else
398     return 0;
399     }
400    
401     Sequence* get_seq(uint32_t ID) const
402     {
403     InvertedIDTable::const_iterator itr = _by_id.find(ID);
404     if (itr != _by_id.end())
405     return itr->second.seq;
406     else
407     return NULL;
408     }
409    
410     const SequenceInfo* get_info(uint32_t ID) const
411     {
412    
413     InvertedIDTable::const_iterator itr = _by_id.find(ID);
414     if (itr != _by_id.end())
415     {
416     return &(itr->second);
417     }
418     else
419     return NULL;
420     }
421    
422     int observation_order(uint32_t ID) const
423     {
424     InvertedIDTable::const_iterator itr = _by_id.find(ID);
425     if (itr != _by_id.end())
426     {
427     return itr->second.observation_order;
428     }
429     else
430     return -1;
431     }
432    
433     iterator begin() { return _by_id.begin(); }
434     iterator end() { return _by_id.end(); }
435    
436     const_iterator begin() const { return _by_id.begin(); }
437     const_iterator end() const { return _by_id.end(); }
438    
439     size_t size() const { return _by_id.size(); }
440    
441     void clear()
442     {
443     //_by_name.clear();
444     _by_id.clear();
445     }
446    
447     private:
448    
449     // This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
450     inline uint32_t hash_string(const char* __s)
451     {
452     uint32_t hash = 0x811c9dc5;
453     for ( ; *__s; ++__s)
454     {
455     hash *= 16777619;
456     hash ^= *__s;
457     }
458     return hash;
459     }
460    
461     //IDTable _by_name;
462     uint32_t _next_id;
463     bool _keep_names;
464     InvertedIDTable _by_id;
465     };
466    
467    
468     bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2);
469    
470     typedef vector<BowtieHit> HitList;
471    
472     /* This class stores all the hits from a Bowtie map */
473    
474     class HitTable
475     {
476     public:
477    
478     typedef map<uint64_t, HitList> RefHits;
479     typedef RefHits::const_iterator const_iterator;
480     typedef RefHits::iterator iterator;
481    
482     HitTable() : _total_hits(0) {}
483    
484     const_iterator begin() const { return _hits_for_ref.begin(); }
485     const_iterator end() const { return _hits_for_ref.end(); }
486    
487     iterator begin() { return _hits_for_ref.begin(); }
488     iterator end() { return _hits_for_ref.end(); }
489    
490     void add_hit(const BowtieHit& bh, bool check_uniqueness);
491    
492     void finalize()
493     {
494     for (RefHits::iterator i = _hits_for_ref.begin();
495     i != _hits_for_ref.end();
496     ++i)
497     {
498     sort(i->second.begin(), i->second.end(), hit_insert_id_lt);
499     }
500     }
501    
502     HitList* get_hits(uint64_t ref_id)
503     {
504     RefHits::iterator i = _hits_for_ref.find(ref_id);
505     if (i == _hits_for_ref.end())
506     return NULL;
507     else
508     return &(i->second);
509     }
510    
511     uint32_t total_hits() const { return _total_hits; }
512    
513     private:
514     RefHits _hits_for_ref;
515     uint32_t _total_hits;
516     };
517    
518     class HitStream;
519    
520     class HitFactory {
521     friend class HitStream;
522     public:
523     HitFactory(ReadTable& insert_table,
524     RefSequenceTable& reference_table) :
525     _insert_table(insert_table), _ref_table(reference_table)
526     {}
527     virtual ~HitFactory() {}
528     virtual void openStream(HitStream& hs)=0;
529     virtual void rewind(HitStream& hs)=0;
530     virtual void closeStream(HitStream& hs)=0;
531     BowtieHit create_hit(const string& insert_name,
532     const string& ref_name,
533     int left,
534     const vector<CigarOp>& cigar,
535     bool antisense_aln,
536     bool antisense_splice,
537     unsigned char edit_dist,
538     unsigned char splice_mms,
539     bool end);
540    
541     BowtieHit create_hit(const string& insert_name,
542     const string& ref_name,
543     uint32_t left,
544     uint32_t read_len,
545     bool antisense_aln,
546     unsigned char edit_dist,
547     bool end);
548    
549     virtual string hitfile_rec(HitStream& hs, const char* hit_buf)=0;
550     virtual bool next_record(HitStream& hs, const char*& buf, size_t& buf_size) = 0;
551     virtual bool get_hit_from_buf(const char* bwt_buf,
552     BowtieHit& bh,
553     bool strip_slash,
554     char* name_out = NULL,
555     char* name_tags = NULL,
556     char* seq = NULL,
557     char* qual = NULL) = 0;
558    
559     protected:
560     ReadTable& _insert_table;
561     RefSequenceTable& _ref_table;
562     HitStream* _hit_stream;
563     };//class HitFactory
564    
565    
566     class LineHitFactory : public HitFactory {
567     //for text line-based formats like Bowtie and SAM
568     public:
569     LineHitFactory(ReadTable& insert_table,
570     RefSequenceTable& reference_table) :
571     HitFactory(insert_table, reference_table) {}
572    
573     string hitfile_rec(HitStream& hs, const char* hit_buf) {
574     string r(hit_buf);
575     return r;
576     }
577     void openStream(HitStream& hs);
578     void rewind(HitStream& hs);
579     void closeStream(HitStream& hs);
580     bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
581     protected:
582     static const size_t _hit_buf_max_sz = 10 * 1024;
583     char _hit_buf[_hit_buf_max_sz];
584     int _line_num;
585     };
586    
587     class BowtieHitFactory : public LineHitFactory {
588     public:
589     BowtieHitFactory(ReadTable& insert_table,
590     RefSequenceTable& reference_table) :
591     LineHitFactory(insert_table, reference_table) {}
592    
593     bool get_hit_from_buf(const char* bwt_buf,
594     BowtieHit& bh,
595     bool strip_slash,
596     char* name_out = NULL,
597     char* name_tags = NULL,
598     char* seq = NULL,
599     char* qual = NULL);
600     };
601    
602     class SplicedBowtieHitFactory : public LineHitFactory {
603     public:
604     SplicedBowtieHitFactory(ReadTable& insert_table,
605     RefSequenceTable& reference_table,
606     int anchor_length) :
607     LineHitFactory(insert_table, reference_table),
608     _anchor_length(anchor_length){}
609    
610     bool get_hit_from_buf(const char* bwt_buf,
611     BowtieHit& bh,
612     bool strip_slash,
613     char* name_out = NULL,
614     char* name_tags = NULL,
615     char* seq = NULL,
616     char* qual = NULL);
617     private:
618     int _anchor_length;
619     int _seg_offset;
620     int _size_buf;
621     };
622    
623 gpertea 69 class SplicedSAMHitFactory : public LineHitFactory {
624     public:
625     SplicedSAMHitFactory(ReadTable& insert_table,
626     RefSequenceTable& reference_table,
627     int anchor_length) :
628     LineHitFactory(insert_table, reference_table),
629     _anchor_length(anchor_length){}
630    
631     bool get_hit_from_buf(const char* bwt_buf,
632     BowtieHit& bh,
633     bool strip_slash,
634     char* name_out = NULL,
635     char* name_tags = NULL,
636     char* seq = NULL,
637     char* qual = NULL);
638     private:
639     int _anchor_length;
640     int _seg_offset;
641     int _size_buf;
642     };
643    
644    
645 gpertea 29 class SAMHitFactory : public LineHitFactory {
646     public:
647     SAMHitFactory(ReadTable& insert_table,
648     RefSequenceTable& reference_table) :
649     LineHitFactory(insert_table, reference_table) {}
650    
651     bool get_hit_from_buf(const char* bwt_buf,
652     BowtieHit& bh,
653     bool strip_slash,
654     char* name_out = NULL,
655     char* name_tags = NULL,
656     char* seq = NULL,
657     char* qual = NULL);
658     };
659    
660    
661     /******************************************************************************
662     BAMHitFactory turns SAM alignments into BowtieHits
663     *******************************************************************************/
664     class BAMHitFactory : public HitFactory {
665     public:
666    
667     BAMHitFactory(ReadTable& insert_table,
668     RefSequenceTable& reference_table) :
669     HitFactory(insert_table, reference_table)
670     {
671     _sam_header=NULL;
672     }
673     void openStream(HitStream& hs);
674     void rewind(HitStream& hs);
675     void closeStream(HitStream& hs);
676    
677     bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
678    
679     /*void mark_curr_pos()
680     {
681     _curr_pos = bgzf_tell(((samfile_t*)_hit_file)->x.bam);
682     }
683    
684    
685     void undo_hit()
686     {
687     bgzf_seek(((samfile_t*)_hit_file)->x.bam, _curr_pos, SEEK_SET);
688     _eof = false;
689     }
690     */
691     string hitfile_rec(HitStream& hs, const char* hit_buf);
692    
693     bool get_hit_from_buf(const char* bwt_buf,
694     BowtieHit& bh,
695     bool strip_slash,
696     char* name_out = NULL,
697     char* name_tags = NULL,
698     char* seq = NULL,
699     char* qual = NULL);
700     private:
701     //int64_t _curr_pos;
702     //int64_t _beginning;
703     bam1_t _next_hit;
704     bam_header_t* _sam_header;
705     bool inspect_header(HitStream& hs);
706     };
707    
708    
709     struct HitsForRead
710     {
711     HitsForRead() : insert_id(0) {}
712     uint64_t insert_id;
713     vector<BowtieHit> hits;
714     };
715    
716     class HitStream
717     {
718     friend class HitFactory;
719     friend class LineHitFactory;
720     friend class BAMHitFactory;
721     //private:
722     HitFactory* _factory;
723     bool _spliced;
724     bool _strip_slash;
725     BowtieHit buffered_hit;
726     bool _keep_bufs;
727     bool _keep_seqs;
728     bool _keep_quals;
729     bool _from_bowtie;
730     void* _hit_file;
731     string _hit_file_name;
732     FZPipe* _fzpipe;
733     bool _eof;
734    
735     public:
736     HitStream(void* hit_file, //could be FILE* or samfile_t*
737     HitFactory* hit_factory,
738     bool spliced,
739     bool strip_slash,
740     bool keep_bufs,
741     bool keep_seqs = false,
742     bool keep_quals = false,
743     bool from_bowtie = false) :
744     _factory(hit_factory),
745     _spliced(spliced),
746     _strip_slash(strip_slash),
747     buffered_hit(BowtieHit()),
748     _keep_bufs(keep_bufs),
749     _keep_seqs(keep_seqs),
750     _keep_quals(keep_quals),
751     _from_bowtie(from_bowtie),
752     _hit_file(hit_file),
753     _hit_file_name(),
754     _fzpipe(NULL),
755     _eof(false)
756     {
757     primeStream();
758     }
759    
760     HitStream(string& hit_filename,
761     HitFactory* hit_factory,
762     bool spliced,
763     bool strip_slash,
764     bool keep_bufs,
765     bool keep_seqs = false,
766     bool keep_quals = false,
767     bool from_bowtie = false) :
768     _factory(hit_factory),
769     _spliced(spliced),
770     _strip_slash(strip_slash),
771     buffered_hit(BowtieHit()),
772     _keep_bufs(keep_bufs),
773     _keep_seqs(keep_seqs),
774     _keep_quals(keep_quals),
775     _from_bowtie(from_bowtie),
776     _hit_file(NULL),
777     _hit_file_name(hit_filename),
778     _fzpipe(NULL),
779     _eof(false)
780     {
781     _factory->openStream(*this);
782     primeStream();
783     }
784    
785     HitStream(FZPipe& hit_filepipe,
786     HitFactory* hit_factory,
787     bool spliced,
788     bool strip_slash,
789     bool keep_bufs,
790     bool keep_seqs = false,
791     bool keep_quals = false,
792     bool from_bowtie = false) :
793     _factory(hit_factory),
794     _spliced(spliced),
795     _strip_slash(strip_slash),
796     buffered_hit(BowtieHit()),
797     _keep_bufs(keep_bufs),
798     _keep_seqs(keep_seqs),
799     _keep_quals(keep_quals),
800     _from_bowtie(from_bowtie),
801     _hit_file(NULL),
802     _hit_file_name(),
803     _fzpipe(&hit_filepipe),
804     _eof(false)
805     {
806     _hit_file=_fzpipe->file;
807     primeStream();
808     }
809    
810     void primeStream() { //why?
811     // Prime the stream by reading a single hit into the buffered_hit
812     HitsForRead dummy = HitsForRead();
813     next_read_hits(dummy);
814     }
815     bool eof() { return _eof; }
816     bool ready() { return (_hit_file!=NULL); }
817     void reset() {
818     _factory->rewind(*this);
819     _eof=false;
820     // re-prime the stream;
821     buffered_hit = BowtieHit();
822     primeStream();
823     }
824    
825     bool next_read_hits(HitsForRead& hits_for_read)
826     {
827     hits_for_read.hits.clear();
828     hits_for_read.insert_id = 0;
829    
830     //if (!_hit_file || (feof(_hit_file) && buffered_hit.insert_id() == 0))
831     // return false;
832     if (!this->ready())
833     //err_die("Error: next_read_hits() called on HitFactory with no file handle\n");
834     return false;
835     if (this->eof() && buffered_hit.insert_id() == 0) {
836     return false;
837     }
838    
839     //char bwt_buf[2048]; bwt_buf[0] = 0;
840     char bwt_seq[2048]; bwt_seq[0] = 0;
841     char bwt_qual[2048]; bwt_qual[0] = 0;
842    
843     char* seq = _keep_seqs ? bwt_seq : NULL;
844     char* qual = _keep_quals ? bwt_qual : NULL;
845    
846     hits_for_read.insert_id = buffered_hit.insert_id();
847     if (hits_for_read.insert_id)
848     hits_for_read.hits.push_back(buffered_hit);
849     const char* hit_buf;
850     size_t hit_buf_size = 0;
851     while (true) {
852     if (!_factory->next_record(*this, hit_buf, hit_buf_size)) {
853     buffered_hit = BowtieHit();
854     break; }
855     //string clean_buf = bwt_buf;
856     // Get a new record from the tab-delimited Bowtie map
857     BowtieHit bh;
858     if (_factory->get_hit_from_buf(hit_buf, bh, _strip_slash,
859     NULL, NULL, seq, qual)) {
860     if (_keep_bufs)
861     bh.hitfile_rec(_factory->hitfile_rec(*this, hit_buf));
862    
863     if (_keep_seqs)
864     bh.seq(seq);
865    
866     if (_keep_quals) {
867     // when it comes to convert from qual in color to qual in bp,
868     // we need to fill in the two extream qual values using the adjacent qual values.
869     size_t qual_len = strlen(qual);
870     if (color && !color_out && qual_len > 2) {
871     qual[0] = qual[1];
872     qual[qual_len-1] = qual[qual_len-2];
873     }
874     bh.qual(qual);
875     }
876    
877     if (bh.insert_id() == hits_for_read.insert_id) {
878     hits_for_read.hits.push_back(bh);
879     }
880     else {
881     buffered_hit = bh;
882     break;
883     }
884     } //hit parsed
885     } //while reading hits
886    
887     return (!hits_for_read.hits.empty() && hits_for_read.insert_id != 0);
888     }
889    
890     uint64_t next_group_id() const
891     {
892     return buffered_hit.insert_id();
893     }
894     bool fromBowtie() { return _from_bowtie; }
895     };
896    
897     typedef uint32_t MateStatusMask;
898    
899     void get_mapped_reads(FILE* bwtf,
900     HitTable& hits,
901     HitFactory& hit_factory,
902     bool strip_slash,
903     bool verbose = false);
904    
905     //bool left_status_better(MateStatusMask left, MateStatusMask right);
906     //bool status_equivalent(MateStatusMask left, MateStatusMask right);
907     typedef uint32_t MateStatusMask;
908    
909     void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC);
910     void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC);
911    
912     void accept_all_hits(HitTable& hits);
913    
914     //print BowtieHit as SAM line
915     void print_hit(FILE* fout,
916     const char* read_name,
917     const BowtieHit& bh,
918     const char* ref_name,
919     const char* sequence,
920     const char* qualities,
921     bool from_bowtie = false);
922    
923     //print BowtieHit as BAM record
924     void print_bamhit(GBamWriter& wbam,
925     const char* read_name,
926     const BowtieHit& bh,
927     const char* ref_name,
928     const char* sequence,
929     const char* qualities,
930     bool from_bowtie = false);
931    
932     /**
933     * Convert a vector of CigarOps to a string representation
934     */
935     std::string print_cigar(vector<CigarOp>& bh_cigar);
936    
937     #endif