ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.h
Revision: 137
Committed: Wed Dec 14 22:58:22 2011 UTC (8 years, 7 months ago) by gpertea
File size: 23302 byte(s)
Log Message:
wip spliceCigar() in bwt_map.cpp needs testing

Line User Rev File contents
1 gpertea 29 #ifndef BWT_MAP_H
2     #define BWT_MAP_H
3    
4     #ifdef HAVE_CONFIG_H
5     #include <config.h>
6     #endif
7    
8     #include <stdint.h>
9     #include <stdio.h>
10     #include <stdlib.h>
11     #include <string>
12     #include <map>
13     #include <vector>
14     #include <cassert>
15     #include <cstring>
16     #include <seqan/sequence.h>
17    
18     #include <bam/sam.h>
19     using namespace std;
20    
21     #include "common.h"
22     #define _FBUF_SIZE 10*1024
23     /*
24     * bwt_map.h
25     * TopHat
26     *
27     * Created by Cole Trapnell on 11/17/08.
28     * Copyright 2008 Cole Trapnell. All rights reserved.
29     *
30     */
31    
32     enum CigarOpCode { MATCH, INS, DEL, REF_SKIP, SOFT_CLIP, HARD_CLIP, PAD };
33    
34     struct CigarOp
35     {
36     CigarOp(CigarOpCode o, uint32_t l) : opcode(o), length(l) {}
37 gpertea 137 CigarOpCode opcode:4;
38     uint32_t length:28;
39 gpertea 29
40     bool operator==(const CigarOp& rhs) const
41     {
42     return opcode == rhs.opcode && length == rhs.length;
43     }
44    
45     };
46    
47     typedef uint32_t ReadID;
48    
49     class RefSequenceTable;
50    
51     /* Stores the information from a single record of the bowtie map. A given read
52     may have many of these. Reads up to 255bp are supported.
53     */
54     struct BowtieHit
55     {
56     BowtieHit() :
57     _ref_id(0),
58     _insert_id(0),
59     _left(0),
60     _antisense_splice(false),
61     _antisense_aln(false),
62     _edit_dist(0),
63     _splice_mms(0),
64     _end(false){}
65    
66     BowtieHit(uint32_t ref_id,
67     ReadID insert_id,
68     int left,
69     int read_len,
70     bool antisense,
71     unsigned char edit_dist,
72     bool end) :
73     _ref_id(ref_id),
74     _insert_id(insert_id),
75     _left(left),
76     _cigar(vector<CigarOp>(1,CigarOp(MATCH,read_len))),
77     _antisense_splice(false),
78     _antisense_aln(antisense),
79     _edit_dist(edit_dist),
80     _splice_mms(0),
81     _end(end)
82     {
83     assert(_cigar.capacity() == _cigar.size());
84     }
85    
86     BowtieHit(uint32_t ref_id,
87     ReadID insert_id,
88     int left,
89     const vector<CigarOp>& cigar,
90     bool antisense_aln,
91     bool antisense_splice,
92     unsigned char edit_dist,
93     unsigned char splice_mms,
94     bool end) :
95     _ref_id(ref_id),
96     _insert_id(insert_id),
97     _left(left),
98     _cigar(cigar),
99     _antisense_splice(antisense_splice),
100     _antisense_aln(antisense_aln),
101     _edit_dist(edit_dist),
102     _splice_mms(splice_mms),
103     _end(end)
104     {
105     assert(_cigar.capacity() == _cigar.size());
106     }
107    
108     int read_len() const
109     {
110     uint32_t len = 0;
111     for (size_t i = 0; i < _cigar.size(); ++i)
112     {
113     const CigarOp& op = _cigar[i];
114     switch(op.opcode)
115     {
116     case MATCH:
117     case INS:
118     case SOFT_CLIP:
119     len += op.length;
120     break;
121     default:
122     break;
123     }
124     }
125    
126     return len;
127     }
128    
129     bool operator==(const BowtieHit& rhs) const
130     {
131     return (_insert_id == rhs._insert_id &&
132     _ref_id == rhs._ref_id &&
133     _antisense_aln == rhs._antisense_aln &&
134     _left == rhs._left &&
135     _antisense_splice == rhs._antisense_splice &&
136     _edit_dist == rhs._edit_dist &&
137     /* DO NOT USE ACCEPTED IN COMPARISON */
138     _cigar == rhs._cigar);
139     }
140    
141     bool operator<(const BowtieHit& rhs) const
142     {
143     if (_insert_id != rhs._insert_id)
144     return _insert_id < rhs._insert_id;
145     if (_ref_id != rhs._ref_id)
146     return _ref_id < rhs._ref_id;
147     if (_left != rhs._left)
148     return _left < rhs._left;
149     if (_antisense_aln != rhs._antisense_aln)
150     return _antisense_aln < rhs._antisense_aln;
151     if (_edit_dist != rhs._edit_dist)
152     return _edit_dist < rhs._edit_dist;
153     if (_cigar != rhs._cigar)
154     {
155     if (_cigar.size() != rhs._cigar.size())
156     return _cigar.size() < rhs._cigar.size();
157     for (size_t i = 0; i < _cigar.size(); ++i)
158     {
159     if (!(_cigar[i] == rhs._cigar[i]))
160     return (_cigar[i].opcode < rhs._cigar[i].opcode || (_cigar[i].opcode == rhs._cigar[i].opcode && _cigar[i].length < rhs._cigar[i].length));
161     }
162     }
163     return false;
164     }
165    
166     uint32_t ref_id() const { return _ref_id; }
167     ReadID insert_id() const { return _insert_id; }
168     int left() const { return _left; }
169     int right() const
170     {
171     int r = _left;
172     for (size_t i = 0; i < _cigar.size(); ++i)
173     {
174     const CigarOp& op = _cigar[i];
175    
176     switch(op.opcode)
177     {
178     case MATCH:
179     case REF_SKIP:
180     case DEL:
181     r += op.length;
182     break;
183     default:
184     break;
185     }
186     }
187     return r;
188     }
189    
190     bool is_spliced()
191     {
192     for (size_t i = 0; i < _cigar.size(); ++i)
193     {
194     const CigarOp& op = _cigar[i];
195    
196     if (op.opcode == REF_SKIP)
197     return true;
198     }
199     return false;
200     }
201    
202     bool antisense_splice() const { return _antisense_splice; }
203     bool antisense_align() const { return _antisense_aln; }
204    
205     unsigned char edit_dist() const { return _edit_dist; }
206     unsigned char splice_mms() const { return _splice_mms; }
207    
208     // For convenience, if you just want a copy of the gap intervals
209     // for this hit.
210     void gaps(vector<pair<int,int> >& gaps_out) const
211     {
212     gaps_out.clear();
213     int pos = _left;
214     for (size_t i = 0; i < _cigar.size(); ++i)
215     {
216     const CigarOp& op = _cigar[i];
217    
218     switch(op.opcode)
219     {
220     case REF_SKIP:
221     gaps_out.push_back(make_pair(pos, pos + op.length - 1));
222     pos += op.length;
223     break;
224     case MATCH:
225 gpertea 135 case DEL:
226 gpertea 29 pos += op.length;
227     break;
228     default:
229     break;
230     }
231     }
232     }
233    
234     const vector<CigarOp>& cigar() const { return _cigar; }
235    
236    
237     bool contiguous() const
238     {
239     return _cigar.size() == 1 && _cigar[0].opcode == MATCH;
240     }
241    
242     const string& hitfile_rec() const { return _hitfile_rec; }
243     void hitfile_rec(const string& rec) { _hitfile_rec = rec; }
244    
245     const string& seq() const { return _seq; }
246     void seq(const string& seq) { _seq = seq; }
247    
248     const string& qual() const { return _qual; }
249     void qual(const string& qual) { _qual = qual; }
250    
251     bool end() const { return _end; }
252     void end(bool end) { _end = end; }
253    
254     // this is for debugging purpose
255     bool check_editdist_consistency(const RefSequenceTable& rt);
256    
257     private:
258    
259     uint32_t _ref_id;
260     ReadID _insert_id; // Id of the sequencing insert
261     int _left; // Position in the reference of the left side of the alignment
262    
263     vector<CigarOp> _cigar;
264    
265     bool _antisense_splice; // Whether the junction spanned is on the reverse strand
266     bool _antisense_aln; // Whether the alignment is to the reverse strand
267    
268     unsigned char _edit_dist; // Total mismatches (note this is not including insertions or deletions as mismatches, ie, not equivalent to NM field of a SAM record)
269     unsigned char _splice_mms; // Mismatches within min_anchor_len of a splice junction
270     string _hitfile_rec; // Points to the buffer for the record from which this hit came
271     string _seq;
272     string _qual;
273     bool _end; // Whether this segment is the last one of the read it belongs to
274     };
275    
276     class ReadTable
277     {
278     public:
279    
280     ReadTable() :
281     _next_id(1) {}
282    
283     // This function should NEVER return zero
284     ReadID get_id(const string& name)
285     {
286     uint32_t _id = atoi(name.c_str());
287     //assert(_id);
288     _next_id = max(_next_id, (size_t)_id);
289     return _id;
290     }
291    
292    
293     uint32_t observation_order(ReadID ID)
294     {
295     if (ID == 0)
296 gpertea 135 return VMAXINT32;
297 gpertea 29 return ID;
298     }
299    
300    
301     size_t size() const { return _next_id; }
302    
303     private:
304     size_t _next_id;
305     };
306    
307     class RefSequenceTable
308     {
309     public:
310    
311     typedef seqan::String<seqan::Dna5, seqan::Packed<seqan::Alloc<> > > Sequence;
312    
313     struct SequenceInfo
314     {
315     SequenceInfo(uint32_t _order,
316     char* _name,
317     Sequence* _seq, uint32_t _len) :
318     observation_order(_order),
319     name(_name),
320     seq(_seq),
321     len(_len) {}
322    
323     uint32_t observation_order;
324     char* name;
325     Sequence* seq;
326     uint32_t len;
327     };
328    
329     typedef map<string, uint64_t> IDTable;
330     typedef map<uint32_t, SequenceInfo> InvertedIDTable;
331     typedef InvertedIDTable::iterator iterator;
332     typedef InvertedIDTable::const_iterator const_iterator;
333    
334     RefSequenceTable(bool keep_names, bool keep_seqs = false) :
335     _next_id(1),
336     _keep_names(keep_names) {}
337    
338     RefSequenceTable(const string& sam_header_filename,
339     bool keep_names,
340     bool keep_seqs = false) :
341     _next_id(1),
342     _keep_names(keep_names)
343     {
344     if (sam_header_filename != "")
345     {
346     samfile_t* fh = samopen(sam_header_filename.c_str(), "r", 0);
347     if (fh == 0) {
348     fprintf(stderr, "Failed to open SAM header file %s\n", sam_header_filename.c_str());
349     exit(1);
350     }
351    
352     for (size_t i = 0; i < (size_t)fh->header->n_targets; ++i)
353     {
354     const char* name = fh->header->target_name[i];
355     uint32_t len = fh->header->target_len[i];
356     get_id(name, NULL, len);
357     //fprintf(stderr, "SQ: %s - %d\n", name, len);
358     }
359     }
360     }
361    
362     // This function should NEVER return zero
363     uint32_t get_id(const string& name,
364     Sequence* seq,
365     uint32_t len)
366     {
367     uint32_t _id = hash_string(name.c_str());
368     pair<InvertedIDTable::iterator, bool> ret =
369     _by_id.insert(make_pair(_id, SequenceInfo(_next_id, NULL, NULL, 0)));
370     if (ret.second == true)
371     {
372     char* _name = NULL;
373     if (_keep_names)
374     _name = strdup(name.c_str());
375     ret.first->second.name = _name;
376     ret.first->second.seq = seq;
377     ret.first->second.len = len;
378     ++_next_id;
379     }
380     assert (_id);
381     return _id;
382     }
383    
384     const char* get_name(uint32_t ID) const
385     {
386     InvertedIDTable::const_iterator itr = _by_id.find(ID);
387     if (itr != _by_id.end())
388     return itr->second.name;
389     else
390     return NULL;
391     }
392    
393     uint32_t get_len(uint32_t ID) const
394     {
395     InvertedIDTable::const_iterator itr = _by_id.find(ID);
396     if (itr != _by_id.end())
397     return itr->second.len;
398     else
399     return 0;
400     }
401    
402     Sequence* get_seq(uint32_t ID) const
403     {
404     InvertedIDTable::const_iterator itr = _by_id.find(ID);
405     if (itr != _by_id.end())
406     return itr->second.seq;
407     else
408     return NULL;
409     }
410    
411     const SequenceInfo* get_info(uint32_t ID) const
412     {
413    
414     InvertedIDTable::const_iterator itr = _by_id.find(ID);
415     if (itr != _by_id.end())
416     {
417     return &(itr->second);
418     }
419     else
420     return NULL;
421     }
422    
423     int observation_order(uint32_t ID) const
424     {
425     InvertedIDTable::const_iterator itr = _by_id.find(ID);
426     if (itr != _by_id.end())
427     {
428     return itr->second.observation_order;
429     }
430     else
431     return -1;
432     }
433    
434     iterator begin() { return _by_id.begin(); }
435     iterator end() { return _by_id.end(); }
436    
437     const_iterator begin() const { return _by_id.begin(); }
438     const_iterator end() const { return _by_id.end(); }
439    
440     size_t size() const { return _by_id.size(); }
441    
442     void clear()
443     {
444     //_by_name.clear();
445     _by_id.clear();
446     }
447    
448 gpertea 135 // daehwan
449 gpertea 29 // This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
450 gpertea 135 static inline uint32_t hash_string(const char* __s)
451 gpertea 29 {
452     uint32_t hash = 0x811c9dc5;
453     for ( ; *__s; ++__s)
454     {
455     hash *= 16777619;
456     hash ^= *__s;
457     }
458     return hash;
459     }
460    
461 gpertea 135 private:
462    
463 gpertea 29 //IDTable _by_name;
464     uint32_t _next_id;
465     bool _keep_names;
466     InvertedIDTable _by_id;
467     };
468    
469    
470     bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2);
471    
472     typedef vector<BowtieHit> HitList;
473    
474     /* This class stores all the hits from a Bowtie map */
475    
476     class HitTable
477     {
478     public:
479    
480     typedef map<uint64_t, HitList> RefHits;
481     typedef RefHits::const_iterator const_iterator;
482     typedef RefHits::iterator iterator;
483    
484     HitTable() : _total_hits(0) {}
485    
486     const_iterator begin() const { return _hits_for_ref.begin(); }
487     const_iterator end() const { return _hits_for_ref.end(); }
488    
489     iterator begin() { return _hits_for_ref.begin(); }
490     iterator end() { return _hits_for_ref.end(); }
491    
492     void add_hit(const BowtieHit& bh, bool check_uniqueness);
493    
494     void finalize()
495     {
496     for (RefHits::iterator i = _hits_for_ref.begin();
497     i != _hits_for_ref.end();
498     ++i)
499     {
500     sort(i->second.begin(), i->second.end(), hit_insert_id_lt);
501     }
502     }
503    
504     HitList* get_hits(uint64_t ref_id)
505     {
506     RefHits::iterator i = _hits_for_ref.find(ref_id);
507     if (i == _hits_for_ref.end())
508     return NULL;
509     else
510     return &(i->second);
511     }
512    
513     uint32_t total_hits() const { return _total_hits; }
514    
515     private:
516     RefHits _hits_for_ref;
517     uint32_t _total_hits;
518     };
519    
520     class HitStream;
521    
522     class HitFactory {
523     friend class HitStream;
524     public:
525     HitFactory(ReadTable& insert_table,
526     RefSequenceTable& reference_table) :
527     _insert_table(insert_table), _ref_table(reference_table)
528     {}
529     virtual ~HitFactory() {}
530     virtual void openStream(HitStream& hs)=0;
531     virtual void rewind(HitStream& hs)=0;
532     virtual void closeStream(HitStream& hs)=0;
533     BowtieHit create_hit(const string& insert_name,
534     const string& ref_name,
535     int left,
536     const vector<CigarOp>& cigar,
537     bool antisense_aln,
538     bool antisense_splice,
539     unsigned char edit_dist,
540     unsigned char splice_mms,
541     bool end);
542    
543     BowtieHit create_hit(const string& insert_name,
544     const string& ref_name,
545     uint32_t left,
546     uint32_t read_len,
547     bool antisense_aln,
548     unsigned char edit_dist,
549     bool end);
550    
551     virtual string hitfile_rec(HitStream& hs, const char* hit_buf)=0;
552     virtual bool next_record(HitStream& hs, const char*& buf, size_t& buf_size) = 0;
553     virtual bool get_hit_from_buf(const char* bwt_buf,
554     BowtieHit& bh,
555     bool strip_slash,
556     char* name_out = NULL,
557     char* name_tags = NULL,
558     char* seq = NULL,
559     char* qual = NULL) = 0;
560    
561     protected:
562     ReadTable& _insert_table;
563     RefSequenceTable& _ref_table;
564     HitStream* _hit_stream;
565     };//class HitFactory
566    
567    
568     class LineHitFactory : public HitFactory {
569     //for text line-based formats like Bowtie and SAM
570     public:
571     LineHitFactory(ReadTable& insert_table,
572     RefSequenceTable& reference_table) :
573     HitFactory(insert_table, reference_table) {}
574    
575     string hitfile_rec(HitStream& hs, const char* hit_buf) {
576     string r(hit_buf);
577     return r;
578     }
579     void openStream(HitStream& hs);
580     void rewind(HitStream& hs);
581     void closeStream(HitStream& hs);
582     bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
583     protected:
584     static const size_t _hit_buf_max_sz = 10 * 1024;
585     char _hit_buf[_hit_buf_max_sz];
586     int _line_num;
587     };
588    
589     class BowtieHitFactory : public LineHitFactory {
590     public:
591     BowtieHitFactory(ReadTable& insert_table,
592     RefSequenceTable& reference_table) :
593     LineHitFactory(insert_table, reference_table) {}
594    
595     bool get_hit_from_buf(const char* bwt_buf,
596     BowtieHit& bh,
597     bool strip_slash,
598     char* name_out = NULL,
599     char* name_tags = NULL,
600     char* seq = NULL,
601     char* qual = NULL);
602     };
603    
604     class SplicedBowtieHitFactory : public LineHitFactory {
605     public:
606     SplicedBowtieHitFactory(ReadTable& insert_table,
607     RefSequenceTable& reference_table,
608     int anchor_length) :
609     LineHitFactory(insert_table, reference_table),
610     _anchor_length(anchor_length){}
611    
612     bool get_hit_from_buf(const char* bwt_buf,
613     BowtieHit& bh,
614     bool strip_slash,
615     char* name_out = NULL,
616     char* name_tags = NULL,
617     char* seq = NULL,
618     char* qual = NULL);
619     private:
620     int _anchor_length;
621     int _seg_offset;
622     int _size_buf;
623     };
624    
625 gpertea 69 class SplicedSAMHitFactory : public LineHitFactory {
626     public:
627     SplicedSAMHitFactory(ReadTable& insert_table,
628     RefSequenceTable& reference_table,
629     int anchor_length) :
630     LineHitFactory(insert_table, reference_table),
631     _anchor_length(anchor_length){}
632    
633     bool get_hit_from_buf(const char* bwt_buf,
634     BowtieHit& bh,
635     bool strip_slash,
636     char* name_out = NULL,
637     char* name_tags = NULL,
638     char* seq = NULL,
639     char* qual = NULL);
640     private:
641     int _anchor_length;
642     int _seg_offset;
643     int _size_buf;
644     };
645    
646    
647 gpertea 29 class SAMHitFactory : public LineHitFactory {
648     public:
649     SAMHitFactory(ReadTable& insert_table,
650     RefSequenceTable& reference_table) :
651     LineHitFactory(insert_table, reference_table) {}
652    
653     bool get_hit_from_buf(const char* bwt_buf,
654     BowtieHit& bh,
655     bool strip_slash,
656     char* name_out = NULL,
657     char* name_tags = NULL,
658     char* seq = NULL,
659     char* qual = NULL);
660     };
661    
662    
663     /******************************************************************************
664     BAMHitFactory turns SAM alignments into BowtieHits
665     *******************************************************************************/
666     class BAMHitFactory : public HitFactory {
667     public:
668    
669     BAMHitFactory(ReadTable& insert_table,
670     RefSequenceTable& reference_table) :
671     HitFactory(insert_table, reference_table)
672     {
673     _sam_header=NULL;
674     }
675     void openStream(HitStream& hs);
676     void rewind(HitStream& hs);
677     void closeStream(HitStream& hs);
678    
679     bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
680    
681     /*void mark_curr_pos()
682     {
683     _curr_pos = bgzf_tell(((samfile_t*)_hit_file)->x.bam);
684     }
685    
686    
687     void undo_hit()
688     {
689     bgzf_seek(((samfile_t*)_hit_file)->x.bam, _curr_pos, SEEK_SET);
690     _eof = false;
691     }
692     */
693     string hitfile_rec(HitStream& hs, const char* hit_buf);
694    
695     bool get_hit_from_buf(const char* bwt_buf,
696     BowtieHit& bh,
697     bool strip_slash,
698     char* name_out = NULL,
699     char* name_tags = NULL,
700     char* seq = NULL,
701     char* qual = NULL);
702     private:
703     //int64_t _curr_pos;
704     //int64_t _beginning;
705     bam1_t _next_hit;
706     bam_header_t* _sam_header;
707     bool inspect_header(HitStream& hs);
708     };
709    
710    
711     struct HitsForRead
712     {
713     HitsForRead() : insert_id(0) {}
714     uint64_t insert_id;
715     vector<BowtieHit> hits;
716     };
717    
718     class HitStream
719     {
720     friend class HitFactory;
721     friend class LineHitFactory;
722     friend class BAMHitFactory;
723     //private:
724     HitFactory* _factory;
725     bool _spliced;
726     bool _strip_slash;
727     BowtieHit buffered_hit;
728     bool _keep_bufs;
729     bool _keep_seqs;
730     bool _keep_quals;
731     bool _from_bowtie;
732     void* _hit_file;
733     string _hit_file_name;
734     FZPipe* _fzpipe;
735     bool _eof;
736    
737     public:
738     HitStream(void* hit_file, //could be FILE* or samfile_t*
739     HitFactory* hit_factory,
740     bool spliced,
741     bool strip_slash,
742     bool keep_bufs,
743     bool keep_seqs = false,
744     bool keep_quals = false,
745     bool from_bowtie = false) :
746     _factory(hit_factory),
747     _spliced(spliced),
748     _strip_slash(strip_slash),
749     buffered_hit(BowtieHit()),
750     _keep_bufs(keep_bufs),
751     _keep_seqs(keep_seqs),
752     _keep_quals(keep_quals),
753     _from_bowtie(from_bowtie),
754     _hit_file(hit_file),
755     _hit_file_name(),
756     _fzpipe(NULL),
757     _eof(false)
758     {
759     primeStream();
760     }
761    
762     HitStream(string& hit_filename,
763     HitFactory* hit_factory,
764     bool spliced,
765     bool strip_slash,
766     bool keep_bufs,
767     bool keep_seqs = false,
768     bool keep_quals = false,
769     bool from_bowtie = false) :
770     _factory(hit_factory),
771     _spliced(spliced),
772     _strip_slash(strip_slash),
773     buffered_hit(BowtieHit()),
774     _keep_bufs(keep_bufs),
775     _keep_seqs(keep_seqs),
776     _keep_quals(keep_quals),
777     _from_bowtie(from_bowtie),
778     _hit_file(NULL),
779     _hit_file_name(hit_filename),
780     _fzpipe(NULL),
781     _eof(false)
782     {
783     _factory->openStream(*this);
784     primeStream();
785     }
786    
787     HitStream(FZPipe& hit_filepipe,
788     HitFactory* hit_factory,
789     bool spliced,
790     bool strip_slash,
791     bool keep_bufs,
792     bool keep_seqs = false,
793     bool keep_quals = false,
794     bool from_bowtie = false) :
795     _factory(hit_factory),
796     _spliced(spliced),
797     _strip_slash(strip_slash),
798     buffered_hit(BowtieHit()),
799     _keep_bufs(keep_bufs),
800     _keep_seqs(keep_seqs),
801     _keep_quals(keep_quals),
802     _from_bowtie(from_bowtie),
803     _hit_file(NULL),
804     _hit_file_name(),
805     _fzpipe(&hit_filepipe),
806     _eof(false)
807     {
808     _hit_file=_fzpipe->file;
809     primeStream();
810     }
811    
812     void primeStream() { //why?
813     // Prime the stream by reading a single hit into the buffered_hit
814     HitsForRead dummy = HitsForRead();
815     next_read_hits(dummy);
816     }
817     bool eof() { return _eof; }
818     bool ready() { return (_hit_file!=NULL); }
819     void reset() {
820     _factory->rewind(*this);
821     _eof=false;
822     // re-prime the stream;
823     buffered_hit = BowtieHit();
824     primeStream();
825     }
826    
827     bool next_read_hits(HitsForRead& hits_for_read)
828     {
829     hits_for_read.hits.clear();
830     hits_for_read.insert_id = 0;
831    
832     //if (!_hit_file || (feof(_hit_file) && buffered_hit.insert_id() == 0))
833     // return false;
834     if (!this->ready())
835     //err_die("Error: next_read_hits() called on HitFactory with no file handle\n");
836     return false;
837     if (this->eof() && buffered_hit.insert_id() == 0) {
838     return false;
839     }
840    
841     //char bwt_buf[2048]; bwt_buf[0] = 0;
842     char bwt_seq[2048]; bwt_seq[0] = 0;
843     char bwt_qual[2048]; bwt_qual[0] = 0;
844    
845     char* seq = _keep_seqs ? bwt_seq : NULL;
846     char* qual = _keep_quals ? bwt_qual : NULL;
847    
848     hits_for_read.insert_id = buffered_hit.insert_id();
849     if (hits_for_read.insert_id)
850     hits_for_read.hits.push_back(buffered_hit);
851     const char* hit_buf;
852     size_t hit_buf_size = 0;
853     while (true) {
854     if (!_factory->next_record(*this, hit_buf, hit_buf_size)) {
855     buffered_hit = BowtieHit();
856     break; }
857     //string clean_buf = bwt_buf;
858     // Get a new record from the tab-delimited Bowtie map
859     BowtieHit bh;
860     if (_factory->get_hit_from_buf(hit_buf, bh, _strip_slash,
861     NULL, NULL, seq, qual)) {
862     if (_keep_bufs)
863     bh.hitfile_rec(_factory->hitfile_rec(*this, hit_buf));
864    
865     if (_keep_seqs)
866     bh.seq(seq);
867    
868     if (_keep_quals) {
869     // when it comes to convert from qual in color to qual in bp,
870     // we need to fill in the two extream qual values using the adjacent qual values.
871     size_t qual_len = strlen(qual);
872     if (color && !color_out && qual_len > 2) {
873     qual[0] = qual[1];
874     qual[qual_len-1] = qual[qual_len-2];
875     }
876     bh.qual(qual);
877     }
878    
879     if (bh.insert_id() == hits_for_read.insert_id) {
880     hits_for_read.hits.push_back(bh);
881     }
882     else {
883     buffered_hit = bh;
884     break;
885     }
886     } //hit parsed
887 gpertea 137 } //while reading hits
888 gpertea 29
889     return (!hits_for_read.hits.empty() && hits_for_read.insert_id != 0);
890     }
891    
892     uint64_t next_group_id() const
893     {
894     return buffered_hit.insert_id();
895     }
896     bool fromBowtie() { return _from_bowtie; }
897     };
898    
899     typedef uint32_t MateStatusMask;
900    
901     void get_mapped_reads(FILE* bwtf,
902     HitTable& hits,
903     HitFactory& hit_factory,
904     bool strip_slash,
905     bool verbose = false);
906    
907     //bool left_status_better(MateStatusMask left, MateStatusMask right);
908     //bool status_equivalent(MateStatusMask left, MateStatusMask right);
909     typedef uint32_t MateStatusMask;
910    
911     void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC);
912     void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC);
913    
914     void accept_all_hits(HitTable& hits);
915    
916     //print BowtieHit as SAM line
917     void print_hit(FILE* fout,
918     const char* read_name,
919     const BowtieHit& bh,
920     const char* ref_name,
921     const char* sequence,
922     const char* qualities,
923     bool from_bowtie = false);
924    
925     //print BowtieHit as BAM record
926     void print_bamhit(GBamWriter& wbam,
927     const char* read_name,
928     const BowtieHit& bh,
929     const char* ref_name,
930     const char* sequence,
931     const char* qualities,
932     bool from_bowtie = false);
933    
934     /**
935     * Convert a vector of CigarOps to a string representation
936     */
937     std::string print_cigar(vector<CigarOp>& bh_cigar);
938    
939     #endif