ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.h
Revision: 69
Committed: Mon Sep 19 21:00:41 2011 UTC (8 years, 10 months ago) by gpertea
File size: 23276 byte(s)
Log Message:
wip - implementing SplicedSAMHitFactory::get_hit_from_buf()

Line File contents
1 #ifndef BWT_MAP_H
2 #define BWT_MAP_H
3
4 #ifdef HAVE_CONFIG_H
5 #include <config.h>
6 #endif
7
8 #include <stdint.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string>
12 #include <map>
13 #include <vector>
14 #include <cassert>
15 #include <cstring>
16 #include <seqan/sequence.h>
17
18 #include <bam/sam.h>
19 using namespace std;
20
21 #include "common.h"
22 #define _FBUF_SIZE 10*1024
23 /*
24 * bwt_map.h
25 * TopHat
26 *
27 * Created by Cole Trapnell on 11/17/08.
28 * Copyright 2008 Cole Trapnell. All rights reserved.
29 *
30 */
31
32 enum CigarOpCode { MATCH, INS, DEL, REF_SKIP, SOFT_CLIP, HARD_CLIP, PAD };
33
34 struct CigarOp
35 {
36 CigarOp(CigarOpCode o, uint32_t l) : opcode(o), length(l) {}
37 CigarOpCode opcode : 3;
38 uint32_t length : 29;
39
40 bool operator==(const CigarOp& rhs) const
41 {
42 return opcode == rhs.opcode && length == rhs.length;
43 }
44
45 };
46
47 typedef uint32_t ReadID;
48
49 class RefSequenceTable;
50
51 /* Stores the information from a single record of the bowtie map. A given read
52 may have many of these. Reads up to 255bp are supported.
53 */
54 struct BowtieHit
55 {
56 BowtieHit() :
57 _ref_id(0),
58 _insert_id(0),
59 _left(0),
60 _antisense_splice(false),
61 _antisense_aln(false),
62 _edit_dist(0),
63 _splice_mms(0),
64 _end(false){}
65
66 BowtieHit(uint32_t ref_id,
67 ReadID insert_id,
68 int left,
69 int read_len,
70 bool antisense,
71 unsigned char edit_dist,
72 bool end) :
73 _ref_id(ref_id),
74 _insert_id(insert_id),
75 _left(left),
76 _cigar(vector<CigarOp>(1,CigarOp(MATCH,read_len))),
77 _antisense_splice(false),
78 _antisense_aln(antisense),
79 _edit_dist(edit_dist),
80 _splice_mms(0),
81 _end(end)
82 {
83 assert(_cigar.capacity() == _cigar.size());
84 }
85
86 BowtieHit(uint32_t ref_id,
87 ReadID insert_id,
88 int left,
89 const vector<CigarOp>& cigar,
90 bool antisense_aln,
91 bool antisense_splice,
92 unsigned char edit_dist,
93 unsigned char splice_mms,
94 bool end) :
95 _ref_id(ref_id),
96 _insert_id(insert_id),
97 _left(left),
98 _cigar(cigar),
99 _antisense_splice(antisense_splice),
100 _antisense_aln(antisense_aln),
101 _edit_dist(edit_dist),
102 _splice_mms(splice_mms),
103 _end(end)
104 {
105 assert(_cigar.capacity() == _cigar.size());
106 }
107
108 int read_len() const
109 {
110 uint32_t len = 0;
111 for (size_t i = 0; i < _cigar.size(); ++i)
112 {
113 const CigarOp& op = _cigar[i];
114 switch(op.opcode)
115 {
116 case MATCH:
117 case INS:
118 case SOFT_CLIP:
119 len += op.length;
120 break;
121 default:
122 break;
123 }
124 }
125
126 return len;
127 }
128
129 bool operator==(const BowtieHit& rhs) const
130 {
131 return (_insert_id == rhs._insert_id &&
132 _ref_id == rhs._ref_id &&
133 _antisense_aln == rhs._antisense_aln &&
134 _left == rhs._left &&
135 _antisense_splice == rhs._antisense_splice &&
136 _edit_dist == rhs._edit_dist &&
137 /* DO NOT USE ACCEPTED IN COMPARISON */
138 _cigar == rhs._cigar);
139 }
140
141 bool operator<(const BowtieHit& rhs) const
142 {
143 if (_insert_id != rhs._insert_id)
144 return _insert_id < rhs._insert_id;
145 if (_ref_id != rhs._ref_id)
146 return _ref_id < rhs._ref_id;
147 if (_left != rhs._left)
148 return _left < rhs._left;
149 if (_antisense_aln != rhs._antisense_aln)
150 return _antisense_aln < rhs._antisense_aln;
151 if (_edit_dist != rhs._edit_dist)
152 return _edit_dist < rhs._edit_dist;
153 if (_cigar != rhs._cigar)
154 {
155 if (_cigar.size() != rhs._cigar.size())
156 return _cigar.size() < rhs._cigar.size();
157 for (size_t i = 0; i < _cigar.size(); ++i)
158 {
159 if (!(_cigar[i] == rhs._cigar[i]))
160 return (_cigar[i].opcode < rhs._cigar[i].opcode || (_cigar[i].opcode == rhs._cigar[i].opcode && _cigar[i].length < rhs._cigar[i].length));
161 }
162 }
163 return false;
164 }
165
166 uint32_t ref_id() const { return _ref_id; }
167 ReadID insert_id() const { return _insert_id; }
168 int left() const { return _left; }
169 int right() const
170 {
171 int r = _left;
172 for (size_t i = 0; i < _cigar.size(); ++i)
173 {
174 const CigarOp& op = _cigar[i];
175
176 switch(op.opcode)
177 {
178 case MATCH:
179 case REF_SKIP:
180 case DEL:
181 r += op.length;
182 break;
183 default:
184 break;
185 }
186 }
187 return r;
188 }
189
190 bool is_spliced()
191 {
192 for (size_t i = 0; i < _cigar.size(); ++i)
193 {
194 const CigarOp& op = _cigar[i];
195
196 if (op.opcode == REF_SKIP)
197 return true;
198 }
199 return false;
200 }
201
202 bool antisense_splice() const { return _antisense_splice; }
203 bool antisense_align() const { return _antisense_aln; }
204
205 unsigned char edit_dist() const { return _edit_dist; }
206 unsigned char splice_mms() const { return _splice_mms; }
207
208 // For convenience, if you just want a copy of the gap intervals
209 // for this hit.
210 void gaps(vector<pair<int,int> >& gaps_out) const
211 {
212 gaps_out.clear();
213 int pos = _left;
214 for (size_t i = 0; i < _cigar.size(); ++i)
215 {
216 const CigarOp& op = _cigar[i];
217
218 switch(op.opcode)
219 {
220 case REF_SKIP:
221 gaps_out.push_back(make_pair(pos, pos + op.length - 1));
222 pos += op.length;
223 break;
224 case MATCH:
225 pos += op.length;
226 break;
227 default:
228 break;
229 }
230 }
231 }
232
233 const vector<CigarOp>& cigar() const { return _cigar; }
234
235
236 bool contiguous() const
237 {
238 return _cigar.size() == 1 && _cigar[0].opcode == MATCH;
239 }
240
241 const string& hitfile_rec() const { return _hitfile_rec; }
242 void hitfile_rec(const string& rec) { _hitfile_rec = rec; }
243
244 const string& seq() const { return _seq; }
245 void seq(const string& seq) { _seq = seq; }
246
247 const string& qual() const { return _qual; }
248 void qual(const string& qual) { _qual = qual; }
249
250 bool end() const { return _end; }
251 void end(bool end) { _end = end; }
252
253 // this is for debugging purpose
254 bool check_editdist_consistency(const RefSequenceTable& rt);
255
256 private:
257
258 uint32_t _ref_id;
259 ReadID _insert_id; // Id of the sequencing insert
260 int _left; // Position in the reference of the left side of the alignment
261
262 vector<CigarOp> _cigar;
263
264 bool _antisense_splice; // Whether the junction spanned is on the reverse strand
265 bool _antisense_aln; // Whether the alignment is to the reverse strand
266
267 unsigned char _edit_dist; // Total mismatches (note this is not including insertions or deletions as mismatches, ie, not equivalent to NM field of a SAM record)
268 unsigned char _splice_mms; // Mismatches within min_anchor_len of a splice junction
269 string _hitfile_rec; // Points to the buffer for the record from which this hit came
270 string _seq;
271 string _qual;
272 bool _end; // Whether this segment is the last one of the read it belongs to
273 };
274
275 class ReadTable
276 {
277 public:
278
279 ReadTable() :
280 _next_id(1) {}
281
282 // This function should NEVER return zero
283 ReadID get_id(const string& name)
284 {
285 uint32_t _id = atoi(name.c_str());
286 //assert(_id);
287 _next_id = max(_next_id, (size_t)_id);
288 return _id;
289 }
290
291
292 uint32_t observation_order(ReadID ID)
293 {
294 if (ID == 0)
295 return 0xFFFFFFFF;
296 return ID;
297 }
298
299
300 size_t size() const { return _next_id; }
301
302 private:
303 size_t _next_id;
304 };
305
306 class RefSequenceTable
307 {
308 public:
309
310 typedef seqan::String<seqan::Dna5, seqan::Packed<seqan::Alloc<> > > Sequence;
311
312 struct SequenceInfo
313 {
314 SequenceInfo(uint32_t _order,
315 char* _name,
316 Sequence* _seq, uint32_t _len) :
317 observation_order(_order),
318 name(_name),
319 seq(_seq),
320 len(_len) {}
321
322 uint32_t observation_order;
323 char* name;
324 Sequence* seq;
325 uint32_t len;
326 };
327
328 typedef map<string, uint64_t> IDTable;
329 typedef map<uint32_t, SequenceInfo> InvertedIDTable;
330 typedef InvertedIDTable::iterator iterator;
331 typedef InvertedIDTable::const_iterator const_iterator;
332
333 RefSequenceTable(bool keep_names, bool keep_seqs = false) :
334 _next_id(1),
335 _keep_names(keep_names) {}
336
337 RefSequenceTable(const string& sam_header_filename,
338 bool keep_names,
339 bool keep_seqs = false) :
340 _next_id(1),
341 _keep_names(keep_names)
342 {
343 if (sam_header_filename != "")
344 {
345 samfile_t* fh = samopen(sam_header_filename.c_str(), "r", 0);
346 if (fh == 0) {
347 fprintf(stderr, "Failed to open SAM header file %s\n", sam_header_filename.c_str());
348 exit(1);
349 }
350
351 for (size_t i = 0; i < (size_t)fh->header->n_targets; ++i)
352 {
353 const char* name = fh->header->target_name[i];
354 uint32_t len = fh->header->target_len[i];
355 get_id(name, NULL, len);
356 //fprintf(stderr, "SQ: %s - %d\n", name, len);
357 }
358 }
359 }
360
361 // This function should NEVER return zero
362 uint32_t get_id(const string& name,
363 Sequence* seq,
364 uint32_t len)
365 {
366 uint32_t _id = hash_string(name.c_str());
367 pair<InvertedIDTable::iterator, bool> ret =
368 _by_id.insert(make_pair(_id, SequenceInfo(_next_id, NULL, NULL, 0)));
369 if (ret.second == true)
370 {
371 char* _name = NULL;
372 if (_keep_names)
373 _name = strdup(name.c_str());
374 ret.first->second.name = _name;
375 ret.first->second.seq = seq;
376 ret.first->second.len = len;
377 ++_next_id;
378 }
379 assert (_id);
380 return _id;
381 }
382
383 const char* get_name(uint32_t ID) const
384 {
385 InvertedIDTable::const_iterator itr = _by_id.find(ID);
386 if (itr != _by_id.end())
387 return itr->second.name;
388 else
389 return NULL;
390 }
391
392 uint32_t get_len(uint32_t ID) const
393 {
394 InvertedIDTable::const_iterator itr = _by_id.find(ID);
395 if (itr != _by_id.end())
396 return itr->second.len;
397 else
398 return 0;
399 }
400
401 Sequence* get_seq(uint32_t ID) const
402 {
403 InvertedIDTable::const_iterator itr = _by_id.find(ID);
404 if (itr != _by_id.end())
405 return itr->second.seq;
406 else
407 return NULL;
408 }
409
410 const SequenceInfo* get_info(uint32_t ID) const
411 {
412
413 InvertedIDTable::const_iterator itr = _by_id.find(ID);
414 if (itr != _by_id.end())
415 {
416 return &(itr->second);
417 }
418 else
419 return NULL;
420 }
421
422 int observation_order(uint32_t ID) const
423 {
424 InvertedIDTable::const_iterator itr = _by_id.find(ID);
425 if (itr != _by_id.end())
426 {
427 return itr->second.observation_order;
428 }
429 else
430 return -1;
431 }
432
433 iterator begin() { return _by_id.begin(); }
434 iterator end() { return _by_id.end(); }
435
436 const_iterator begin() const { return _by_id.begin(); }
437 const_iterator end() const { return _by_id.end(); }
438
439 size_t size() const { return _by_id.size(); }
440
441 void clear()
442 {
443 //_by_name.clear();
444 _by_id.clear();
445 }
446
447 private:
448
449 // This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
450 inline uint32_t hash_string(const char* __s)
451 {
452 uint32_t hash = 0x811c9dc5;
453 for ( ; *__s; ++__s)
454 {
455 hash *= 16777619;
456 hash ^= *__s;
457 }
458 return hash;
459 }
460
461 //IDTable _by_name;
462 uint32_t _next_id;
463 bool _keep_names;
464 InvertedIDTable _by_id;
465 };
466
467
468 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2);
469
470 typedef vector<BowtieHit> HitList;
471
472 /* This class stores all the hits from a Bowtie map */
473
474 class HitTable
475 {
476 public:
477
478 typedef map<uint64_t, HitList> RefHits;
479 typedef RefHits::const_iterator const_iterator;
480 typedef RefHits::iterator iterator;
481
482 HitTable() : _total_hits(0) {}
483
484 const_iterator begin() const { return _hits_for_ref.begin(); }
485 const_iterator end() const { return _hits_for_ref.end(); }
486
487 iterator begin() { return _hits_for_ref.begin(); }
488 iterator end() { return _hits_for_ref.end(); }
489
490 void add_hit(const BowtieHit& bh, bool check_uniqueness);
491
492 void finalize()
493 {
494 for (RefHits::iterator i = _hits_for_ref.begin();
495 i != _hits_for_ref.end();
496 ++i)
497 {
498 sort(i->second.begin(), i->second.end(), hit_insert_id_lt);
499 }
500 }
501
502 HitList* get_hits(uint64_t ref_id)
503 {
504 RefHits::iterator i = _hits_for_ref.find(ref_id);
505 if (i == _hits_for_ref.end())
506 return NULL;
507 else
508 return &(i->second);
509 }
510
511 uint32_t total_hits() const { return _total_hits; }
512
513 private:
514 RefHits _hits_for_ref;
515 uint32_t _total_hits;
516 };
517
518 class HitStream;
519
520 class HitFactory {
521 friend class HitStream;
522 public:
523 HitFactory(ReadTable& insert_table,
524 RefSequenceTable& reference_table) :
525 _insert_table(insert_table), _ref_table(reference_table)
526 {}
527 virtual ~HitFactory() {}
528 virtual void openStream(HitStream& hs)=0;
529 virtual void rewind(HitStream& hs)=0;
530 virtual void closeStream(HitStream& hs)=0;
531 BowtieHit create_hit(const string& insert_name,
532 const string& ref_name,
533 int left,
534 const vector<CigarOp>& cigar,
535 bool antisense_aln,
536 bool antisense_splice,
537 unsigned char edit_dist,
538 unsigned char splice_mms,
539 bool end);
540
541 BowtieHit create_hit(const string& insert_name,
542 const string& ref_name,
543 uint32_t left,
544 uint32_t read_len,
545 bool antisense_aln,
546 unsigned char edit_dist,
547 bool end);
548
549 virtual string hitfile_rec(HitStream& hs, const char* hit_buf)=0;
550 virtual bool next_record(HitStream& hs, const char*& buf, size_t& buf_size) = 0;
551 virtual bool get_hit_from_buf(const char* bwt_buf,
552 BowtieHit& bh,
553 bool strip_slash,
554 char* name_out = NULL,
555 char* name_tags = NULL,
556 char* seq = NULL,
557 char* qual = NULL) = 0;
558
559 protected:
560 ReadTable& _insert_table;
561 RefSequenceTable& _ref_table;
562 HitStream* _hit_stream;
563 };//class HitFactory
564
565
566 class LineHitFactory : public HitFactory {
567 //for text line-based formats like Bowtie and SAM
568 public:
569 LineHitFactory(ReadTable& insert_table,
570 RefSequenceTable& reference_table) :
571 HitFactory(insert_table, reference_table) {}
572
573 string hitfile_rec(HitStream& hs, const char* hit_buf) {
574 string r(hit_buf);
575 return r;
576 }
577 void openStream(HitStream& hs);
578 void rewind(HitStream& hs);
579 void closeStream(HitStream& hs);
580 bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
581 protected:
582 static const size_t _hit_buf_max_sz = 10 * 1024;
583 char _hit_buf[_hit_buf_max_sz];
584 int _line_num;
585 };
586
587 class BowtieHitFactory : public LineHitFactory {
588 public:
589 BowtieHitFactory(ReadTable& insert_table,
590 RefSequenceTable& reference_table) :
591 LineHitFactory(insert_table, reference_table) {}
592
593 bool get_hit_from_buf(const char* bwt_buf,
594 BowtieHit& bh,
595 bool strip_slash,
596 char* name_out = NULL,
597 char* name_tags = NULL,
598 char* seq = NULL,
599 char* qual = NULL);
600 };
601
602 class SplicedBowtieHitFactory : public LineHitFactory {
603 public:
604 SplicedBowtieHitFactory(ReadTable& insert_table,
605 RefSequenceTable& reference_table,
606 int anchor_length) :
607 LineHitFactory(insert_table, reference_table),
608 _anchor_length(anchor_length){}
609
610 bool get_hit_from_buf(const char* bwt_buf,
611 BowtieHit& bh,
612 bool strip_slash,
613 char* name_out = NULL,
614 char* name_tags = NULL,
615 char* seq = NULL,
616 char* qual = NULL);
617 private:
618 int _anchor_length;
619 int _seg_offset;
620 int _size_buf;
621 };
622
623 class SplicedSAMHitFactory : public LineHitFactory {
624 public:
625 SplicedSAMHitFactory(ReadTable& insert_table,
626 RefSequenceTable& reference_table,
627 int anchor_length) :
628 LineHitFactory(insert_table, reference_table),
629 _anchor_length(anchor_length){}
630
631 bool get_hit_from_buf(const char* bwt_buf,
632 BowtieHit& bh,
633 bool strip_slash,
634 char* name_out = NULL,
635 char* name_tags = NULL,
636 char* seq = NULL,
637 char* qual = NULL);
638 private:
639 int _anchor_length;
640 int _seg_offset;
641 int _size_buf;
642 };
643
644
645 class SAMHitFactory : public LineHitFactory {
646 public:
647 SAMHitFactory(ReadTable& insert_table,
648 RefSequenceTable& reference_table) :
649 LineHitFactory(insert_table, reference_table) {}
650
651 bool get_hit_from_buf(const char* bwt_buf,
652 BowtieHit& bh,
653 bool strip_slash,
654 char* name_out = NULL,
655 char* name_tags = NULL,
656 char* seq = NULL,
657 char* qual = NULL);
658 };
659
660
661 /******************************************************************************
662 BAMHitFactory turns SAM alignments into BowtieHits
663 *******************************************************************************/
664 class BAMHitFactory : public HitFactory {
665 public:
666
667 BAMHitFactory(ReadTable& insert_table,
668 RefSequenceTable& reference_table) :
669 HitFactory(insert_table, reference_table)
670 {
671 _sam_header=NULL;
672 }
673 void openStream(HitStream& hs);
674 void rewind(HitStream& hs);
675 void closeStream(HitStream& hs);
676
677 bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
678
679 /*void mark_curr_pos()
680 {
681 _curr_pos = bgzf_tell(((samfile_t*)_hit_file)->x.bam);
682 }
683
684
685 void undo_hit()
686 {
687 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _curr_pos, SEEK_SET);
688 _eof = false;
689 }
690 */
691 string hitfile_rec(HitStream& hs, const char* hit_buf);
692
693 bool get_hit_from_buf(const char* bwt_buf,
694 BowtieHit& bh,
695 bool strip_slash,
696 char* name_out = NULL,
697 char* name_tags = NULL,
698 char* seq = NULL,
699 char* qual = NULL);
700 private:
701 //int64_t _curr_pos;
702 //int64_t _beginning;
703 bam1_t _next_hit;
704 bam_header_t* _sam_header;
705 bool inspect_header(HitStream& hs);
706 };
707
708
709 struct HitsForRead
710 {
711 HitsForRead() : insert_id(0) {}
712 uint64_t insert_id;
713 vector<BowtieHit> hits;
714 };
715
716 class HitStream
717 {
718 friend class HitFactory;
719 friend class LineHitFactory;
720 friend class BAMHitFactory;
721 //private:
722 HitFactory* _factory;
723 bool _spliced;
724 bool _strip_slash;
725 BowtieHit buffered_hit;
726 bool _keep_bufs;
727 bool _keep_seqs;
728 bool _keep_quals;
729 bool _from_bowtie;
730 void* _hit_file;
731 string _hit_file_name;
732 FZPipe* _fzpipe;
733 bool _eof;
734
735 public:
736 HitStream(void* hit_file, //could be FILE* or samfile_t*
737 HitFactory* hit_factory,
738 bool spliced,
739 bool strip_slash,
740 bool keep_bufs,
741 bool keep_seqs = false,
742 bool keep_quals = false,
743 bool from_bowtie = false) :
744 _factory(hit_factory),
745 _spliced(spliced),
746 _strip_slash(strip_slash),
747 buffered_hit(BowtieHit()),
748 _keep_bufs(keep_bufs),
749 _keep_seqs(keep_seqs),
750 _keep_quals(keep_quals),
751 _from_bowtie(from_bowtie),
752 _hit_file(hit_file),
753 _hit_file_name(),
754 _fzpipe(NULL),
755 _eof(false)
756 {
757 primeStream();
758 }
759
760 HitStream(string& hit_filename,
761 HitFactory* hit_factory,
762 bool spliced,
763 bool strip_slash,
764 bool keep_bufs,
765 bool keep_seqs = false,
766 bool keep_quals = false,
767 bool from_bowtie = false) :
768 _factory(hit_factory),
769 _spliced(spliced),
770 _strip_slash(strip_slash),
771 buffered_hit(BowtieHit()),
772 _keep_bufs(keep_bufs),
773 _keep_seqs(keep_seqs),
774 _keep_quals(keep_quals),
775 _from_bowtie(from_bowtie),
776 _hit_file(NULL),
777 _hit_file_name(hit_filename),
778 _fzpipe(NULL),
779 _eof(false)
780 {
781 _factory->openStream(*this);
782 primeStream();
783 }
784
785 HitStream(FZPipe& hit_filepipe,
786 HitFactory* hit_factory,
787 bool spliced,
788 bool strip_slash,
789 bool keep_bufs,
790 bool keep_seqs = false,
791 bool keep_quals = false,
792 bool from_bowtie = false) :
793 _factory(hit_factory),
794 _spliced(spliced),
795 _strip_slash(strip_slash),
796 buffered_hit(BowtieHit()),
797 _keep_bufs(keep_bufs),
798 _keep_seqs(keep_seqs),
799 _keep_quals(keep_quals),
800 _from_bowtie(from_bowtie),
801 _hit_file(NULL),
802 _hit_file_name(),
803 _fzpipe(&hit_filepipe),
804 _eof(false)
805 {
806 _hit_file=_fzpipe->file;
807 primeStream();
808 }
809
810 void primeStream() { //why?
811 // Prime the stream by reading a single hit into the buffered_hit
812 HitsForRead dummy = HitsForRead();
813 next_read_hits(dummy);
814 }
815 bool eof() { return _eof; }
816 bool ready() { return (_hit_file!=NULL); }
817 void reset() {
818 _factory->rewind(*this);
819 _eof=false;
820 // re-prime the stream;
821 buffered_hit = BowtieHit();
822 primeStream();
823 }
824
825 bool next_read_hits(HitsForRead& hits_for_read)
826 {
827 hits_for_read.hits.clear();
828 hits_for_read.insert_id = 0;
829
830 //if (!_hit_file || (feof(_hit_file) && buffered_hit.insert_id() == 0))
831 // return false;
832 if (!this->ready())
833 //err_die("Error: next_read_hits() called on HitFactory with no file handle\n");
834 return false;
835 if (this->eof() && buffered_hit.insert_id() == 0) {
836 return false;
837 }
838
839 //char bwt_buf[2048]; bwt_buf[0] = 0;
840 char bwt_seq[2048]; bwt_seq[0] = 0;
841 char bwt_qual[2048]; bwt_qual[0] = 0;
842
843 char* seq = _keep_seqs ? bwt_seq : NULL;
844 char* qual = _keep_quals ? bwt_qual : NULL;
845
846 hits_for_read.insert_id = buffered_hit.insert_id();
847 if (hits_for_read.insert_id)
848 hits_for_read.hits.push_back(buffered_hit);
849 const char* hit_buf;
850 size_t hit_buf_size = 0;
851 while (true) {
852 if (!_factory->next_record(*this, hit_buf, hit_buf_size)) {
853 buffered_hit = BowtieHit();
854 break; }
855 //string clean_buf = bwt_buf;
856 // Get a new record from the tab-delimited Bowtie map
857 BowtieHit bh;
858 if (_factory->get_hit_from_buf(hit_buf, bh, _strip_slash,
859 NULL, NULL, seq, qual)) {
860 if (_keep_bufs)
861 bh.hitfile_rec(_factory->hitfile_rec(*this, hit_buf));
862
863 if (_keep_seqs)
864 bh.seq(seq);
865
866 if (_keep_quals) {
867 // when it comes to convert from qual in color to qual in bp,
868 // we need to fill in the two extream qual values using the adjacent qual values.
869 size_t qual_len = strlen(qual);
870 if (color && !color_out && qual_len > 2) {
871 qual[0] = qual[1];
872 qual[qual_len-1] = qual[qual_len-2];
873 }
874 bh.qual(qual);
875 }
876
877 if (bh.insert_id() == hits_for_read.insert_id) {
878 hits_for_read.hits.push_back(bh);
879 }
880 else {
881 buffered_hit = bh;
882 break;
883 }
884 } //hit parsed
885 } //while reading hits
886
887 return (!hits_for_read.hits.empty() && hits_for_read.insert_id != 0);
888 }
889
890 uint64_t next_group_id() const
891 {
892 return buffered_hit.insert_id();
893 }
894 bool fromBowtie() { return _from_bowtie; }
895 };
896
897 typedef uint32_t MateStatusMask;
898
899 void get_mapped_reads(FILE* bwtf,
900 HitTable& hits,
901 HitFactory& hit_factory,
902 bool strip_slash,
903 bool verbose = false);
904
905 //bool left_status_better(MateStatusMask left, MateStatusMask right);
906 //bool status_equivalent(MateStatusMask left, MateStatusMask right);
907 typedef uint32_t MateStatusMask;
908
909 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC);
910 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC);
911
912 void accept_all_hits(HitTable& hits);
913
914 //print BowtieHit as SAM line
915 void print_hit(FILE* fout,
916 const char* read_name,
917 const BowtieHit& bh,
918 const char* ref_name,
919 const char* sequence,
920 const char* qualities,
921 bool from_bowtie = false);
922
923 //print BowtieHit as BAM record
924 void print_bamhit(GBamWriter& wbam,
925 const char* read_name,
926 const BowtieHit& bh,
927 const char* ref_name,
928 const char* sequence,
929 const char* qualities,
930 bool from_bowtie = false);
931
932 /**
933 * Convert a vector of CigarOps to a string representation
934 */
935 std::string print_cigar(vector<CigarOp>& bh_cigar);
936
937 #endif