ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.h
Revision: 135
Committed: Mon Dec 12 22:28:38 2011 UTC (8 years, 8 months ago) by gpertea
File size: 23308 byte(s)
Log Message:
wip - SplicedSAMHitFactory() still not implemented

Line File contents
1 #ifndef BWT_MAP_H
2 #define BWT_MAP_H
3
4 #ifdef HAVE_CONFIG_H
5 #include <config.h>
6 #endif
7
8 #include <stdint.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string>
12 #include <map>
13 #include <vector>
14 #include <cassert>
15 #include <cstring>
16 #include <seqan/sequence.h>
17
18 #include <bam/sam.h>
19 using namespace std;
20
21 #include "common.h"
22 #define _FBUF_SIZE 10*1024
23 /*
24 * bwt_map.h
25 * TopHat
26 *
27 * Created by Cole Trapnell on 11/17/08.
28 * Copyright 2008 Cole Trapnell. All rights reserved.
29 *
30 */
31
32 enum CigarOpCode { MATCH, INS, DEL, REF_SKIP, SOFT_CLIP, HARD_CLIP, PAD };
33
34 struct CigarOp
35 {
36 CigarOp(CigarOpCode o, uint32_t l) : opcode(o), length(l) {}
37 CigarOpCode opcode : 3;
38 uint32_t length : 29;
39
40 bool operator==(const CigarOp& rhs) const
41 {
42 return opcode == rhs.opcode && length == rhs.length;
43 }
44
45 };
46
47 typedef uint32_t ReadID;
48
49 class RefSequenceTable;
50
51 /* Stores the information from a single record of the bowtie map. A given read
52 may have many of these. Reads up to 255bp are supported.
53 */
54 struct BowtieHit
55 {
56 BowtieHit() :
57 _ref_id(0),
58 _insert_id(0),
59 _left(0),
60 _antisense_splice(false),
61 _antisense_aln(false),
62 _edit_dist(0),
63 _splice_mms(0),
64 _end(false){}
65
66 BowtieHit(uint32_t ref_id,
67 ReadID insert_id,
68 int left,
69 int read_len,
70 bool antisense,
71 unsigned char edit_dist,
72 bool end) :
73 _ref_id(ref_id),
74 _insert_id(insert_id),
75 _left(left),
76 _cigar(vector<CigarOp>(1,CigarOp(MATCH,read_len))),
77 _antisense_splice(false),
78 _antisense_aln(antisense),
79 _edit_dist(edit_dist),
80 _splice_mms(0),
81 _end(end)
82 {
83 assert(_cigar.capacity() == _cigar.size());
84 }
85
86 BowtieHit(uint32_t ref_id,
87 ReadID insert_id,
88 int left,
89 const vector<CigarOp>& cigar,
90 bool antisense_aln,
91 bool antisense_splice,
92 unsigned char edit_dist,
93 unsigned char splice_mms,
94 bool end) :
95 _ref_id(ref_id),
96 _insert_id(insert_id),
97 _left(left),
98 _cigar(cigar),
99 _antisense_splice(antisense_splice),
100 _antisense_aln(antisense_aln),
101 _edit_dist(edit_dist),
102 _splice_mms(splice_mms),
103 _end(end)
104 {
105 assert(_cigar.capacity() == _cigar.size());
106 }
107
108 int read_len() const
109 {
110 uint32_t len = 0;
111 for (size_t i = 0; i < _cigar.size(); ++i)
112 {
113 const CigarOp& op = _cigar[i];
114 switch(op.opcode)
115 {
116 case MATCH:
117 case INS:
118 case SOFT_CLIP:
119 len += op.length;
120 break;
121 default:
122 break;
123 }
124 }
125
126 return len;
127 }
128
129 bool operator==(const BowtieHit& rhs) const
130 {
131 return (_insert_id == rhs._insert_id &&
132 _ref_id == rhs._ref_id &&
133 _antisense_aln == rhs._antisense_aln &&
134 _left == rhs._left &&
135 _antisense_splice == rhs._antisense_splice &&
136 _edit_dist == rhs._edit_dist &&
137 /* DO NOT USE ACCEPTED IN COMPARISON */
138 _cigar == rhs._cigar);
139 }
140
141 bool operator<(const BowtieHit& rhs) const
142 {
143 if (_insert_id != rhs._insert_id)
144 return _insert_id < rhs._insert_id;
145 if (_ref_id != rhs._ref_id)
146 return _ref_id < rhs._ref_id;
147 if (_left != rhs._left)
148 return _left < rhs._left;
149 if (_antisense_aln != rhs._antisense_aln)
150 return _antisense_aln < rhs._antisense_aln;
151 if (_edit_dist != rhs._edit_dist)
152 return _edit_dist < rhs._edit_dist;
153 if (_cigar != rhs._cigar)
154 {
155 if (_cigar.size() != rhs._cigar.size())
156 return _cigar.size() < rhs._cigar.size();
157 for (size_t i = 0; i < _cigar.size(); ++i)
158 {
159 if (!(_cigar[i] == rhs._cigar[i]))
160 return (_cigar[i].opcode < rhs._cigar[i].opcode || (_cigar[i].opcode == rhs._cigar[i].opcode && _cigar[i].length < rhs._cigar[i].length));
161 }
162 }
163 return false;
164 }
165
166 uint32_t ref_id() const { return _ref_id; }
167 ReadID insert_id() const { return _insert_id; }
168 int left() const { return _left; }
169 int right() const
170 {
171 int r = _left;
172 for (size_t i = 0; i < _cigar.size(); ++i)
173 {
174 const CigarOp& op = _cigar[i];
175
176 switch(op.opcode)
177 {
178 case MATCH:
179 case REF_SKIP:
180 case DEL:
181 r += op.length;
182 break;
183 default:
184 break;
185 }
186 }
187 return r;
188 }
189
190 bool is_spliced()
191 {
192 for (size_t i = 0; i < _cigar.size(); ++i)
193 {
194 const CigarOp& op = _cigar[i];
195
196 if (op.opcode == REF_SKIP)
197 return true;
198 }
199 return false;
200 }
201
202 bool antisense_splice() const { return _antisense_splice; }
203 bool antisense_align() const { return _antisense_aln; }
204
205 unsigned char edit_dist() const { return _edit_dist; }
206 unsigned char splice_mms() const { return _splice_mms; }
207
208 // For convenience, if you just want a copy of the gap intervals
209 // for this hit.
210 void gaps(vector<pair<int,int> >& gaps_out) const
211 {
212 gaps_out.clear();
213 int pos = _left;
214 for (size_t i = 0; i < _cigar.size(); ++i)
215 {
216 const CigarOp& op = _cigar[i];
217
218 switch(op.opcode)
219 {
220 case REF_SKIP:
221 gaps_out.push_back(make_pair(pos, pos + op.length - 1));
222 pos += op.length;
223 break;
224 case MATCH:
225 case DEL:
226 pos += op.length;
227 break;
228 default:
229 break;
230 }
231 }
232 }
233
234 const vector<CigarOp>& cigar() const { return _cigar; }
235
236
237 bool contiguous() const
238 {
239 return _cigar.size() == 1 && _cigar[0].opcode == MATCH;
240 }
241
242 const string& hitfile_rec() const { return _hitfile_rec; }
243 void hitfile_rec(const string& rec) { _hitfile_rec = rec; }
244
245 const string& seq() const { return _seq; }
246 void seq(const string& seq) { _seq = seq; }
247
248 const string& qual() const { return _qual; }
249 void qual(const string& qual) { _qual = qual; }
250
251 bool end() const { return _end; }
252 void end(bool end) { _end = end; }
253
254 // this is for debugging purpose
255 bool check_editdist_consistency(const RefSequenceTable& rt);
256
257 private:
258
259 uint32_t _ref_id;
260 ReadID _insert_id; // Id of the sequencing insert
261 int _left; // Position in the reference of the left side of the alignment
262
263 vector<CigarOp> _cigar;
264
265 bool _antisense_splice; // Whether the junction spanned is on the reverse strand
266 bool _antisense_aln; // Whether the alignment is to the reverse strand
267
268 unsigned char _edit_dist; // Total mismatches (note this is not including insertions or deletions as mismatches, ie, not equivalent to NM field of a SAM record)
269 unsigned char _splice_mms; // Mismatches within min_anchor_len of a splice junction
270 string _hitfile_rec; // Points to the buffer for the record from which this hit came
271 string _seq;
272 string _qual;
273 bool _end; // Whether this segment is the last one of the read it belongs to
274 };
275
276 class ReadTable
277 {
278 public:
279
280 ReadTable() :
281 _next_id(1) {}
282
283 // This function should NEVER return zero
284 ReadID get_id(const string& name)
285 {
286 uint32_t _id = atoi(name.c_str());
287 //assert(_id);
288 _next_id = max(_next_id, (size_t)_id);
289 return _id;
290 }
291
292
293 uint32_t observation_order(ReadID ID)
294 {
295 if (ID == 0)
296 return VMAXINT32;
297 return ID;
298 }
299
300
301 size_t size() const { return _next_id; }
302
303 private:
304 size_t _next_id;
305 };
306
307 class RefSequenceTable
308 {
309 public:
310
311 typedef seqan::String<seqan::Dna5, seqan::Packed<seqan::Alloc<> > > Sequence;
312
313 struct SequenceInfo
314 {
315 SequenceInfo(uint32_t _order,
316 char* _name,
317 Sequence* _seq, uint32_t _len) :
318 observation_order(_order),
319 name(_name),
320 seq(_seq),
321 len(_len) {}
322
323 uint32_t observation_order;
324 char* name;
325 Sequence* seq;
326 uint32_t len;
327 };
328
329 typedef map<string, uint64_t> IDTable;
330 typedef map<uint32_t, SequenceInfo> InvertedIDTable;
331 typedef InvertedIDTable::iterator iterator;
332 typedef InvertedIDTable::const_iterator const_iterator;
333
334 RefSequenceTable(bool keep_names, bool keep_seqs = false) :
335 _next_id(1),
336 _keep_names(keep_names) {}
337
338 RefSequenceTable(const string& sam_header_filename,
339 bool keep_names,
340 bool keep_seqs = false) :
341 _next_id(1),
342 _keep_names(keep_names)
343 {
344 if (sam_header_filename != "")
345 {
346 samfile_t* fh = samopen(sam_header_filename.c_str(), "r", 0);
347 if (fh == 0) {
348 fprintf(stderr, "Failed to open SAM header file %s\n", sam_header_filename.c_str());
349 exit(1);
350 }
351
352 for (size_t i = 0; i < (size_t)fh->header->n_targets; ++i)
353 {
354 const char* name = fh->header->target_name[i];
355 uint32_t len = fh->header->target_len[i];
356 get_id(name, NULL, len);
357 //fprintf(stderr, "SQ: %s - %d\n", name, len);
358 }
359 }
360 }
361
362 // This function should NEVER return zero
363 uint32_t get_id(const string& name,
364 Sequence* seq,
365 uint32_t len)
366 {
367 uint32_t _id = hash_string(name.c_str());
368 pair<InvertedIDTable::iterator, bool> ret =
369 _by_id.insert(make_pair(_id, SequenceInfo(_next_id, NULL, NULL, 0)));
370 if (ret.second == true)
371 {
372 char* _name = NULL;
373 if (_keep_names)
374 _name = strdup(name.c_str());
375 ret.first->second.name = _name;
376 ret.first->second.seq = seq;
377 ret.first->second.len = len;
378 ++_next_id;
379 }
380 assert (_id);
381 return _id;
382 }
383
384 const char* get_name(uint32_t ID) const
385 {
386 InvertedIDTable::const_iterator itr = _by_id.find(ID);
387 if (itr != _by_id.end())
388 return itr->second.name;
389 else
390 return NULL;
391 }
392
393 uint32_t get_len(uint32_t ID) const
394 {
395 InvertedIDTable::const_iterator itr = _by_id.find(ID);
396 if (itr != _by_id.end())
397 return itr->second.len;
398 else
399 return 0;
400 }
401
402 Sequence* get_seq(uint32_t ID) const
403 {
404 InvertedIDTable::const_iterator itr = _by_id.find(ID);
405 if (itr != _by_id.end())
406 return itr->second.seq;
407 else
408 return NULL;
409 }
410
411 const SequenceInfo* get_info(uint32_t ID) const
412 {
413
414 InvertedIDTable::const_iterator itr = _by_id.find(ID);
415 if (itr != _by_id.end())
416 {
417 return &(itr->second);
418 }
419 else
420 return NULL;
421 }
422
423 int observation_order(uint32_t ID) const
424 {
425 InvertedIDTable::const_iterator itr = _by_id.find(ID);
426 if (itr != _by_id.end())
427 {
428 return itr->second.observation_order;
429 }
430 else
431 return -1;
432 }
433
434 iterator begin() { return _by_id.begin(); }
435 iterator end() { return _by_id.end(); }
436
437 const_iterator begin() const { return _by_id.begin(); }
438 const_iterator end() const { return _by_id.end(); }
439
440 size_t size() const { return _by_id.size(); }
441
442 void clear()
443 {
444 //_by_name.clear();
445 _by_id.clear();
446 }
447
448 // daehwan
449 // This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
450 static inline uint32_t hash_string(const char* __s)
451 {
452 uint32_t hash = 0x811c9dc5;
453 for ( ; *__s; ++__s)
454 {
455 hash *= 16777619;
456 hash ^= *__s;
457 }
458 return hash;
459 }
460
461 private:
462
463 //IDTable _by_name;
464 uint32_t _next_id;
465 bool _keep_names;
466 InvertedIDTable _by_id;
467 };
468
469
470 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2);
471
472 typedef vector<BowtieHit> HitList;
473
474 /* This class stores all the hits from a Bowtie map */
475
476 class HitTable
477 {
478 public:
479
480 typedef map<uint64_t, HitList> RefHits;
481 typedef RefHits::const_iterator const_iterator;
482 typedef RefHits::iterator iterator;
483
484 HitTable() : _total_hits(0) {}
485
486 const_iterator begin() const { return _hits_for_ref.begin(); }
487 const_iterator end() const { return _hits_for_ref.end(); }
488
489 iterator begin() { return _hits_for_ref.begin(); }
490 iterator end() { return _hits_for_ref.end(); }
491
492 void add_hit(const BowtieHit& bh, bool check_uniqueness);
493
494 void finalize()
495 {
496 for (RefHits::iterator i = _hits_for_ref.begin();
497 i != _hits_for_ref.end();
498 ++i)
499 {
500 sort(i->second.begin(), i->second.end(), hit_insert_id_lt);
501 }
502 }
503
504 HitList* get_hits(uint64_t ref_id)
505 {
506 RefHits::iterator i = _hits_for_ref.find(ref_id);
507 if (i == _hits_for_ref.end())
508 return NULL;
509 else
510 return &(i->second);
511 }
512
513 uint32_t total_hits() const { return _total_hits; }
514
515 private:
516 RefHits _hits_for_ref;
517 uint32_t _total_hits;
518 };
519
520 class HitStream;
521
522 class HitFactory {
523 friend class HitStream;
524 public:
525 HitFactory(ReadTable& insert_table,
526 RefSequenceTable& reference_table) :
527 _insert_table(insert_table), _ref_table(reference_table)
528 {}
529 virtual ~HitFactory() {}
530 virtual void openStream(HitStream& hs)=0;
531 virtual void rewind(HitStream& hs)=0;
532 virtual void closeStream(HitStream& hs)=0;
533 BowtieHit create_hit(const string& insert_name,
534 const string& ref_name,
535 int left,
536 const vector<CigarOp>& cigar,
537 bool antisense_aln,
538 bool antisense_splice,
539 unsigned char edit_dist,
540 unsigned char splice_mms,
541 bool end);
542
543 BowtieHit create_hit(const string& insert_name,
544 const string& ref_name,
545 uint32_t left,
546 uint32_t read_len,
547 bool antisense_aln,
548 unsigned char edit_dist,
549 bool end);
550
551 virtual string hitfile_rec(HitStream& hs, const char* hit_buf)=0;
552 virtual bool next_record(HitStream& hs, const char*& buf, size_t& buf_size) = 0;
553 virtual bool get_hit_from_buf(const char* bwt_buf,
554 BowtieHit& bh,
555 bool strip_slash,
556 char* name_out = NULL,
557 char* name_tags = NULL,
558 char* seq = NULL,
559 char* qual = NULL) = 0;
560
561 protected:
562 ReadTable& _insert_table;
563 RefSequenceTable& _ref_table;
564 HitStream* _hit_stream;
565 };//class HitFactory
566
567
568 class LineHitFactory : public HitFactory {
569 //for text line-based formats like Bowtie and SAM
570 public:
571 LineHitFactory(ReadTable& insert_table,
572 RefSequenceTable& reference_table) :
573 HitFactory(insert_table, reference_table) {}
574
575 string hitfile_rec(HitStream& hs, const char* hit_buf) {
576 string r(hit_buf);
577 return r;
578 }
579 void openStream(HitStream& hs);
580 void rewind(HitStream& hs);
581 void closeStream(HitStream& hs);
582 bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
583 protected:
584 static const size_t _hit_buf_max_sz = 10 * 1024;
585 char _hit_buf[_hit_buf_max_sz];
586 int _line_num;
587 };
588
589 class BowtieHitFactory : public LineHitFactory {
590 public:
591 BowtieHitFactory(ReadTable& insert_table,
592 RefSequenceTable& reference_table) :
593 LineHitFactory(insert_table, reference_table) {}
594
595 bool get_hit_from_buf(const char* bwt_buf,
596 BowtieHit& bh,
597 bool strip_slash,
598 char* name_out = NULL,
599 char* name_tags = NULL,
600 char* seq = NULL,
601 char* qual = NULL);
602 };
603
604 class SplicedBowtieHitFactory : public LineHitFactory {
605 public:
606 SplicedBowtieHitFactory(ReadTable& insert_table,
607 RefSequenceTable& reference_table,
608 int anchor_length) :
609 LineHitFactory(insert_table, reference_table),
610 _anchor_length(anchor_length){}
611
612 bool get_hit_from_buf(const char* bwt_buf,
613 BowtieHit& bh,
614 bool strip_slash,
615 char* name_out = NULL,
616 char* name_tags = NULL,
617 char* seq = NULL,
618 char* qual = NULL);
619 private:
620 int _anchor_length;
621 int _seg_offset;
622 int _size_buf;
623 };
624
625 class SplicedSAMHitFactory : public LineHitFactory {
626 public:
627 SplicedSAMHitFactory(ReadTable& insert_table,
628 RefSequenceTable& reference_table,
629 int anchor_length) :
630 LineHitFactory(insert_table, reference_table),
631 _anchor_length(anchor_length){}
632
633 bool get_hit_from_buf(const char* bwt_buf,
634 BowtieHit& bh,
635 bool strip_slash,
636 char* name_out = NULL,
637 char* name_tags = NULL,
638 char* seq = NULL,
639 char* qual = NULL);
640 private:
641 int _anchor_length;
642 int _seg_offset;
643 int _size_buf;
644 };
645
646
647 class SAMHitFactory : public LineHitFactory {
648 public:
649 SAMHitFactory(ReadTable& insert_table,
650 RefSequenceTable& reference_table) :
651 LineHitFactory(insert_table, reference_table) {}
652
653 bool get_hit_from_buf(const char* bwt_buf,
654 BowtieHit& bh,
655 bool strip_slash,
656 char* name_out = NULL,
657 char* name_tags = NULL,
658 char* seq = NULL,
659 char* qual = NULL);
660 };
661
662
663 /******************************************************************************
664 BAMHitFactory turns SAM alignments into BowtieHits
665 *******************************************************************************/
666 class BAMHitFactory : public HitFactory {
667 public:
668
669 BAMHitFactory(ReadTable& insert_table,
670 RefSequenceTable& reference_table) :
671 HitFactory(insert_table, reference_table)
672 {
673 _sam_header=NULL;
674 }
675 void openStream(HitStream& hs);
676 void rewind(HitStream& hs);
677 void closeStream(HitStream& hs);
678
679 bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
680
681 /*void mark_curr_pos()
682 {
683 _curr_pos = bgzf_tell(((samfile_t*)_hit_file)->x.bam);
684 }
685
686
687 void undo_hit()
688 {
689 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _curr_pos, SEEK_SET);
690 _eof = false;
691 }
692 */
693 string hitfile_rec(HitStream& hs, const char* hit_buf);
694
695 bool get_hit_from_buf(const char* bwt_buf,
696 BowtieHit& bh,
697 bool strip_slash,
698 char* name_out = NULL,
699 char* name_tags = NULL,
700 char* seq = NULL,
701 char* qual = NULL);
702 private:
703 //int64_t _curr_pos;
704 //int64_t _beginning;
705 bam1_t _next_hit;
706 bam_header_t* _sam_header;
707 bool inspect_header(HitStream& hs);
708 };
709
710
711 struct HitsForRead
712 {
713 HitsForRead() : insert_id(0) {}
714 uint64_t insert_id;
715 vector<BowtieHit> hits;
716 };
717
718 class HitStream
719 {
720 friend class HitFactory;
721 friend class LineHitFactory;
722 friend class BAMHitFactory;
723 //private:
724 HitFactory* _factory;
725 bool _spliced;
726 bool _strip_slash;
727 BowtieHit buffered_hit;
728 bool _keep_bufs;
729 bool _keep_seqs;
730 bool _keep_quals;
731 bool _from_bowtie;
732 void* _hit_file;
733 string _hit_file_name;
734 FZPipe* _fzpipe;
735 bool _eof;
736
737 public:
738 HitStream(void* hit_file, //could be FILE* or samfile_t*
739 HitFactory* hit_factory,
740 bool spliced,
741 bool strip_slash,
742 bool keep_bufs,
743 bool keep_seqs = false,
744 bool keep_quals = false,
745 bool from_bowtie = false) :
746 _factory(hit_factory),
747 _spliced(spliced),
748 _strip_slash(strip_slash),
749 buffered_hit(BowtieHit()),
750 _keep_bufs(keep_bufs),
751 _keep_seqs(keep_seqs),
752 _keep_quals(keep_quals),
753 _from_bowtie(from_bowtie),
754 _hit_file(hit_file),
755 _hit_file_name(),
756 _fzpipe(NULL),
757 _eof(false)
758 {
759 primeStream();
760 }
761
762 HitStream(string& hit_filename,
763 HitFactory* hit_factory,
764 bool spliced,
765 bool strip_slash,
766 bool keep_bufs,
767 bool keep_seqs = false,
768 bool keep_quals = false,
769 bool from_bowtie = false) :
770 _factory(hit_factory),
771 _spliced(spliced),
772 _strip_slash(strip_slash),
773 buffered_hit(BowtieHit()),
774 _keep_bufs(keep_bufs),
775 _keep_seqs(keep_seqs),
776 _keep_quals(keep_quals),
777 _from_bowtie(from_bowtie),
778 _hit_file(NULL),
779 _hit_file_name(hit_filename),
780 _fzpipe(NULL),
781 _eof(false)
782 {
783 _factory->openStream(*this);
784 primeStream();
785 }
786
787 HitStream(FZPipe& hit_filepipe,
788 HitFactory* hit_factory,
789 bool spliced,
790 bool strip_slash,
791 bool keep_bufs,
792 bool keep_seqs = false,
793 bool keep_quals = false,
794 bool from_bowtie = false) :
795 _factory(hit_factory),
796 _spliced(spliced),
797 _strip_slash(strip_slash),
798 buffered_hit(BowtieHit()),
799 _keep_bufs(keep_bufs),
800 _keep_seqs(keep_seqs),
801 _keep_quals(keep_quals),
802 _from_bowtie(from_bowtie),
803 _hit_file(NULL),
804 _hit_file_name(),
805 _fzpipe(&hit_filepipe),
806 _eof(false)
807 {
808 _hit_file=_fzpipe->file;
809 primeStream();
810 }
811
812 void primeStream() { //why?
813 // Prime the stream by reading a single hit into the buffered_hit
814 HitsForRead dummy = HitsForRead();
815 next_read_hits(dummy);
816 }
817 bool eof() { return _eof; }
818 bool ready() { return (_hit_file!=NULL); }
819 void reset() {
820 _factory->rewind(*this);
821 _eof=false;
822 // re-prime the stream;
823 buffered_hit = BowtieHit();
824 primeStream();
825 }
826
827 bool next_read_hits(HitsForRead& hits_for_read)
828 {
829 hits_for_read.hits.clear();
830 hits_for_read.insert_id = 0;
831
832 //if (!_hit_file || (feof(_hit_file) && buffered_hit.insert_id() == 0))
833 // return false;
834 if (!this->ready())
835 //err_die("Error: next_read_hits() called on HitFactory with no file handle\n");
836 return false;
837 if (this->eof() && buffered_hit.insert_id() == 0) {
838 return false;
839 }
840
841 //char bwt_buf[2048]; bwt_buf[0] = 0;
842 char bwt_seq[2048]; bwt_seq[0] = 0;
843 char bwt_qual[2048]; bwt_qual[0] = 0;
844
845 char* seq = _keep_seqs ? bwt_seq : NULL;
846 char* qual = _keep_quals ? bwt_qual : NULL;
847
848 hits_for_read.insert_id = buffered_hit.insert_id();
849 if (hits_for_read.insert_id)
850 hits_for_read.hits.push_back(buffered_hit);
851 const char* hit_buf;
852 size_t hit_buf_size = 0;
853 while (true) {
854 if (!_factory->next_record(*this, hit_buf, hit_buf_size)) {
855 buffered_hit = BowtieHit();
856 break; }
857 //string clean_buf = bwt_buf;
858 // Get a new record from the tab-delimited Bowtie map
859 BowtieHit bh;
860 if (_factory->get_hit_from_buf(hit_buf, bh, _strip_slash,
861 NULL, NULL, seq, qual)) {
862 if (_keep_bufs)
863 bh.hitfile_rec(_factory->hitfile_rec(*this, hit_buf));
864
865 if (_keep_seqs)
866 bh.seq(seq);
867
868 if (_keep_quals) {
869 // when it comes to convert from qual in color to qual in bp,
870 // we need to fill in the two extream qual values using the adjacent qual values.
871 size_t qual_len = strlen(qual);
872 if (color && !color_out && qual_len > 2) {
873 qual[0] = qual[1];
874 qual[qual_len-1] = qual[qual_len-2];
875 }
876 bh.qual(qual);
877 }
878
879 if (bh.insert_id() == hits_for_read.insert_id) {
880 hits_for_read.hits.push_back(bh);
881 }
882 else {
883 buffered_hit = bh;
884 break;
885 }
886 } //hit parsed
887 } //while reading hits
888
889 return (!hits_for_read.hits.empty() && hits_for_read.insert_id != 0);
890 }
891
892 uint64_t next_group_id() const
893 {
894 return buffered_hit.insert_id();
895 }
896 bool fromBowtie() { return _from_bowtie; }
897 };
898
899 typedef uint32_t MateStatusMask;
900
901 void get_mapped_reads(FILE* bwtf,
902 HitTable& hits,
903 HitFactory& hit_factory,
904 bool strip_slash,
905 bool verbose = false);
906
907 //bool left_status_better(MateStatusMask left, MateStatusMask right);
908 //bool status_equivalent(MateStatusMask left, MateStatusMask right);
909 typedef uint32_t MateStatusMask;
910
911 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC);
912 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC);
913
914 void accept_all_hits(HitTable& hits);
915
916 //print BowtieHit as SAM line
917 void print_hit(FILE* fout,
918 const char* read_name,
919 const BowtieHit& bh,
920 const char* ref_name,
921 const char* sequence,
922 const char* qualities,
923 bool from_bowtie = false);
924
925 //print BowtieHit as BAM record
926 void print_bamhit(GBamWriter& wbam,
927 const char* read_name,
928 const BowtieHit& bh,
929 const char* ref_name,
930 const char* sequence,
931 const char* qualities,
932 bool from_bowtie = false);
933
934 /**
935 * Convert a vector of CigarOps to a string representation
936 */
937 std::string print_cigar(vector<CigarOp>& bh_cigar);
938
939 #endif