ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.h
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (7 years, 8 months ago) by gpertea
File size: 30677 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 #ifndef BWT_MAP_H
2 #define BWT_MAP_H
3
4 #ifdef HAVE_CONFIG_H
5 #include <config.h>
6 #endif
7
8 #include <stdint.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string>
12 #include <map>
13 #include <vector>
14 #include <cassert>
15 #include <cstring>
16 #include <algorithm>
17 #include <seqan/sequence.h>
18
19 #include <bam/sam.h>
20 using namespace std;
21
22 #include "common.h"
23 #include "reads.h"
24 #define _FBUF_SIZE 10*1024
25
26 /*
27 * bwt_map.h
28 * TopHat
29 *
30 * Created by Cole Trapnell on 11/17/08.
31 * Copyright 2008 Cole Trapnell. All rights reserved.
32 *
33 */
34
35 enum CigarOpCode
36 {
37 CIGAR_NOTHING = 0,
38 FUSION_NOTHING = 0,
39 MATCH,
40 mATCH,
41 INS,
42 iNS,
43 DEL,
44 dEL,
45 FUSION_FF,
46 FUSION_FR,
47 FUSION_RF,
48 FUSION_RR,
49 REF_SKIP,
50 rEF_SKIP,
51 SOFT_CLIP,
52 HARD_CLIP,
53 PAD
54 };
55
56 struct CigarOp
57 {
58 CigarOp(CigarOpCode o, uint32_t l) : opcode(o), length(l) {}
59 CigarOpCode opcode;
60 uint32_t length;
61
62 bool operator==(const CigarOp& rhs) const
63 {
64 return opcode == rhs.opcode && length == rhs.length;
65 }
66
67 };
68
69 typedef uint32_t ReadID;
70
71 class RefSequenceTable;
72
73 /* Stores the information from a single record of the bowtie map. A given read
74 may have many of these. Reads up to 255bp are supported.
75 */
76 struct BowtieHit
77 {
78 BowtieHit() :
79 _ref_id(0),
80 _ref_id2(0),
81 _insert_id(0),
82 _left(0),
83 _antisense_splice(false),
84 _antisense_aln(false),
85 _edit_dist(0),
86 _splice_mms(0),
87 _end(false){}
88
89 BowtieHit(uint32_t ref_id,
90 uint32_t ref_id2,
91 ReadID insert_id,
92 int left,
93 int read_len,
94 bool antisense,
95 unsigned char edit_dist,
96 bool end) :
97 _ref_id(ref_id),
98 _ref_id2(ref_id2),
99 _insert_id(insert_id),
100 _left(left),
101 _cigar(vector<CigarOp>(1,CigarOp(MATCH,read_len))),
102 _antisense_splice(false),
103 _antisense_aln(antisense),
104 _edit_dist(edit_dist),
105 _splice_mms(0),
106 _end(end)
107 {
108 assert(_cigar.capacity() == _cigar.size());
109 }
110
111 BowtieHit(uint32_t ref_id,
112 uint32_t ref_id2,
113 ReadID insert_id,
114 int left,
115 const vector<CigarOp>& cigar,
116 bool antisense_aln,
117 bool antisense_splice,
118 unsigned char edit_dist,
119 unsigned char splice_mms,
120 bool end) :
121 _ref_id(ref_id),
122 _ref_id2(ref_id2),
123 _insert_id(insert_id),
124 _left(left),
125 _cigar(cigar),
126 _antisense_splice(antisense_splice),
127 _antisense_aln(antisense_aln),
128 _edit_dist(edit_dist),
129 _splice_mms(splice_mms),
130 _end(end)
131 {
132 assert(_cigar.capacity() == _cigar.size());
133 }
134
135 int read_len() const
136 {
137 uint32_t len = 0;
138 for (size_t i = 0; i < _cigar.size(); ++i)
139 {
140 const CigarOp& op = _cigar[i];
141 switch(op.opcode)
142 {
143 case MATCH:
144 case mATCH:
145 case INS:
146 case iNS:
147 case SOFT_CLIP:
148 len += op.length;
149 break;
150 default:
151 break;
152 }
153 }
154 return len;
155 }
156
157 bool operator==(const BowtieHit& rhs) const
158 {
159 return (_insert_id == rhs._insert_id &&
160 _ref_id == rhs._ref_id &&
161 _ref_id2 == rhs._ref_id2 &&
162 _antisense_aln == rhs._antisense_aln &&
163 _left == rhs._left &&
164 _antisense_splice == rhs._antisense_splice &&
165 _edit_dist == rhs._edit_dist &&
166 /* DO NOT USE ACCEPTED IN COMPARISON */
167 _cigar == rhs._cigar);
168 }
169
170 bool operator<(const BowtieHit& rhs) const
171 {
172 if (_insert_id != rhs._insert_id)
173 return _insert_id < rhs._insert_id;
174 if (_ref_id != rhs._ref_id)
175 return _ref_id < rhs._ref_id;
176 if (_ref_id2 != rhs._ref_id2)
177 return _ref_id2 < rhs._ref_id2;
178 if (_left != rhs._left)
179 return _left < rhs._left;
180 if (_antisense_aln != rhs._antisense_aln)
181 return _antisense_aln < rhs._antisense_aln;
182 if (_edit_dist != rhs._edit_dist)
183 return _edit_dist < rhs._edit_dist;
184 if (_cigar != rhs._cigar)
185 {
186 if (_cigar.size() != rhs._cigar.size())
187 return _cigar.size() < rhs._cigar.size();
188 for (size_t i = 0; i < _cigar.size(); ++i)
189 {
190 if (!(_cigar[i] == rhs._cigar[i]))
191 return (_cigar[i].opcode < rhs._cigar[i].opcode || (_cigar[i].opcode == rhs._cigar[i].opcode && _cigar[i].length < rhs._cigar[i].length));
192 }
193 }
194 return false;
195 }
196
197 uint32_t ref_id() const { return _ref_id; }
198 uint32_t ref_id2() const { return _ref_id2; }
199 ReadID insert_id() const { return _insert_id; }
200 int left() const { return _left; }
201 int right() const
202 {
203 int r = _left;
204 for (size_t i = 0; i < _cigar.size(); ++i)
205 {
206 const CigarOp& op = _cigar[i];
207
208 switch(op.opcode)
209 {
210 case MATCH:
211 case REF_SKIP:
212 case DEL:
213 r += op.length;
214 break;
215 case mATCH:
216 case rEF_SKIP:
217 case dEL:
218 r -= op.length;
219 break;
220 case FUSION_FF:
221 case FUSION_FR:
222 case FUSION_RF:
223 case FUSION_RR:
224 r = op.length;
225 break;
226 default:
227 break;
228 }
229 }
230 return r;
231 }
232
233 bool is_spliced() const
234 {
235 for (size_t i = 0; i < _cigar.size(); ++i)
236 {
237 const CigarOp& op = _cigar[i];
238
239 if (op.opcode == REF_SKIP || op.opcode == rEF_SKIP)
240 return true;
241 }
242 return false;
243 }
244
245 CigarOpCode fusion_opcode() const
246 {
247 for (size_t i = 0; i < _cigar.size(); ++i)
248 {
249 const CigarOp& op = _cigar[i];
250
251 if (op.opcode == FUSION_FF || op.opcode == FUSION_FR || op.opcode == FUSION_RF || op.opcode == FUSION_RR)
252 return op.opcode;
253 }
254 return FUSION_NOTHING;
255 }
256
257 /*
258 * checks whether its coordinate is increasing or decreasing
259 * before its fusion or until the end.
260 */
261 bool is_forwarding_left() const
262 {
263 for (size_t i = 0; i < _cigar.size(); ++i)
264 {
265 const CigarOp& op = _cigar[i];
266
267 if (op.opcode == MATCH || op.opcode == REF_SKIP || op.opcode == INS || op.opcode == DEL)
268 return true;
269
270 if (op.opcode == mATCH || op.opcode == rEF_SKIP || op.opcode == iNS || op.opcode == dEL)
271 return false;
272
273 if (op.opcode == FUSION_FF || op.opcode == FUSION_FR || op.opcode == FUSION_RF || op.opcode == FUSION_RR)
274 break;
275 }
276
277 return true;
278 }
279
280 /*
281 * checks whether its coordinate is increasing or decreasing
282 * before its fusion or until the end.
283 */
284 bool is_forwarding_right() const
285 {
286 for (int i = _cigar.size() - 1; i >= 0; --i)
287 {
288 const CigarOp& op = _cigar[i];
289
290 if (op.opcode == MATCH || op.opcode == REF_SKIP || op.opcode == INS || op.opcode == DEL)
291 return true;
292
293 if (op.opcode == mATCH || op.opcode == rEF_SKIP || op.opcode == iNS || op.opcode == dEL)
294 return false;
295
296 if (op.opcode == FUSION_FF || op.opcode == FUSION_FR || op.opcode == FUSION_RF || op.opcode == FUSION_RR)
297 break;
298 }
299
300 return true;
301 }
302
303 bool antisense_splice() const { return _antisense_splice; }
304 bool antisense_align() const { return _antisense_aln; }
305 void antisense_align(bool antisense_align) { _antisense_aln = antisense_align; }
306
307 bool antisense_align2() const
308 {
309 /*
310 * antisense_splice is also used to indicate whether fusion is ff, fr, or rf.
311 */
312 CigarOpCode fusionOpCode = fusion_opcode();
313 if (fusionOpCode == FUSION_NOTHING || fusionOpCode == FUSION_FF || fusionOpCode == FUSION_RR)
314 return antisense_align();
315
316 return !antisense_align();
317 }
318
319 BowtieHit reverse() const
320 {
321 BowtieHit result;
322 result._ref_id = _ref_id2;
323 result._ref_id2 = _ref_id;
324 result._insert_id = _insert_id;
325
326 uint32_t right, fusion_pos;
327 right = fusion_pos = _left;
328
329 for (size_t i = 0; i < _cigar.size(); ++i)
330 {
331 const CigarOp& op = _cigar[i];
332
333 switch(op.opcode)
334 {
335 case MATCH:
336 case REF_SKIP:
337 case DEL:
338 right += op.length;
339 break;
340 case mATCH:
341 case rEF_SKIP:
342 case dEL:
343 right -= op.length;
344 break;
345 case FUSION_FF:
346 case FUSION_FR:
347 case FUSION_RF:
348 case FUSION_RR:
349 fusion_pos = right;
350 right = op.length;
351 break;
352 default:
353 break;
354 }
355 }
356
357 if (is_forwarding_left())
358 fusion_pos -= 1;
359 else
360 fusion_pos += 1;
361
362 CigarOpCode fusionOpCode = fusion_opcode();
363 if (fusionOpCode == FUSION_NOTHING || fusionOpCode == FUSION_FF || fusionOpCode == FUSION_RR)
364 {
365 if (is_forwarding_left())
366 result._left = right - 1;
367 else
368 result._left = right + 1;
369 }
370 else
371 {
372 if (fusionOpCode == FUSION_FR)
373 result._left = right + 1;
374 else
375 result._left = right - 1;
376 }
377
378 result._cigar.clear();
379 for (int i = _cigar.size() - 1; i >= 0; --i)
380 {
381 CigarOp cigar = _cigar[i];
382
383 switch(cigar.opcode)
384 {
385 case MATCH:
386 cigar.opcode = mATCH; break;
387 case mATCH:
388 cigar.opcode = MATCH; break;
389 case INS:
390 cigar.opcode = iNS; break;
391 case iNS:
392 cigar.opcode = INS; break;
393 case DEL:
394 cigar.opcode = dEL; break;
395 case dEL:
396 cigar.opcode = DEL; break;
397 case REF_SKIP:
398 cigar.opcode = rEF_SKIP; break;
399 case rEF_SKIP:
400 cigar.opcode = REF_SKIP; break;
401 case FUSION_FF:
402 case FUSION_FR:
403 case FUSION_RF:
404 case FUSION_RR:
405 cigar.length = fusion_pos; break;
406 default:
407 break;
408 }
409
410 result._cigar.push_back(cigar);
411 }
412
413 if (fusionOpCode == FUSION_FR || fusionOpCode == FUSION_RF)
414 result._antisense_aln = !_antisense_aln;
415 else
416 result._antisense_aln = _antisense_aln;
417
418 result._antisense_splice = _antisense_splice;
419 result._edit_dist = _edit_dist;
420 result._splice_mms = _splice_mms;
421 result._end = _end;
422
423 result._seq = _seq;
424 reverse_complement(result._seq);
425 result._qual = _qual;
426 ::reverse(result._qual.begin(), result._qual.end());
427
428 return result;
429 }
430
431 unsigned char edit_dist() const { return _edit_dist; }
432 unsigned char splice_mms() const { return _splice_mms; }
433
434 // For convenience, if you just want a copy of the gap intervals
435 // for this hit.
436 void gaps(vector<pair<int,int> >& gaps_out) const
437 {
438 gaps_out.clear();
439 int pos = _left;
440 for (size_t i = 0; i < _cigar.size(); ++i)
441 {
442 const CigarOp& op = _cigar[i];
443
444 switch(op.opcode)
445 {
446 case REF_SKIP:
447 gaps_out.push_back(make_pair(pos, pos + op.length - 1));
448 pos += op.length;
449 break;
450 case rEF_SKIP:
451 gaps_out.push_back(make_pair(pos, pos - op.length + 1));
452 pos -= op.length;
453 break;
454 case MATCH:
455 case DEL:
456 pos += op.length;
457 break;
458 case mATCH:
459 case dEL:
460 pos -= op.length;
461 break;
462 case FUSION_FF:
463 case FUSION_FR:
464 case FUSION_RF:
465 case FUSION_RR:
466 pos = op.length;
467 break;
468 default:
469 break;
470 }
471 }
472 }
473
474 const vector<CigarOp>& cigar() const { return _cigar; }
475
476 bool contiguous() const
477 {
478 return _cigar.size() == 1 && _cigar[0].opcode == MATCH;
479 }
480
481 const string& hitfile_rec() const { return _hitfile_rec; }
482 void hitfile_rec(const string& rec) { _hitfile_rec = rec; }
483
484 const string& seq() const { return _seq; }
485 void seq(const string& seq) { _seq = seq; }
486
487 const string& qual() const { return _qual; }
488 void qual(const string& qual) { _qual = qual; }
489
490 bool end() const { return _end; }
491 void end(bool end) { _end = end; }
492
493 // this is for debugging purpose
494 bool check_editdist_consistency(const RefSequenceTable& rt);
495
496 private:
497
498 uint32_t _ref_id;
499 uint32_t _ref_id2;
500 ReadID _insert_id; // Id of the sequencing insert
501 int _left; // Position in the reference of the left side of the alignment
502
503 vector<CigarOp> _cigar;
504
505 bool _antisense_splice; // Whether the junction spanned is on the reverse strand
506 bool _antisense_aln; // Whether the alignment is to the reverse strand
507
508 unsigned char _edit_dist; // Total mismatches (note this is not including insertions or deletions as mismatches, ie, not equivalent to NM field of a SAM record)
509 unsigned char _splice_mms; // Mismatches within min_anchor_len of a splice junction
510 string _hitfile_rec; // Points to the buffer for the record from which this hit came
511 string _seq;
512 string _qual;
513 bool _end; // Whether this segment is the last one of the read it belongs to
514 };
515
516 class ReadTable
517 {
518 public:
519
520 ReadTable() :
521 _next_id(1) {}
522
523 // This function should NEVER return zero
524 ReadID get_id(const string& name)
525 {
526 uint32_t _id = atoi(name.c_str());
527 //assert(_id);
528 _next_id = max(_next_id, (size_t)_id);
529 return _id;
530 }
531
532
533 uint32_t observation_order(ReadID ID)
534 {
535 if (ID == 0)
536 return VMAXINT32;
537 return ID;
538 }
539
540
541 size_t size() const { return _next_id; }
542
543 private:
544 size_t _next_id;
545 };
546
547 inline bool REFID_Less(uint32_t ref_id1, uint32_t ref_id2)
548 {
549 return false;
550 }
551
552 inline bool REFID_Equal(uint32_t ref_id1, uint32_t ref_id2)
553 {
554 return ref_id1 == ref_id2;
555 }
556
557 class RefSequenceTable
558 {
559 public:
560 typedef seqan::String<seqan::Dna5, seqan::Packed<seqan::Alloc<> > > Sequence;
561
562 struct SequenceInfo
563 {
564 SequenceInfo(uint32_t _order,
565 char* _name,
566 Sequence* _seq, uint32_t _len) :
567 observation_order(_order),
568 name(_name),
569 seq(_seq),
570 len(_len) {}
571
572 uint32_t observation_order;
573 char* name;
574 Sequence* seq;
575 uint32_t len;
576 };
577
578 typedef map<string, SequenceInfo> IDTable;
579 typedef IDTable::iterator iterator;
580 typedef IDTable::const_iterator const_iterator;
581
582 RefSequenceTable(bool keep_names) :
583 _next_id(1),
584 _keep_names(keep_names) {}
585
586 RefSequenceTable(const string& sam_header_filename,
587 bool keep_names) :
588 _next_id(1),
589 _keep_names(keep_names)
590 {
591 if (sam_header_filename != "")
592 {
593 samfile_t* fh = samopen(sam_header_filename.c_str(), "r", 0);
594 if (fh == 0)
595 {
596 fprintf(stderr, "Failed to open SAM header file %s\n", sam_header_filename.c_str());
597 exit(1);
598 }
599
600 for (size_t i = 0; i < (size_t)fh->header->n_targets; ++i)
601 {
602 const char* name = fh->header->target_name[i];
603 uint32_t len = fh->header->target_len[i];
604 get_id(name, NULL, len);
605 // fprintf(stderr, "SQ: %s - %u\n", name, len);
606 }
607 // order_recs_lexicographically();
608 }
609 }
610
611 // This function should NEVER return zero
612 uint32_t get_id(const string& name,
613 Sequence* seq = NULL,
614 uint32_t len = 0)
615 {
616 pair<IDTable::iterator, bool> ret =
617 _by_name.insert(make_pair(name, SequenceInfo(_next_id, NULL, NULL, 0)));
618 if (ret.second == true)
619 {
620 char* _name = NULL;
621 if (_keep_names)
622 _name = strdup(name.c_str());
623
624 ret.first->second.name = _name;
625 ret.first->second.seq = seq;
626 ret.first->second.len = len;
627 ret.first->second.observation_order = _next_id;
628 assert (_refid_to_hash.size() + 1 == _next_id);
629 _refid_to_name.push_back (name);
630
631 ++_next_id;
632 }
633 else
634 {
635 if (seq)
636 {
637 ret.first->second.seq = seq;
638 ret.first->second.len = len;
639 }
640 }
641
642 return ret.first->second.observation_order;
643 }
644
645 const char* get_name(uint32_t ID) const
646 {
647 assert (ID > 0 && ID <= _refid_to_hash.size());
648 const string& name = _refid_to_name[ID-1];
649 IDTable::const_iterator itr = _by_name.find(name);
650 if (itr != _by_name.end())
651 return itr->second.name;
652 else
653 return NULL;
654 }
655
656 uint32_t get_len(uint32_t ID) const
657 {
658 assert (ID > 0 && ID <= _refid_to_name.size());
659 const string& name = _refid_to_name[ID-1];
660 IDTable::const_iterator itr = _by_name.find(name);
661 if (itr != _by_name.end())
662 return itr->second.len;
663 else
664 return 0;
665 }
666
667 Sequence* get_seq(uint32_t ID) const
668 {
669 assert (ID > 0 && ID <= _refid_to_name.size());
670 const string& name = _refid_to_name[ID-1];
671 IDTable::const_iterator itr = _by_name.find(name);
672 if (itr != _by_name.end())
673 return itr->second.seq;
674 else
675 return NULL;
676 }
677
678 const SequenceInfo* get_info(uint32_t ID) const
679 {
680 assert (ID > 0 && ID <= _refid_to_name.size());
681 const string& name = _refid_to_name[ID-1];
682 IDTable::const_iterator itr = _by_name.find(name);
683 if (itr != _by_name.end())
684 {
685 return &(itr->second);
686 }
687 else
688 return NULL;
689 }
690
691 uint32_t observation_order(uint32_t ID) const
692 {
693 return ID;
694 }
695
696 iterator begin() { return _by_name.begin(); }
697 iterator end() { return _by_name.end(); }
698
699 const_iterator begin() const { return _by_name.begin(); }
700 const_iterator end() const { return _by_name.end(); }
701
702 size_t size() const { return _by_name.size(); }
703
704 void clear()
705 {
706 _by_name.clear();
707 }
708
709 // strnum_cmp is taken from samtools.
710 static inline int strnum_cmp(const string &a, const string &b)
711 {
712 char *pa = (char*)a.c_str(), *pb = (char*)b.c_str();
713 while (*pa && *pb)
714 {
715 if (isdigit(*pa) && isdigit(*pb))
716 {
717 long ai, bi;
718 ai = strtol(pa, &pa, 10);
719 bi = strtol(pb, &pb, 10);
720 if (ai != bi) return ai<bi? true : false;
721 }
722 else
723 {
724 if (*pa != *pb) break;
725 ++pa; ++pb;
726 }
727 }
728
729 if (*pa == *pb)
730 return (pa-a.c_str()) < (pb-b.c_str())? true : false;
731
732 return *pa<*pb? true : false;
733 }
734
735 void order_recs_lexicographically()
736 {
737 vector<string> vStr;
738 for (IDTable::iterator i = _by_name.begin(); i != _by_name.end(); ++i)
739 {
740 vStr.push_back(i->first);
741 }
742
743 ::sort(vStr.begin(), vStr.end(), RefSequenceTable::strnum_cmp);
744
745 _refid_to_name.clear();
746 size_t new_order = 1;
747 for (vector<string>::iterator i = vStr.begin(); i != vStr.end(); ++i, ++new_order)
748 {
749 _by_name.find(*i)->second.observation_order = new_order;
750 _refid_to_name.push_back(*i);
751 }
752 }
753
754 private:
755 uint32_t _next_id;
756 bool _keep_names;
757 IDTable _by_name;
758 vector<string> _refid_to_name;
759 };
760
761
762 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2);
763
764 typedef vector<BowtieHit> HitList;
765
766 /* This class stores all the hits from a Bowtie map */
767
768 class HitTable
769 {
770 public:
771
772 typedef map<uint64_t, HitList> RefHits;
773 typedef RefHits::const_iterator const_iterator;
774 typedef RefHits::iterator iterator;
775
776 HitTable() : _total_hits(0) {}
777
778 const_iterator begin() const { return _hits_for_ref.begin(); }
779 const_iterator end() const { return _hits_for_ref.end(); }
780
781 iterator begin() { return _hits_for_ref.begin(); }
782 iterator end() { return _hits_for_ref.end(); }
783
784 void add_hit(const BowtieHit& bh, bool check_uniqueness);
785
786 void finalize()
787 {
788 for (RefHits::iterator i = _hits_for_ref.begin();
789 i != _hits_for_ref.end();
790 ++i)
791 {
792 sort(i->second.begin(), i->second.end(), hit_insert_id_lt);
793 }
794 }
795
796 HitList* get_hits(uint64_t ref_id)
797 {
798 RefHits::iterator i = _hits_for_ref.find(ref_id);
799 if (i == _hits_for_ref.end())
800 return NULL;
801 else
802 return &(i->second);
803 }
804
805 uint32_t total_hits() const { return _total_hits; }
806
807 private:
808 RefHits _hits_for_ref;
809 uint32_t _total_hits;
810 };
811
812 class HitStream;
813 class HitFactory {
814 friend class HitStream;
815 public:
816 HitFactory(ReadTable& insert_table,
817 RefSequenceTable& reference_table) :
818 _insert_table(insert_table), _ref_table(reference_table)
819 {}
820 virtual ~HitFactory() {}
821 virtual void openStream(HitStream& hs)=0;
822 virtual void rewind(HitStream& hs)=0;
823 virtual void seek(HitStream& hs, int64_t offset)=0;
824 virtual void closeStream(HitStream& hs)=0;
825 BowtieHit create_hit(const string& insert_name,
826 const string& ref_name,
827 const string& ref_name2,
828 int left,
829 const vector<CigarOp>& cigar,
830 bool antisense_aln,
831 bool antisense_splice,
832 unsigned char edit_dist,
833 unsigned char splice_mms,
834 bool end);
835
836 BowtieHit create_hit(const string& insert_name,
837 const string& ref_name,
838 uint32_t left,
839 uint32_t read_len,
840 bool antisense_aln,
841 unsigned char edit_dist,
842 bool end);
843
844 virtual string hitfile_rec(HitStream& hs, const char* hit_buf)=0;
845 virtual bool next_record(HitStream& hs, const char*& buf, size_t& buf_size) = 0;
846 virtual bool get_hit_from_buf(const char* bwt_buf,
847 BowtieHit& bh,
848 bool strip_slash,
849 char* name_out = NULL,
850 char* name_tags = NULL,
851 char* seq = NULL,
852 char* qual = NULL) = 0;
853
854 protected:
855 ReadTable& _insert_table;
856 RefSequenceTable& _ref_table;
857 HitStream* _hit_stream;
858 };//class HitFactory
859
860
861 class LineHitFactory : public HitFactory {
862 //for text line-based formats like Bowtie and SAM
863 public:
864 LineHitFactory(ReadTable& insert_table,
865 RefSequenceTable& reference_table) :
866 HitFactory(insert_table, reference_table) {}
867
868 string hitfile_rec(HitStream& hs, const char* hit_buf) {
869 string r(hit_buf);
870 return r;
871 }
872 void openStream(HitStream& hs);
873 void rewind(HitStream& hs);
874 void seek(HitStream&hs, int64_t offset);
875 void closeStream(HitStream& hs);
876 bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
877 protected:
878 static const size_t _hit_buf_max_sz = 10 * 1024;
879 char _hit_buf[_hit_buf_max_sz];
880 int _line_num;
881 };
882
883 class BowtieHitFactory : public LineHitFactory {
884 public:
885 BowtieHitFactory(ReadTable& insert_table,
886 RefSequenceTable& reference_table) :
887 LineHitFactory(insert_table, reference_table) {}
888
889 bool get_hit_from_buf(const char* bwt_buf,
890 BowtieHit& bh,
891 bool strip_slash,
892 char* name_out = NULL,
893 char* name_tags = NULL,
894 char* seq = NULL,
895 char* qual = NULL);
896 };
897
898 class SplicedBowtieHitFactory : public LineHitFactory {
899 public:
900 SplicedBowtieHitFactory(ReadTable& insert_table,
901 RefSequenceTable& reference_table,
902 int anchor_length) :
903 LineHitFactory(insert_table, reference_table),
904 _anchor_length(anchor_length){}
905
906 bool get_hit_from_buf(const char* bwt_buf,
907 BowtieHit& bh,
908 bool strip_slash,
909 char* name_out = NULL,
910 char* name_tags = NULL,
911 char* seq = NULL,
912 char* qual = NULL);
913 private:
914 int _anchor_length;
915 int _seg_offset;
916 int _size_buf;
917 };
918
919 class SplicedSAMHitFactory : public LineHitFactory {
920 public:
921 SplicedSAMHitFactory(ReadTable& insert_table,
922 RefSequenceTable& reference_table,
923 int anchor_length) :
924 LineHitFactory(insert_table, reference_table),
925 _anchor_length(anchor_length){}
926
927 bool get_hit_from_buf(const char* bwt_buf,
928 BowtieHit& bh,
929 bool strip_slash,
930 char* name_out = NULL,
931 char* name_tags = NULL,
932 char* seq = NULL,
933 char* qual = NULL);
934 private:
935 int _anchor_length;
936 int _seg_offset;
937 int _size_buf;
938 };
939
940
941 class SAMHitFactory : public LineHitFactory {
942 public:
943 SAMHitFactory(ReadTable& insert_table,
944 RefSequenceTable& reference_table) :
945 LineHitFactory(insert_table, reference_table) {}
946
947 bool get_hit_from_buf(const char* bwt_buf,
948 BowtieHit& bh,
949 bool strip_slash,
950 char* name_out = NULL,
951 char* name_tags = NULL,
952 char* seq = NULL,
953 char* qual = NULL);
954 };
955
956
957 /******************************************************************************
958 BAMHitFactory turns SAM alignments into BowtieHits
959 *******************************************************************************/
960 class BAMHitFactory : public HitFactory {
961 public:
962
963 BAMHitFactory(ReadTable& insert_table,
964 RefSequenceTable& reference_table) :
965 HitFactory(insert_table, reference_table)
966 {
967 _sam_header=NULL;
968 }
969 void openStream(HitStream& hs);
970 void rewind(HitStream& hs);
971 void seek(HitStream& hs, int64_t offset);
972 void closeStream(HitStream& hs);
973
974 bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
975
976 /*void mark_curr_pos()
977 {
978 _curr_pos = bgzf_tell(((samfile_t*)_hit_file)->x.bam);
979 }
980
981
982 void undo_hit()
983 {
984 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _curr_pos, SEEK_SET);
985 _eof = false;
986 }
987 */
988 string hitfile_rec(HitStream& hs, const char* hit_buf);
989
990 bool get_hit_from_buf(const char* bwt_buf,
991 BowtieHit& bh,
992 bool strip_slash,
993 char* name_out = NULL,
994 char* name_tags = NULL,
995 char* seq = NULL,
996 char* qual = NULL);
997 protected:
998 //int64_t _curr_pos;
999 //int64_t _beginning;
1000 bam1_t _next_hit;
1001 bam_header_t* _sam_header;
1002 bool inspect_header(HitStream& hs);
1003 };
1004
1005 class SplicedBAMHitFactory : public BAMHitFactory {
1006 public:
1007 SplicedBAMHitFactory(ReadTable& insert_table,
1008 RefSequenceTable& reference_table,
1009 int anchor_length) :
1010 BAMHitFactory(insert_table, reference_table),
1011 _anchor_length(anchor_length)
1012 {
1013 }
1014
1015 bool get_hit_from_buf(const char* bwt_buf,
1016 BowtieHit& bh,
1017 bool strip_slash,
1018 char* name_out = NULL,
1019 char* name_tags = NULL,
1020 char* seq = NULL,
1021 char* qual = NULL);
1022
1023 private:
1024 int _anchor_length;
1025 int _seg_offset;
1026 int _size_buf;
1027 };
1028
1029
1030 struct HitsForRead
1031 {
1032 HitsForRead() : insert_id(0) {}
1033 uint64_t insert_id;
1034 vector<BowtieHit> hits;
1035 };
1036
1037 class HitStream
1038 {
1039 friend class HitFactory;
1040 friend class LineHitFactory;
1041 friend class BAMHitFactory;
1042 //private:
1043 HitFactory* _factory;
1044 bool _spliced;
1045 bool _strip_slash;
1046 BowtieHit buffered_hit;
1047 bool _keep_bufs;
1048 bool _keep_seqs;
1049 bool _keep_quals;
1050 bool _from_bowtie;
1051 void* _hit_file;
1052 string _hit_file_name;
1053 FZPipe* _fzpipe;
1054 bool _eof;
1055
1056 public:
1057 HitStream(void* hit_file, //could be FILE* or samfile_t*
1058 HitFactory* hit_factory,
1059 bool spliced,
1060 bool strip_slash,
1061 bool keep_bufs,
1062 bool keep_seqs = false,
1063 bool keep_quals = false,
1064 bool from_bowtie = false) :
1065 _factory(hit_factory),
1066 _spliced(spliced),
1067 _strip_slash(strip_slash),
1068 buffered_hit(BowtieHit()),
1069 _keep_bufs(keep_bufs),
1070 _keep_seqs(keep_seqs),
1071 _keep_quals(keep_quals),
1072 _from_bowtie(from_bowtie),
1073 _hit_file(hit_file),
1074 _hit_file_name(),
1075 _fzpipe(NULL),
1076 _eof(false)
1077 {
1078 primeStream();
1079 }
1080
1081 HitStream(const string& hit_filename,
1082 HitFactory* hit_factory,
1083 bool spliced,
1084 bool strip_slash,
1085 bool keep_bufs,
1086 bool keep_seqs = false,
1087 bool keep_quals = false,
1088 bool from_bowtie = false) :
1089 _factory(hit_factory),
1090 _spliced(spliced),
1091 _strip_slash(strip_slash),
1092 buffered_hit(BowtieHit()),
1093 _keep_bufs(keep_bufs),
1094 _keep_seqs(keep_seqs),
1095 _keep_quals(keep_quals),
1096 _from_bowtie(from_bowtie),
1097 _hit_file(NULL),
1098 _hit_file_name(hit_filename),
1099 _fzpipe(NULL),
1100 _eof(false)
1101 {
1102 _factory->openStream(*this);
1103 primeStream();
1104 }
1105
1106 HitStream(FZPipe& hit_filepipe,
1107 HitFactory* hit_factory,
1108 bool spliced,
1109 bool strip_slash,
1110 bool keep_bufs,
1111 bool keep_seqs = false,
1112 bool keep_quals = false,
1113 bool from_bowtie = false) :
1114 _factory(hit_factory),
1115 _spliced(spliced),
1116 _strip_slash(strip_slash),
1117 buffered_hit(BowtieHit()),
1118 _keep_bufs(keep_bufs),
1119 _keep_seqs(keep_seqs),
1120 _keep_quals(keep_quals),
1121 _from_bowtie(from_bowtie),
1122 _hit_file(NULL),
1123 _hit_file_name(),
1124 _fzpipe(&hit_filepipe),
1125 _eof(false)
1126 {
1127 _hit_file=_fzpipe->file;
1128 primeStream();
1129 }
1130
1131 void primeStream() { //why?
1132 // Prime the stream by reading a single hit into the buffered_hit
1133 HitsForRead dummy = HitsForRead();
1134 next_read_hits(dummy);
1135 }
1136 bool eof() { return _eof; }
1137 bool ready() { return (_hit_file!=NULL); }
1138 void reset() {
1139 _factory->rewind(*this);
1140 _eof=false;
1141 // re-prime the stream;
1142 buffered_hit = BowtieHit();
1143 primeStream();
1144 }
1145 void seek(int64_t offset) {
1146 _factory->seek(*this, offset);
1147 _eof = false;
1148 buffered_hit = BowtieHit();
1149 primeStream();
1150 }
1151
1152 bool next_read_hits(HitsForRead& hits_for_read)
1153 {
1154 hits_for_read.hits.clear();
1155 hits_for_read.insert_id = 0;
1156
1157 //if (!_hit_file || (feof(_hit_file) && buffered_hit.insert_id() == 0))
1158 // return false;
1159 if (!this->ready())
1160 //err_die("Error: next_read_hits() called on HitFactory with no file handle\n");
1161 return false;
1162 if (this->eof() && buffered_hit.insert_id() == 0) {
1163 return false;
1164 }
1165
1166 //char bwt_buf[2048]; bwt_buf[0] = 0;
1167 char bwt_seq[2048]; bwt_seq[0] = 0;
1168 char bwt_qual[2048]; bwt_qual[0] = 0;
1169
1170 char* seq = _keep_seqs ? bwt_seq : NULL;
1171 char* qual = _keep_quals ? bwt_qual : NULL;
1172
1173 hits_for_read.insert_id = buffered_hit.insert_id();
1174 if (hits_for_read.insert_id)
1175 hits_for_read.hits.push_back(buffered_hit);
1176 const char* hit_buf;
1177 size_t hit_buf_size = 0;
1178 while (true) {
1179
1180 if (!_factory->next_record(*this, hit_buf, hit_buf_size)) {
1181 buffered_hit = BowtieHit();
1182 break; }
1183
1184 //string clean_buf = bwt_buf;
1185 // Get a new record from the tab-delimited Bowtie map
1186 BowtieHit bh;
1187 if (_factory->get_hit_from_buf(hit_buf, bh, _strip_slash,
1188 NULL, NULL, seq, qual)) {
1189 if (_keep_bufs)
1190 bh.hitfile_rec(_factory->hitfile_rec(*this, hit_buf));
1191
1192 if (_keep_seqs)
1193 bh.seq(seq);
1194
1195 if (_keep_quals) {
1196 // when it comes to convert from qual in color to qual in bp,
1197 // we need to fill in the two extream qual values using the adjacent qual values.
1198 size_t qual_len = strlen(qual);
1199 if (color && !color_out && qual_len > 2) {
1200 qual[0] = qual[1];
1201 qual[qual_len-1] = qual[qual_len-2];
1202 }
1203 bh.qual(qual);
1204 }
1205
1206 if (bh.insert_id() == hits_for_read.insert_id) {
1207 hits_for_read.hits.push_back(bh);
1208 }
1209 else {
1210 buffered_hit = bh;
1211 break;
1212 }
1213 } //hit parsed
1214 } //while reading hits
1215
1216 return (!hits_for_read.hits.empty() && hits_for_read.insert_id != 0);
1217 }
1218
1219 uint64_t next_group_id() const
1220 {
1221 return buffered_hit.insert_id();
1222 }
1223 bool fromBowtie() { return _from_bowtie; }
1224 };
1225
1226 typedef uint32_t MateStatusMask;
1227
1228 void get_mapped_reads(FILE* bwtf,
1229 HitTable& hits,
1230 HitFactory& hit_factory,
1231 bool strip_slash,
1232 bool verbose = false);
1233
1234 //bool left_status_better(MateStatusMask left, MateStatusMask right);
1235 //bool status_equivalent(MateStatusMask left, MateStatusMask right);
1236 typedef uint32_t MateStatusMask;
1237
1238 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC);
1239 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC);
1240
1241 void accept_all_hits(HitTable& hits);
1242
1243 //print BowtieHit as BAM record
1244 void print_bamhit(GBamWriter& wbam,
1245 const char* read_name,
1246 const BowtieHit& bh,
1247 const char* ref_name,
1248 const char* ref_name2,
1249 const char* sequence,
1250 const char* qualities,
1251 bool from_bowtie = false);
1252
1253
1254 void extract_partial_hits(const BowtieHit& bh, const string& seq, const string& qual,
1255 char* cigar1, char* cigar2, string& seq1, string& seq2,
1256 string& qual1, string& qual2, int& left1, int& left2);
1257
1258 /**
1259 * Convert a vector of CigarOps to a string representation
1260 */
1261 std::string print_cigar(const vector<CigarOp>& bh_cigar);
1262
1263 #endif