ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.h
Revision: 165
Committed: Fri Feb 10 01:04:05 2012 UTC (8 years, 1 month ago) by gpertea
File size: 34762 byte(s)
Log Message:
sync back with Daehwan. Test bam_merge for memory leaks!

Line File contents
1 #ifndef BWT_MAP_H
2 #define BWT_MAP_H
3
4 #ifdef HAVE_CONFIG_H
5 #include <config.h>
6 #endif
7
8 #include <stdint.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string>
12 #include <map>
13 #include <vector>
14 #include <cassert>
15 #include <cstring>
16 #include <algorithm>
17 #include <seqan/sequence.h>
18
19 #include <bam/sam.h>
20 using namespace std;
21
22 #include "common.h"
23 #include "reads.h"
24 #define _FBUF_SIZE 10*1024
25
26 /*
27 * bwt_map.h
28 * TopHat
29 *
30 * Created by Cole Trapnell on 11/17/08.
31 * Copyright 2008 Cole Trapnell. All rights reserved.
32 *
33 */
34
35 enum CigarOpCode
36 {
37 CIGAR_NOTHING = 0,
38 FUSION_NOTHING = 0,
39 MATCH,
40 mATCH,
41 INS,
42 iNS,
43 DEL,
44 dEL,
45 FUSION_FF,
46 FUSION_FR,
47 FUSION_RF,
48 FUSION_RR,
49 REF_SKIP,
50 rEF_SKIP,
51 SOFT_CLIP,
52 HARD_CLIP,
53 PAD
54 };
55
56 struct CigarOp
57 {
58 CigarOp(CigarOpCode o, uint32_t l) : opcode(o), length(l) {}
59 CigarOpCode opcode;
60 uint32_t length;
61
62 bool operator==(const CigarOp& rhs) const
63 {
64 return opcode == rhs.opcode && length == rhs.length;
65 }
66
67 };
68
69 typedef uint32_t ReadID;
70
71 class RefSequenceTable;
72
73 /* Stores the information from a single record of the bowtie map. A given read
74 may have many of these. Reads up to 255bp are supported.
75 */
76 struct BowtieHit
77 {
78 BowtieHit() :
79 _ref_id(0),
80 _ref_id2(0),
81 _insert_id(0),
82 _left(0),
83 _antisense_splice(false),
84 _antisense_aln(false),
85 _edit_dist(0),
86 _splice_mms(0),
87 _end(false){}
88
89 BowtieHit(uint32_t ref_id,
90 uint32_t ref_id2,
91 ReadID insert_id,
92 int left,
93 int read_len,
94 bool antisense,
95 unsigned char edit_dist,
96 bool end) :
97 _ref_id(ref_id),
98 _ref_id2(ref_id2),
99 _insert_id(insert_id),
100 _left(left),
101 _cigar(vector<CigarOp>(1,CigarOp(MATCH,read_len))),
102 _antisense_splice(false),
103 _antisense_aln(antisense),
104 _edit_dist(edit_dist),
105 _splice_mms(0),
106 _end(end)
107 {
108 assert(_cigar.capacity() == _cigar.size());
109 }
110
111 BowtieHit(uint32_t ref_id,
112 uint32_t ref_id2,
113 ReadID insert_id,
114 int left,
115 const vector<CigarOp>& cigar,
116 bool antisense_aln,
117 bool antisense_splice,
118 unsigned char edit_dist,
119 unsigned char splice_mms,
120 bool end) :
121 _ref_id(ref_id),
122 _ref_id2(ref_id2),
123 _insert_id(insert_id),
124 _left(left),
125 _cigar(cigar),
126 _antisense_splice(antisense_splice),
127 _antisense_aln(antisense_aln),
128 _edit_dist(edit_dist),
129 _splice_mms(splice_mms),
130 _end(end)
131 {
132 assert(_cigar.capacity() == _cigar.size());
133 }
134
135 int read_len() const
136 {
137 uint32_t len = 0;
138 for (size_t i = 0; i < _cigar.size(); ++i)
139 {
140 const CigarOp& op = _cigar[i];
141 switch(op.opcode)
142 {
143 case MATCH:
144 case mATCH:
145 case INS:
146 case iNS:
147 case SOFT_CLIP:
148 len += op.length;
149 break;
150 default:
151 break;
152 }
153 }
154 return len;
155 }
156
157 bool operator==(const BowtieHit& rhs) const
158 {
159 return (_insert_id == rhs._insert_id &&
160 _ref_id == rhs._ref_id &&
161 _ref_id2 == rhs._ref_id2 &&
162 _antisense_aln == rhs._antisense_aln &&
163 _left == rhs._left &&
164 _antisense_splice == rhs._antisense_splice &&
165 _edit_dist == rhs._edit_dist &&
166 /* DO NOT USE ACCEPTED IN COMPARISON */
167 _cigar == rhs._cigar);
168 }
169
170 bool operator<(const BowtieHit& rhs) const
171 {
172 if (_insert_id != rhs._insert_id)
173 return _insert_id < rhs._insert_id;
174 if (_ref_id != rhs._ref_id)
175 return _ref_id < rhs._ref_id;
176 if (_ref_id2 != rhs._ref_id2)
177 return _ref_id2 < rhs._ref_id2;
178 if (_left != rhs._left)
179 return _left < rhs._left;
180 if (_antisense_aln != rhs._antisense_aln)
181 return _antisense_aln < rhs._antisense_aln;
182 if (_edit_dist != rhs._edit_dist)
183 return _edit_dist < rhs._edit_dist;
184 if (_cigar != rhs._cigar)
185 {
186 if (_cigar.size() != rhs._cigar.size())
187 return _cigar.size() < rhs._cigar.size();
188 for (size_t i = 0; i < _cigar.size(); ++i)
189 {
190 if (!(_cigar[i] == rhs._cigar[i]))
191 return (_cigar[i].opcode < rhs._cigar[i].opcode || (_cigar[i].opcode == rhs._cigar[i].opcode && _cigar[i].length < rhs._cigar[i].length));
192 }
193 }
194 return false;
195 }
196
197 uint32_t ref_id() const { return _ref_id; }
198 uint32_t ref_id2() const { return _ref_id2; }
199 ReadID insert_id() const { return _insert_id; }
200 int left() const { return _left; }
201 int right() const
202 {
203 int r = _left;
204 for (size_t i = 0; i < _cigar.size(); ++i)
205 {
206 const CigarOp& op = _cigar[i];
207
208 switch(op.opcode)
209 {
210 case MATCH:
211 case REF_SKIP:
212 case DEL:
213 r += op.length;
214 break;
215 case mATCH:
216 case rEF_SKIP:
217 case dEL:
218 r -= op.length;
219 break;
220 case FUSION_FF:
221 case FUSION_FR:
222 case FUSION_RF:
223 case FUSION_RR:
224 r = op.length;
225 break;
226 default:
227 break;
228 }
229 }
230 return r;
231 }
232
233 bool is_spliced() const
234 {
235 for (size_t i = 0; i < _cigar.size(); ++i)
236 {
237 const CigarOp& op = _cigar[i];
238
239 if (op.opcode == REF_SKIP || op.opcode == rEF_SKIP)
240 return true;
241 }
242 return false;
243 }
244
245 CigarOpCode fusion_opcode() const
246 {
247 for (size_t i = 0; i < _cigar.size(); ++i)
248 {
249 const CigarOp& op = _cigar[i];
250
251 if (op.opcode == FUSION_FF || op.opcode == FUSION_FR || op.opcode == FUSION_RF || op.opcode == FUSION_RR)
252 return op.opcode;
253 }
254 return FUSION_NOTHING;
255 }
256
257 /*
258 * checks whether its coordinate is increasing or decreasing
259 * before its fusion or until the end.
260 */
261 bool is_forwarding_left() const
262 {
263 for (size_t i = 0; i < _cigar.size(); ++i)
264 {
265 const CigarOp& op = _cigar[i];
266
267 if (op.opcode == MATCH || op.opcode == REF_SKIP || op.opcode == INS || op.opcode == DEL)
268 return true;
269
270 if (op.opcode == mATCH || op.opcode == rEF_SKIP || op.opcode == iNS || op.opcode == dEL)
271 return false;
272
273 if (op.opcode == FUSION_FF || op.opcode == FUSION_FR || op.opcode == FUSION_RF || op.opcode == FUSION_RR)
274 break;
275 }
276
277 return true;
278 }
279
280 /*
281 * checks whether its coordinate is increasing or decreasing
282 * before its fusion or until the end.
283 */
284 bool is_forwarding_right() const
285 {
286 for (int i = _cigar.size() - 1; i >= 0; --i)
287 {
288 const CigarOp& op = _cigar[i];
289
290 if (op.opcode == MATCH || op.opcode == REF_SKIP || op.opcode == INS || op.opcode == DEL)
291 return true;
292
293 if (op.opcode == mATCH || op.opcode == rEF_SKIP || op.opcode == iNS || op.opcode == dEL)
294 return false;
295
296 if (op.opcode == FUSION_FF || op.opcode == FUSION_FR || op.opcode == FUSION_RF || op.opcode == FUSION_RR)
297 break;
298 }
299
300 return true;
301 }
302
303 bool antisense_splice() const { return _antisense_splice; }
304 bool antisense_align() const { return _antisense_aln; }
305 void antisense_align(bool antisense_align) { _antisense_aln = antisense_align; }
306
307 bool antisense_align2() const
308 {
309 /*
310 * antisense_splice is also used to indicate whether fusion is ff, fr, or rf.
311 */
312 CigarOpCode fusionOpCode = fusion_opcode();
313 if (fusionOpCode == FUSION_NOTHING || fusionOpCode == FUSION_FF || fusionOpCode == FUSION_RR)
314 return antisense_align();
315
316 return !antisense_align();
317 }
318
319 BowtieHit reverse() const
320 {
321 BowtieHit result;
322 result._ref_id = _ref_id2;
323 result._ref_id2 = _ref_id;
324 result._insert_id = _insert_id;
325
326 uint32_t right, fusion_pos;
327 right = fusion_pos = _left;
328
329 for (size_t i = 0; i < _cigar.size(); ++i)
330 {
331 const CigarOp& op = _cigar[i];
332
333 switch(op.opcode)
334 {
335 case MATCH:
336 case REF_SKIP:
337 case DEL:
338 right += op.length;
339 break;
340 case mATCH:
341 case rEF_SKIP:
342 case dEL:
343 right -= op.length;
344 break;
345 case FUSION_FF:
346 case FUSION_FR:
347 case FUSION_RF:
348 case FUSION_RR:
349 fusion_pos = right;
350 right = op.length;
351 break;
352 default:
353 break;
354 }
355 }
356
357 if (is_forwarding_left())
358 fusion_pos -= 1;
359 else
360 fusion_pos += 1;
361
362 CigarOpCode fusionOpCode = fusion_opcode();
363 if (fusionOpCode == FUSION_NOTHING || fusionOpCode == FUSION_FF || fusionOpCode == FUSION_RR)
364 {
365 if (is_forwarding_left())
366 result._left = right - 1;
367 else
368 result._left = right + 1;
369 }
370 else
371 {
372 if (fusionOpCode == FUSION_FR)
373 result._left = right + 1;
374 else
375 result._left = right - 1;
376 }
377
378 result._cigar.clear();
379 for (int i = _cigar.size() - 1; i >= 0; --i)
380 {
381 CigarOp cigar = _cigar[i];
382
383 switch(cigar.opcode)
384 {
385 case MATCH:
386 cigar.opcode = mATCH; break;
387 case mATCH:
388 cigar.opcode = MATCH; break;
389 case INS:
390 cigar.opcode = iNS; break;
391 case iNS:
392 cigar.opcode = INS; break;
393 case DEL:
394 cigar.opcode = dEL; break;
395 case dEL:
396 cigar.opcode = DEL; break;
397 case REF_SKIP:
398 cigar.opcode = rEF_SKIP; break;
399 case rEF_SKIP:
400 cigar.opcode = REF_SKIP; break;
401 case FUSION_FF:
402 case FUSION_FR:
403 case FUSION_RF:
404 case FUSION_RR:
405 cigar.length = fusion_pos; break;
406 default:
407 break;
408 }
409
410 result._cigar.push_back(cigar);
411 }
412
413 if (fusionOpCode == FUSION_FR || fusionOpCode == FUSION_RF)
414 result._antisense_aln = !_antisense_aln;
415 else
416 result._antisense_aln = _antisense_aln;
417
418 result._antisense_splice = _antisense_splice;
419 result._edit_dist = _edit_dist;
420 result._splice_mms = _splice_mms;
421 result._end = _end;
422
423 result._seq = _seq;
424 reverse_complement(result._seq);
425 result._qual = _qual;
426 ::reverse(result._qual.begin(), result._qual.end());
427
428 return result;
429 }
430
431 unsigned char edit_dist() const { return _edit_dist; }
432 unsigned char splice_mms() const { return _splice_mms; }
433
434 // For convenience, if you just want a copy of the gap intervals
435 // for this hit.
436 void gaps(vector<pair<int,int> >& gaps_out) const
437 {
438 gaps_out.clear();
439 int pos = _left;
440 for (size_t i = 0; i < _cigar.size(); ++i)
441 {
442 const CigarOp& op = _cigar[i];
443
444 switch(op.opcode)
445 {
446 case REF_SKIP:
447 gaps_out.push_back(make_pair(pos, pos + op.length - 1));
448 pos += op.length;
449 break;
450 case rEF_SKIP:
451 gaps_out.push_back(make_pair(pos, pos - op.length + 1));
452 pos -= op.length;
453 break;
454 case MATCH:
455 case DEL:
456 pos += op.length;
457 break;
458 case mATCH:
459 case dEL:
460 pos -= op.length;
461 break;
462 case FUSION_FF:
463 case FUSION_FR:
464 case FUSION_RF:
465 case FUSION_RR:
466 pos = op.length;
467 break;
468 default:
469 break;
470 }
471 }
472 }
473
474 const vector<CigarOp>& cigar() const { return _cigar; }
475
476 bool contiguous() const
477 {
478 return _cigar.size() == 1 && _cigar[0].opcode == MATCH;
479 }
480
481 const string& hitfile_rec() const { return _hitfile_rec; }
482 void hitfile_rec(const string& rec) { _hitfile_rec = rec; }
483
484 const string& seq() const { return _seq; }
485 void seq(const string& seq) { _seq = seq; }
486
487 const string& qual() const { return _qual; }
488 void qual(const string& qual) { _qual = qual; }
489
490 bool end() const { return _end; }
491 void end(bool end) { _end = end; }
492
493 // this is for debugging purpose
494 bool check_editdist_consistency(const RefSequenceTable& rt, bool bDebug = false);
495
496 private:
497
498 uint32_t _ref_id;
499 uint32_t _ref_id2;
500 ReadID _insert_id; // Id of the sequencing insert
501 int _left; // Position in the reference of the left side of the alignment
502
503 vector<CigarOp> _cigar;
504
505 bool _antisense_splice; // Whether the junction spanned is on the reverse strand
506 bool _antisense_aln; // Whether the alignment is to the reverse strand
507
508 unsigned char _edit_dist; // Total mismatches (note this is not including insertions or deletions as mismatches, ie, not equivalent to NM field of a SAM record)
509 unsigned char _splice_mms; // Mismatches within min_anchor_len of a splice junction
510 string _hitfile_rec; // Points to the buffer for the record from which this hit came
511 string _seq;
512 string _qual;
513 bool _end; // Whether this segment is the last one of the read it belongs to
514 };
515
516 class ReadTable
517 {
518 public:
519
520 ReadTable() :
521 _next_id(1) {}
522
523 // This function should NEVER return zero
524 ReadID get_id(const string& name)
525 {
526 uint32_t _id = atoi(name.c_str());
527 //assert(_id);
528 _next_id = max(_next_id, (size_t)_id);
529 return _id;
530 }
531
532
533 uint32_t observation_order(ReadID ID)
534 {
535 if (ID == 0)
536 return VMAXINT32;
537 return ID;
538 }
539
540
541 size_t size() const { return _next_id; }
542
543 private:
544 size_t _next_id;
545 };
546
547 inline bool REFID_Less(uint32_t ref_id1, uint32_t ref_id2)
548 {
549 return false;
550 }
551
552 inline bool REFID_Equal(uint32_t ref_id1, uint32_t ref_id2)
553 {
554 return ref_id1 == ref_id2;
555 }
556
557 #if 0
558 class RefSequenceTable
559 {
560 public:
561
562 typedef seqan::String<seqan::Dna5, seqan::Packed<seqan::Alloc<> > > Sequence;
563
564 struct SequenceInfo
565 {
566 SequenceInfo(uint32_t _order,
567 char* _name,
568 Sequence* _seq, uint32_t _len) :
569 observation_order(_order),
570 name(_name),
571 seq(_seq),
572 len(_len) {}
573
574 uint32_t observation_order;
575 char* name;
576 Sequence* seq;
577 uint32_t len;
578 };
579
580 typedef map<string, uint64_t> IDTable;
581 typedef map<uint32_t, SequenceInfo> InvertedIDTable;
582 typedef InvertedIDTable::iterator iterator;
583 typedef InvertedIDTable::const_iterator const_iterator;
584
585 RefSequenceTable(bool keep_names, bool keep_seqs = false) :
586 _next_id(1),
587 _keep_names(keep_names) {}
588
589 RefSequenceTable(const string& sam_header_filename,
590 bool keep_names,
591 bool keep_seqs = false) :
592 _next_id(1),
593 _keep_names(keep_names)
594 {
595 if (sam_header_filename != "")
596 {
597 samfile_t* fh = samopen(sam_header_filename.c_str(), "r", 0);
598 if (fh == 0) {
599 fprintf(stderr, "Failed to open SAM header file %s\n", sam_header_filename.c_str());
600 exit(1);
601 }
602
603 for (size_t i = 0; i < (size_t)fh->header->n_targets; ++i)
604 {
605 const char* name = fh->header->target_name[i];
606 uint32_t len = fh->header->target_len[i];
607 get_id(name, NULL, len);
608 //fprintf(stderr, "SQ: %s - %d\n", name, len);
609 }
610 }
611 }
612
613 // This function should NEVER return zero
614 uint32_t get_id(const string& name,
615 Sequence* seq = NULL,
616 uint32_t len = 0)
617 {
618 uint32_t _id = hash_string(name.c_str());
619 pair<InvertedIDTable::iterator, bool> ret =
620 _by_id.insert(make_pair(_id, SequenceInfo(_next_id, NULL, NULL, 0)));
621 if (ret.second == true)
622 {
623 char* _name = NULL;
624 if (_keep_names)
625 _name = strdup(name.c_str());
626 ret.first->second.name = _name;
627 ret.first->second.seq = seq;
628 ret.first->second.len = len;
629 ++_next_id;
630 }
631 assert (_id);
632 return _id;
633 }
634
635 const char* get_name(uint32_t ID) const
636 {
637 InvertedIDTable::const_iterator itr = _by_id.find(ID);
638 if (itr != _by_id.end())
639 return itr->second.name;
640 else
641 return NULL;
642 }
643
644 uint32_t get_len(uint32_t ID) const
645 {
646 InvertedIDTable::const_iterator itr = _by_id.find(ID);
647 if (itr != _by_id.end())
648 return itr->second.len;
649 else
650 return 0;
651 }
652
653 Sequence* get_seq(uint32_t ID) const
654 {
655 InvertedIDTable::const_iterator itr = _by_id.find(ID);
656 if (itr != _by_id.end())
657 return itr->second.seq;
658 else
659 return NULL;
660 }
661
662 const SequenceInfo* get_info(uint32_t ID) const
663 {
664
665 InvertedIDTable::const_iterator itr = _by_id.find(ID);
666 if (itr != _by_id.end())
667 {
668 return &(itr->second);
669 }
670 else
671 return NULL;
672 }
673
674 int observation_order(uint32_t ID) const
675 {
676 InvertedIDTable::const_iterator itr = _by_id.find(ID);
677 if (itr != _by_id.end())
678 {
679 return itr->second.observation_order;
680 }
681 else
682 return -1;
683 }
684
685 iterator begin() { return _by_id.begin(); }
686 iterator end() { return _by_id.end(); }
687
688 const_iterator begin() const { return _by_id.begin(); }
689 const_iterator end() const { return _by_id.end(); }
690
691 size_t size() const { return _by_id.size(); }
692
693 void clear()
694 {
695 //_by_name.clear();
696 _by_id.clear();
697 }
698
699 // daehwan
700 // This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash
701 static inline uint32_t hash_string(const char* __s)
702 {
703 uint32_t hash = 0x811c9dc5;
704 for ( ; *__s; ++__s)
705 {
706 hash *= 16777619;
707 hash ^= *__s;
708 }
709 return hash;
710 }
711
712 private:
713
714 //IDTable _by_name;
715 uint32_t _next_id;
716 bool _keep_names;
717 InvertedIDTable _by_id;
718 };
719
720 #else
721
722 class RefSequenceTable
723 {
724 public:
725 typedef seqan::String<seqan::Dna5, seqan::Packed<seqan::Alloc<> > > Sequence;
726
727 struct SequenceInfo
728 {
729 SequenceInfo(uint32_t _order,
730 char* _name,
731 Sequence* _seq, uint32_t _len) :
732 observation_order(_order),
733 name(_name),
734 seq(_seq),
735 len(_len) {}
736
737 uint32_t observation_order;
738 char* name;
739 Sequence* seq;
740 uint32_t len;
741 };
742
743 typedef map<string, SequenceInfo> IDTable;
744 typedef IDTable::iterator iterator;
745 typedef IDTable::const_iterator const_iterator;
746
747 RefSequenceTable(bool keep_names) :
748 _next_id(1),
749 _keep_names(keep_names) {}
750
751 RefSequenceTable(const string& sam_header_filename,
752 bool keep_names) :
753 _next_id(1),
754 _keep_names(keep_names)
755 {
756 if (sam_header_filename != "")
757 {
758 samfile_t* fh = samopen(sam_header_filename.c_str(), "r", 0);
759 if (fh == 0)
760 {
761 fprintf(stderr, "Failed to open SAM header file %s\n", sam_header_filename.c_str());
762 exit(1);
763 }
764
765 for (size_t i = 0; i < (size_t)fh->header->n_targets; ++i)
766 {
767 const char* name = fh->header->target_name[i];
768 uint32_t len = fh->header->target_len[i];
769 get_id(name, NULL, len);
770 // fprintf(stderr, "SQ: %s - %u\n", name, len);
771 }
772 // order_recs_lexicographically();
773 }
774 }
775
776 // This function should NEVER return zero
777 uint32_t get_id(const string& name,
778 Sequence* seq = NULL,
779 uint32_t len = 0)
780 {
781 pair<IDTable::iterator, bool> ret =
782 _by_name.insert(make_pair(name, SequenceInfo(_next_id, NULL, NULL, 0)));
783 if (ret.second == true)
784 {
785 char* _name = NULL;
786 if (_keep_names)
787 _name = strdup(name.c_str());
788
789 ret.first->second.name = _name;
790 ret.first->second.seq = seq;
791 ret.first->second.len = len;
792 ret.first->second.observation_order = _next_id;
793 //assert (_refid_to_hash.size() + 1 == _next_id);
794 _refid_to_name.push_back (name);
795
796 ++_next_id;
797 }
798 else
799 {
800 if (seq)
801 {
802 ret.first->second.seq = seq;
803 ret.first->second.len = len;
804 }
805 }
806
807 return ret.first->second.observation_order;
808 }
809
810 const char* get_name(uint32_t ID) const
811 {
812 //assert (ID > 0 && ID <= _refid_to_hash.size());
813 const string& name = _refid_to_name[ID-1];
814 IDTable::const_iterator itr = _by_name.find(name);
815 if (itr != _by_name.end())
816 return itr->second.name;
817 else
818 return NULL;
819 }
820
821 uint32_t get_len(uint32_t ID) const
822 {
823 assert (ID > 0 && ID <= _refid_to_name.size());
824 const string& name = _refid_to_name[ID-1];
825 IDTable::const_iterator itr = _by_name.find(name);
826 if (itr != _by_name.end())
827 return itr->second.len;
828 else
829 return 0;
830 }
831
832 Sequence* get_seq(uint32_t ID) const
833 {
834 assert (ID > 0 && ID <= _refid_to_name.size());
835 const string& name = _refid_to_name[ID-1];
836 IDTable::const_iterator itr = _by_name.find(name);
837 if (itr != _by_name.end())
838 return itr->second.seq;
839 else
840 return NULL;
841 }
842
843 const SequenceInfo* get_info(uint32_t ID) const
844 {
845 assert (ID > 0 && ID <= _refid_to_name.size());
846 const string& name = _refid_to_name[ID-1];
847 IDTable::const_iterator itr = _by_name.find(name);
848 if (itr != _by_name.end())
849 {
850 return &(itr->second);
851 }
852 else
853 return NULL;
854 }
855
856 uint32_t observation_order(uint32_t ID) const
857 {
858 return ID;
859 }
860
861 iterator begin() { return _by_name.begin(); }
862 iterator end() { return _by_name.end(); }
863
864 const_iterator begin() const { return _by_name.begin(); }
865 const_iterator end() const { return _by_name.end(); }
866
867 size_t size() const { return _by_name.size(); }
868
869 void clear()
870 {
871 _by_name.clear();
872 }
873
874 // strnum_cmp is taken from samtools.
875 static inline int strnum_cmp(const string &a, const string &b)
876 {
877 char *pa = (char*)a.c_str(), *pb = (char*)b.c_str();
878 while (*pa && *pb)
879 {
880 if (isdigit(*pa) && isdigit(*pb))
881 {
882 long ai, bi;
883 ai = strtol(pa, &pa, 10);
884 bi = strtol(pb, &pb, 10);
885 if (ai != bi) return ai<bi? true : false;
886 }
887 else
888 {
889 if (*pa != *pb) break;
890 ++pa; ++pb;
891 }
892 }
893
894 if (*pa == *pb)
895 return (pa-a.c_str()) < (pb-b.c_str())? true : false;
896
897 return *pa<*pb? true : false;
898 }
899
900 void order_recs_lexicographically()
901 {
902 vector<string> vStr;
903 for (IDTable::iterator i = _by_name.begin(); i != _by_name.end(); ++i)
904 {
905 vStr.push_back(i->first);
906 }
907
908 ::sort(vStr.begin(), vStr.end(), RefSequenceTable::strnum_cmp);
909
910 _refid_to_name.clear();
911 size_t new_order = 1;
912 for (vector<string>::iterator i = vStr.begin(); i != vStr.end(); ++i, ++new_order)
913 {
914 _by_name.find(*i)->second.observation_order = new_order;
915 _refid_to_name.push_back(*i);
916 }
917 }
918
919 private:
920 uint32_t _next_id;
921 bool _keep_names;
922 IDTable _by_name;
923 vector<string> _refid_to_name;
924 };
925 #endif
926
927 bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2);
928
929 typedef vector<BowtieHit> HitList;
930
931 /* This class stores all the hits from a Bowtie map */
932
933 class HitTable
934 {
935 public:
936
937 typedef map<uint64_t, HitList> RefHits;
938 typedef RefHits::const_iterator const_iterator;
939 typedef RefHits::iterator iterator;
940
941 HitTable() : _total_hits(0) {}
942
943 const_iterator begin() const { return _hits_for_ref.begin(); }
944 const_iterator end() const { return _hits_for_ref.end(); }
945
946 iterator begin() { return _hits_for_ref.begin(); }
947 iterator end() { return _hits_for_ref.end(); }
948
949 void add_hit(const BowtieHit& bh, bool check_uniqueness);
950
951 void finalize()
952 {
953 for (RefHits::iterator i = _hits_for_ref.begin();
954 i != _hits_for_ref.end();
955 ++i)
956 {
957 sort(i->second.begin(), i->second.end(), hit_insert_id_lt);
958 }
959 }
960
961 HitList* get_hits(uint64_t ref_id)
962 {
963 RefHits::iterator i = _hits_for_ref.find(ref_id);
964 if (i == _hits_for_ref.end())
965 return NULL;
966 else
967 return &(i->second);
968 }
969
970 uint32_t total_hits() const { return _total_hits; }
971
972 private:
973 RefHits _hits_for_ref;
974 uint32_t _total_hits;
975 };
976
977 class HitStream;
978 class HitFactory {
979 friend class HitStream;
980 public:
981 HitFactory(ReadTable& insert_table,
982 RefSequenceTable& reference_table) :
983 _insert_table(insert_table), _ref_table(reference_table)
984 {}
985 virtual ~HitFactory() {}
986 virtual void openStream(HitStream& hs)=0;
987 virtual void rewind(HitStream& hs)=0;
988 virtual void seek(HitStream& hs, int64_t offset)=0;
989 virtual void closeStream(HitStream& hs)=0;
990 BowtieHit create_hit(const string& insert_name,
991 const string& ref_name,
992 const string& ref_name2,
993 int left,
994 const vector<CigarOp>& cigar,
995 bool antisense_aln,
996 bool antisense_splice,
997 unsigned char edit_dist,
998 unsigned char splice_mms,
999 bool end);
1000
1001 BowtieHit create_hit(const string& insert_name,
1002 const string& ref_name,
1003 uint32_t left,
1004 uint32_t read_len,
1005 bool antisense_aln,
1006 unsigned char edit_dist,
1007 bool end);
1008
1009 virtual string hitfile_rec(HitStream& hs, const char* hit_buf)=0;
1010 virtual bool next_record(HitStream& hs, const char*& buf, size_t& buf_size) = 0;
1011 virtual bool get_hit_from_buf(const char* bwt_buf,
1012 BowtieHit& bh,
1013 bool strip_slash,
1014 char* name_out = NULL,
1015 char* name_tags = NULL,
1016 char* seq = NULL,
1017 char* qual = NULL) = 0;
1018
1019 protected:
1020 ReadTable& _insert_table;
1021 RefSequenceTable& _ref_table;
1022 HitStream* _hit_stream;
1023 };//class HitFactory
1024
1025
1026 class LineHitFactory : public HitFactory {
1027 //for text line-based formats like Bowtie and SAM
1028 public:
1029 LineHitFactory(ReadTable& insert_table,
1030 RefSequenceTable& reference_table) :
1031 HitFactory(insert_table, reference_table) {}
1032
1033 string hitfile_rec(HitStream& hs, const char* hit_buf) {
1034 string r(hit_buf);
1035 return r;
1036 }
1037 void openStream(HitStream& hs);
1038 void rewind(HitStream& hs);
1039 void seek(HitStream&hs, int64_t offset);
1040 void closeStream(HitStream& hs);
1041 bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
1042 protected:
1043 static const size_t _hit_buf_max_sz = 10 * 1024;
1044 char _hit_buf[_hit_buf_max_sz];
1045 int _line_num;
1046 };
1047
1048 class BowtieHitFactory : public LineHitFactory {
1049 public:
1050 BowtieHitFactory(ReadTable& insert_table,
1051 RefSequenceTable& reference_table) :
1052 LineHitFactory(insert_table, reference_table) {}
1053
1054 bool get_hit_from_buf(const char* bwt_buf,
1055 BowtieHit& bh,
1056 bool strip_slash,
1057 char* name_out = NULL,
1058 char* name_tags = NULL,
1059 char* seq = NULL,
1060 char* qual = NULL);
1061 };
1062
1063 class SplicedBowtieHitFactory : public LineHitFactory {
1064 public:
1065 SplicedBowtieHitFactory(ReadTable& insert_table,
1066 RefSequenceTable& reference_table,
1067 int anchor_length) :
1068 LineHitFactory(insert_table, reference_table),
1069 _anchor_length(anchor_length){}
1070
1071 bool get_hit_from_buf(const char* bwt_buf,
1072 BowtieHit& bh,
1073 bool strip_slash,
1074 char* name_out = NULL,
1075 char* name_tags = NULL,
1076 char* seq = NULL,
1077 char* qual = NULL);
1078 private:
1079 int _anchor_length;
1080 int _seg_offset;
1081 int _size_buf;
1082 };
1083
1084 class SplicedSAMHitFactory : public LineHitFactory {
1085 public:
1086 SplicedSAMHitFactory(ReadTable& insert_table,
1087 RefSequenceTable& reference_table,
1088 int anchor_length) :
1089 LineHitFactory(insert_table, reference_table),
1090 _anchor_length(anchor_length){}
1091
1092 bool get_hit_from_buf(const char* bwt_buf,
1093 BowtieHit& bh,
1094 bool strip_slash,
1095 char* name_out = NULL,
1096 char* name_tags = NULL,
1097 char* seq = NULL,
1098 char* qual = NULL);
1099 private:
1100 int _anchor_length;
1101 int _seg_offset;
1102 int _size_buf;
1103 };
1104
1105
1106 class SAMHitFactory : public LineHitFactory {
1107 public:
1108 SAMHitFactory(ReadTable& insert_table,
1109 RefSequenceTable& reference_table) :
1110 LineHitFactory(insert_table, reference_table) {}
1111
1112 bool get_hit_from_buf(const char* bwt_buf,
1113 BowtieHit& bh,
1114 bool strip_slash,
1115 char* name_out = NULL,
1116 char* name_tags = NULL,
1117 char* seq = NULL,
1118 char* qual = NULL);
1119 };
1120
1121
1122 /******************************************************************************
1123 BAMHitFactory turns SAM alignments into BowtieHits
1124 *******************************************************************************/
1125 class BAMHitFactory : public HitFactory {
1126 public:
1127
1128 BAMHitFactory(ReadTable& insert_table,
1129 RefSequenceTable& reference_table) :
1130 HitFactory(insert_table, reference_table)
1131 {
1132 _sam_header=NULL;
1133 }
1134 void openStream(HitStream& hs);
1135 void rewind(HitStream& hs);
1136 void seek(HitStream& hs, int64_t offset);
1137 void closeStream(HitStream& hs);
1138
1139 bool next_record(HitStream& hs, const char*& buf, size_t& buf_size);
1140
1141 /*void mark_curr_pos()
1142 {
1143 _curr_pos = bgzf_tell(((samfile_t*)_hit_file)->x.bam);
1144 }
1145
1146
1147 void undo_hit()
1148 {
1149 bgzf_seek(((samfile_t*)_hit_file)->x.bam, _curr_pos, SEEK_SET);
1150 _eof = false;
1151 }
1152 */
1153 string hitfile_rec(HitStream& hs, const char* hit_buf);
1154
1155 bool get_hit_from_buf(const char* bwt_buf,
1156 BowtieHit& bh,
1157 bool strip_slash,
1158 char* name_out = NULL,
1159 char* name_tags = NULL,
1160 char* seq = NULL,
1161 char* qual = NULL);
1162 protected:
1163 //int64_t _curr_pos;
1164 //int64_t _beginning;
1165 bam1_t _next_hit;
1166 bam_header_t* _sam_header;
1167 bool inspect_header(HitStream& hs);
1168 };
1169
1170 class SplicedBAMHitFactory : public BAMHitFactory {
1171 public:
1172 SplicedBAMHitFactory(ReadTable& insert_table,
1173 RefSequenceTable& reference_table,
1174 int anchor_length) :
1175 BAMHitFactory(insert_table, reference_table),
1176 _anchor_length(anchor_length)
1177 {
1178 }
1179
1180 bool get_hit_from_buf(const char* bwt_buf,
1181 BowtieHit& bh,
1182 bool strip_slash,
1183 char* name_out = NULL,
1184 char* name_tags = NULL,
1185 char* seq = NULL,
1186 char* qual = NULL);
1187
1188 private:
1189 int _anchor_length;
1190 int _seg_offset;
1191 int _size_buf;
1192 };
1193
1194
1195 struct HitsForRead
1196 {
1197 HitsForRead() : insert_id(0) {}
1198 uint64_t insert_id;
1199 vector<BowtieHit> hits;
1200 };
1201
1202 class HitStream
1203 {
1204 friend class HitFactory;
1205 friend class LineHitFactory;
1206 friend class BAMHitFactory;
1207 //private:
1208 HitFactory* _factory;
1209 bool _spliced;
1210 bool _strip_slash;
1211 BowtieHit buffered_hit;
1212 bool _keep_bufs;
1213 bool _keep_seqs;
1214 bool _keep_quals;
1215 bool _from_bowtie;
1216 void* _hit_file;
1217 string _hit_file_name;
1218 FZPipe* _fzpipe;
1219 bool _eof;
1220
1221 public:
1222 HitStream(void* hit_file, //could be FILE* or samfile_t*
1223 HitFactory* hit_factory,
1224 bool spliced,
1225 bool strip_slash,
1226 bool keep_bufs,
1227 bool keep_seqs = false,
1228 bool keep_quals = false,
1229 bool from_bowtie = false) :
1230 _factory(hit_factory),
1231 _spliced(spliced),
1232 _strip_slash(strip_slash),
1233 buffered_hit(BowtieHit()),
1234 _keep_bufs(keep_bufs),
1235 _keep_seqs(keep_seqs),
1236 _keep_quals(keep_quals),
1237 _from_bowtie(from_bowtie),
1238 _hit_file(hit_file),
1239 _hit_file_name(),
1240 _fzpipe(NULL),
1241 _eof(false)
1242 {
1243 primeStream();
1244 }
1245
1246 HitStream(const string& hit_filename,
1247 HitFactory* hit_factory,
1248 bool spliced,
1249 bool strip_slash,
1250 bool keep_bufs,
1251 bool keep_seqs = false,
1252 bool keep_quals = false,
1253 bool from_bowtie = false) :
1254 _factory(hit_factory),
1255 _spliced(spliced),
1256 _strip_slash(strip_slash),
1257 buffered_hit(BowtieHit()),
1258 _keep_bufs(keep_bufs),
1259 _keep_seqs(keep_seqs),
1260 _keep_quals(keep_quals),
1261 _from_bowtie(from_bowtie),
1262 _hit_file(NULL),
1263 _hit_file_name(hit_filename),
1264 _fzpipe(NULL),
1265 _eof(false)
1266 {
1267 _factory->openStream(*this);
1268 primeStream();
1269 }
1270
1271 HitStream(FZPipe& hit_filepipe,
1272 HitFactory* hit_factory,
1273 bool spliced,
1274 bool strip_slash,
1275 bool keep_bufs,
1276 bool keep_seqs = false,
1277 bool keep_quals = false,
1278 bool from_bowtie = false) :
1279 _factory(hit_factory),
1280 _spliced(spliced),
1281 _strip_slash(strip_slash),
1282 buffered_hit(BowtieHit()),
1283 _keep_bufs(keep_bufs),
1284 _keep_seqs(keep_seqs),
1285 _keep_quals(keep_quals),
1286 _from_bowtie(from_bowtie),
1287 _hit_file(NULL),
1288 _hit_file_name(),
1289 _fzpipe(&hit_filepipe),
1290 _eof(false)
1291 {
1292 _hit_file=_fzpipe->file;
1293 primeStream();
1294 }
1295
1296 void primeStream() { //why?
1297 // Prime the stream by reading a single hit into the buffered_hit
1298 HitsForRead dummy = HitsForRead();
1299 next_read_hits(dummy);
1300 }
1301 bool eof() { return _eof; }
1302 bool ready() { return (_hit_file!=NULL); }
1303 void reset() {
1304 _factory->rewind(*this);
1305 _eof=false;
1306 // re-prime the stream;
1307 buffered_hit = BowtieHit();
1308 primeStream();
1309 }
1310 void seek(int64_t offset) {
1311 _factory->seek(*this, offset);
1312 _eof = false;
1313 buffered_hit = BowtieHit();
1314 primeStream();
1315 }
1316
1317 bool next_read_hits(HitsForRead& hits_for_read)
1318 {
1319 hits_for_read.hits.clear();
1320 hits_for_read.insert_id = 0;
1321
1322 //if (!_hit_file || (feof(_hit_file) && buffered_hit.insert_id() == 0))
1323 // return false;
1324 if (!this->ready())
1325 //err_die("Error: next_read_hits() called on HitFactory with no file handle\n");
1326 return false;
1327 if (this->eof() && buffered_hit.insert_id() == 0) {
1328 return false;
1329 }
1330
1331 //char bwt_buf[2048]; bwt_buf[0] = 0;
1332 char bwt_seq[2048]; bwt_seq[0] = 0;
1333 char bwt_qual[2048]; bwt_qual[0] = 0;
1334
1335 char* seq = _keep_seqs ? bwt_seq : NULL;
1336 char* qual = _keep_quals ? bwt_qual : NULL;
1337
1338 hits_for_read.insert_id = buffered_hit.insert_id();
1339 if (hits_for_read.insert_id)
1340 hits_for_read.hits.push_back(buffered_hit);
1341 const char* hit_buf;
1342 size_t hit_buf_size = 0;
1343 while (true) {
1344
1345 if (!_factory->next_record(*this, hit_buf, hit_buf_size)) {
1346 buffered_hit = BowtieHit();
1347 break; }
1348
1349 //string clean_buf = bwt_buf;
1350 // Get a new record from the tab-delimited Bowtie map
1351 BowtieHit bh;
1352 if (_factory->get_hit_from_buf(hit_buf, bh, _strip_slash,
1353 NULL, NULL, seq, qual)) {
1354 if (_keep_bufs)
1355 bh.hitfile_rec(_factory->hitfile_rec(*this, hit_buf));
1356
1357 if (_keep_seqs)
1358 bh.seq(seq);
1359
1360 if (_keep_quals) {
1361 // when it comes to convert from qual in color to qual in bp,
1362 // we need to fill in the two extream qual values using the adjacent qual values.
1363 size_t qual_len = strlen(qual);
1364 if (color && qual_len > 2) {
1365 qual[0] = qual[1];
1366 qual[qual_len-1] = qual[qual_len-2];
1367 }
1368 bh.qual(qual);
1369 }
1370
1371 if (bh.insert_id() == hits_for_read.insert_id) {
1372 hits_for_read.hits.push_back(bh);
1373 }
1374 else {
1375 buffered_hit = bh;
1376 break;
1377 }
1378 } //hit parsed
1379 } //while reading hits
1380
1381 return (!hits_for_read.hits.empty() && hits_for_read.insert_id != 0);
1382 }
1383
1384 uint64_t next_group_id() const
1385 {
1386 return buffered_hit.insert_id();
1387 }
1388 bool fromBowtie() { return _from_bowtie; }
1389 };
1390
1391 typedef uint32_t MateStatusMask;
1392
1393 void get_mapped_reads(FILE* bwtf,
1394 HitTable& hits,
1395 HitFactory& hit_factory,
1396 bool strip_slash,
1397 bool verbose = false);
1398
1399 //bool left_status_better(MateStatusMask left, MateStatusMask right);
1400 //bool status_equivalent(MateStatusMask left, MateStatusMask right);
1401 typedef uint32_t MateStatusMask;
1402
1403 void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC);
1404 void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC);
1405
1406 void accept_all_hits(HitTable& hits);
1407
1408 //print BowtieHit as BAM record
1409 void print_bamhit(GBamWriter& wbam,
1410 const char* read_name,
1411 const BowtieHit& bh,
1412 const char* ref_name,
1413 const char* ref_name2,
1414 const char* sequence,
1415 const char* qualities,
1416 bool from_bowtie = false,
1417 const vector<string>* extra_fields = NULL);
1418
1419
1420 void extract_partial_hits(const BowtieHit& bh, const string& seq, const string& qual,
1421 char* cigar1, char* cigar2, string& seq1, string& seq2,
1422 string& qual1, string& qual2, int& left1, int& left2);
1423
1424 /**
1425 * Convert a vector of CigarOps to a string representation
1426 */
1427 std::string print_cigar(const vector<CigarOp>& bh_cigar);
1428
1429 /**
1430 * Calculate bowtie (1 or 2) related extra SAM fields such as
1431 * AS:i (alignment score)
1432 * MD:Z
1433 * NM:i
1434 * etc
1435 */
1436 void bowtie_sam_extra(const BowtieHit& bh, const RefSequenceTable& rt, vector<string>& fields);
1437
1438 #endif