1 |
#ifndef BWT_MAP_H |
2 |
#define BWT_MAP_H |
3 |
|
4 |
#ifdef HAVE_CONFIG_H |
5 |
#include <config.h> |
6 |
#endif |
7 |
|
8 |
#include <stdint.h> |
9 |
#include <stdio.h> |
10 |
#include <stdlib.h> |
11 |
#include <string> |
12 |
#include <map> |
13 |
#include <vector> |
14 |
#include <cassert> |
15 |
#include <cstring> |
16 |
#include <seqan/sequence.h> |
17 |
|
18 |
#include <bam/sam.h> |
19 |
using namespace std; |
20 |
|
21 |
#include "common.h" |
22 |
#define _FBUF_SIZE 10*1024 |
23 |
/* |
24 |
* bwt_map.h |
25 |
* TopHat |
26 |
* |
27 |
* Created by Cole Trapnell on 11/17/08. |
28 |
* Copyright 2008 Cole Trapnell. All rights reserved. |
29 |
* |
30 |
*/ |
31 |
|
32 |
enum CigarOpCode { MATCH, INS, DEL, REF_SKIP, SOFT_CLIP, HARD_CLIP, PAD }; |
33 |
|
34 |
struct CigarOp |
35 |
{ |
36 |
CigarOp(CigarOpCode o, uint32_t l) : opcode(o), length(l) {} |
37 |
CigarOpCode opcode : 3; |
38 |
uint32_t length : 29; |
39 |
|
40 |
bool operator==(const CigarOp& rhs) const |
41 |
{ |
42 |
return opcode == rhs.opcode && length == rhs.length; |
43 |
} |
44 |
|
45 |
}; |
46 |
|
47 |
typedef uint32_t ReadID; |
48 |
|
49 |
class RefSequenceTable; |
50 |
|
51 |
/* Stores the information from a single record of the bowtie map. A given read |
52 |
may have many of these. Reads up to 255bp are supported. |
53 |
*/ |
54 |
struct BowtieHit |
55 |
{ |
56 |
BowtieHit() : |
57 |
_ref_id(0), |
58 |
_insert_id(0), |
59 |
_left(0), |
60 |
_antisense_splice(false), |
61 |
_antisense_aln(false), |
62 |
_edit_dist(0), |
63 |
_splice_mms(0), |
64 |
_end(false){} |
65 |
|
66 |
BowtieHit(uint32_t ref_id, |
67 |
ReadID insert_id, |
68 |
int left, |
69 |
int read_len, |
70 |
bool antisense, |
71 |
unsigned char edit_dist, |
72 |
bool end) : |
73 |
_ref_id(ref_id), |
74 |
_insert_id(insert_id), |
75 |
_left(left), |
76 |
_cigar(vector<CigarOp>(1,CigarOp(MATCH,read_len))), |
77 |
_antisense_splice(false), |
78 |
_antisense_aln(antisense), |
79 |
_edit_dist(edit_dist), |
80 |
_splice_mms(0), |
81 |
_end(end) |
82 |
{ |
83 |
assert(_cigar.capacity() == _cigar.size()); |
84 |
} |
85 |
|
86 |
BowtieHit(uint32_t ref_id, |
87 |
ReadID insert_id, |
88 |
int left, |
89 |
const vector<CigarOp>& cigar, |
90 |
bool antisense_aln, |
91 |
bool antisense_splice, |
92 |
unsigned char edit_dist, |
93 |
unsigned char splice_mms, |
94 |
bool end) : |
95 |
_ref_id(ref_id), |
96 |
_insert_id(insert_id), |
97 |
_left(left), |
98 |
_cigar(cigar), |
99 |
_antisense_splice(antisense_splice), |
100 |
_antisense_aln(antisense_aln), |
101 |
_edit_dist(edit_dist), |
102 |
_splice_mms(splice_mms), |
103 |
_end(end) |
104 |
{ |
105 |
assert(_cigar.capacity() == _cigar.size()); |
106 |
} |
107 |
|
108 |
int read_len() const |
109 |
{ |
110 |
uint32_t len = 0; |
111 |
for (size_t i = 0; i < _cigar.size(); ++i) |
112 |
{ |
113 |
const CigarOp& op = _cigar[i]; |
114 |
switch(op.opcode) |
115 |
{ |
116 |
case MATCH: |
117 |
case INS: |
118 |
case SOFT_CLIP: |
119 |
len += op.length; |
120 |
break; |
121 |
default: |
122 |
break; |
123 |
} |
124 |
} |
125 |
|
126 |
return len; |
127 |
} |
128 |
|
129 |
bool operator==(const BowtieHit& rhs) const |
130 |
{ |
131 |
return (_insert_id == rhs._insert_id && |
132 |
_ref_id == rhs._ref_id && |
133 |
_antisense_aln == rhs._antisense_aln && |
134 |
_left == rhs._left && |
135 |
_antisense_splice == rhs._antisense_splice && |
136 |
_edit_dist == rhs._edit_dist && |
137 |
/* DO NOT USE ACCEPTED IN COMPARISON */ |
138 |
_cigar == rhs._cigar); |
139 |
} |
140 |
|
141 |
bool operator<(const BowtieHit& rhs) const |
142 |
{ |
143 |
if (_insert_id != rhs._insert_id) |
144 |
return _insert_id < rhs._insert_id; |
145 |
if (_ref_id != rhs._ref_id) |
146 |
return _ref_id < rhs._ref_id; |
147 |
if (_left != rhs._left) |
148 |
return _left < rhs._left; |
149 |
if (_antisense_aln != rhs._antisense_aln) |
150 |
return _antisense_aln < rhs._antisense_aln; |
151 |
if (_edit_dist != rhs._edit_dist) |
152 |
return _edit_dist < rhs._edit_dist; |
153 |
if (_cigar != rhs._cigar) |
154 |
{ |
155 |
if (_cigar.size() != rhs._cigar.size()) |
156 |
return _cigar.size() < rhs._cigar.size(); |
157 |
for (size_t i = 0; i < _cigar.size(); ++i) |
158 |
{ |
159 |
if (!(_cigar[i] == rhs._cigar[i])) |
160 |
return (_cigar[i].opcode < rhs._cigar[i].opcode || (_cigar[i].opcode == rhs._cigar[i].opcode && _cigar[i].length < rhs._cigar[i].length)); |
161 |
} |
162 |
} |
163 |
return false; |
164 |
} |
165 |
|
166 |
uint32_t ref_id() const { return _ref_id; } |
167 |
ReadID insert_id() const { return _insert_id; } |
168 |
int left() const { return _left; } |
169 |
int right() const |
170 |
{ |
171 |
int r = _left; |
172 |
for (size_t i = 0; i < _cigar.size(); ++i) |
173 |
{ |
174 |
const CigarOp& op = _cigar[i]; |
175 |
|
176 |
switch(op.opcode) |
177 |
{ |
178 |
case MATCH: |
179 |
case REF_SKIP: |
180 |
case DEL: |
181 |
r += op.length; |
182 |
break; |
183 |
default: |
184 |
break; |
185 |
} |
186 |
} |
187 |
return r; |
188 |
} |
189 |
|
190 |
bool is_spliced() |
191 |
{ |
192 |
for (size_t i = 0; i < _cigar.size(); ++i) |
193 |
{ |
194 |
const CigarOp& op = _cigar[i]; |
195 |
|
196 |
if (op.opcode == REF_SKIP) |
197 |
return true; |
198 |
} |
199 |
return false; |
200 |
} |
201 |
|
202 |
bool antisense_splice() const { return _antisense_splice; } |
203 |
bool antisense_align() const { return _antisense_aln; } |
204 |
|
205 |
unsigned char edit_dist() const { return _edit_dist; } |
206 |
unsigned char splice_mms() const { return _splice_mms; } |
207 |
|
208 |
// For convenience, if you just want a copy of the gap intervals |
209 |
// for this hit. |
210 |
void gaps(vector<pair<int,int> >& gaps_out) const |
211 |
{ |
212 |
gaps_out.clear(); |
213 |
int pos = _left; |
214 |
for (size_t i = 0; i < _cigar.size(); ++i) |
215 |
{ |
216 |
const CigarOp& op = _cigar[i]; |
217 |
|
218 |
switch(op.opcode) |
219 |
{ |
220 |
case REF_SKIP: |
221 |
gaps_out.push_back(make_pair(pos, pos + op.length - 1)); |
222 |
pos += op.length; |
223 |
break; |
224 |
case MATCH: |
225 |
pos += op.length; |
226 |
break; |
227 |
default: |
228 |
break; |
229 |
} |
230 |
} |
231 |
} |
232 |
|
233 |
const vector<CigarOp>& cigar() const { return _cigar; } |
234 |
|
235 |
|
236 |
bool contiguous() const |
237 |
{ |
238 |
return _cigar.size() == 1 && _cigar[0].opcode == MATCH; |
239 |
} |
240 |
|
241 |
const string& hitfile_rec() const { return _hitfile_rec; } |
242 |
void hitfile_rec(const string& rec) { _hitfile_rec = rec; } |
243 |
|
244 |
const string& seq() const { return _seq; } |
245 |
void seq(const string& seq) { _seq = seq; } |
246 |
|
247 |
const string& qual() const { return _qual; } |
248 |
void qual(const string& qual) { _qual = qual; } |
249 |
|
250 |
bool end() const { return _end; } |
251 |
void end(bool end) { _end = end; } |
252 |
|
253 |
// this is for debugging purpose |
254 |
bool check_editdist_consistency(const RefSequenceTable& rt); |
255 |
|
256 |
private: |
257 |
|
258 |
uint32_t _ref_id; |
259 |
ReadID _insert_id; // Id of the sequencing insert |
260 |
int _left; // Position in the reference of the left side of the alignment |
261 |
|
262 |
vector<CigarOp> _cigar; |
263 |
|
264 |
bool _antisense_splice; // Whether the junction spanned is on the reverse strand |
265 |
bool _antisense_aln; // Whether the alignment is to the reverse strand |
266 |
|
267 |
unsigned char _edit_dist; // Total mismatches (note this is not including insertions or deletions as mismatches, ie, not equivalent to NM field of a SAM record) |
268 |
unsigned char _splice_mms; // Mismatches within min_anchor_len of a splice junction |
269 |
string _hitfile_rec; // Points to the buffer for the record from which this hit came |
270 |
string _seq; |
271 |
string _qual; |
272 |
bool _end; // Whether this segment is the last one of the read it belongs to |
273 |
}; |
274 |
|
275 |
class ReadTable |
276 |
{ |
277 |
public: |
278 |
|
279 |
ReadTable() : |
280 |
_next_id(1) {} |
281 |
|
282 |
// This function should NEVER return zero |
283 |
ReadID get_id(const string& name) |
284 |
{ |
285 |
uint32_t _id = atoi(name.c_str()); |
286 |
//assert(_id); |
287 |
_next_id = max(_next_id, (size_t)_id); |
288 |
return _id; |
289 |
} |
290 |
|
291 |
|
292 |
uint32_t observation_order(ReadID ID) |
293 |
{ |
294 |
if (ID == 0) |
295 |
return 0xFFFFFFFF; |
296 |
return ID; |
297 |
} |
298 |
|
299 |
|
300 |
size_t size() const { return _next_id; } |
301 |
|
302 |
private: |
303 |
size_t _next_id; |
304 |
}; |
305 |
|
306 |
class RefSequenceTable |
307 |
{ |
308 |
public: |
309 |
|
310 |
typedef seqan::String<seqan::Dna5, seqan::Packed<seqan::Alloc<> > > Sequence; |
311 |
|
312 |
struct SequenceInfo |
313 |
{ |
314 |
SequenceInfo(uint32_t _order, |
315 |
char* _name, |
316 |
Sequence* _seq, uint32_t _len) : |
317 |
observation_order(_order), |
318 |
name(_name), |
319 |
seq(_seq), |
320 |
len(_len) {} |
321 |
|
322 |
uint32_t observation_order; |
323 |
char* name; |
324 |
Sequence* seq; |
325 |
uint32_t len; |
326 |
}; |
327 |
|
328 |
typedef map<string, uint64_t> IDTable; |
329 |
typedef map<uint32_t, SequenceInfo> InvertedIDTable; |
330 |
typedef InvertedIDTable::iterator iterator; |
331 |
typedef InvertedIDTable::const_iterator const_iterator; |
332 |
|
333 |
RefSequenceTable(bool keep_names, bool keep_seqs = false) : |
334 |
_next_id(1), |
335 |
_keep_names(keep_names) {} |
336 |
|
337 |
RefSequenceTable(const string& sam_header_filename, |
338 |
bool keep_names, |
339 |
bool keep_seqs = false) : |
340 |
_next_id(1), |
341 |
_keep_names(keep_names) |
342 |
{ |
343 |
if (sam_header_filename != "") |
344 |
{ |
345 |
samfile_t* fh = samopen(sam_header_filename.c_str(), "r", 0); |
346 |
if (fh == 0) { |
347 |
fprintf(stderr, "Failed to open SAM header file %s\n", sam_header_filename.c_str()); |
348 |
exit(1); |
349 |
} |
350 |
|
351 |
for (size_t i = 0; i < (size_t)fh->header->n_targets; ++i) |
352 |
{ |
353 |
const char* name = fh->header->target_name[i]; |
354 |
uint32_t len = fh->header->target_len[i]; |
355 |
get_id(name, NULL, len); |
356 |
//fprintf(stderr, "SQ: %s - %d\n", name, len); |
357 |
} |
358 |
} |
359 |
} |
360 |
|
361 |
// This function should NEVER return zero |
362 |
uint32_t get_id(const string& name, |
363 |
Sequence* seq, |
364 |
uint32_t len) |
365 |
{ |
366 |
uint32_t _id = hash_string(name.c_str()); |
367 |
pair<InvertedIDTable::iterator, bool> ret = |
368 |
_by_id.insert(make_pair(_id, SequenceInfo(_next_id, NULL, NULL, 0))); |
369 |
if (ret.second == true) |
370 |
{ |
371 |
char* _name = NULL; |
372 |
if (_keep_names) |
373 |
_name = strdup(name.c_str()); |
374 |
ret.first->second.name = _name; |
375 |
ret.first->second.seq = seq; |
376 |
ret.first->second.len = len; |
377 |
++_next_id; |
378 |
} |
379 |
assert (_id); |
380 |
return _id; |
381 |
} |
382 |
|
383 |
const char* get_name(uint32_t ID) const |
384 |
{ |
385 |
InvertedIDTable::const_iterator itr = _by_id.find(ID); |
386 |
if (itr != _by_id.end()) |
387 |
return itr->second.name; |
388 |
else |
389 |
return NULL; |
390 |
} |
391 |
|
392 |
uint32_t get_len(uint32_t ID) const |
393 |
{ |
394 |
InvertedIDTable::const_iterator itr = _by_id.find(ID); |
395 |
if (itr != _by_id.end()) |
396 |
return itr->second.len; |
397 |
else |
398 |
return 0; |
399 |
} |
400 |
|
401 |
Sequence* get_seq(uint32_t ID) const |
402 |
{ |
403 |
InvertedIDTable::const_iterator itr = _by_id.find(ID); |
404 |
if (itr != _by_id.end()) |
405 |
return itr->second.seq; |
406 |
else |
407 |
return NULL; |
408 |
} |
409 |
|
410 |
const SequenceInfo* get_info(uint32_t ID) const |
411 |
{ |
412 |
|
413 |
InvertedIDTable::const_iterator itr = _by_id.find(ID); |
414 |
if (itr != _by_id.end()) |
415 |
{ |
416 |
return &(itr->second); |
417 |
} |
418 |
else |
419 |
return NULL; |
420 |
} |
421 |
|
422 |
int observation_order(uint32_t ID) const |
423 |
{ |
424 |
InvertedIDTable::const_iterator itr = _by_id.find(ID); |
425 |
if (itr != _by_id.end()) |
426 |
{ |
427 |
return itr->second.observation_order; |
428 |
} |
429 |
else |
430 |
return -1; |
431 |
} |
432 |
|
433 |
iterator begin() { return _by_id.begin(); } |
434 |
iterator end() { return _by_id.end(); } |
435 |
|
436 |
const_iterator begin() const { return _by_id.begin(); } |
437 |
const_iterator end() const { return _by_id.end(); } |
438 |
|
439 |
size_t size() const { return _by_id.size(); } |
440 |
|
441 |
void clear() |
442 |
{ |
443 |
//_by_name.clear(); |
444 |
_by_id.clear(); |
445 |
} |
446 |
|
447 |
private: |
448 |
|
449 |
// This is FNV-1, see http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash |
450 |
inline uint32_t hash_string(const char* __s) |
451 |
{ |
452 |
uint32_t hash = 0x811c9dc5; |
453 |
for ( ; *__s; ++__s) |
454 |
{ |
455 |
hash *= 16777619; |
456 |
hash ^= *__s; |
457 |
} |
458 |
return hash; |
459 |
} |
460 |
|
461 |
//IDTable _by_name; |
462 |
uint32_t _next_id; |
463 |
bool _keep_names; |
464 |
InvertedIDTable _by_id; |
465 |
}; |
466 |
|
467 |
|
468 |
bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2); |
469 |
|
470 |
typedef vector<BowtieHit> HitList; |
471 |
|
472 |
/* This class stores all the hits from a Bowtie map */ |
473 |
|
474 |
class HitTable |
475 |
{ |
476 |
public: |
477 |
|
478 |
typedef map<uint64_t, HitList> RefHits; |
479 |
typedef RefHits::const_iterator const_iterator; |
480 |
typedef RefHits::iterator iterator; |
481 |
|
482 |
HitTable() : _total_hits(0) {} |
483 |
|
484 |
const_iterator begin() const { return _hits_for_ref.begin(); } |
485 |
const_iterator end() const { return _hits_for_ref.end(); } |
486 |
|
487 |
iterator begin() { return _hits_for_ref.begin(); } |
488 |
iterator end() { return _hits_for_ref.end(); } |
489 |
|
490 |
void add_hit(const BowtieHit& bh, bool check_uniqueness); |
491 |
|
492 |
void finalize() |
493 |
{ |
494 |
for (RefHits::iterator i = _hits_for_ref.begin(); |
495 |
i != _hits_for_ref.end(); |
496 |
++i) |
497 |
{ |
498 |
sort(i->second.begin(), i->second.end(), hit_insert_id_lt); |
499 |
} |
500 |
} |
501 |
|
502 |
HitList* get_hits(uint64_t ref_id) |
503 |
{ |
504 |
RefHits::iterator i = _hits_for_ref.find(ref_id); |
505 |
if (i == _hits_for_ref.end()) |
506 |
return NULL; |
507 |
else |
508 |
return &(i->second); |
509 |
} |
510 |
|
511 |
uint32_t total_hits() const { return _total_hits; } |
512 |
|
513 |
private: |
514 |
RefHits _hits_for_ref; |
515 |
uint32_t _total_hits; |
516 |
}; |
517 |
|
518 |
class HitStream; |
519 |
|
520 |
class HitFactory { |
521 |
friend class HitStream; |
522 |
public: |
523 |
HitFactory(ReadTable& insert_table, |
524 |
RefSequenceTable& reference_table) : |
525 |
_insert_table(insert_table), _ref_table(reference_table) |
526 |
{} |
527 |
virtual ~HitFactory() {} |
528 |
virtual void openStream(HitStream& hs)=0; |
529 |
virtual void rewind(HitStream& hs)=0; |
530 |
virtual void closeStream(HitStream& hs)=0; |
531 |
BowtieHit create_hit(const string& insert_name, |
532 |
const string& ref_name, |
533 |
int left, |
534 |
const vector<CigarOp>& cigar, |
535 |
bool antisense_aln, |
536 |
bool antisense_splice, |
537 |
unsigned char edit_dist, |
538 |
unsigned char splice_mms, |
539 |
bool end); |
540 |
|
541 |
BowtieHit create_hit(const string& insert_name, |
542 |
const string& ref_name, |
543 |
uint32_t left, |
544 |
uint32_t read_len, |
545 |
bool antisense_aln, |
546 |
unsigned char edit_dist, |
547 |
bool end); |
548 |
|
549 |
virtual string hitfile_rec(HitStream& hs, const char* hit_buf)=0; |
550 |
virtual bool next_record(HitStream& hs, const char*& buf, size_t& buf_size) = 0; |
551 |
virtual bool get_hit_from_buf(const char* bwt_buf, |
552 |
BowtieHit& bh, |
553 |
bool strip_slash, |
554 |
char* name_out = NULL, |
555 |
char* name_tags = NULL, |
556 |
char* seq = NULL, |
557 |
char* qual = NULL) = 0; |
558 |
|
559 |
protected: |
560 |
ReadTable& _insert_table; |
561 |
RefSequenceTable& _ref_table; |
562 |
HitStream* _hit_stream; |
563 |
};//class HitFactory |
564 |
|
565 |
|
566 |
class LineHitFactory : public HitFactory { |
567 |
//for text line-based formats like Bowtie and SAM |
568 |
public: |
569 |
LineHitFactory(ReadTable& insert_table, |
570 |
RefSequenceTable& reference_table) : |
571 |
HitFactory(insert_table, reference_table) {} |
572 |
|
573 |
string hitfile_rec(HitStream& hs, const char* hit_buf) { |
574 |
string r(hit_buf); |
575 |
return r; |
576 |
} |
577 |
void openStream(HitStream& hs); |
578 |
void rewind(HitStream& hs); |
579 |
void closeStream(HitStream& hs); |
580 |
bool next_record(HitStream& hs, const char*& buf, size_t& buf_size); |
581 |
protected: |
582 |
static const size_t _hit_buf_max_sz = 10 * 1024; |
583 |
char _hit_buf[_hit_buf_max_sz]; |
584 |
int _line_num; |
585 |
}; |
586 |
|
587 |
class BowtieHitFactory : public LineHitFactory { |
588 |
public: |
589 |
BowtieHitFactory(ReadTable& insert_table, |
590 |
RefSequenceTable& reference_table) : |
591 |
LineHitFactory(insert_table, reference_table) {} |
592 |
|
593 |
bool get_hit_from_buf(const char* bwt_buf, |
594 |
BowtieHit& bh, |
595 |
bool strip_slash, |
596 |
char* name_out = NULL, |
597 |
char* name_tags = NULL, |
598 |
char* seq = NULL, |
599 |
char* qual = NULL); |
600 |
}; |
601 |
|
602 |
class SplicedBowtieHitFactory : public LineHitFactory { |
603 |
public: |
604 |
SplicedBowtieHitFactory(ReadTable& insert_table, |
605 |
RefSequenceTable& reference_table, |
606 |
int anchor_length) : |
607 |
LineHitFactory(insert_table, reference_table), |
608 |
_anchor_length(anchor_length){} |
609 |
|
610 |
bool get_hit_from_buf(const char* bwt_buf, |
611 |
BowtieHit& bh, |
612 |
bool strip_slash, |
613 |
char* name_out = NULL, |
614 |
char* name_tags = NULL, |
615 |
char* seq = NULL, |
616 |
char* qual = NULL); |
617 |
private: |
618 |
int _anchor_length; |
619 |
int _seg_offset; |
620 |
int _size_buf; |
621 |
}; |
622 |
|
623 |
class SplicedSAMHitFactory : public LineHitFactory { |
624 |
public: |
625 |
SplicedSAMHitFactory(ReadTable& insert_table, |
626 |
RefSequenceTable& reference_table, |
627 |
int anchor_length) : |
628 |
LineHitFactory(insert_table, reference_table), |
629 |
_anchor_length(anchor_length){} |
630 |
|
631 |
bool get_hit_from_buf(const char* bwt_buf, |
632 |
BowtieHit& bh, |
633 |
bool strip_slash, |
634 |
char* name_out = NULL, |
635 |
char* name_tags = NULL, |
636 |
char* seq = NULL, |
637 |
char* qual = NULL); |
638 |
private: |
639 |
int _anchor_length; |
640 |
int _seg_offset; |
641 |
int _size_buf; |
642 |
}; |
643 |
|
644 |
|
645 |
class SAMHitFactory : public LineHitFactory { |
646 |
public: |
647 |
SAMHitFactory(ReadTable& insert_table, |
648 |
RefSequenceTable& reference_table) : |
649 |
LineHitFactory(insert_table, reference_table) {} |
650 |
|
651 |
bool get_hit_from_buf(const char* bwt_buf, |
652 |
BowtieHit& bh, |
653 |
bool strip_slash, |
654 |
char* name_out = NULL, |
655 |
char* name_tags = NULL, |
656 |
char* seq = NULL, |
657 |
char* qual = NULL); |
658 |
}; |
659 |
|
660 |
|
661 |
/****************************************************************************** |
662 |
BAMHitFactory turns SAM alignments into BowtieHits |
663 |
*******************************************************************************/ |
664 |
class BAMHitFactory : public HitFactory { |
665 |
public: |
666 |
|
667 |
BAMHitFactory(ReadTable& insert_table, |
668 |
RefSequenceTable& reference_table) : |
669 |
HitFactory(insert_table, reference_table) |
670 |
{ |
671 |
_sam_header=NULL; |
672 |
} |
673 |
void openStream(HitStream& hs); |
674 |
void rewind(HitStream& hs); |
675 |
void closeStream(HitStream& hs); |
676 |
|
677 |
bool next_record(HitStream& hs, const char*& buf, size_t& buf_size); |
678 |
|
679 |
/*void mark_curr_pos() |
680 |
{ |
681 |
_curr_pos = bgzf_tell(((samfile_t*)_hit_file)->x.bam); |
682 |
} |
683 |
|
684 |
|
685 |
void undo_hit() |
686 |
{ |
687 |
bgzf_seek(((samfile_t*)_hit_file)->x.bam, _curr_pos, SEEK_SET); |
688 |
_eof = false; |
689 |
} |
690 |
*/ |
691 |
string hitfile_rec(HitStream& hs, const char* hit_buf); |
692 |
|
693 |
bool get_hit_from_buf(const char* bwt_buf, |
694 |
BowtieHit& bh, |
695 |
bool strip_slash, |
696 |
char* name_out = NULL, |
697 |
char* name_tags = NULL, |
698 |
char* seq = NULL, |
699 |
char* qual = NULL); |
700 |
private: |
701 |
//int64_t _curr_pos; |
702 |
//int64_t _beginning; |
703 |
bam1_t _next_hit; |
704 |
bam_header_t* _sam_header; |
705 |
bool inspect_header(HitStream& hs); |
706 |
}; |
707 |
|
708 |
|
709 |
struct HitsForRead |
710 |
{ |
711 |
HitsForRead() : insert_id(0) {} |
712 |
uint64_t insert_id; |
713 |
vector<BowtieHit> hits; |
714 |
}; |
715 |
|
716 |
class HitStream |
717 |
{ |
718 |
friend class HitFactory; |
719 |
friend class LineHitFactory; |
720 |
friend class BAMHitFactory; |
721 |
//private: |
722 |
HitFactory* _factory; |
723 |
bool _spliced; |
724 |
bool _strip_slash; |
725 |
BowtieHit buffered_hit; |
726 |
bool _keep_bufs; |
727 |
bool _keep_seqs; |
728 |
bool _keep_quals; |
729 |
bool _from_bowtie; |
730 |
void* _hit_file; |
731 |
string _hit_file_name; |
732 |
FZPipe* _fzpipe; |
733 |
bool _eof; |
734 |
|
735 |
public: |
736 |
HitStream(void* hit_file, //could be FILE* or samfile_t* |
737 |
HitFactory* hit_factory, |
738 |
bool spliced, |
739 |
bool strip_slash, |
740 |
bool keep_bufs, |
741 |
bool keep_seqs = false, |
742 |
bool keep_quals = false, |
743 |
bool from_bowtie = false) : |
744 |
_factory(hit_factory), |
745 |
_spliced(spliced), |
746 |
_strip_slash(strip_slash), |
747 |
buffered_hit(BowtieHit()), |
748 |
_keep_bufs(keep_bufs), |
749 |
_keep_seqs(keep_seqs), |
750 |
_keep_quals(keep_quals), |
751 |
_from_bowtie(from_bowtie), |
752 |
_hit_file(hit_file), |
753 |
_hit_file_name(), |
754 |
_fzpipe(NULL), |
755 |
_eof(false) |
756 |
{ |
757 |
primeStream(); |
758 |
} |
759 |
|
760 |
HitStream(string& hit_filename, |
761 |
HitFactory* hit_factory, |
762 |
bool spliced, |
763 |
bool strip_slash, |
764 |
bool keep_bufs, |
765 |
bool keep_seqs = false, |
766 |
bool keep_quals = false, |
767 |
bool from_bowtie = false) : |
768 |
_factory(hit_factory), |
769 |
_spliced(spliced), |
770 |
_strip_slash(strip_slash), |
771 |
buffered_hit(BowtieHit()), |
772 |
_keep_bufs(keep_bufs), |
773 |
_keep_seqs(keep_seqs), |
774 |
_keep_quals(keep_quals), |
775 |
_from_bowtie(from_bowtie), |
776 |
_hit_file(NULL), |
777 |
_hit_file_name(hit_filename), |
778 |
_fzpipe(NULL), |
779 |
_eof(false) |
780 |
{ |
781 |
_factory->openStream(*this); |
782 |
primeStream(); |
783 |
} |
784 |
|
785 |
HitStream(FZPipe& hit_filepipe, |
786 |
HitFactory* hit_factory, |
787 |
bool spliced, |
788 |
bool strip_slash, |
789 |
bool keep_bufs, |
790 |
bool keep_seqs = false, |
791 |
bool keep_quals = false, |
792 |
bool from_bowtie = false) : |
793 |
_factory(hit_factory), |
794 |
_spliced(spliced), |
795 |
_strip_slash(strip_slash), |
796 |
buffered_hit(BowtieHit()), |
797 |
_keep_bufs(keep_bufs), |
798 |
_keep_seqs(keep_seqs), |
799 |
_keep_quals(keep_quals), |
800 |
_from_bowtie(from_bowtie), |
801 |
_hit_file(NULL), |
802 |
_hit_file_name(), |
803 |
_fzpipe(&hit_filepipe), |
804 |
_eof(false) |
805 |
{ |
806 |
_hit_file=_fzpipe->file; |
807 |
primeStream(); |
808 |
} |
809 |
|
810 |
void primeStream() { //why? |
811 |
// Prime the stream by reading a single hit into the buffered_hit |
812 |
HitsForRead dummy = HitsForRead(); |
813 |
next_read_hits(dummy); |
814 |
} |
815 |
bool eof() { return _eof; } |
816 |
bool ready() { return (_hit_file!=NULL); } |
817 |
void reset() { |
818 |
_factory->rewind(*this); |
819 |
_eof=false; |
820 |
// re-prime the stream; |
821 |
buffered_hit = BowtieHit(); |
822 |
primeStream(); |
823 |
} |
824 |
|
825 |
bool next_read_hits(HitsForRead& hits_for_read) |
826 |
{ |
827 |
hits_for_read.hits.clear(); |
828 |
hits_for_read.insert_id = 0; |
829 |
|
830 |
//if (!_hit_file || (feof(_hit_file) && buffered_hit.insert_id() == 0)) |
831 |
// return false; |
832 |
if (!this->ready()) |
833 |
//err_die("Error: next_read_hits() called on HitFactory with no file handle\n"); |
834 |
return false; |
835 |
if (this->eof() && buffered_hit.insert_id() == 0) { |
836 |
return false; |
837 |
} |
838 |
|
839 |
//char bwt_buf[2048]; bwt_buf[0] = 0; |
840 |
char bwt_seq[2048]; bwt_seq[0] = 0; |
841 |
char bwt_qual[2048]; bwt_qual[0] = 0; |
842 |
|
843 |
char* seq = _keep_seqs ? bwt_seq : NULL; |
844 |
char* qual = _keep_quals ? bwt_qual : NULL; |
845 |
|
846 |
hits_for_read.insert_id = buffered_hit.insert_id(); |
847 |
if (hits_for_read.insert_id) |
848 |
hits_for_read.hits.push_back(buffered_hit); |
849 |
const char* hit_buf; |
850 |
size_t hit_buf_size = 0; |
851 |
while (true) { |
852 |
if (!_factory->next_record(*this, hit_buf, hit_buf_size)) { |
853 |
buffered_hit = BowtieHit(); |
854 |
break; } |
855 |
//string clean_buf = bwt_buf; |
856 |
// Get a new record from the tab-delimited Bowtie map |
857 |
BowtieHit bh; |
858 |
if (_factory->get_hit_from_buf(hit_buf, bh, _strip_slash, |
859 |
NULL, NULL, seq, qual)) { |
860 |
if (_keep_bufs) |
861 |
bh.hitfile_rec(_factory->hitfile_rec(*this, hit_buf)); |
862 |
|
863 |
if (_keep_seqs) |
864 |
bh.seq(seq); |
865 |
|
866 |
if (_keep_quals) { |
867 |
// when it comes to convert from qual in color to qual in bp, |
868 |
// we need to fill in the two extream qual values using the adjacent qual values. |
869 |
size_t qual_len = strlen(qual); |
870 |
if (color && !color_out && qual_len > 2) { |
871 |
qual[0] = qual[1]; |
872 |
qual[qual_len-1] = qual[qual_len-2]; |
873 |
} |
874 |
bh.qual(qual); |
875 |
} |
876 |
|
877 |
if (bh.insert_id() == hits_for_read.insert_id) { |
878 |
hits_for_read.hits.push_back(bh); |
879 |
} |
880 |
else { |
881 |
buffered_hit = bh; |
882 |
break; |
883 |
} |
884 |
} //hit parsed |
885 |
} //while reading hits |
886 |
|
887 |
return (!hits_for_read.hits.empty() && hits_for_read.insert_id != 0); |
888 |
} |
889 |
|
890 |
uint64_t next_group_id() const |
891 |
{ |
892 |
return buffered_hit.insert_id(); |
893 |
} |
894 |
bool fromBowtie() { return _from_bowtie; } |
895 |
}; |
896 |
|
897 |
typedef uint32_t MateStatusMask; |
898 |
|
899 |
void get_mapped_reads(FILE* bwtf, |
900 |
HitTable& hits, |
901 |
HitFactory& hit_factory, |
902 |
bool strip_slash, |
903 |
bool verbose = false); |
904 |
|
905 |
//bool left_status_better(MateStatusMask left, MateStatusMask right); |
906 |
//bool status_equivalent(MateStatusMask left, MateStatusMask right); |
907 |
typedef uint32_t MateStatusMask; |
908 |
|
909 |
void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC); |
910 |
void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC); |
911 |
|
912 |
void accept_all_hits(HitTable& hits); |
913 |
|
914 |
//print BowtieHit as SAM line |
915 |
void print_hit(FILE* fout, |
916 |
const char* read_name, |
917 |
const BowtieHit& bh, |
918 |
const char* ref_name, |
919 |
const char* sequence, |
920 |
const char* qualities, |
921 |
bool from_bowtie = false); |
922 |
|
923 |
//print BowtieHit as BAM record |
924 |
void print_bamhit(GBamWriter& wbam, |
925 |
const char* read_name, |
926 |
const BowtieHit& bh, |
927 |
const char* ref_name, |
928 |
const char* sequence, |
929 |
const char* qualities, |
930 |
bool from_bowtie = false); |
931 |
|
932 |
/** |
933 |
* Convert a vector of CigarOps to a string representation |
934 |
*/ |
935 |
std::string print_cigar(vector<CigarOp>& bh_cigar); |
936 |
|
937 |
#endif |