1 |
#include "GBam.h" |
2 |
#include <ctype.h> |
3 |
#include "kstring.h" |
4 |
|
5 |
//auxiliary functions for low level BAM record creation |
6 |
uint8_t* realloc_bdata(bam1_t *b, int size) { |
7 |
if (b->m_data < size) { |
8 |
b->m_data = size; |
9 |
kroundup32(b->m_data); |
10 |
b->data = (uint8_t*)realloc(b->data, b->m_data); |
11 |
} |
12 |
if (b->data_len<size) b->data_len=size; |
13 |
return b->data; |
14 |
} |
15 |
|
16 |
uint8_t* dupalloc_bdata(bam1_t *b, int size) { |
17 |
//same as realloc_bdata, but does not free previous data |
18 |
//but returns it instead |
19 |
//it ALWAYS duplicates data |
20 |
b->m_data = size; |
21 |
kroundup32(b->m_data); |
22 |
uint8_t* odata=b->data; |
23 |
b->data = (uint8_t*)malloc(b->m_data); |
24 |
memcpy((void*)b->data, (void*)odata, b->data_len); |
25 |
b->data_len=size; |
26 |
return odata; //user must FREE this after |
27 |
} |
28 |
|
29 |
//from bam_import.c |
30 |
extern unsigned short bam_char2flag_table[]; |
31 |
|
32 |
GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid, |
33 |
int pos, bool reverse, const char* qseq, |
34 |
const char* cigar, const char* quals):exons(1) { |
35 |
novel=true; |
36 |
bam_header=NULL; |
37 |
b=bam_init1(); |
38 |
if (pos<=0 || gseq_tid<0) { |
39 |
b->core.pos=-1; //unmapped |
40 |
b->core.flag |= BAM_FUNMAP; |
41 |
gseq_tid=-1; |
42 |
} |
43 |
else b->core.pos=pos-1; //BAM is 0-based |
44 |
b->core.tid=gseq_tid; |
45 |
b->core.qual=255; |
46 |
b->core.mtid=-1; |
47 |
b->core.mpos=-1; |
48 |
int l_qseq=strlen(qseq); |
49 |
//this may not be accurate, setting CIGAR is the correct way |
50 |
//b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1); |
51 |
b->core.l_qname=strlen(qname)+1; //includes the \0 at the end |
52 |
memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); |
53 |
set_cigar(cigar); //this will also set core.bin |
54 |
add_sequence(qseq, l_qseq); |
55 |
add_quals(quals); //quals must be given as Phred33 |
56 |
if (reverse) { b->core.flag |= BAM_FREVERSE ; } |
57 |
} |
58 |
|
59 |
GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid, |
60 |
int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos, |
61 |
int insert_size, const char* qseq, const char* quals, |
62 |
GVec<char*>* aux_strings):exons(1) { |
63 |
novel=true; |
64 |
bam_header=NULL; |
65 |
b=bam_init1(); |
66 |
b->core.tid=g_tid; |
67 |
b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based |
68 |
b->core.qual=map_qual; |
69 |
int l_qseq=strlen(qseq); |
70 |
b->core.l_qname=strlen(qname)+1; //includes the \0 at the end |
71 |
memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname); |
72 |
set_cigar(cigar); //this will also set core.bin |
73 |
add_sequence(qseq, l_qseq); |
74 |
add_quals(quals); //quals must be given as Phred33 |
75 |
set_flags(flags); |
76 |
set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size); |
77 |
if (aux_strings!=NULL) { |
78 |
for (int i=0;i<aux_strings->Count();i++) { |
79 |
add_aux(aux_strings->Get(i)); |
80 |
} |
81 |
} |
82 |
} |
83 |
|
84 |
void GBamRecord::set_cigar(const char* cigar) { |
85 |
//requires b->core.pos and b->core.flag to have been set properly PRIOR to this call |
86 |
int doff=b->core.l_qname; |
87 |
uint8_t* after_cigar=NULL; |
88 |
int after_cigar_len=0; |
89 |
uint8_t* prev_bdata=NULL; |
90 |
if (b->data_len>doff) { |
91 |
//cigar string already allocated, replace it |
92 |
int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data |
93 |
after_cigar=b->data+d; |
94 |
after_cigar_len=b->data_len-d; |
95 |
} |
96 |
const char *s; |
97 |
char *t; |
98 |
int i, op; |
99 |
long x; |
100 |
b->core.n_cigar = 0; |
101 |
if (cigar != NULL && strcmp(cigar, "*") != 0) { |
102 |
for (s = cigar; *s; ++s) { |
103 |
if (isalpha(*s)) b->core.n_cigar++; |
104 |
else if (!isdigit(*s)) { |
105 |
GError("Error: invalid CIGAR character (%s)\n",cigar); |
106 |
} |
107 |
} |
108 |
if (after_cigar_len>0) { //replace/insert into existing full data |
109 |
prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len); |
110 |
memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len); |
111 |
free(prev_bdata); |
112 |
} |
113 |
else { |
114 |
realloc_bdata(b, doff + b->core.n_cigar * 4); |
115 |
} |
116 |
for (i = 0, s = cigar; i != b->core.n_cigar; ++i) { |
117 |
x = strtol(s, &t, 10); |
118 |
op = toupper(*t); |
119 |
if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH; |
120 |
else if (op == 'I') op = BAM_CINS; |
121 |
else if (op == 'D') op = BAM_CDEL; |
122 |
else if (op == 'N') op = BAM_CREF_SKIP; |
123 |
else if (op == 'S') op = BAM_CSOFT_CLIP; |
124 |
else if (op == 'H') op = BAM_CHARD_CLIP; |
125 |
else if (op == 'P') op = BAM_CPAD; |
126 |
else GError("Error: invalid CIGAR operation (%s)\n",cigar); |
127 |
s = t + 1; |
128 |
bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op; |
129 |
} |
130 |
if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar); |
131 |
b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b))); |
132 |
} else {//no CIGAR string given |
133 |
if (!(b->core.flag&BAM_FUNMAP)) { |
134 |
GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data); |
135 |
b->core.flag |= BAM_FUNMAP; |
136 |
} |
137 |
b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1); |
138 |
} |
139 |
setupCoordinates(); |
140 |
} //set_cigar() |
141 |
|
142 |
void GBamRecord::add_sequence(const char* qseq, int slen) { |
143 |
//must be called AFTER set_cigar (cannot replace existing sequence for now) |
144 |
if (qseq==NULL) return; //should we ever care about this? |
145 |
if (slen<0) slen=strlen(qseq); |
146 |
int doff = b->core.l_qname + b->core.n_cigar * 4; |
147 |
if (strcmp(qseq, "*")!=0) { |
148 |
b->core.l_qseq=slen; |
149 |
if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b))) |
150 |
GError("Error: CIGAR and sequence length are inconsistent!(%s)\n", |
151 |
qseq); |
152 |
uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff; |
153 |
//also allocated quals memory |
154 |
memset(p, 0, (b->core.l_qseq+1)/2); |
155 |
for (int i = 0; i < b->core.l_qseq; ++i) |
156 |
p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2); |
157 |
} else b->core.l_qseq = 0; |
158 |
} |
159 |
|
160 |
void GBamRecord::add_quals(const char* quals) { |
161 |
//requires core.l_qseq already set |
162 |
//and must be called AFTER add_sequence(), which also allocates the memory for quals |
163 |
uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2); |
164 |
if (quals==NULL || strcmp(quals, "*") == 0) { |
165 |
for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff; |
166 |
return; |
167 |
} |
168 |
for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33; |
169 |
} |
170 |
|
171 |
void GBamRecord::add_aux(const char* str) { |
172 |
//requires: being called AFTER add_quals() |
173 |
int strl=strlen(str); |
174 |
//int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux; |
175 |
//int doff0=doff; |
176 |
if (strl < 6 || str[2] != ':' || str[4] != ':') |
177 |
parse_error("missing colon in auxiliary data"); |
178 |
tag[0] = str[0]; tag[1] = str[1]; |
179 |
uint8_t atype = str[3]; |
180 |
uint8_t* adata=abuf; |
181 |
int alen=0; |
182 |
if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility |
183 |
atype='A'; |
184 |
alen=1; |
185 |
adata=(uint8_t*)&str[5]; |
186 |
} |
187 |
else if (atype == 'I' || atype == 'i') { |
188 |
long long x=(long long)atoll(str + 5); |
189 |
if (x < 0) { |
190 |
if (x >= -127) { |
191 |
atype='c'; |
192 |
abuf[0] = (int8_t)x; |
193 |
alen=1; |
194 |
} |
195 |
else if (x >= -32767) { |
196 |
atype = 's'; |
197 |
*(int16_t*)abuf = (int16_t)x; |
198 |
alen=2; |
199 |
} |
200 |
else { |
201 |
atype='i'; |
202 |
*(int32_t*)abuf = (int32_t)x; |
203 |
alen=4; |
204 |
if (x < -2147483648ll) |
205 |
GMessage("Parse warning: integer %lld is out of range.", x); |
206 |
} |
207 |
} else { //x >=0 |
208 |
if (x <= 255) { |
209 |
atype = 'C'; |
210 |
abuf[0] = (uint8_t)x; |
211 |
alen=1; |
212 |
} |
213 |
else if (x <= 65535) { |
214 |
atype='S'; |
215 |
*(uint16_t*)abuf = (uint16_t)x; |
216 |
alen=2; |
217 |
} |
218 |
else { |
219 |
atype='I'; |
220 |
*(uint32_t*)abuf = (uint32_t)x; |
221 |
alen=4; |
222 |
if (x > 4294967295ll) |
223 |
GMessage("Parse warning: integer %lld is out of range.", x); |
224 |
} |
225 |
} |
226 |
} //integer type |
227 |
else if (atype == 'f') { |
228 |
*(float*)abuf = (float)atof(str + 5); |
229 |
alen = sizeof(float); |
230 |
} |
231 |
else if (atype == 'd') { //? |
232 |
*(float*)abuf = (float)atof(str + 9); |
233 |
alen=8; |
234 |
} |
235 |
else if (atype == 'Z' || atype == 'H') { |
236 |
if (atype == 'H') { // check whether the hex string is valid |
237 |
if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even"); |
238 |
for (int i = 0; i < strl - 5; ++i) { |
239 |
int c = toupper(str[5 + i]); |
240 |
if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F'))) |
241 |
parse_error("invalid hex character"); |
242 |
} |
243 |
} |
244 |
memcpy(abuf, str + 5, strl - 5); |
245 |
abuf[strl-5] = 0; |
246 |
alen=strl-4; |
247 |
} else parse_error("unrecognized aux type"); |
248 |
this->add_aux(tag, atype, alen, adata); |
249 |
}//add_aux() |
250 |
|
251 |
void interpret_CIGAR(char cop, int cl, int aln_start) { |
252 |
// NB: Closed ranges |
253 |
// gpos = current genomic position (will end up as right coordinate on the genome) |
254 |
// rpos = read position (will end up as the length of the read) |
255 |
// cop = CIGAR operation, cl = operation length |
256 |
int rpos = 0; |
257 |
int gpos = aln_start; |
258 |
int num_mismatches=0; //NM tag value = edit distance |
259 |
switch (cop) { |
260 |
case BAM_CDIFF: |
261 |
num_mismatches+=cl; |
262 |
case BAM_CMATCH: |
263 |
//have to actually check for mismatches: num_mismatches+=count_mismatches; |
264 |
case BAM_CEQUAL: |
265 |
printf("[%d-%d]", gpos, gpos + cl - 1); |
266 |
gpos+=cl; |
267 |
rpos+=cl; |
268 |
break; |
269 |
case BAM_CPAD: |
270 |
// printf("[%d-%d]", pos, pos + cl - 1); // Spans positions, No Coverage |
271 |
gpos+=cl; |
272 |
break; |
273 |
|
274 |
case BAM_CHARD_CLIP: |
275 |
// printf("[%d]", pos); // No coverage |
276 |
// gpos is not advanced by this operation |
277 |
break; |
278 |
|
279 |
case BAM_CSOFT_CLIP: |
280 |
//soft clipped bases, present in SEQ |
281 |
rpos+=cl; |
282 |
break; |
283 |
case BAM_CINS: |
284 |
// No Coverage |
285 |
// adds cl bases "throughput" but not genomic position "coverage" (gap in genomic seq) |
286 |
// should also add cl to the number of "mismatches" (unaligned read bases) |
287 |
num_mismatches+=cl; |
288 |
// How you handle this is application dependent |
289 |
// gpos is not advanced by this operation |
290 |
rpos+=cl; |
291 |
break; |
292 |
|
293 |
case BAM_CDEL: |
294 |
printf("D"); //deletion in reference sequence relative to the read (gap in read sequence) |
295 |
// printf("[%d-%d]", pos, pos + cl - 1); |
296 |
// Spans positions |
297 |
num_mismatches+=cl; |
298 |
gpos += cl; |
299 |
break; |
300 |
|
301 |
case BAM_CREF_SKIP: |
302 |
// intron |
303 |
printf("S"); |
304 |
// printf("[%d-%d]", pos, pos + cl - 1); // Spans positions, No Coverage |
305 |
//special skip operation, not contributing to "edit distance", |
306 |
// so num_mismatches is not updated |
307 |
gpos+=cl; |
308 |
break; |
309 |
|
310 |
default: |
311 |
fprintf(stderr, "Unhandled cigar_op %d:%d\n", cop, cl); |
312 |
//printf("?"); |
313 |
} |
314 |
} // interpret_CIGAR(), just a reference of CIGAR operations interpretation |
315 |
|
316 |
void GBamRecord::setupCoordinates() { |
317 |
uint32_t *cigar = bam1_cigar(b); |
318 |
const bam1_core_t *c = &b->core; |
319 |
if (c->flag & BAM_FUNMAP) return; /* skip unmapped reads */ |
320 |
int l=0; |
321 |
start=c->pos+1; //genomic start coordinate |
322 |
int exstart=c->pos; |
323 |
for (int i = 0; i < c->n_cigar; ++i) { |
324 |
int op = cigar[i]&0xf; |
325 |
if (op == BAM_CMATCH || op==BAM_CEQUAL || |
326 |
op == BAM_CDIFF || op == BAM_CDEL) { |
327 |
l += cigar[i]>>4; |
328 |
} |
329 |
else if (op == BAM_CREF_SKIP) { //N |
330 |
//intron starts |
331 |
//exon ends here |
332 |
GSeg exon(exstart+1,c->pos+l); |
333 |
exons.Add(exon); |
334 |
l += cigar[i]>>4; |
335 |
exstart=c->pos+l; |
336 |
} |
337 |
} |
338 |
GSeg exon(exstart+1,c->pos+l); |
339 |
exons.Add(exon); |
340 |
end=c->pos+l; //genomic start coordinate |
341 |
} |
342 |
/* |
343 |
int GBamRecord::find_tag(const char tag[2], uint8_t* & s, char& tag_type) { |
344 |
//position s at the beginning of tag "data" (after the type char) |
345 |
//returns the length of tag data, and tag type in tag_type |
346 |
uint8_t* aux_start=bam1_aux(b); |
347 |
s=aux_start; |
348 |
while (s < aux_start + b->l_aux - 1) { |
349 |
char key[2]; |
350 |
key[0] = (char)s[0]; key[1] = (char)s[1]; |
351 |
s += 2; tag_type = (char)*s; ++s; |
352 |
int inc=0; |
353 |
int m_inc=0; //for 'B' case |
354 |
uint8_t sub_type=0; // --,,-- |
355 |
switch (tag_type) { |
356 |
case 'A': |
357 |
case 'C': |
358 |
case 'c': |
359 |
inc=1; |
360 |
break; |
361 |
case 'S': |
362 |
case 's': |
363 |
inc=2; |
364 |
break; |
365 |
case 'I': |
366 |
case 'i': |
367 |
case 'f': |
368 |
inc=4; |
369 |
break; |
370 |
case 'd': |
371 |
inc=8; |
372 |
break; |
373 |
case 'B': |
374 |
sub_type = *(s+1); |
375 |
int32_t n; |
376 |
memcpy(&n, s+1, 4); |
377 |
inc += 5; |
378 |
//kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); |
379 |
m_inc=0; |
380 |
if (sub_type == 'c' || sub_type == 'C') |
381 |
{ m_inc=1; } |
382 |
else if (sub_type == 's' || sub_type == 'S') |
383 |
{ m_inc = 2; } |
384 |
else if ('i' == sub_type || 'I' == sub_type || 'f' == sub_type) |
385 |
{ m_inc = 4; } |
386 |
if (m_inc==0) |
387 |
GError("Error: invalid 'B' array subtype (%c)!\n",sub_type); |
388 |
inc += m_inc*n; |
389 |
break; |
390 |
case 'H': |
391 |
case 'Z': |
392 |
while (*(s+inc)) ++inc; |
393 |
++inc; // to skip the terminating \0 |
394 |
break; |
395 |
} //switch (tag_type) |
396 |
if (tag[0]==key[0] && tag[1]==key[1]) { |
397 |
return inc; |
398 |
} |
399 |
s+=inc; |
400 |
}//while aux data |
401 |
return 0; |
402 |
} |
403 |
*/ |
404 |
uint8_t* GBamRecord::find_tag(const char tag[2]) { |
405 |
return bam_aux_get(this->b, tag); |
406 |
} |
407 |
|
408 |
char GBamRecord::tag_char(const char tag[2]) { //retrieve tag data as single char |
409 |
uint8_t* s=find_tag(tag); |
410 |
if (s) return ( bam_aux2A(s) ); |
411 |
return 0; |
412 |
} |
413 |
|
414 |
int GBamRecord::tag_int(const char tag[2]) { //get the numeric value of tag |
415 |
uint8_t *s=find_tag(tag); |
416 |
if (s) return ( bam_aux2i(s) ); |
417 |
return 0; |
418 |
} |
419 |
|
420 |
char* GBamRecord::tag_str(const char tag[2]) { //return string value for a tag |
421 |
uint8_t *s=find_tag(tag); |
422 |
if (s) return ( bam_aux2Z(s) ); |
423 |
return NULL; |
424 |
} |
425 |
|
426 |
char GBamRecord::spliceStrand() { // '+', '-' from the XS tag, or 0 if no XS tag |
427 |
char c=tag_char("XS"); |
428 |
if (c) return c; |
429 |
else return '.'; |
430 |
} |
431 |
|
432 |
char* GBamRecord::sequence() { //user must free this after use |
433 |
char *s = (char*)bam1_seq(b); |
434 |
char* qseq=NULL; |
435 |
GMALLOC(qseq, (b->core.l_qseq+1)); |
436 |
int i; |
437 |
for (i=0;i<(b->core.l_qseq);i++) { |
438 |
int8_t v = bam1_seqi(s,i); |
439 |
qseq[i] = bam_nt16_rev_table[v]; |
440 |
} |
441 |
qseq[i] = 0; |
442 |
return qseq; |
443 |
} |
444 |
|
445 |
char* GBamRecord::qualities() {//user must free this after use |
446 |
char *qual = (char*)bam1_qual(b); |
447 |
char* qv=NULL; |
448 |
GMALLOC(qv, (b->core.l_qseq+1) ); |
449 |
int i; |
450 |
for(i=0;i<(b->core.l_qseq);i++) { |
451 |
qv[i]=qual[i]+33; |
452 |
} |
453 |
qv[i]=0; |
454 |
return qv; |
455 |
} |
456 |
|
457 |
char* GBamRecord::cigar() { //returns text version of the CIGAR string; must be freed by user |
458 |
kstring_t str; |
459 |
str.l = str.m = 0; str.s = 0; |
460 |
if (b->core.n_cigar == 0) kputc('*', &str); |
461 |
else { |
462 |
for (int i = 0; i < b->core.n_cigar; ++i) { |
463 |
kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); |
464 |
kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); |
465 |
} |
466 |
} |
467 |
return str.s; |
468 |
} |