ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBam.cpp
Revision: 310
Committed: Fri Mar 22 20:06:27 2013 UTC (6 years, 1 month ago) by gpertea
File size: 15953 byte(s)
Log Message:
sync with igm repo

Line File contents
1 #include "GBam.h"
2 #include <ctype.h>
3 #include "kstring.h"
4
5 //auxiliary functions for low level BAM record creation
6 uint8_t* realloc_bdata(bam1_t *b, int size) {
7 if (b->m_data < size) {
8 b->m_data = size;
9 kroundup32(b->m_data);
10 b->data = (uint8_t*)realloc(b->data, b->m_data);
11 }
12 if (b->data_len<size) b->data_len=size;
13 return b->data;
14 }
15
16 uint8_t* dupalloc_bdata(bam1_t *b, int size) {
17 //same as realloc_bdata, but does not free previous data
18 //but returns it instead
19 //it ALWAYS duplicates data
20 b->m_data = size;
21 kroundup32(b->m_data);
22 uint8_t* odata=b->data;
23 b->data = (uint8_t*)malloc(b->m_data);
24 memcpy((void*)b->data, (void*)odata, b->data_len);
25 b->data_len=size;
26 return odata; //user must FREE this after
27 }
28
29 //from bam_import.c
30 extern unsigned short bam_char2flag_table[];
31
32 GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid,
33 int pos, bool reverse, const char* qseq,
34 const char* cigar, const char* quals):exons(1) {
35 novel=true;
36 bam_header=NULL;
37 b=bam_init1();
38 if (pos<=0 || gseq_tid<0) {
39 b->core.pos=-1; //unmapped
40 b->core.flag |= BAM_FUNMAP;
41 gseq_tid=-1;
42 }
43 else b->core.pos=pos-1; //BAM is 0-based
44 b->core.tid=gseq_tid;
45 b->core.qual=255;
46 b->core.mtid=-1;
47 b->core.mpos=-1;
48 int l_qseq=strlen(qseq);
49 //this may not be accurate, setting CIGAR is the correct way
50 //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1);
51 b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
52 memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
53 set_cigar(cigar); //this will also set core.bin
54 add_sequence(qseq, l_qseq);
55 add_quals(quals); //quals must be given as Phred33
56 if (reverse) { b->core.flag |= BAM_FREVERSE ; }
57 }
58
59 GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
60 int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
61 int insert_size, const char* qseq, const char* quals,
62 GVec<char*>* aux_strings):exons(1) {
63 novel=true;
64 bam_header=NULL;
65 b=bam_init1();
66 b->core.tid=g_tid;
67 b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based
68 b->core.qual=map_qual;
69 int l_qseq=strlen(qseq);
70 b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
71 memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
72 set_cigar(cigar); //this will also set core.bin
73 add_sequence(qseq, l_qseq);
74 add_quals(quals); //quals must be given as Phred33
75 set_flags(flags);
76 set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size);
77 if (aux_strings!=NULL) {
78 for (int i=0;i<aux_strings->Count();i++) {
79 add_aux(aux_strings->Get(i));
80 }
81 }
82 }
83
84 void GBamRecord::set_cigar(const char* cigar) {
85 //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
86 int doff=b->core.l_qname;
87 uint8_t* after_cigar=NULL;
88 int after_cigar_len=0;
89 uint8_t* prev_bdata=NULL;
90 if (b->data_len>doff) {
91 //cigar string already allocated, replace it
92 int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
93 after_cigar=b->data+d;
94 after_cigar_len=b->data_len-d;
95 }
96 const char *s;
97 char *t;
98 int i, op;
99 long x;
100 b->core.n_cigar = 0;
101 if (cigar != NULL && strcmp(cigar, "*") != 0) {
102 for (s = cigar; *s; ++s) {
103 if (isalpha(*s)) b->core.n_cigar++;
104 else if (!isdigit(*s)) {
105 GError("Error: invalid CIGAR character (%s)\n",cigar);
106 }
107 }
108 if (after_cigar_len>0) { //replace/insert into existing full data
109 prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
110 memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
111 free(prev_bdata);
112 }
113 else {
114 realloc_bdata(b, doff + b->core.n_cigar * 4);
115 }
116 for (i = 0, s = cigar; i != b->core.n_cigar; ++i) {
117 x = strtol(s, &t, 10);
118 op = toupper(*t);
119 if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
120 else if (op == 'I') op = BAM_CINS;
121 else if (op == 'D') op = BAM_CDEL;
122 else if (op == 'N') op = BAM_CREF_SKIP;
123 else if (op == 'S') op = BAM_CSOFT_CLIP;
124 else if (op == 'H') op = BAM_CHARD_CLIP;
125 else if (op == 'P') op = BAM_CPAD;
126 else GError("Error: invalid CIGAR operation (%s)\n",cigar);
127 s = t + 1;
128 bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
129 }
130 if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar);
131 b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
132 } else {//no CIGAR string given
133 if (!(b->core.flag&BAM_FUNMAP)) {
134 GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
135 b->core.flag |= BAM_FUNMAP;
136 }
137 b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
138 }
139 setupCoordinates();
140 } //set_cigar()
141
142 void GBamRecord::add_sequence(const char* qseq, int slen) {
143 //must be called AFTER set_cigar (cannot replace existing sequence for now)
144 if (qseq==NULL) return; //should we ever care about this?
145 if (slen<0) slen=strlen(qseq);
146 int doff = b->core.l_qname + b->core.n_cigar * 4;
147 if (strcmp(qseq, "*")!=0) {
148 b->core.l_qseq=slen;
149 if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
150 GError("Error: CIGAR and sequence length are inconsistent!(%s)\n",
151 qseq);
152 uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
153 //also allocated quals memory
154 memset(p, 0, (b->core.l_qseq+1)/2);
155 for (int i = 0; i < b->core.l_qseq; ++i)
156 p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
157 } else b->core.l_qseq = 0;
158 }
159
160 void GBamRecord::add_quals(const char* quals) {
161 //requires core.l_qseq already set
162 //and must be called AFTER add_sequence(), which also allocates the memory for quals
163 uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2);
164 if (quals==NULL || strcmp(quals, "*") == 0) {
165 for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff;
166 return;
167 }
168 for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33;
169 }
170
171 void GBamRecord::add_aux(const char* str) {
172 //requires: being called AFTER add_quals()
173 int strl=strlen(str);
174 //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
175 //int doff0=doff;
176 if (strl < 6 || str[2] != ':' || str[4] != ':')
177 parse_error("missing colon in auxiliary data");
178 tag[0] = str[0]; tag[1] = str[1];
179 uint8_t atype = str[3];
180 uint8_t* adata=abuf;
181 int alen=0;
182 if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility
183 atype='A';
184 alen=1;
185 adata=(uint8_t*)&str[5];
186 }
187 else if (atype == 'I' || atype == 'i') {
188
189 long long x=strtoll(str+5, NULL, 10); //(long long)atoll(str + 5);
190 //long x=(long)atol(str + 5);
191 if (x < 0) {
192 if (x >= -127) {
193 atype='c';
194 abuf[0] = (int8_t)x;
195 alen=1;
196 }
197 else if (x >= -32767) {
198 atype = 's';
199 *(int16_t*)abuf = (int16_t)x;
200 alen=2;
201 }
202 else {
203 atype='i';
204 *(int32_t*)abuf = (int32_t)x;
205 alen=4;
206 if (x < -2147483648ll)
207 GMessage("Parse warning: integer %lld is out of range.", x);
208 }
209 } else { //x >=0
210 if (x <= 255) {
211 atype = 'C';
212 abuf[0] = (uint8_t)x;
213 alen=1;
214 }
215 else if (x <= 65535) {
216 atype='S';
217 *(uint16_t*)abuf = (uint16_t)x;
218 alen=2;
219 }
220 else {
221 atype='I';
222 *(uint32_t*)abuf = (uint32_t)x;
223 alen=4;
224 if (x > 4294967295ll)
225 GMessage("Parse warning: integer %lld is out of range.", x);
226 }
227 }
228 } //integer type
229 else if (atype == 'f') {
230 *(float*)abuf = (float)atof(str + 5);
231 alen = sizeof(float);
232 }
233 else if (atype == 'd') { //?
234 *(float*)abuf = (float)atof(str + 9);
235 alen=8;
236 }
237 else if (atype == 'Z' || atype == 'H') {
238 if (atype == 'H') { // check whether the hex string is valid
239 if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even");
240 for (int i = 0; i < strl - 5; ++i) {
241 int c = toupper(str[5 + i]);
242 if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
243 parse_error("invalid hex character");
244 }
245 }
246 memcpy(abuf, str + 5, strl - 5);
247 abuf[strl-5] = 0;
248 alen=strl-4;
249 } else parse_error("unrecognized aux type");
250 this->add_aux(tag, atype, alen, adata);
251 }//add_aux()
252
253 void interpret_CIGAR(char cop, int cl, int aln_start) {
254 // NB: Closed ranges
255 // gpos = current genomic position (will end up as right coordinate on the genome)
256 // rpos = read position (will end up as the length of the read)
257 // cop = CIGAR operation, cl = operation length
258 int rpos = 0;
259 int gpos = aln_start;
260 int num_mismatches=0; //NM tag value = edit distance
261 switch (cop) {
262 case BAM_CDIFF:
263 num_mismatches+=cl;
264 case BAM_CMATCH:
265 //have to actually check for mismatches: num_mismatches+=count_mismatches;
266 case BAM_CEQUAL:
267 printf("[%d-%d]", gpos, gpos + cl - 1);
268 gpos+=cl;
269 rpos+=cl;
270 break;
271 case BAM_CPAD:
272 // printf("[%d-%d]", pos, pos + cl - 1); // Spans positions, No Coverage
273 gpos+=cl;
274 break;
275
276 case BAM_CHARD_CLIP:
277 // printf("[%d]", pos); // No coverage
278 // gpos is not advanced by this operation
279 break;
280
281 case BAM_CSOFT_CLIP:
282 //soft clipped bases, present in SEQ
283 rpos+=cl;
284 break;
285 case BAM_CINS:
286 // No Coverage
287 // adds cl bases "throughput" but not genomic position "coverage" (gap in genomic seq)
288 // should also add cl to the number of "mismatches" (unaligned read bases)
289 num_mismatches+=cl;
290 // How you handle this is application dependent
291 // gpos is not advanced by this operation
292 rpos+=cl;
293 break;
294
295 case BAM_CDEL:
296 printf("D"); //deletion in reference sequence relative to the read (gap in read sequence)
297 // printf("[%d-%d]", pos, pos + cl - 1);
298 // Spans positions
299 num_mismatches+=cl;
300 gpos += cl;
301 break;
302
303 case BAM_CREF_SKIP:
304 // intron
305 printf("S");
306 // printf("[%d-%d]", pos, pos + cl - 1); // Spans positions, No Coverage
307 //special skip operation, not contributing to "edit distance",
308 // so num_mismatches is not updated
309 gpos+=cl;
310 break;
311
312 default:
313 fprintf(stderr, "Unhandled cigar_op %d:%d\n", cop, cl);
314 //printf("?");
315 }
316 } // interpret_CIGAR(), just a reference of CIGAR operations interpretation
317
318 void GBamRecord::setupCoordinates() {
319 uint32_t *cigar = bam1_cigar(b);
320 const bam1_core_t *c = &b->core;
321 if (c->flag & BAM_FUNMAP) return; /* skip unmapped reads */
322 int l=0;
323 start=c->pos+1; //genomic start coordinate, 1-based (BAM core.pos is 0-based)
324 int exstart=c->pos;
325 for (int i = 0; i < c->n_cigar; ++i) {
326 int op = cigar[i]&0xf;
327 if (op == BAM_CMATCH || op==BAM_CEQUAL ||
328 op == BAM_CDIFF || op == BAM_CDEL) {
329 l += cigar[i]>>4;
330 }
331 else if (op == BAM_CREF_SKIP) { //N
332 //intron starts
333 //exon ends here
334 GSeg exon(exstart+1,c->pos+l);
335 exons.Add(exon);
336 l += cigar[i]>>4;
337 exstart=c->pos+l;
338 }
339 }
340 GSeg exon(exstart+1,c->pos+l);
341 exons.Add(exon);
342 end=c->pos+l; //genomic start coordinate
343 }
344 /*
345 int GBamRecord::find_tag(const char tag[2], uint8_t* & s, char& tag_type) {
346 //position s at the beginning of tag "data" (after the type char)
347 //returns the length of tag data, and tag type in tag_type
348 uint8_t* aux_start=bam1_aux(b);
349 s=aux_start;
350 while (s < aux_start + b->l_aux - 1) {
351 char key[2];
352 key[0] = (char)s[0]; key[1] = (char)s[1];
353 s += 2; tag_type = (char)*s; ++s;
354 int inc=0;
355 int m_inc=0; //for 'B' case
356 uint8_t sub_type=0; // --,,--
357 switch (tag_type) {
358 case 'A':
359 case 'C':
360 case 'c':
361 inc=1;
362 break;
363 case 'S':
364 case 's':
365 inc=2;
366 break;
367 case 'I':
368 case 'i':
369 case 'f':
370 inc=4;
371 break;
372 case 'd':
373 inc=8;
374 break;
375 case 'B':
376 sub_type = *(s+1);
377 int32_t n;
378 memcpy(&n, s+1, 4);
379 inc += 5;
380 //kputc(type, &str); kputc(':', &str); kputc(sub_type, &str);
381 m_inc=0;
382 if (sub_type == 'c' || sub_type == 'C')
383 { m_inc=1; }
384 else if (sub_type == 's' || sub_type == 'S')
385 { m_inc = 2; }
386 else if ('i' == sub_type || 'I' == sub_type || 'f' == sub_type)
387 { m_inc = 4; }
388 if (m_inc==0)
389 GError("Error: invalid 'B' array subtype (%c)!\n",sub_type);
390 inc += m_inc*n;
391 break;
392 case 'H':
393 case 'Z':
394 while (*(s+inc)) ++inc;
395 ++inc; // to skip the terminating \0
396 break;
397 } //switch (tag_type)
398 if (tag[0]==key[0] && tag[1]==key[1]) {
399 return inc;
400 }
401 s+=inc;
402 }//while aux data
403 return 0;
404 }
405 */
406 uint8_t* GBamRecord::find_tag(const char tag[2]) {
407 return bam_aux_get(this->b, tag);
408 }
409
410 char GBamRecord::tag_char(const char tag[2]) { //retrieve tag data as single char
411 uint8_t* s=find_tag(tag);
412 if (s) return ( bam_aux2A(s) );
413 return 0;
414 }
415
416 int GBamRecord::tag_int(const char tag[2]) { //get the numeric value of tag
417 uint8_t *s=find_tag(tag);
418 if (s) return ( bam_aux2i(s) );
419 return 0;
420 }
421
422 char* GBamRecord::tag_str(const char tag[2]) { //return string value for a tag
423 uint8_t *s=find_tag(tag);
424 if (s) return ( bam_aux2Z(s) );
425 return NULL;
426 }
427
428 char GBamRecord::spliceStrand() { // '+', '-' from the XS tag, or 0 if no XS tag
429 char c=tag_char("XS");
430 if (c) return c;
431 else return '.';
432 }
433
434 char* GBamRecord::sequence() { //user must free this after use
435 char *s = (char*)bam1_seq(b);
436 char* qseq=NULL;
437 GMALLOC(qseq, (b->core.l_qseq+1));
438 int i;
439 for (i=0;i<(b->core.l_qseq);i++) {
440 int8_t v = bam1_seqi(s,i);
441 qseq[i] = bam_nt16_rev_table[v];
442 }
443 qseq[i] = 0;
444 return qseq;
445 }
446
447 char* GBamRecord::qualities() {//user must free this after use
448 char *qual = (char*)bam1_qual(b);
449 char* qv=NULL;
450 GMALLOC(qv, (b->core.l_qseq+1) );
451 int i;
452 for(i=0;i<(b->core.l_qseq);i++) {
453 qv[i]=qual[i]+33;
454 }
455 qv[i]=0;
456 return qv;
457 }
458
459 char* GBamRecord::cigar() { //returns text version of the CIGAR string; must be freed by user
460 kstring_t str;
461 str.l = str.m = 0; str.s = 0;
462 if (b->core.n_cigar == 0) kputc('*', &str);
463 else {
464 for (int i = 0; i < b->core.n_cigar; ++i) {
465 kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);
466 kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str);
467 }
468 }
469 return str.s;
470 }