ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBam.cpp
Revision: 235
Committed: Mon Apr 23 21:31:55 2012 UTC (7 years, 2 months ago) by gpertea
File size: 15868 byte(s)
Log Message:
some clarifications in CIGAR operation parsing

Line User Rev File contents
1 gpertea 121 #include "GBam.h"
2     #include <ctype.h>
3 gpertea 124 #include "kstring.h"
4 gpertea 121
5 gpertea 133 //auxiliary functions for low level BAM record creation
6 gpertea 121 uint8_t* realloc_bdata(bam1_t *b, int size) {
7     if (b->m_data < size) {
8     b->m_data = size;
9     kroundup32(b->m_data);
10     b->data = (uint8_t*)realloc(b->data, b->m_data);
11     }
12     if (b->data_len<size) b->data_len=size;
13     return b->data;
14     }
15    
16     uint8_t* dupalloc_bdata(bam1_t *b, int size) {
17     //same as realloc_bdata, but does not free previous data
18     //but returns it instead
19     //it ALWAYS duplicates data
20     b->m_data = size;
21     kroundup32(b->m_data);
22     uint8_t* odata=b->data;
23     b->data = (uint8_t*)malloc(b->m_data);
24     memcpy((void*)b->data, (void*)odata, b->data_len);
25     b->data_len=size;
26     return odata; //user must FREE this after
27     }
28    
29     //from bam_import.c
30     extern unsigned short bam_char2flag_table[];
31    
32     GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid,
33 gpertea 124 int pos, bool reverse, const char* qseq,
34     const char* cigar, const char* quals):exons(1) {
35 gpertea 225 novel=true;
36 gpertea 121 bam_header=NULL;
37     b=bam_init1();
38 gpertea 225 if (pos<=0 || gseq_tid<0) {
39 gpertea 121 b->core.pos=-1; //unmapped
40     b->core.flag |= BAM_FUNMAP;
41 gpertea 225 gseq_tid=-1;
42 gpertea 121 }
43     else b->core.pos=pos-1; //BAM is 0-based
44 gpertea 225 b->core.tid=gseq_tid;
45 gpertea 121 b->core.qual=255;
46 gpertea 225 b->core.mtid=-1;
47     b->core.mpos=-1;
48 gpertea 121 int l_qseq=strlen(qseq);
49     //this may not be accurate, setting CIGAR is the correct way
50     //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1);
51     b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
52     memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
53     set_cigar(cigar); //this will also set core.bin
54     add_sequence(qseq, l_qseq);
55     add_quals(quals); //quals must be given as Phred33
56     if (reverse) { b->core.flag |= BAM_FREVERSE ; }
57     }
58    
59     GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
60     int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
61     int insert_size, const char* qseq, const char* quals,
62 gpertea 124 GVec<char*>* aux_strings):exons(1) {
63 gpertea 225 novel=true;
64 gpertea 121 bam_header=NULL;
65     b=bam_init1();
66     b->core.tid=g_tid;
67     b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based
68     b->core.qual=map_qual;
69     int l_qseq=strlen(qseq);
70     b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
71     memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
72     set_cigar(cigar); //this will also set core.bin
73     add_sequence(qseq, l_qseq);
74     add_quals(quals); //quals must be given as Phred33
75     set_flags(flags);
76     set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size);
77     if (aux_strings!=NULL) {
78     for (int i=0;i<aux_strings->Count();i++) {
79     add_aux(aux_strings->Get(i));
80     }
81     }
82     }
83 gpertea 124
84 gpertea 121 void GBamRecord::set_cigar(const char* cigar) {
85     //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
86     int doff=b->core.l_qname;
87     uint8_t* after_cigar=NULL;
88     int after_cigar_len=0;
89     uint8_t* prev_bdata=NULL;
90     if (b->data_len>doff) {
91     //cigar string already allocated, replace it
92     int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
93     after_cigar=b->data+d;
94     after_cigar_len=b->data_len-d;
95     }
96     const char *s;
97     char *t;
98     int i, op;
99     long x;
100     b->core.n_cigar = 0;
101     if (cigar != NULL && strcmp(cigar, "*") != 0) {
102     for (s = cigar; *s; ++s) {
103     if (isalpha(*s)) b->core.n_cigar++;
104     else if (!isdigit(*s)) {
105     GError("Error: invalid CIGAR character (%s)\n",cigar);
106     }
107     }
108     if (after_cigar_len>0) { //replace/insert into existing full data
109     prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
110     memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
111     free(prev_bdata);
112     }
113     else {
114     realloc_bdata(b, doff + b->core.n_cigar * 4);
115     }
116     for (i = 0, s = cigar; i != b->core.n_cigar; ++i) {
117     x = strtol(s, &t, 10);
118     op = toupper(*t);
119     if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
120     else if (op == 'I') op = BAM_CINS;
121     else if (op == 'D') op = BAM_CDEL;
122     else if (op == 'N') op = BAM_CREF_SKIP;
123     else if (op == 'S') op = BAM_CSOFT_CLIP;
124     else if (op == 'H') op = BAM_CHARD_CLIP;
125     else if (op == 'P') op = BAM_CPAD;
126     else GError("Error: invalid CIGAR operation (%s)\n",cigar);
127     s = t + 1;
128     bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
129     }
130     if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar);
131     b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
132     } else {//no CIGAR string given
133     if (!(b->core.flag&BAM_FUNMAP)) {
134     GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
135     b->core.flag |= BAM_FUNMAP;
136     }
137     b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
138     }
139 gpertea 124 setupCoordinates();
140 gpertea 121 } //set_cigar()
141    
142     void GBamRecord::add_sequence(const char* qseq, int slen) {
143     //must be called AFTER set_cigar (cannot replace existing sequence for now)
144     if (qseq==NULL) return; //should we ever care about this?
145     if (slen<0) slen=strlen(qseq);
146     int doff = b->core.l_qname + b->core.n_cigar * 4;
147     if (strcmp(qseq, "*")!=0) {
148     b->core.l_qseq=slen;
149     if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
150     GError("Error: CIGAR and sequence length are inconsistent!(%s)\n",
151     qseq);
152     uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
153     //also allocated quals memory
154     memset(p, 0, (b->core.l_qseq+1)/2);
155     for (int i = 0; i < b->core.l_qseq; ++i)
156     p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
157     } else b->core.l_qseq = 0;
158     }
159    
160     void GBamRecord::add_quals(const char* quals) {
161     //requires core.l_qseq already set
162     //and must be called AFTER add_sequence(), which also allocates the memory for quals
163     uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2);
164     if (quals==NULL || strcmp(quals, "*") == 0) {
165     for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff;
166     return;
167     }
168     for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33;
169     }
170    
171     void GBamRecord::add_aux(const char* str) {
172     //requires: being called AFTER add_quals()
173     int strl=strlen(str);
174     //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
175     //int doff0=doff;
176     if (strl < 6 || str[2] != ':' || str[4] != ':')
177     parse_error("missing colon in auxiliary data");
178     tag[0] = str[0]; tag[1] = str[1];
179     uint8_t atype = str[3];
180     uint8_t* adata=abuf;
181     int alen=0;
182     if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility
183     atype='A';
184     alen=1;
185     adata=(uint8_t*)&str[5];
186     }
187     else if (atype == 'I' || atype == 'i') {
188     long long x=(long long)atoll(str + 5);
189     if (x < 0) {
190     if (x >= -127) {
191     atype='c';
192     abuf[0] = (int8_t)x;
193     alen=1;
194     }
195     else if (x >= -32767) {
196     atype = 's';
197     *(int16_t*)abuf = (int16_t)x;
198     alen=2;
199     }
200     else {
201     atype='i';
202     *(int32_t*)abuf = (int32_t)x;
203     alen=4;
204     if (x < -2147483648ll)
205     GMessage("Parse warning: integer %lld is out of range.", x);
206     }
207     } else { //x >=0
208     if (x <= 255) {
209     atype = 'C';
210     abuf[0] = (uint8_t)x;
211     alen=1;
212     }
213     else if (x <= 65535) {
214     atype='S';
215     *(uint16_t*)abuf = (uint16_t)x;
216     alen=2;
217     }
218     else {
219     atype='I';
220     *(uint32_t*)abuf = (uint32_t)x;
221     alen=4;
222     if (x > 4294967295ll)
223     GMessage("Parse warning: integer %lld is out of range.", x);
224     }
225     }
226     } //integer type
227     else if (atype == 'f') {
228     *(float*)abuf = (float)atof(str + 5);
229     alen = sizeof(float);
230     }
231     else if (atype == 'd') { //?
232     *(float*)abuf = (float)atof(str + 9);
233     alen=8;
234     }
235     else if (atype == 'Z' || atype == 'H') {
236     if (atype == 'H') { // check whether the hex string is valid
237     if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even");
238     for (int i = 0; i < strl - 5; ++i) {
239     int c = toupper(str[5 + i]);
240     if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
241     parse_error("invalid hex character");
242     }
243     }
244     memcpy(abuf, str + 5, strl - 5);
245     abuf[strl-5] = 0;
246     alen=strl-4;
247     } else parse_error("unrecognized aux type");
248     this->add_aux(tag, atype, alen, adata);
249     }//add_aux()
250    
251 gpertea 235 void interpret_CIGAR(char cop, int cl, int aln_start) {
252     // NB: Closed ranges
253     // gpos = current genomic position (will end up as right coordinate on the genome)
254     // rpos = read position (will end up as the length of the read)
255     // cop = CIGAR operation, cl = operation length
256     int rpos = 0;
257     int gpos = aln_start;
258     int num_mismatches=0; //NM tag value = edit distance
259     switch (cop) {
260     case BAM_CDIFF:
261     num_mismatches+=cl;
262     case BAM_CMATCH:
263     //have to actually check for mismatches: num_mismatches+=count_mismatches;
264     case BAM_CEQUAL:
265     printf("[%d-%d]", gpos, gpos + cl - 1);
266     gpos+=cl;
267     rpos+=cl;
268     break;
269     case BAM_CPAD:
270     // printf("[%d-%d]", pos, pos + cl - 1); // Spans positions, No Coverage
271     gpos+=cl;
272     break;
273 gpertea 121
274 gpertea 235 case BAM_CHARD_CLIP:
275     // printf("[%d]", pos); // No coverage
276     // gpos is not advanced by this operation
277     break;
278    
279     case BAM_CSOFT_CLIP:
280     //soft clipped bases, present in SEQ
281     rpos+=cl;
282     break;
283     case BAM_CINS:
284     // No Coverage
285     // adds cl bases "throughput" but not genomic position "coverage" (gap in genomic seq)
286     // should also add cl to the number of "mismatches" (unaligned read bases)
287     num_mismatches+=cl;
288     // How you handle this is application dependent
289     // gpos is not advanced by this operation
290     rpos+=cl;
291     break;
292    
293     case BAM_CDEL:
294     printf("D"); //deletion in reference sequence relative to the read (gap in read sequence)
295     // printf("[%d-%d]", pos, pos + cl - 1);
296     // Spans positions
297     num_mismatches+=cl;
298     gpos += cl;
299     break;
300    
301     case BAM_CREF_SKIP:
302     // intron
303     printf("S");
304     // printf("[%d-%d]", pos, pos + cl - 1); // Spans positions, No Coverage
305     //special skip operation, not contributing to "edit distance",
306     // so num_mismatches is not updated
307     gpos+=cl;
308     break;
309    
310     default:
311     fprintf(stderr, "Unhandled cigar_op %d:%d\n", cop, cl);
312     //printf("?");
313     }
314     } // interpret_CIGAR(), just a reference of CIGAR operations interpretation
315    
316 gpertea 121 void GBamRecord::setupCoordinates() {
317     uint32_t *cigar = bam1_cigar(b);
318     const bam1_core_t *c = &b->core;
319 gpertea 124 if (c->flag & BAM_FUNMAP) return; /* skip unmapped reads */
320 gpertea 121 int l=0;
321 gpertea 235 start=c->pos+1; //genomic start coordinate
322 gpertea 124 int exstart=c->pos;
323     for (int i = 0; i < c->n_cigar; ++i) {
324     int op = cigar[i]&0xf;
325     if (op == BAM_CMATCH || op==BAM_CEQUAL ||
326     op == BAM_CDIFF || op == BAM_CDEL) {
327 gpertea 121 l += cigar[i]>>4;
328 gpertea 124 }
329     else if (op == BAM_CREF_SKIP) { //N
330     //intron starts
331     //exon ends here
332 gpertea 130 GSeg exon(exstart+1,c->pos+l);
333 gpertea 126 exons.Add(exon);
334 gpertea 124 l += cigar[i]>>4;
335     exstart=c->pos+l;
336     }
337     }
338 gpertea 130 GSeg exon(exstart+1,c->pos+l);
339 gpertea 126 exons.Add(exon);
340 gpertea 235 end=c->pos+l; //genomic start coordinate
341 gpertea 121 }
342 gpertea 234 /*
343 gpertea 124 int GBamRecord::find_tag(const char tag[2], uint8_t* & s, char& tag_type) {
344     //position s at the beginning of tag "data" (after the type char)
345     //returns the length of tag data, and tag type in tag_type
346 gpertea 214 uint8_t* aux_start=bam1_aux(b);
347     s=aux_start;
348     while (s < aux_start + b->l_aux - 1) {
349 gpertea 124 char key[2];
350     key[0] = (char)s[0]; key[1] = (char)s[1];
351     s += 2; tag_type = (char)*s; ++s;
352     int inc=0;
353     int m_inc=0; //for 'B' case
354     uint8_t sub_type=0; // --,,--
355     switch (tag_type) {
356     case 'A':
357     case 'C':
358     case 'c':
359     inc=1;
360     break;
361     case 'S':
362     case 's':
363     inc=2;
364     break;
365     case 'I':
366     case 'i':
367     case 'f':
368     inc=4;
369     break;
370     case 'd':
371     inc=8;
372     break;
373     case 'B':
374     sub_type = *(s+1);
375     int32_t n;
376     memcpy(&n, s+1, 4);
377     inc += 5;
378     //kputc(type, &str); kputc(':', &str); kputc(sub_type, &str);
379     m_inc=0;
380     if (sub_type == 'c' || sub_type == 'C')
381     { m_inc=1; }
382     else if (sub_type == 's' || sub_type == 'S')
383     { m_inc = 2; }
384     else if ('i' == sub_type || 'I' == sub_type || 'f' == sub_type)
385     { m_inc = 4; }
386 gpertea 214 if (m_inc==0)
387 gpertea 232 GError("Error: invalid 'B' array subtype (%c)!\n",sub_type);
388 gpertea 124 inc += m_inc*n;
389     break;
390     case 'H':
391     case 'Z':
392     while (*(s+inc)) ++inc;
393 gpertea 232 ++inc; // to skip the terminating \0
394 gpertea 124 break;
395     } //switch (tag_type)
396 gpertea 232 if (tag[0]==key[0] && tag[1]==key[1]) {
397 gpertea 124 return inc;
398 gpertea 232 }
399 gpertea 127 s+=inc;
400 gpertea 124 }//while aux data
401     return 0;
402     }
403 gpertea 234 */
404     uint8_t* GBamRecord::find_tag(const char tag[2]) {
405     return bam_aux_get(this->b, tag);
406     }
407 gpertea 124
408     char GBamRecord::tag_char(const char tag[2]) { //retrieve tag data as single char
409 gpertea 234 uint8_t* s=find_tag(tag);
410     if (s) return ( bam_aux2A(s) );
411     return 0;
412 gpertea 124 }
413    
414     int GBamRecord::tag_int(const char tag[2]) { //get the numeric value of tag
415 gpertea 234 uint8_t *s=find_tag(tag);
416     if (s) return ( bam_aux2i(s) );
417 gpertea 124 return 0;
418     }
419    
420     char* GBamRecord::tag_str(const char tag[2]) { //return string value for a tag
421 gpertea 234 uint8_t *s=find_tag(tag);
422     if (s) return ( bam_aux2Z(s) );
423     return NULL;
424 gpertea 124 }
425    
426     char GBamRecord::spliceStrand() { // '+', '-' from the XS tag, or 0 if no XS tag
427 gpertea 234 char c=tag_char("XS");
428     if (c) return c;
429     else return '.';
430 gpertea 124 }
431    
432     char* GBamRecord::sequence() { //user must free this after use
433     char *s = (char*)bam1_seq(b);
434     char* qseq=NULL;
435     GMALLOC(qseq, (b->core.l_qseq+1));
436     int i;
437     for (i=0;i<(b->core.l_qseq);i++) {
438     int8_t v = bam1_seqi(s,i);
439     qseq[i] = bam_nt16_rev_table[v];
440     }
441     qseq[i] = 0;
442     return qseq;
443     }
444    
445     char* GBamRecord::qualities() {//user must free this after use
446     char *qual = (char*)bam1_qual(b);
447     char* qv=NULL;
448     GMALLOC(qv, (b->core.l_qseq+1) );
449     int i;
450     for(i=0;i<(b->core.l_qseq);i++) {
451     qv[i]=qual[i]+33;
452     }
453     qv[i]=0;
454     return qv;
455     }
456    
457     char* GBamRecord::cigar() { //returns text version of the CIGAR string; must be freed by user
458     kstring_t str;
459     str.l = str.m = 0; str.s = 0;
460     if (b->core.n_cigar == 0) kputc('*', &str);
461     else {
462     for (int i = 0; i < b->core.n_cigar; ++i) {
463     kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str);
464     kputc("MIDNSHP=X"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str);
465     }
466     }
467     return str.s;
468     }