ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBam.cpp
Revision: 121
Committed: Mon Dec 5 22:18:48 2011 UTC (7 years, 7 months ago) by gpertea
File size: 9818 byte(s)
Log Message:
added GBam BAM reader/writer classes (wip)

Line User Rev File contents
1 gpertea 121 #include "GBam.h"
2     #include <ctype.h>
3    
4    
5     //auxiliary functions for BAM record handling
6     uint8_t* realloc_bdata(bam1_t *b, int size) {
7     if (b->m_data < size) {
8     b->m_data = size;
9     kroundup32(b->m_data);
10     b->data = (uint8_t*)realloc(b->data, b->m_data);
11     }
12     if (b->data_len<size) b->data_len=size;
13     return b->data;
14     }
15    
16     uint8_t* dupalloc_bdata(bam1_t *b, int size) {
17     //same as realloc_bdata, but does not free previous data
18     //but returns it instead
19     //it ALWAYS duplicates data
20     b->m_data = size;
21     kroundup32(b->m_data);
22     uint8_t* odata=b->data;
23     b->data = (uint8_t*)malloc(b->m_data);
24     memcpy((void*)b->data, (void*)odata, b->data_len);
25     b->data_len=size;
26     return odata; //user must FREE this after
27     }
28    
29     //from bam_import.c
30     extern unsigned short bam_char2flag_table[];
31    
32     GBamRecord::GBamRecord(const char* qname, int32_t gseq_tid,
33     int pos, bool reverse, const char* qseq, const char* cigar, const char* quals) {
34     novel=true;
35     bam_header=NULL;
36     b=bam_init1();
37     b->core.tid=gseq_tid;
38     if (pos<=0) {
39     b->core.pos=-1; //unmapped
40     //if (gseq_tid<0)
41     b->core.flag |= BAM_FUNMAP;
42     }
43     else b->core.pos=pos-1; //BAM is 0-based
44     b->core.qual=255;
45     int l_qseq=strlen(qseq);
46     //this may not be accurate, setting CIGAR is the correct way
47     //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1);
48     b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
49     memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
50     set_cigar(cigar); //this will also set core.bin
51     add_sequence(qseq, l_qseq);
52     add_quals(quals); //quals must be given as Phred33
53     if (reverse) { b->core.flag |= BAM_FREVERSE ; }
54     }
55    
56     GBamRecord::GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
57     int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
58     int insert_size, const char* qseq, const char* quals,
59     GVec<char*>* aux_strings) {
60     novel=true;
61     bam_header=NULL;
62     b=bam_init1();
63     b->core.tid=g_tid;
64     b->core.pos = (pos<=0) ? -1 : pos-1; //BAM is 0-based
65     b->core.qual=map_qual;
66     int l_qseq=strlen(qseq);
67     b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
68     memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
69     set_cigar(cigar); //this will also set core.bin
70     add_sequence(qseq, l_qseq);
71     add_quals(quals); //quals must be given as Phred33
72     set_flags(flags);
73     set_mdata(mg_tid, (int32_t)(mate_pos-1), (int32_t)insert_size);
74     if (aux_strings!=NULL) {
75     for (int i=0;i<aux_strings->Count();i++) {
76     add_aux(aux_strings->Get(i));
77     }
78     }
79     }
80     void GBamRecord::set_cigar(const char* cigar) {
81     //requires b->core.pos and b->core.flag to have been set properly PRIOR to this call
82     int doff=b->core.l_qname;
83     uint8_t* after_cigar=NULL;
84     int after_cigar_len=0;
85     uint8_t* prev_bdata=NULL;
86     if (b->data_len>doff) {
87     //cigar string already allocated, replace it
88     int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
89     after_cigar=b->data+d;
90     after_cigar_len=b->data_len-d;
91     }
92     const char *s;
93     char *t;
94     int i, op;
95     long x;
96     b->core.n_cigar = 0;
97     if (cigar != NULL && strcmp(cigar, "*") != 0) {
98     for (s = cigar; *s; ++s) {
99     if (isalpha(*s)) b->core.n_cigar++;
100     else if (!isdigit(*s)) {
101     GError("Error: invalid CIGAR character (%s)\n",cigar);
102     }
103     }
104     if (after_cigar_len>0) { //replace/insert into existing full data
105     prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
106     memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
107     free(prev_bdata);
108     }
109     else {
110     realloc_bdata(b, doff + b->core.n_cigar * 4);
111     }
112     for (i = 0, s = cigar; i != b->core.n_cigar; ++i) {
113     x = strtol(s, &t, 10);
114     op = toupper(*t);
115     if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
116     else if (op == 'I') op = BAM_CINS;
117     else if (op == 'D') op = BAM_CDEL;
118     else if (op == 'N') op = BAM_CREF_SKIP;
119     else if (op == 'S') op = BAM_CSOFT_CLIP;
120     else if (op == 'H') op = BAM_CHARD_CLIP;
121     else if (op == 'P') op = BAM_CPAD;
122     else GError("Error: invalid CIGAR operation (%s)\n",cigar);
123     s = t + 1;
124     bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
125     }
126     if (*s) GError("Error: unmatched CIGAR operation (%s)\n",cigar);
127     b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
128     } else {//no CIGAR string given
129     if (!(b->core.flag&BAM_FUNMAP)) {
130     GMessage("Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
131     b->core.flag |= BAM_FUNMAP;
132     }
133     b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
134     }
135     } //set_cigar()
136    
137     void GBamRecord::add_sequence(const char* qseq, int slen) {
138     //must be called AFTER set_cigar (cannot replace existing sequence for now)
139     if (qseq==NULL) return; //should we ever care about this?
140     if (slen<0) slen=strlen(qseq);
141     int doff = b->core.l_qname + b->core.n_cigar * 4;
142     if (strcmp(qseq, "*")!=0) {
143     b->core.l_qseq=slen;
144     if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
145     GError("Error: CIGAR and sequence length are inconsistent!(%s)\n",
146     qseq);
147     uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
148     //also allocated quals memory
149     memset(p, 0, (b->core.l_qseq+1)/2);
150     for (int i = 0; i < b->core.l_qseq; ++i)
151     p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
152     } else b->core.l_qseq = 0;
153     }
154    
155     void GBamRecord::add_quals(const char* quals) {
156     //requires core.l_qseq already set
157     //and must be called AFTER add_sequence(), which also allocates the memory for quals
158     uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2);
159     if (quals==NULL || strcmp(quals, "*") == 0) {
160     for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff;
161     return;
162     }
163     for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33;
164     }
165    
166     void GBamRecord::add_aux(const char* str) {
167     //requires: being called AFTER add_quals()
168     static char tag[2];
169     static uint8_t abuf[512];
170     //requires: being called AFTER add_quals()
171     int strl=strlen(str);
172     //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
173     //int doff0=doff;
174     if (strl < 6 || str[2] != ':' || str[4] != ':')
175     parse_error("missing colon in auxiliary data");
176     tag[0] = str[0]; tag[1] = str[1];
177     uint8_t atype = str[3];
178     uint8_t* adata=abuf;
179     int alen=0;
180     if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility
181     atype='A';
182     alen=1;
183     adata=(uint8_t*)&str[5];
184     }
185     else if (atype == 'I' || atype == 'i') {
186     long long x=(long long)atoll(str + 5);
187     if (x < 0) {
188     if (x >= -127) {
189     atype='c';
190     abuf[0] = (int8_t)x;
191     alen=1;
192     }
193     else if (x >= -32767) {
194     atype = 's';
195     *(int16_t*)abuf = (int16_t)x;
196     alen=2;
197     }
198     else {
199     atype='i';
200     *(int32_t*)abuf = (int32_t)x;
201     alen=4;
202     if (x < -2147483648ll)
203     GMessage("Parse warning: integer %lld is out of range.", x);
204     }
205     } else { //x >=0
206     if (x <= 255) {
207     atype = 'C';
208     abuf[0] = (uint8_t)x;
209     alen=1;
210     }
211     else if (x <= 65535) {
212     atype='S';
213     *(uint16_t*)abuf = (uint16_t)x;
214     alen=2;
215     }
216     else {
217     atype='I';
218     *(uint32_t*)abuf = (uint32_t)x;
219     alen=4;
220     if (x > 4294967295ll)
221     GMessage("Parse warning: integer %lld is out of range.", x);
222     }
223     }
224     } //integer type
225     else if (atype == 'f') {
226     *(float*)abuf = (float)atof(str + 5);
227     alen = sizeof(float);
228     }
229     else if (atype == 'd') { //?
230     *(float*)abuf = (float)atof(str + 9);
231     alen=8;
232     }
233     else if (atype == 'Z' || atype == 'H') {
234     if (atype == 'H') { // check whether the hex string is valid
235     if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even");
236     for (int i = 0; i < strl - 5; ++i) {
237     int c = toupper(str[5 + i]);
238     if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
239     parse_error("invalid hex character");
240     }
241     }
242     memcpy(abuf, str + 5, strl - 5);
243     abuf[strl-5] = 0;
244     alen=strl-4;
245     } else parse_error("unrecognized aux type");
246     this->add_aux(tag, atype, alen, adata);
247     }//add_aux()
248    
249    
250     void GBamRecord::setupCoordinates() {
251     uint32_t *cigar = bam1_cigar(b);
252     const bam1_core_t *c = &b->core;
253     if (c->flag & BAM_FUNMAP) return 0; /* skip unmapped reads */
254     int l=0;
255     for (int i = l = 0; i < c->n_cigar; ++i) {
256     int op = cigar[i]&0xf;
257     if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP)
258     l += cigar[i]>>4;
259     }
260     }