ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/sam2bam.cpp
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (8 years, 2 months ago) by gpertea
File size: 15739 byte(s)
Log Message:
adding tophat source work

Line File contents
1 #include <cstdlib>
2 #include <cstdio>
3 #include <string>
4
5 #include "bam/bam.h"
6 #include "bam/sam.h"
7
8 using namespace std;
9
10 extern unsigned short bam_char2flag_table[];
11
12 uint8_t* realloc_bdata(bam1_t *b, int size) {
13 if (b->m_data < size) {
14 b->m_data = size;
15 kroundup32(b->m_data);
16 b->data = (uint8_t*)realloc(b->data, b->m_data);
17 b->data_len=size;
18 }
19 return b->data;
20 }
21
22 uint8_t* dupalloc_bdata(bam1_t *b, int size) {
23 //same as realloc_bdata, but does not free previous data
24 //but returns it instead
25 //it ALWAYS duplicates data
26 b->m_data = size;
27 kroundup32(b->m_data);
28 uint8_t* odata=b->data;
29 b->data = (uint8_t*)malloc(b->m_data);
30 memcpy((void*)b->data, (void*)odata, b->data_len);
31 b->data_len=size;
32 return odata; //user must FREE this after
33 }
34
35
36 class GBamRecord {
37 bam1_t* b;
38 // b->data has the following strings concatenated:
39 // qname (including the terminal \0)
40 // +cigar (each event encoded on 32 bits)
41 // +seq (4bit-encoded)
42 // +qual
43 // +aux
44 bool novel;
45 public:
46 GBamRecord(bam1_t* from_b=NULL) {
47 if (from_b==NULL) {
48 b=bam_init1();
49 novel=true;
50 }
51 else {
52 b=from_b;
53 novel=false;
54 }
55 }
56 //creates a new record from 1-based alignment coordinate
57 //quals should be given as Phred33
58 GBamRecord(const char* qname, int32_t gseq_tid,
59 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL) {
60 novel=true;
61 b=bam_init1();
62 b->core.tid=gseq_tid;
63 if (pos<=0) b->core.pos=-1; //unmapped?
64 else b->core.pos=pos-1; //BAM is 0-based
65 b->core.qual=255;
66 int l_qseq=strlen(qseq);
67 //this may not be accurate, setting CIGAR is the correct way
68 //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1);
69 b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
70 memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
71 set_cigar(cigar); //this will also set core.bin
72 add_sequence(qseq, l_qseq);
73 add_quals(quals); //quals must be given as Phred33
74 if (reverse) { b->core.flag |= BAM_FREVERSE ; }
75 }
76
77 void set_mdata(int32_t mtid, int32_t mpos,
78 int32_t isize=0) { //mate info for current record
79 b->core.mtid=mtid;
80 b->core.mpos=mpos; // should be -1 if '*'
81 b->core.isize=isize; //should be 0 if not available
82 }
83
84 void set_flags(uint16_t flags) {
85 b->core.flag=flags;
86 }
87
88 void set_cigar(const char* cigar) {
89 //requires b->core.pos to have been set properly PRIOR to this call
90 int doff=b->core.l_qname;
91 uint8_t* after_cigar=NULL;
92 int after_cigar_len=0;
93 uint8_t* prev_bdata=NULL;
94 if (b->data_len>doff) {
95 //cigar string already allocated, replace it
96 int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
97 after_cigar=b->data+d;
98 after_cigar_len=b->data_len-d;
99 }
100 const char *s;
101 char *t;
102 int i, op;
103 long x;
104 b->core.n_cigar = 0;
105 if (cigar!=NULL && cigar[0] != '*') {
106 for (s = cigar; *s; ++s) {
107 if (isalpha(*s)) b->core.n_cigar++;
108 else if (!isdigit(*s)) {
109 fprintf(stderr, "Error: invalid CIGAR character (%s)\n",cigar);
110 exit(1);
111 }
112 }
113 if (after_cigar_len>0) { //replace/insert into existing full data
114 prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
115 memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
116 free(prev_bdata);
117 }
118 else {
119 b->data = realloc_bdata(b, doff + b->core.n_cigar * 4);
120 }
121 for (i = 0, s = cigar; i != b->core.n_cigar; ++i) {
122 x = strtol(s, &t, 10);
123 op = toupper(*t);
124 if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
125 else if (op == 'I') op = BAM_CINS;
126 else if (op == 'D') op = BAM_CDEL;
127 else if (op == 'N') op = BAM_CREF_SKIP;
128 else if (op == 'S') op = BAM_CSOFT_CLIP;
129 else if (op == 'H') op = BAM_CHARD_CLIP;
130 else if (op == 'P') op = BAM_CPAD;
131 else { fprintf(stderr, "Error: invalid CIGAR operation (%s)\n",cigar);
132 exit(1);
133 }
134 s = t + 1;
135 bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
136 }
137 if (*s) {fprintf(stderr, "Error: unmatched CIGAR operation (%s)\n",cigar); exit(1); }
138 b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
139 } else {//no CIGAR string
140 if (!(b->core.flag&BAM_FUNMAP)) {
141 fprintf(stderr, "Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
142 b->core.flag |= BAM_FUNMAP;
143 }
144 b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
145 }
146 } //set_cigar()
147
148 void add_sequence(const char* qseq, int slen=-1) {
149 //must be called AFTER set_cigar (cannot replace existing sequence for now)
150 if (qseq==NULL) return; //should we ever care about this?
151 if (slen<0) slen=strlen(qseq);
152 int doff = b->core.l_qname + b->core.n_cigar * 4;
153 if (strcmp(qseq, "*")!=0) {
154 b->core.l_qseq=slen;
155 if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
156 {
157 fprintf(stderr, "Error: CIGAR and sequence length are inconsistent!(%s)\n", qseq);
158 exit(1);
159 }
160 uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
161 //also allocated quals memory
162 memset(p, 0, (b->core.l_qseq+1)/2);
163 for (int i = 0; i < b->core.l_qseq; ++i)
164 p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
165 } else b->core.l_qseq = 0;
166 }
167
168 void add_quals(const char* quals) {
169 //requires core.l_qseq already set
170 //and must be called AFTER add_sequence(), which also allocates the memory for quals
171 uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2);
172 if (quals==NULL || quals[0]=='*') {
173 for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff;
174 return;
175 }
176 for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33;
177 }
178 void clear() {
179 if (novel) {
180 bam_destroy1(b);
181 }
182 novel=true;
183 b=bam_init1();
184 }
185 ~GBamRecord() {
186 if (novel) { bam_destroy1(b); }
187 }
188 void parse_error(const char* s) {
189 fprintf(stderr, "Parsing error: %s\n", s);
190 exit(1);
191 }
192
193 void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
194 int ori_len = b->data_len;
195 b->data_len += 3 + len;
196 b->l_aux += 3 + len;
197 if (b->m_data < b->data_len) {
198 b->m_data = b->data_len;
199 kroundup32(b->m_data);
200 b->data = (uint8_t*)realloc(b->data, b->m_data);
201 }
202 b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
203 b->data[ori_len + 2] = atype;
204 memcpy(b->data + ori_len + 3, data, len);
205 }
206
207 void add_aux(char* str) {
208 static char tag[2];
209 static uint8_t abuf[512];
210 //requires: being called AFTER add_quals()
211 int strl=strlen(str);
212 //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
213 //int doff0=doff;
214 if (strl < 6 || str[2] != ':' || str[4] != ':')
215 parse_error("missing colon in auxiliary data");
216 tag[0] = str[0]; tag[1] = str[1];
217 uint8_t atype = str[3];
218 uint8_t* adata=abuf;
219 int alen=0;
220 if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility
221 atype='A';
222 alen=1;
223 adata=(uint8_t*)&str[5];
224 }
225 else if (atype == 'I' || atype == 'i') {
226 long long x=(long long)atoll(str + 5);
227 if (x < 0) {
228 if (x >= -127) {
229 atype='c';
230 abuf[0] = (int8_t)x;
231 alen=1;
232 }
233 else if (x >= -32767) {
234 atype = 's';
235 *(int16_t*)abuf = (int16_t)x;
236 alen=2;
237 }
238 else {
239 atype='i';
240 *(int32_t*)abuf = (int32_t)x;
241 alen=4;
242 if (x < -2147483648ll)
243 fprintf(stderr, "Parse warning: integer %lld is out of range.",
244 x);
245 }
246 } else { //x >=0
247 if (x <= 255) {
248 atype = 'C';
249 abuf[0] = (uint8_t)x;
250 alen=1;
251 }
252 else if (x <= 65535) {
253 atype='S';
254 *(uint16_t*)abuf = (uint16_t)x;
255 alen=2;
256 }
257 else {
258 atype='I';
259 *(uint32_t*)abuf = (uint32_t)x;
260 alen=4;
261 if (x > 4294967295ll)
262 fprintf(stderr, "Parse warning: integer %lld is out of range.",
263 x);
264 }
265 }
266 } //integer type
267 else if (atype == 'f') {
268 *(float*)abuf = (float)atof(str + 5);
269 alen = sizeof(float);
270 }
271 else if (atype == 'd') { //?
272 *(float*)abuf = (float)atof(str + 9);
273 alen=8;
274 }
275 else if (atype == 'Z' || atype == 'H') {
276 if (atype == 'H') { // check whether the hex string is valid
277 if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even");
278 for (int i = 0; i < strl - 5; ++i) {
279 int c = toupper(str[5 + i]);
280 if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
281 parse_error("invalid hex character");
282 }
283 }
284 memcpy(abuf, str + 5, strl - 5);
285 abuf[strl-5] = 0;
286 alen=strl-4;
287 } else parse_error("unrecognized aux type");
288 add_aux(tag, atype, alen, adata);
289 }//add_aux()
290
291 bam1_t* get_b() { return b; }
292
293
294 };
295
296 class GBamWriter {
297 samfile_t* bam_file;
298 bam_header_t* bam_header;
299 public:
300 void create(const char* fname, bool uncompressed=false) {
301 if (bam_header==NULL) {
302 fprintf(stderr, "Error: no bam_header for GBamWriter::create()!\n");
303 exit(1);
304 }
305 if (uncompressed) {
306 bam_file=samopen(fname, "wbu", bam_header);
307 }
308 else {
309 bam_file=samopen(fname, "wb", bam_header);
310 }
311 if (bam_file==NULL) {
312 fprintf(stderr, "Error: could not create BAM file %s!\n",fname);
313 exit(1);
314 }
315 //do we need to call bam_header_write() ?
316 }
317
318 GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
319 bam_header=bh;
320 create(fname, uncompressed);
321 }
322
323 GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
324 tamFile samf_in=sam_open(samfname);
325 if (samf_in==NULL) {
326 fprintf(stderr, "Error: could not open SAM file %s\n", samfname);
327 exit(1);
328 }
329 bam_header=sam_header_read(samf_in);
330 if (bam_header==NULL) {
331 fprintf(stderr, "Error: could not read SAM header from %s!\n",samfname);
332 exit(2);
333 }
334 sam_close(samf_in);
335 create(fname, uncompressed);
336 }
337
338 ~GBamWriter() {
339 samclose(bam_file);
340 bam_header_destroy(bam_header);
341 }
342 bam_header_t* get_header() { return bam_header; }
343 int32_t get_tid(const char *seq_name) {
344 if (bam_header==NULL) {
345 fprintf(stderr, "Error: missing SAM header (get_tid())\n");
346 exit(1);
347 }
348 return bam_get_tid(bam_header, seq_name);
349 }
350
351 //just a convenience function for creating a new record, but it's NOT written
352 //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
353 GBamRecord* new_record(const char* qname, const char* gseqname,
354 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
355 int32_t gseq_tid=get_tid(gseqname);
356 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
357 if (bam_header->n_targets == 0) {
358 fprintf(stderr, "Error: missing/invalid SAM header\n");
359 exit(1);
360 } else
361 fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
362 gseqname);
363 }
364
365 GBamRecord *newbamrec=new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual);
366 return newbamrec;
367 }
368
369 void write(GBamRecord* brec) {
370 samwrite(this->bam_file,brec->get_b());
371 }
372 };
373
374 int str_split(char* str, char**& t, char delim='\t') {
375 //split text line str into an array of strings t[] by delimiter delim (tab)
376 //Notes: * destructive operation for s (replaces every \t with \0)
377 // * user must free t when no longer needed
378 int tcap=14;
379 t=(char**)malloc(tcap*sizeof(char*));
380 int tcount=0;
381 char prevch=0;
382 for (char* p = str; *p!=0 ;p++) {
383 if (*p==delim) *p=0; //break the string here
384 if (prevch==0) { //field start
385 if (tcount==tcap) {
386 tcap+=4;
387 t = (char**)realloc(t,tcap*sizeof(char*));
388 }
389 t[tcount]=p;
390 tcount++;
391 } //field start
392 prevch=*p;
393 if (*p=='\n' || *p=='\r') {
394 *p=0;
395 break;
396 }
397 }//for each character on the line
398 return tcount;
399 }
400
401 int main(int argc, char *argv[]) {
402
403 if (argv[1]==NULL) {
404 fprintf(stderr, "Usage:\nsam2bam <samfile>\n");
405 exit(1);
406 }
407 string fname(argv[1]);
408 fname+=".bam";
409 GBamWriter wbam(fname.c_str(), argv[1]);
410 //now parse the SAM line and convert it to BAM
411 char samline[1024];
412 samline[1023]=0;
413 FILE* f_in=fopen(argv[1], "r");
414 if (f_in==NULL) {
415 fprintf(stderr, "Error: could not open SAM file %s for reading!\n",argv[1]);
416 exit(2);
417 }
418 while (fgets(samline, 1024, f_in)!=NULL) {
419 char** t=NULL;
420 int tcount=str_split(samline, t);
421 if (samline[0]=='@') {
422 free(t);
423 continue; //skip header
424 }
425 if (tcount<10) {
426 fprintf(stderr, "Warning: skipping malformed SAM line %s\n",samline);
427 free(t);
428 continue;
429 }
430 int gpos=isdigit(t[3][0]) ? atoi(t[3]) : 0;
431 GBamRecord *brec= wbam.new_record(t[0],t[2],gpos,false, t[9], t[5], t[10]);
432 //parse flags value:
433 char* s=NULL;
434 long flag = strtol(t[1], &s, 0);
435 if (*s) { // not the end of the string
436 flag = 0;
437 for (s = t[1]; *s; ++s)
438 flag |= bam_char2flag_table[(int)*s]; //text-style SAM flags
439 }
440 brec->set_flags(flag);
441 int32_t mtid = strcmp(t[6], "=")? wbam.get_tid(t[6]) : brec->get_b()->core.tid;
442 int32_t mpos = isdigit(t[7][0])? atoi(t[7]) - 1 : -1;
443 int32_t isize = (t[8][0] == '-' || isdigit(t[8][0]))? atoi(t[8]) : 0;
444 brec->set_mdata(mtid, mpos, isize);
445 //now parse and add aux data:
446 if (tcount>11) {
447 for (int i=11;i<tcount;i++) {
448 //parse aux entry:
449 brec->add_aux(t[i]);
450 }//for each aux field
451 }
452 wbam.write(brec);
453 delete brec;
454 free(t);
455 }
456 fclose(f_in);
457 return 0;
458 }