ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/sam2bam.cpp
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (8 years, 10 months ago) by gpertea
File size: 15739 byte(s)
Log Message:
adding tophat source work

Line User Rev File contents
1 gpertea 29 #include <cstdlib>
2     #include <cstdio>
3     #include <string>
4    
5     #include "bam/bam.h"
6     #include "bam/sam.h"
7    
8     using namespace std;
9    
10     extern unsigned short bam_char2flag_table[];
11    
12     uint8_t* realloc_bdata(bam1_t *b, int size) {
13     if (b->m_data < size) {
14     b->m_data = size;
15     kroundup32(b->m_data);
16     b->data = (uint8_t*)realloc(b->data, b->m_data);
17     b->data_len=size;
18     }
19     return b->data;
20     }
21    
22     uint8_t* dupalloc_bdata(bam1_t *b, int size) {
23     //same as realloc_bdata, but does not free previous data
24     //but returns it instead
25     //it ALWAYS duplicates data
26     b->m_data = size;
27     kroundup32(b->m_data);
28     uint8_t* odata=b->data;
29     b->data = (uint8_t*)malloc(b->m_data);
30     memcpy((void*)b->data, (void*)odata, b->data_len);
31     b->data_len=size;
32     return odata; //user must FREE this after
33     }
34    
35    
36     class GBamRecord {
37     bam1_t* b;
38     // b->data has the following strings concatenated:
39     // qname (including the terminal \0)
40     // +cigar (each event encoded on 32 bits)
41     // +seq (4bit-encoded)
42     // +qual
43     // +aux
44     bool novel;
45     public:
46     GBamRecord(bam1_t* from_b=NULL) {
47     if (from_b==NULL) {
48     b=bam_init1();
49     novel=true;
50     }
51     else {
52     b=from_b;
53     novel=false;
54     }
55     }
56     //creates a new record from 1-based alignment coordinate
57     //quals should be given as Phred33
58     GBamRecord(const char* qname, int32_t gseq_tid,
59     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL) {
60     novel=true;
61     b=bam_init1();
62     b->core.tid=gseq_tid;
63     if (pos<=0) b->core.pos=-1; //unmapped?
64     else b->core.pos=pos-1; //BAM is 0-based
65     b->core.qual=255;
66     int l_qseq=strlen(qseq);
67     //this may not be accurate, setting CIGAR is the correct way
68     //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1);
69     b->core.l_qname=strlen(qname)+1; //includes the \0 at the end
70     memcpy(realloc_bdata(b, b->core.l_qname), qname, b->core.l_qname);
71     set_cigar(cigar); //this will also set core.bin
72     add_sequence(qseq, l_qseq);
73     add_quals(quals); //quals must be given as Phred33
74     if (reverse) { b->core.flag |= BAM_FREVERSE ; }
75     }
76    
77     void set_mdata(int32_t mtid, int32_t mpos,
78     int32_t isize=0) { //mate info for current record
79     b->core.mtid=mtid;
80     b->core.mpos=mpos; // should be -1 if '*'
81     b->core.isize=isize; //should be 0 if not available
82     }
83    
84     void set_flags(uint16_t flags) {
85     b->core.flag=flags;
86     }
87    
88     void set_cigar(const char* cigar) {
89     //requires b->core.pos to have been set properly PRIOR to this call
90     int doff=b->core.l_qname;
91     uint8_t* after_cigar=NULL;
92     int after_cigar_len=0;
93     uint8_t* prev_bdata=NULL;
94     if (b->data_len>doff) {
95     //cigar string already allocated, replace it
96     int d=b->core.l_qname + b->core.n_cigar * 4;//offset of after-cigar data
97     after_cigar=b->data+d;
98     after_cigar_len=b->data_len-d;
99     }
100     const char *s;
101     char *t;
102     int i, op;
103     long x;
104     b->core.n_cigar = 0;
105     if (cigar!=NULL && cigar[0] != '*') {
106     for (s = cigar; *s; ++s) {
107     if (isalpha(*s)) b->core.n_cigar++;
108     else if (!isdigit(*s)) {
109     fprintf(stderr, "Error: invalid CIGAR character (%s)\n",cigar);
110     exit(1);
111     }
112     }
113     if (after_cigar_len>0) { //replace/insert into existing full data
114     prev_bdata=dupalloc_bdata(b, doff + b->core.n_cigar * 4 + after_cigar_len);
115     memcpy((void*)(b->data+doff+b->core.n_cigar*4),(void*)after_cigar, after_cigar_len);
116     free(prev_bdata);
117     }
118     else {
119     b->data = realloc_bdata(b, doff + b->core.n_cigar * 4);
120     }
121     for (i = 0, s = cigar; i != b->core.n_cigar; ++i) {
122     x = strtol(s, &t, 10);
123     op = toupper(*t);
124     if (op == 'M' || op == '=' || op == 'X') op = BAM_CMATCH;
125     else if (op == 'I') op = BAM_CINS;
126     else if (op == 'D') op = BAM_CDEL;
127     else if (op == 'N') op = BAM_CREF_SKIP;
128     else if (op == 'S') op = BAM_CSOFT_CLIP;
129     else if (op == 'H') op = BAM_CHARD_CLIP;
130     else if (op == 'P') op = BAM_CPAD;
131     else { fprintf(stderr, "Error: invalid CIGAR operation (%s)\n",cigar);
132     exit(1);
133     }
134     s = t + 1;
135     bam1_cigar(b)[i] = x << BAM_CIGAR_SHIFT | op;
136     }
137     if (*s) {fprintf(stderr, "Error: unmatched CIGAR operation (%s)\n",cigar); exit(1); }
138     b->core.bin = bam_reg2bin(b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
139     } else {//no CIGAR string
140     if (!(b->core.flag&BAM_FUNMAP)) {
141     fprintf(stderr, "Warning: mapped sequence without CIGAR (%s)\n", (char*)b->data);
142     b->core.flag |= BAM_FUNMAP;
143     }
144     b->core.bin = bam_reg2bin(b->core.pos, b->core.pos + 1);
145     }
146     } //set_cigar()
147    
148     void add_sequence(const char* qseq, int slen=-1) {
149     //must be called AFTER set_cigar (cannot replace existing sequence for now)
150     if (qseq==NULL) return; //should we ever care about this?
151     if (slen<0) slen=strlen(qseq);
152     int doff = b->core.l_qname + b->core.n_cigar * 4;
153     if (strcmp(qseq, "*")!=0) {
154     b->core.l_qseq=slen;
155     if (b->core.n_cigar && b->core.l_qseq != (int32_t)bam_cigar2qlen(&b->core, bam1_cigar(b)))
156     {
157     fprintf(stderr, "Error: CIGAR and sequence length are inconsistent!(%s)\n", qseq);
158     exit(1);
159     }
160     uint8_t* p = (uint8_t*)realloc_bdata(b, doff + (b->core.l_qseq+1)/2 + b->core.l_qseq) + doff;
161     //also allocated quals memory
162     memset(p, 0, (b->core.l_qseq+1)/2);
163     for (int i = 0; i < b->core.l_qseq; ++i)
164     p[i/2] |= bam_nt16_table[(int)qseq[i]] << 4*(1-i%2);
165     } else b->core.l_qseq = 0;
166     }
167    
168     void add_quals(const char* quals) {
169     //requires core.l_qseq already set
170     //and must be called AFTER add_sequence(), which also allocates the memory for quals
171     uint8_t* p = b->data+(b->core.l_qname + b->core.n_cigar * 4 + (b->core.l_qseq+1)/2);
172     if (quals==NULL || quals[0]=='*') {
173     for (int i=0;i < b->core.l_qseq; i++) p[i] = 0xff;
174     return;
175     }
176     for (int i=0;i < b->core.l_qseq; i++) p[i] = quals[i]-33;
177     }
178     void clear() {
179     if (novel) {
180     bam_destroy1(b);
181     }
182     novel=true;
183     b=bam_init1();
184     }
185     ~GBamRecord() {
186     if (novel) { bam_destroy1(b); }
187     }
188     void parse_error(const char* s) {
189     fprintf(stderr, "Parsing error: %s\n", s);
190     exit(1);
191     }
192    
193     void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
194     int ori_len = b->data_len;
195     b->data_len += 3 + len;
196     b->l_aux += 3 + len;
197     if (b->m_data < b->data_len) {
198     b->m_data = b->data_len;
199     kroundup32(b->m_data);
200     b->data = (uint8_t*)realloc(b->data, b->m_data);
201     }
202     b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
203     b->data[ori_len + 2] = atype;
204     memcpy(b->data + ori_len + 3, data, len);
205     }
206    
207     void add_aux(char* str) {
208     static char tag[2];
209     static uint8_t abuf[512];
210     //requires: being called AFTER add_quals()
211     int strl=strlen(str);
212     //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
213     //int doff0=doff;
214     if (strl < 6 || str[2] != ':' || str[4] != ':')
215     parse_error("missing colon in auxiliary data");
216     tag[0] = str[0]; tag[1] = str[1];
217     uint8_t atype = str[3];
218     uint8_t* adata=abuf;
219     int alen=0;
220     if (atype == 'A' || atype == 'a' || atype == 'c' || atype == 'C') { // c and C for backward compatibility
221     atype='A';
222     alen=1;
223     adata=(uint8_t*)&str[5];
224     }
225     else if (atype == 'I' || atype == 'i') {
226     long long x=(long long)atoll(str + 5);
227     if (x < 0) {
228     if (x >= -127) {
229     atype='c';
230     abuf[0] = (int8_t)x;
231     alen=1;
232     }
233     else if (x >= -32767) {
234     atype = 's';
235     *(int16_t*)abuf = (int16_t)x;
236     alen=2;
237     }
238     else {
239     atype='i';
240     *(int32_t*)abuf = (int32_t)x;
241     alen=4;
242     if (x < -2147483648ll)
243     fprintf(stderr, "Parse warning: integer %lld is out of range.",
244     x);
245     }
246     } else { //x >=0
247     if (x <= 255) {
248     atype = 'C';
249     abuf[0] = (uint8_t)x;
250     alen=1;
251     }
252     else if (x <= 65535) {
253     atype='S';
254     *(uint16_t*)abuf = (uint16_t)x;
255     alen=2;
256     }
257     else {
258     atype='I';
259     *(uint32_t*)abuf = (uint32_t)x;
260     alen=4;
261     if (x > 4294967295ll)
262     fprintf(stderr, "Parse warning: integer %lld is out of range.",
263     x);
264     }
265     }
266     } //integer type
267     else if (atype == 'f') {
268     *(float*)abuf = (float)atof(str + 5);
269     alen = sizeof(float);
270     }
271     else if (atype == 'd') { //?
272     *(float*)abuf = (float)atof(str + 9);
273     alen=8;
274     }
275     else if (atype == 'Z' || atype == 'H') {
276     if (atype == 'H') { // check whether the hex string is valid
277     if ((strl - 5) % 2 == 1) parse_error("length of the hex string not even");
278     for (int i = 0; i < strl - 5; ++i) {
279     int c = toupper(str[5 + i]);
280     if (!((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F')))
281     parse_error("invalid hex character");
282     }
283     }
284     memcpy(abuf, str + 5, strl - 5);
285     abuf[strl-5] = 0;
286     alen=strl-4;
287     } else parse_error("unrecognized aux type");
288     add_aux(tag, atype, alen, adata);
289     }//add_aux()
290    
291     bam1_t* get_b() { return b; }
292    
293    
294     };
295    
296     class GBamWriter {
297     samfile_t* bam_file;
298     bam_header_t* bam_header;
299     public:
300     void create(const char* fname, bool uncompressed=false) {
301     if (bam_header==NULL) {
302     fprintf(stderr, "Error: no bam_header for GBamWriter::create()!\n");
303     exit(1);
304     }
305     if (uncompressed) {
306     bam_file=samopen(fname, "wbu", bam_header);
307     }
308     else {
309     bam_file=samopen(fname, "wb", bam_header);
310     }
311     if (bam_file==NULL) {
312     fprintf(stderr, "Error: could not create BAM file %s!\n",fname);
313     exit(1);
314     }
315     //do we need to call bam_header_write() ?
316     }
317    
318     GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
319     bam_header=bh;
320     create(fname, uncompressed);
321     }
322    
323     GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
324     tamFile samf_in=sam_open(samfname);
325     if (samf_in==NULL) {
326     fprintf(stderr, "Error: could not open SAM file %s\n", samfname);
327     exit(1);
328     }
329     bam_header=sam_header_read(samf_in);
330     if (bam_header==NULL) {
331     fprintf(stderr, "Error: could not read SAM header from %s!\n",samfname);
332     exit(2);
333     }
334     sam_close(samf_in);
335     create(fname, uncompressed);
336     }
337    
338     ~GBamWriter() {
339     samclose(bam_file);
340     bam_header_destroy(bam_header);
341     }
342     bam_header_t* get_header() { return bam_header; }
343     int32_t get_tid(const char *seq_name) {
344     if (bam_header==NULL) {
345     fprintf(stderr, "Error: missing SAM header (get_tid())\n");
346     exit(1);
347     }
348     return bam_get_tid(bam_header, seq_name);
349     }
350    
351     //just a convenience function for creating a new record, but it's NOT written
352     //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
353     GBamRecord* new_record(const char* qname, const char* gseqname,
354     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
355     int32_t gseq_tid=get_tid(gseqname);
356     if (gseq_tid < 0 && strcmp(gseqname, "*")) {
357     if (bam_header->n_targets == 0) {
358     fprintf(stderr, "Error: missing/invalid SAM header\n");
359     exit(1);
360     } else
361     fprintf(stderr, "Warning: reference '%s' not found in header, will consider it '*'.\n",
362     gseqname);
363     }
364    
365     GBamRecord *newbamrec=new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual);
366     return newbamrec;
367     }
368    
369     void write(GBamRecord* brec) {
370     samwrite(this->bam_file,brec->get_b());
371     }
372     };
373    
374     int str_split(char* str, char**& t, char delim='\t') {
375     //split text line str into an array of strings t[] by delimiter delim (tab)
376     //Notes: * destructive operation for s (replaces every \t with \0)
377     // * user must free t when no longer needed
378     int tcap=14;
379     t=(char**)malloc(tcap*sizeof(char*));
380     int tcount=0;
381     char prevch=0;
382     for (char* p = str; *p!=0 ;p++) {
383     if (*p==delim) *p=0; //break the string here
384     if (prevch==0) { //field start
385     if (tcount==tcap) {
386     tcap+=4;
387     t = (char**)realloc(t,tcap*sizeof(char*));
388     }
389     t[tcount]=p;
390     tcount++;
391     } //field start
392     prevch=*p;
393     if (*p=='\n' || *p=='\r') {
394     *p=0;
395     break;
396     }
397     }//for each character on the line
398     return tcount;
399     }
400    
401     int main(int argc, char *argv[]) {
402    
403     if (argv[1]==NULL) {
404     fprintf(stderr, "Usage:\nsam2bam <samfile>\n");
405     exit(1);
406     }
407     string fname(argv[1]);
408     fname+=".bam";
409     GBamWriter wbam(fname.c_str(), argv[1]);
410     //now parse the SAM line and convert it to BAM
411     char samline[1024];
412     samline[1023]=0;
413     FILE* f_in=fopen(argv[1], "r");
414     if (f_in==NULL) {
415     fprintf(stderr, "Error: could not open SAM file %s for reading!\n",argv[1]);
416     exit(2);
417     }
418     while (fgets(samline, 1024, f_in)!=NULL) {
419     char** t=NULL;
420     int tcount=str_split(samline, t);
421     if (samline[0]=='@') {
422     free(t);
423     continue; //skip header
424     }
425     if (tcount<10) {
426     fprintf(stderr, "Warning: skipping malformed SAM line %s\n",samline);
427     free(t);
428     continue;
429     }
430     int gpos=isdigit(t[3][0]) ? atoi(t[3]) : 0;
431     GBamRecord *brec= wbam.new_record(t[0],t[2],gpos,false, t[9], t[5], t[10]);
432     //parse flags value:
433     char* s=NULL;
434     long flag = strtol(t[1], &s, 0);
435     if (*s) { // not the end of the string
436     flag = 0;
437     for (s = t[1]; *s; ++s)
438     flag |= bam_char2flag_table[(int)*s]; //text-style SAM flags
439     }
440     brec->set_flags(flag);
441     int32_t mtid = strcmp(t[6], "=")? wbam.get_tid(t[6]) : brec->get_b()->core.tid;
442     int32_t mpos = isdigit(t[7][0])? atoi(t[7]) - 1 : -1;
443     int32_t isize = (t[8][0] == '-' || isdigit(t[8][0]))? atoi(t[8]) : 0;
444     brec->set_mdata(mtid, mpos, isize);
445     //now parse and add aux data:
446     if (tcount>11) {
447     for (int i=11;i<tcount;i++) {
448     //parse aux entry:
449     brec->add_aux(t[i]);
450     }//for each aux field
451     }
452     wbam.write(brec);
453     delete brec;
454     free(t);
455     }
456     fclose(f_in);
457     return 0;
458     }