ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBam.h
Revision: 225
Committed: Mon Mar 26 12:07:13 2012 UTC (7 years, 5 months ago) by gpertea
File size: 11742 byte(s)
Log Message:
Line File contents
1 #ifndef _G_BAM_H
2 #define _G_BAM_H
3 #include "GBase.h"
4 #include "GList.hh"
5 #include "bam.h"
6 #include "sam.h"
7
8
9 class GBamReader;
10 class GBamWriter;
11
12 class GBamRecord: public GSeg {
13 friend class GBamReader;
14 friend class GBamWriter;
15 bam1_t* b;
16 // b->data has the following strings concatenated:
17 // qname (including the terminal \0)
18 // +cigar (each event encoded on 32 bits)
19 // +seq (4bit-encoded)
20 // +qual
21 // +aux
22 bool novel; //created from scratch, not from a given bam1_t* record
23 bam_header_t* bam_header;
24 char tag[2];
25 uint8_t abuf[512];
26 public:
27 GVec<GSeg> exons;
28 //created from a reader:
29 GBamRecord(bam1_t* from_b=NULL, bam_header_t* b_header=NULL):exons(1) {
30 bam_header=NULL;
31 if (from_b==NULL) {
32 b=bam_init1();
33 novel=true;
34 }
35 else {
36 b=from_b; //it'll take over from_b but not free it when destroyed
37 novel=false;
38 }
39 bam_header=b_header;
40 setupCoordinates();//sets start, end coordinate and populates exons array
41 }
42
43 const GBamRecord& operator=(GBamRecord& r) {
44 //make a new copy of the bam1_t record etc.
45 clear();
46 b=bam_dup1(r.b);
47 novel=true; //will also free b when destroyed
48 start=r.start;
49 end=r.end;
50 exons = r.exons;
51 return *this;
52 }
53
54 void setupCoordinates();
55 void clear() {
56 if (novel) {
57 bam_destroy1(b);
58 novel=false;
59 }
60 b=NULL;
61 exons.Clear();
62 bam_header=NULL;
63 }
64
65 ~GBamRecord() {
66 clear();
67 }
68
69 void parse_error(const char* s) {
70 GError("BAM parsing error: %s\n", s);
71 }
72
73 bam1_t* get_b() { return b; }
74
75 void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available
76 int32_t isize=0) { //mate info for current record
77 b->core.mtid=mtid;
78 b->core.mpos=m0pos; // should be -1 if '*'
79 b->core.isize=isize; //should be 0 if not available
80 }
81
82 void set_flags(uint16_t flags) {
83 b->core.flag=flags;
84 }
85
86 //creates a new record from 1-based alignment coordinate
87 //quals should be given as Phred33
88 //Warning: pos and mate_pos must be given 1-based!
89 GBamRecord(const char* qname, int32_t gseq_tid,
90 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL);
91 GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
92 int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
93 int insert_size, const char* qseq, const char* quals=NULL,
94 GVec<char*>* aux_strings=NULL);
95 //const std::vector<std::string>* aux_strings=NULL);
96 void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format
97 void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format
98 void add_quals(const char* quals); //quality values string in Phred33 format
99 void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1")
100 void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
101 //IMPORTANT: strings (Z,H) should include the terminal \0
102 int addz=0;
103 if ((atype=='Z' || atype=='H') && data[len-1]!=0) {
104 addz=1;
105 }
106 int ori_len = b->data_len;
107 b->data_len += 3 + len + addz;
108 b->l_aux += 3 + len + addz;
109 if (b->m_data < b->data_len) {
110 b->m_data = b->data_len;
111 kroundup32(b->m_data);
112 b->data = (uint8_t*)realloc(b->data, b->m_data);
113 }
114 b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
115 b->data[ori_len + 2] = atype;
116 if (addz) {
117 b->data[ori_len+len+4]=0;
118 }
119 memcpy(b->data + ori_len + 3, data, len);
120 }
121
122 void add_tag(const char tag[2], char atype, int len, uint8_t *data) {
123 //same with add_aux()
124 add_aux(tag,atype,len,data);
125 }
126 //--query methods:
127 uint32_t flags() { return b->core.flag; }
128 bool isUnmapped() { return ((b->core.flag & BAM_FUNMAP) != 0); }
129 bool isMapped() { return ((b->core.flag & BAM_FUNMAP) == 0); }
130 bool isPaired() { return ((b->core.flag & BAM_FPAIRED) != 0); }
131 const char* name() { return bam1_qname(b); }
132 int pairOrder() {
133 //which read in the pair: 0 = unpaired, 1=first read, 2=second read
134 int r=0;
135 if ((b->core.flag & BAM_FREAD1) != 0) r=1;
136 else if ((b->core.flag & BAM_FREAD2) != 0) r=2;
137 return r;
138 }
139 bool revStrand() {
140 //this is the raw alignment strand, NOT the transcription (XS) strand
141 return ((b->core.flag & BAM_FREVERSE) != 0);
142 }
143 const char* refName() {
144 return (bam_header!=NULL) ?
145 ((b->core.tid<0) ? "*" : bam_header->target_name[b->core.tid]) : NULL;
146 }
147 int32_t refId() { return b->core.tid; }
148 int32_t mate_refId() { return b->core.mtid; }
149 const char* mate_refName() {
150 return (bam_header!=NULL) ?
151 ((b->core.mtid<0) ? "*" : bam_header->target_name[b->core.mtid]) : NULL;
152 }
153 int32_t insertSize() { return b->core.isize; }
154 int32_t mate_start() { return b->core.mpos<0? 0 : b->core.mpos+1; }
155
156 int find_tag(const char tag[2], uint8_t* & s, char& tag_type);
157 //position s at the beginning of tag data, tag_type is set to the found tag type
158 //returns length of tag data, or 0 if tag not found
159
160 char* tag_str(const char tag[2]); //return tag value for tag type 'Z'
161 int tag_int(const char tag[2]); //return numeric value of tag (for numeric types)
162 char tag_char(const char tag[2]); //return char value of tag (for type 'A')
163 char spliceStrand(); // '+', '-' from the XS tag, or '.' if no XS tag
164
165 char* sequence(); //user should free after use
166 char* qualities();//user should free after use
167 char* cigar(); //returns text version of the CIGAR string; user must free
168 };
169
170
171 class GBamReader {
172 samfile_t* bam_file;
173 char* fname;
174 public:
175 void bopen(const char* filename) {
176 if (strcmp(filename, "-")==0) {
177 //if stdin was given, we HAVE to assume it's text SAM format, sorry
178 bam_file=samopen(filename, "r", 0);
179 }
180 else {
181 //BAM files have the zlib signature bytes at the beginning: 1F 8B 08
182 //if that's not present, we assume sam file
183 FILE* f=fopen(filename, "rb");
184 if (f==NULL) GError("Error opening file %s!\n", filename);
185 byte fsig[3];
186 size_t rd=fread(fsig, 1, 3, f);
187 if (rd<3) GError("Error reading from file %s!\n",filename);
188 if ((fsig[0]==0x1F && fsig[1]==0x8B && fsig[2]==0x08) ||
189 (fsig[0]=='B' && fsig[1]=='A' && fsig[2]=='M')) {
190 bam_file=samopen(filename, "rb", 0); //BAM or uncompressed BAM
191 }
192 else { //assume text SAM file
193 bam_file=samopen(filename, "r", 0);
194 }
195 }
196 if (bam_file==NULL)
197 GError("Error: could not open SAM file %s!\n",filename);
198 fname=Gstrdup(filename);
199 }
200 GBamReader(const char* fn) {
201 bam_file=NULL;
202 fname=NULL;
203 bopen(fn);
204 }
205
206 bam_header_t* header() {
207 return bam_file? bam_file->header : NULL;
208 }
209 void bclose() {
210 if (bam_file) {
211 samclose(bam_file);
212 bam_file=NULL;
213 }
214 }
215 ~GBamReader() {
216 bclose();
217 GFREE(fname);
218 }
219 void rewind() {
220 if (fname==NULL) {
221 GMessage("Warning: GBamReader::rewind() called without a file name.\n");
222 return;
223 }
224 bclose();
225 char* ifname=fname;
226 bopen(ifname);
227 GFREE(ifname);
228 }
229
230 GBamRecord* next() {
231 if (bam_file==NULL)
232 GError("Warning: GBamReader::next() called with no open file.\n");
233 bam1_t* b = bam_init1();
234 if (samread(bam_file, b) >= 0) {
235 GBamRecord* bamrec=new GBamRecord(b, bam_file->header);
236 return bamrec;
237 }
238 else {
239 bam_destroy1(b);
240 return NULL;
241 }
242 }
243 };
244
245
246 class GBamWriter {
247 samfile_t* bam_file;
248 bam_header_t* bam_header;
249 public:
250 void create(const char* fname, bool uncompressed=false) {
251 if (bam_header==NULL)
252 GError("Error: no bam_header for GBamWriter::create()!\n");
253 if (uncompressed) {
254 bam_file=samopen(fname, "wbu", bam_header);
255 }
256 else {
257 bam_file=samopen(fname, "wb", bam_header);
258 }
259 if (bam_file==NULL)
260 GError("Error: could not create BAM file %s!\n",fname);
261 }
262 void create(const char* fname, bam_header_t* bh, bool uncompressed=false) {
263 bam_header=bh;
264 create(fname,uncompressed);
265 }
266
267 GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
268 create(fname, bh, uncompressed);
269 }
270
271 GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
272 tamFile samf_in=sam_open(samfname);
273 if (samf_in==NULL)
274 GError("Error: could not open SAM file %s\n", samfname);
275 bam_header=sam_header_read(samf_in);
276 if (bam_header==NULL)
277 GError("Error: could not read SAM header from %s!\n",samfname);
278 sam_close(samf_in);
279 create(fname, uncompressed);
280 }
281
282 ~GBamWriter() {
283 samclose(bam_file);
284 bam_header_destroy(bam_header);
285 }
286 bam_header_t* get_header() { return bam_header; }
287 int32_t get_tid(const char *seq_name) {
288 if (bam_header==NULL)
289 GError("Error: missing SAM header (get_tid())\n");
290 return bam_get_tid(bam_header, seq_name);
291 }
292
293 //just a convenience function for creating a new record, but it's NOT written
294 //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
295 GBamRecord* new_record(const char* qname, const char* gseqname,
296 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
297 int32_t gseq_tid=get_tid(gseqname);
298 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
299 if (bam_header->n_targets == 0) {
300 GError("Error: missing/invalid SAM header\n");
301 } else
302 GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
303 gseqname);
304 }
305
306 return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual));
307 }
308
309 GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname,
310 int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos,
311 int insert_size, const char* qseq, const char* quals=NULL,
312 GVec<char*>* aux_strings=NULL) {
313 int32_t gseq_tid=get_tid(gseqname);
314 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
315 if (bam_header->n_targets == 0) {
316 GError("Error: missing/invalid SAM header\n");
317 } else
318 GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
319 gseqname);
320 }
321 int32_t mgseq_tid=-1;
322 if (mgseqname!=NULL) {
323 if (strcmp(mgseqname, "=")==0) {
324 mgseq_tid=gseq_tid;
325 }
326 else {
327 mgseq_tid=get_tid(mgseqname);
328 if (mgseq_tid < 0 && strcmp(mgseqname, "*")) {
329 GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
330 mgseqname);
331 }
332 }
333 }
334 return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar,
335 mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings));
336 }
337
338 void write(GBamRecord* brec) {
339 if (brec!=NULL)
340 samwrite(this->bam_file,brec->get_b());
341 }
342 void write(bam1_t* b) {
343 samwrite(this->bam_file, b);
344 }
345 };
346
347 #endif