ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBam.h
Revision: 129
Committed: Wed Dec 7 20:59:43 2011 UTC (7 years, 11 months ago) by gpertea
File size: 10113 byte(s)
Log Message:
Line File contents
1 #ifndef _G_BAM_H
2 #define _G_BAM_H
3 #include "GBase.h"
4 #include "GList.hh"
5 #include "bam.h"
6 #include "sam.h"
7
8
9 class GBamReader;
10 class GBamWriter;
11
12 class GBamRecord: public GSeg {
13 friend class GBamReader;
14 friend class GBamWriter;
15 bam1_t* b;
16 // b->data has the following strings concatenated:
17 // qname (including the terminal \0)
18 // +cigar (each event encoded on 32 bits)
19 // +seq (4bit-encoded)
20 // +qual
21 // +aux
22 //bool novel; //created from scratch, not from a given bam1_t* record
23 bam_header_t* bam_header;
24
25 public:
26 GVec<GSeg> exons;
27 //created from a reader:
28 GBamRecord(bam1_t* from_b=NULL, bam_header_t* b_header=NULL):exons(1) {
29 bam_header=NULL;
30 if (from_b==NULL) {
31 b=bam_init1();
32 //novel=true;
33 }
34 else {
35 b=from_b; //it'll take over from_b and FREE it when destroyed
36 //novel=false;
37 }
38 bam_header=b_header;
39 setupCoordinates();//sets start, end coordinate and populates exons array
40 }
41
42 const GBamRecord& operator=(GBamRecord& r) {
43 //make a new copy of the bam1_t record etc.
44 clear();
45 b=bam_dup1(r.b);
46 //novel=true; //will also free b when destroyed
47 start=r.start;
48 end=r.end;
49 exons = r.exons;
50 return *this;
51 }
52
53 void setupCoordinates();
54 void clear() {
55 //if (novel) {
56 bam_destroy1(b);
57 // novel=false;
58 // }
59 b=NULL;
60 exons.Clear();
61 bam_header=NULL;
62 }
63
64 ~GBamRecord() {
65 clear();
66 }
67
68 void parse_error(const char* s) {
69 GError("BAM parsing error: %s\n", s);
70 }
71
72 bam1_t* get_b() { return b; }
73
74 void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available
75 int32_t isize=0) { //mate info for current record
76 b->core.mtid=mtid;
77 b->core.mpos=m0pos; // should be -1 if '*'
78 b->core.isize=isize; //should be 0 if not available
79 }
80
81 void set_flags(uint16_t flags) {
82 b->core.flag=flags;
83 }
84
85
86
87 //creates a new record from 1-based alignment coordinate
88 //quals should be given as Phred33
89 //Warning: pos and mate_pos must be given 1-based!
90 GBamRecord(const char* qname, int32_t gseq_tid,
91 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL);
92 GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
93 int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
94 int insert_size, const char* qseq, const char* quals=NULL,
95 GVec<char*>* aux_strings=NULL);
96 //const std::vector<std::string>* aux_strings=NULL);
97 void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format
98 void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format
99 void add_quals(const char* quals); //quality values string in Phred33 format
100 void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1")
101 void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
102 int ori_len = b->data_len;
103 b->data_len += 3 + len;
104 b->l_aux += 3 + len;
105 if (b->m_data < b->data_len) {
106 b->m_data = b->data_len;
107 kroundup32(b->m_data);
108 b->data = (uint8_t*)realloc(b->data, b->m_data);
109 }
110 b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
111 b->data[ori_len + 2] = atype;
112 memcpy(b->data + ori_len + 3, data, len);
113 }
114 void add_tag(const char tag[2], char atype, int len, uint8_t *data) {
115 //same with add_aux()
116 add_aux(tag,atype,len,data);
117 }
118 //--query methods:
119 uint32_t flags() { return b->core.flag; }
120 bool isUnmapped() { return ((b->core.flag & BAM_FUNMAP) != 0); }
121 bool isMapped() { return ((b->core.flag & BAM_FUNMAP) == 0); }
122 bool isPaired() { return ((b->core.flag & BAM_FPAIRED) != 0); }
123 const char* name() { return bam1_qname(b); }
124 int mateNum() {
125 int r=0;
126 if ((b->core.flag & BAM_FREAD1) != 0) r=1;
127 else if ((b->core.flag & BAM_FREAD2) != 0) r=2;
128 return r;
129 }
130 bool revStrand() { return ((b->core.flag & BAM_FREVERSE) != 0); }
131 const char* refName() {
132 return (bam_header!=NULL) ? bam_header->target_name[b->core.tid] : NULL;
133 }
134 int32_t refId() { return b->core.tid; }
135
136 int find_tag(const char tag[2], uint8_t* & s, char& tag_type);
137 char* tag_str(const char tag[2]); //return tag value for tag type 'Z'
138 int tag_int(const char tag[2]); //return numeric value of tag (for numeric types)
139 char tag_char(const char tag[2]); //return char value of tag (for type 'A')
140 char spliceStrand(); // '+', '-' from the XS tag, or '.' if no XS tag
141 char* sequence(); //user must free this after use
142 char* qualities();//user must free this after use
143 char* cigar(); //returns text version of the CIGAR string; must be freed by user
144 };
145
146
147 class GBamReader {
148 samfile_t* bam_file;
149 char* fname;
150 public:
151 void bopen(const char* filename) {
152 bam_file=samopen(filename, "rb", 0);
153 //for .sam files:
154 //bam_file=samopen(fname, "r", 0);
155 if (bam_file==NULL)
156 GError("Error: could not open SAM file %s!\n",filename);
157 fname=Gstrdup(filename);
158 }
159 GBamReader(const char* fn) {
160 bam_file=NULL;
161 fname=NULL;
162 bopen(fn);
163 }
164
165 bam_header_t* header() {
166 return bam_file? bam_file->header : NULL;
167 }
168 void bclose() {
169 if (bam_file) {
170 samclose(bam_file);
171 bam_file=NULL;
172 }
173 }
174 ~GBamReader() {
175 bclose();
176 GFREE(fname);
177 }
178 void rewind() {
179 if (fname==NULL) {
180 GMessage("Warning: GBamReader::rewind() called without a file name.\n");
181 return;
182 }
183 bclose();
184 char* ifname=fname;
185 bopen(ifname);
186 GFREE(ifname);
187 }
188
189 GBamRecord* next() {
190 if (bam_file==NULL)
191 GError("Warning: GBamReader::next() called with no open file.\n");
192 bam1_t* b = bam_init1();
193 if (samread(bam_file, b) >= 0) {
194 GBamRecord* bamrec=new GBamRecord(b, bam_file->header);
195 return bamrec;
196 }
197 else {
198 bam_destroy1(b);
199 return NULL;
200 }
201 }
202 };
203
204
205 class GBamWriter {
206 samfile_t* bam_file;
207 bam_header_t* bam_header;
208 public:
209 void create(const char* fname, bool uncompressed=false) {
210 if (bam_header==NULL)
211 GError("Error: no bam_header for GBamWriter::create()!\n");
212 if (uncompressed) {
213 bam_file=samopen(fname, "wbu", bam_header);
214 }
215 else {
216 bam_file=samopen(fname, "wb", bam_header);
217 }
218 if (bam_file==NULL)
219 GError("Error: could not create BAM file %s!\n",fname);
220 }
221 void create(const char* fname, bam_header_t* bh, bool uncompressed=false) {
222 bam_header=bh;
223 create(fname,uncompressed);
224 }
225
226 GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
227 create(fname, bh, uncompressed);
228 }
229
230 GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
231 tamFile samf_in=sam_open(samfname);
232 if (samf_in==NULL)
233 GError("Error: could not open SAM file %s\n", samfname);
234 bam_header=sam_header_read(samf_in);
235 if (bam_header==NULL)
236 GError("Error: could not read SAM header from %s!\n",samfname);
237 sam_close(samf_in);
238 create(fname, uncompressed);
239 }
240
241 ~GBamWriter() {
242 samclose(bam_file);
243 bam_header_destroy(bam_header);
244 }
245 bam_header_t* get_header() { return bam_header; }
246 int32_t get_tid(const char *seq_name) {
247 if (bam_header==NULL)
248 GError("Error: missing SAM header (get_tid())\n");
249 return bam_get_tid(bam_header, seq_name);
250 }
251
252 //just a convenience function for creating a new record, but it's NOT written
253 //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
254 GBamRecord* new_record(const char* qname, const char* gseqname,
255 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
256 int32_t gseq_tid=get_tid(gseqname);
257 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
258 if (bam_header->n_targets == 0) {
259 GError("Error: missing/invalid SAM header\n");
260 } else
261 GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
262 gseqname);
263 }
264
265 return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual));
266 }
267
268 GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname,
269 int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos,
270 int insert_size, const char* qseq, const char* quals=NULL,
271 GVec<char*>* aux_strings=NULL) {
272 int32_t gseq_tid=get_tid(gseqname);
273 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
274 if (bam_header->n_targets == 0) {
275 GError("Error: missing/invalid SAM header\n");
276 } else
277 GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
278 gseqname);
279 }
280 int32_t mgseq_tid=-1;
281 if (mgseqname!=NULL) {
282 if (strcmp(mgseqname, "=")==0) {
283 mgseq_tid=gseq_tid;
284 }
285 else {
286 mgseq_tid=get_tid(mgseqname);
287 if (mgseq_tid < 0 && strcmp(mgseqname, "*")) {
288 GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
289 mgseqname);
290 }
291 }
292 }
293 return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar,
294 mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings));
295 }
296
297 void write(GBamRecord* brec) {
298 if (brec!=NULL)
299 samwrite(this->bam_file,brec->get_b());
300 }
301 void write(bam1_t* b) {
302 samwrite(this->bam_file, b);
303 }
304 };
305
306 #endif