ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBam.h
Revision: 310
Committed: Fri Mar 22 20:06:27 2013 UTC (5 years, 11 months ago) by gpertea
File size: 12494 byte(s)
Log Message:
sync with igm repo

Line File contents
1 #ifndef _G_BAM_H
2 #define _G_BAM_H
3 #include "GBase.h"
4 #include "GList.hh"
5 #include "bam.h"
6 #include "sam.h"
7
8 class GBamReader;
9 class GBamWriter;
10
11 class GBamRecord: public GSeg {
12 friend class GBamReader;
13 friend class GBamWriter;
14 bam1_t* b;
15 // b->data has the following strings concatenated:
16 // qname (including the terminal \0)
17 // +cigar (each event encoded on 32 bits)
18 // +seq (4bit-encoded)
19 // +qual
20 // +aux
21 bool novel; //if true, will also free b on delete
22 bam_header_t* bam_header;
23 char tag[2];
24 uint8_t abuf[512];
25 public:
26 GVec<GSeg> exons; //coordinates will be 1-based
27 //created from a reader:
28 void bfree_on_delete(bool b_free=true) { novel=b_free; }
29 GBamRecord(bam1_t* from_b=NULL, bam_header_t* b_header=NULL, bool b_free=true):exons(1) {
30 bam_header=NULL;
31 if (from_b==NULL) {
32 b=bam_init1();
33 novel=true;
34 }
35 else {
36 b=from_b; //it'll take over from_b
37 novel=b_free;
38 }
39
40 bam_header=b_header;
41 setupCoordinates();//set 1-based coordinates (start, end and exons array)
42 }
43
44 const GBamRecord& operator=(GBamRecord& r) {
45 //make a new copy of the bam1_t record etc.
46 clear();
47 b=bam_dup1(r.b);
48 novel=true; //will also free b when destroyed
49 start=r.start;
50 end=r.end;
51 exons = r.exons;
52 return *this;
53 }
54
55 void setupCoordinates();
56 void clear() {
57 if (novel) {
58 bam_destroy1(b);
59 //novel=false;
60 }
61 b=NULL;
62 exons.Clear();
63 bam_header=NULL;
64 }
65
66 ~GBamRecord() {
67 clear();
68 }
69
70 void parse_error(const char* s) {
71 GError("BAM parsing error: %s\n", s);
72 }
73
74 bam1_t* get_b() { return b; }
75
76 void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available
77 int32_t isize=0) { //mate info for current record
78 b->core.mtid=mtid;
79 b->core.mpos=m0pos; // should be -1 if '*'
80 b->core.isize=isize; //should be 0 if not available
81 }
82
83 void set_flags(uint16_t flags) {
84 b->core.flag=flags;
85 }
86
87 //creates a new record from 1-based alignment coordinate
88 //quals should be given as Phred33
89 //Warning: pos and mate_pos must be given 1-based!
90 GBamRecord(const char* qname, int32_t gseq_tid,
91 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL);
92 GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
93 int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
94 int insert_size, const char* qseq, const char* quals=NULL,
95 GVec<char*>* aux_strings=NULL);
96 //const std::vector<std::string>* aux_strings=NULL);
97 void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format
98 void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format
99 void add_quals(const char* quals); //quality values string in Phred33 format
100 void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1")
101 void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
102 //IMPORTANT: strings (Z,H) should include the terminal \0
103 int addz=0;
104 if ((atype=='Z' || atype=='H') && data[len-1]!=0) {
105 addz=1;
106 }
107 int ori_len = b->data_len;
108 b->data_len += 3 + len + addz;
109 b->l_aux += 3 + len + addz;
110 if (b->m_data < b->data_len) {
111 b->m_data = b->data_len;
112 kroundup32(b->m_data);
113 b->data = (uint8_t*)realloc(b->data, b->m_data);
114 }
115 b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
116 b->data[ori_len + 2] = atype;
117 if (addz) {
118 b->data[ori_len+len+4]=0;
119 }
120 memcpy(b->data + ori_len + 3, data, len);
121 }
122
123 void add_tag(const char tag[2], char atype, int len, uint8_t *data) {
124 //same with add_aux()
125 add_aux(tag,atype,len,data);
126 }
127 //--query methods:
128 uint32_t flags() { return b->core.flag; }
129 bool isUnmapped() { return ((b->core.flag & BAM_FUNMAP) != 0); }
130 bool isMapped() { return ((b->core.flag & BAM_FUNMAP) == 0); }
131 bool isPaired() { return ((b->core.flag & BAM_FPAIRED) != 0); }
132 const char* name() { return bam1_qname(b); }
133 int pairOrder() {
134 //which read in the pair: 0 = unpaired, 1=first read, 2=second read
135 int r=0;
136 if ((b->core.flag & BAM_FREAD1) != 0) r=1;
137 else if ((b->core.flag & BAM_FREAD2) != 0) r=2;
138 return r;
139 }
140 bool revStrand() {
141 //this is the raw alignment strand, NOT the transcription (XS) strand
142 return ((b->core.flag & BAM_FREVERSE) != 0);
143 }
144 const char* refName() {
145 return (bam_header!=NULL) ?
146 ((b->core.tid<0) ? "*" : bam_header->target_name[b->core.tid]) : NULL;
147 }
148 int32_t refId() { return b->core.tid; }
149 int32_t mate_refId() { return b->core.mtid; }
150 const char* mate_refName() {
151 return (bam_header!=NULL) ?
152 ((b->core.mtid<0) ? "*" : bam_header->target_name[b->core.mtid]) : NULL;
153 }
154 int32_t insertSize() { return b->core.isize; }
155 int32_t mate_start() { return b->core.mpos<0? 0 : b->core.mpos+1; }
156
157 //int find_tag(const char tag[2], uint8_t* & s, char& tag_type);
158 uint8_t* find_tag(const char tag[2]);
159 //position s at the beginning of tag data, tag_type is set to the found tag type
160 //returns length of tag data, or 0 if tag not found
161
162 char* tag_str(const char tag[2]); //return tag value for tag type 'Z'
163 int tag_int(const char tag[2]); //return numeric value of tag (for numeric types)
164 char tag_char(const char tag[2]); //return char value of tag (for type 'A')
165 char spliceStrand(); // '+', '-' from the XS tag, or '.' if no XS tag
166
167 char* sequence(); //user should free after use
168 char* qualities();//user should free after use
169 char* cigar(); //returns text version of the CIGAR string; user must free
170 };
171
172 // from sam.c:
173 #define FTYPE_BAM 1
174 #define FTYPE_READ 2
175
176 class GBamReader {
177 samfile_t* bam_file;
178 char* fname;
179 // from bam_import.c:
180 struct samtools_tamFile_t {
181 gzFile fp;
182 void *ks;
183 void *str;
184 uint64_t n_lines;
185 int is_first;
186 };
187
188 public:
189 void bopen(const char* filename) {
190 if (strcmp(filename, "-")==0) {
191 //if stdin was given, we HAVE to assume it's text SAM format, sorry
192 bam_file=samopen(filename, "r", 0);
193 }
194 else {
195 //BAM files have the zlib signature bytes at the beginning: 1F 8B 08
196 //if that's not present, we assume sam file
197 FILE* f=fopen(filename, "rb");
198 if (f==NULL) GError("Error opening file %s!\n", filename);
199 byte fsig[3];
200 size_t rd=fread(fsig, 1, 3, f);
201 fclose(f);
202 if (rd<3) GError("Error reading from file %s!\n",filename);
203 if ((fsig[0]==0x1F && fsig[1]==0x8B && fsig[2]==0x08) ||
204 (fsig[0]=='B' && fsig[1]=='A' && fsig[2]=='M')) {
205 bam_file=samopen(filename, "rb", 0); //BAM or uncompressed BAM
206 }
207 else { //assume text SAM file
208 bam_file=samopen(filename, "r", 0);
209 }
210 }
211 if (bam_file==NULL)
212 GError("Error: could not open SAM file %s!\n",filename);
213 fname=Gstrdup(filename);
214 }
215 GBamReader(const char* fn) {
216 bam_file=NULL;
217 fname=NULL;
218 bopen(fn);
219 }
220
221 bam_header_t* header() {
222 return bam_file? bam_file->header : NULL;
223 }
224 void bclose() {
225 if (bam_file) {
226 samclose(bam_file);
227 bam_file=NULL;
228 }
229 }
230 ~GBamReader() {
231 bclose();
232 GFREE(fname);
233 }
234 int64_t fpos() {
235 if ( bam_file->type & FTYPE_BAM )
236 return bgzf_tell(bam_file->x.bam);
237 else
238 return (int64_t)gztell(((samtools_tamFile_t*)(bam_file->x.tamr))->fp);
239 }
240 int64_t fseek(int64_t offs) {
241 if ( bam_file->type & FTYPE_BAM )
242 return bgzf_seek(bam_file->x.bam, offs, SEEK_SET);
243 else
244 return (int64_t)gzseek(((samtools_tamFile_t*)(bam_file->x.tamr))->fp, offs, SEEK_SET);
245 }
246 void rewind() {
247 if (fname==NULL) {
248 GMessage("Warning: GBamReader::rewind() called without a file name.\n");
249 return;
250 }
251 bclose();
252 char* ifname=fname;
253 bopen(ifname);
254 GFREE(ifname);
255 }
256
257 GBamRecord* next() {
258 if (bam_file==NULL)
259 GError("Warning: GBamReader::next() called with no open file.\n");
260 bam1_t* b = bam_init1();
261 if (samread(bam_file, b) >= 0) {
262 GBamRecord* bamrec=new GBamRecord(b, bam_file->header, true);
263 return bamrec;
264 }
265 else {
266 bam_destroy1(b);
267 return NULL;
268 }
269 }
270 };
271
272
273 class GBamWriter {
274 samfile_t* bam_file;
275 bam_header_t* bam_header;
276 public:
277 void create(const char* fname, bool uncompressed=false) {
278 if (bam_header==NULL)
279 GError("Error: no bam_header for GBamWriter::create()!\n");
280 if (uncompressed) {
281 bam_file=samopen(fname, "wbu", bam_header);
282 }
283 else {
284 bam_file=samopen(fname, "wb", bam_header);
285 }
286 if (bam_file==NULL)
287 GError("Error: could not create BAM file %s!\n",fname);
288 }
289 void create(const char* fname, bam_header_t* bh, bool uncompressed=false) {
290 bam_header=bh;
291 create(fname,uncompressed);
292 }
293
294 GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
295 create(fname, bh, uncompressed);
296 }
297
298 GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
299 tamFile samf_in=sam_open(samfname);
300 if (samf_in==NULL)
301 GError("Error: could not open SAM file %s\n", samfname);
302 bam_header=sam_header_read(samf_in);
303 if (bam_header==NULL)
304 GError("Error: could not read SAM header from %s!\n",samfname);
305 sam_close(samf_in);
306 create(fname, uncompressed);
307 }
308
309 ~GBamWriter() {
310 samclose(bam_file);
311 bam_header_destroy(bam_header);
312 }
313 bam_header_t* get_header() { return bam_header; }
314 int32_t get_tid(const char *seq_name) {
315 if (bam_header==NULL)
316 GError("Error: missing SAM header (get_tid())\n");
317 return bam_get_tid(bam_header, seq_name);
318 }
319
320 //just a convenience function for creating a new record, but it's NOT written
321 //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
322 GBamRecord* new_record(const char* qname, const char* gseqname,
323 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
324 int32_t gseq_tid=get_tid(gseqname);
325 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
326 if (bam_header->n_targets == 0) {
327 GError("Error: missing/invalid SAM header\n");
328 } else
329 GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
330 gseqname);
331 }
332
333 return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual));
334 }
335
336 GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname,
337 int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos,
338 int insert_size, const char* qseq, const char* quals=NULL,
339 GVec<char*>* aux_strings=NULL) {
340 int32_t gseq_tid=get_tid(gseqname);
341 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
342 if (bam_header->n_targets == 0) {
343 GError("Error: missing/invalid SAM header\n");
344 } else
345 GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
346 gseqname);
347 }
348 int32_t mgseq_tid=-1;
349 if (mgseqname!=NULL) {
350 if (strcmp(mgseqname, "=")==0) {
351 mgseq_tid=gseq_tid;
352 }
353 else {
354 mgseq_tid=get_tid(mgseqname);
355 if (mgseq_tid < 0 && strcmp(mgseqname, "*")) {
356 GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
357 mgseqname);
358 }
359 }
360 }
361 return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar,
362 mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings));
363 }
364
365 void write(GBamRecord* brec) {
366 if (brec!=NULL)
367 samwrite(this->bam_file,brec->get_b());
368 }
369 void write(bam1_t* b) {
370 samwrite(this->bam_file, b);
371 }
372 };
373
374 #endif