ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBam.h
Revision: 121
Committed: Mon Dec 5 22:18:48 2011 UTC (7 years, 7 months ago) by gpertea
File size: 8661 byte(s)
Log Message:
added GBam BAM reader/writer classes (wip)

Line File contents
1 #ifndef _G_BAM_H
2 #define _G_BAM_H
3 #include "GBase.h"
4 #include "GList.hh"
5 #include "bam.h"
6 #include "sam.h"
7
8
9 class GBamReader;
10 class GBamWriter;
11
12 class GBamRecord: public GSeg {
13 friend class GBamReader;
14 friend class GBamWriter;
15 bam1_t* b;
16 // b->data has the following strings concatenated:
17 // qname (including the terminal \0)
18 // +cigar (each event encoded on 32 bits)
19 // +seq (4bit-encoded)
20 // +qual
21 // +aux
22 bool novel;
23 bam_header_t* bam_header;
24
25 public:
26 GPVec<GSeg> exons;
27 GBamRecord(bam1_t* from_b=NULL, bam_header_t* b_header=NULL):exons() {
28 bam_header=NULL;
29 if (from_b==NULL) {
30 b=bam_init1();
31 novel=true;
32 }
33 else {
34 b=from_b;
35 novel=false;
36 }
37 bam_header=b_header;
38 setupCoordinates();//sets start, end coordinate and populates exons array
39 }
40 void setupCoordinates();
41 void clear() {
42 if (novel) {
43 bam_destroy1(b);
44 }
45 novel=true;
46 b=bam_init1();
47 }
48
49 ~GBamRecord() {
50 if (novel) { bam_destroy1(b); }
51 }
52
53 void parse_error(const char* s) {
54 GError("BAM parsing error: %s\n", s);
55 }
56
57 bam1_t* get_b() { return b; }
58
59 void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available
60 int32_t isize=0) { //mate info for current record
61 b->core.mtid=mtid;
62 b->core.mpos=m0pos; // should be -1 if '*'
63 b->core.isize=isize; //should be 0 if not available
64 }
65
66 void set_flags(uint16_t flags) {
67 b->core.flag=flags;
68 }
69
70
71
72 //creates a new record from 1-based alignment coordinate
73 //quals should be given as Phred33
74 //Warning: pos and mate_pos must be given 1-based!
75 GBamRecord(const char* qname, int32_t gseq_tid,
76 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL);
77 GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
78 int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
79 int insert_size, const char* qseq, const char* quals=NULL,
80 GVec<char*>* aux_strings=NULL);
81 //const std::vector<std::string>* aux_strings=NULL);
82 void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format
83 void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format
84 void add_quals(const char* quals); //quality values string in Phred33 format
85 void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1")
86 void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
87 int ori_len = b->data_len;
88 b->data_len += 3 + len;
89 b->l_aux += 3 + len;
90 if (b->m_data < b->data_len) {
91 b->m_data = b->data_len;
92 kroundup32(b->m_data);
93 b->data = (uint8_t*)realloc(b->data, b->m_data);
94 }
95 b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
96 b->data[ori_len + 2] = atype;
97 memcpy(b->data + ori_len + 3, data, len);
98 }
99 void add_tag(const char tag[2], char atype, int len, uint8_t *data) {
100 //same with add_aux()
101 add_aux(tag,atype,len,data);
102 }
103 //--query methods:
104 uint32_t flags() { return b->core.flag; }
105 bool isUnmapped() { return ((b->core.flag & BAM_FUNMAP) != 0); }
106 bool isMapped() { return ((b->core.flag & BAM_FUNMAP) == 0); }
107 bool isPaired() { return ((b->core.flag & BAM_FPAIRED) != 0); }
108 int mateNum() {
109 int r=0;
110 if ((b->core.flag & BAM_FREAD1) != 0) r=1;
111 else if ((b->core.flag & BAM_FREAD2) != 0) r=2;
112 return r;
113 }
114 bool onRevStrand() { return ((b->core.flag & BAM_FREVERSE) != 0); }
115 const char* refName() {
116 return (bam_header!=NULL) ? bam_header->target_name[b->core.tid] : NULL;
117 }
118 int32_t refId() { return b->core.tid; }
119
120 uint8_t* tag(const char tag[2]); //retrieves tag data
121 int tag_int(const char tag[2]); //get the numeric value of tag (if it's numeric)
122 char spliceStrand(); // '+', '-' from the XS tag, or 0 if no XS tag
123 char* sequence(); //user must free this after use
124 char* qualities();//user must free this after use
125 char* cigar(); //returns text version of the CIGAR string; must be freed by user
126 };
127
128
129 class GBamReader {
130 samfile_t* bam_file;
131 char* fname;
132 public:
133 void open(const char* filename) {
134 bam_file=samopen(filename, "rb", 0);
135 //for .sam files:
136 //bam_file=samopen(fname, "r", 0);
137 if (bam_file==NULL)
138 GError("Error: could not open SAM file %s!\n",filename);
139 fname=Gstrdup(filename);
140 }
141
142 GBamReader(const char* fname) {
143 fname=NULL;
144 open(fname);
145 }
146
147 ~GBamReader() {
148 samclose(bam_file);
149 GFREE(fname);
150 }
151
152 };
153
154
155 class GBamWriter {
156 samfile_t* bam_file;
157 bam_header_t* bam_header;
158 public:
159 void create(const char* fname, bool uncompressed=false) {
160 if (bam_header==NULL)
161 GError("Error: no bam_header for GBamWriter::create()!\n");
162 if (uncompressed) {
163 bam_file=samopen(fname, "wbu", bam_header);
164 }
165 else {
166 bam_file=samopen(fname, "wb", bam_header);
167 }
168 if (bam_file==NULL)
169 GError("Error: could not create BAM file %s!\n",fname);
170 }
171 void create(const char* fname, bam_header_t* bh, bool uncompressed=false) {
172 bam_header=bh;
173 create(fname,uncompressed);
174 }
175
176 GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
177 create(fname, bh, uncompressed);
178 }
179
180 GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
181 tamFile samf_in=sam_open(samfname);
182 if (samf_in==NULL)
183 GError("Error: could not open SAM file %s\n", samfname);
184 bam_header=sam_header_read(samf_in);
185 if (bam_header==NULL)
186 GError("Error: could not read SAM header from %s!\n",samfname);
187 sam_close(samf_in);
188 create(fname, uncompressed);
189 }
190
191 ~GBamWriter() {
192 samclose(bam_file);
193 bam_header_destroy(bam_header);
194 }
195 bam_header_t* get_header() { return bam_header; }
196 int32_t get_tid(const char *seq_name) {
197 if (bam_header==NULL)
198 GError("Error: missing SAM header (get_tid())\n");
199 return bam_get_tid(bam_header, seq_name);
200 }
201
202 //just a convenience function for creating a new record, but it's NOT written
203 //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
204 GBamRecord* new_record(const char* qname, const char* gseqname,
205 int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
206 int32_t gseq_tid=get_tid(gseqname);
207 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
208 if (bam_header->n_targets == 0) {
209 GError("Error: missing/invalid SAM header\n");
210 } else
211 GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
212 gseqname);
213 }
214
215 return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual));
216 }
217
218 GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname,
219 int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos,
220 int insert_size, const char* qseq, const char* quals=NULL,
221 GVec<char*>* aux_strings=NULL) {
222 int32_t gseq_tid=get_tid(gseqname);
223 if (gseq_tid < 0 && strcmp(gseqname, "*")) {
224 if (bam_header->n_targets == 0) {
225 GError("Error: missing/invalid SAM header\n");
226 } else
227 GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
228 gseqname);
229 }
230 int32_t mgseq_tid=-1;
231 if (mgseqname!=NULL) {
232 if (strcmp(mgseqname, "=")==0) {
233 mgseq_tid=gseq_tid;
234 }
235 else {
236 mgseq_tid=get_tid(mgseqname);
237 if (mgseq_tid < 0 && strcmp(mgseqname, "*")) {
238 GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
239 mgseqname);
240 }
241 }
242 }
243 return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar,
244 mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings));
245 }
246
247 void write(GBamRecord* brec) {
248 if (brec!=NULL)
249 samwrite(this->bam_file,brec->get_b());
250 }
251 void write(bam1_t* b) {
252 samwrite(this->bam_file, b);
253 }
254 };
255
256 #endif