ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBam.h
Revision: 129
Committed: Wed Dec 7 20:59:43 2011 UTC (7 years, 10 months ago) by gpertea
File size: 10113 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 121 #ifndef _G_BAM_H
2     #define _G_BAM_H
3     #include "GBase.h"
4     #include "GList.hh"
5     #include "bam.h"
6     #include "sam.h"
7    
8    
9     class GBamReader;
10     class GBamWriter;
11    
12     class GBamRecord: public GSeg {
13     friend class GBamReader;
14     friend class GBamWriter;
15     bam1_t* b;
16     // b->data has the following strings concatenated:
17     // qname (including the terminal \0)
18     // +cigar (each event encoded on 32 bits)
19     // +seq (4bit-encoded)
20     // +qual
21     // +aux
22 gpertea 126 //bool novel; //created from scratch, not from a given bam1_t* record
23 gpertea 121 bam_header_t* bam_header;
24    
25     public:
26 gpertea 126 GVec<GSeg> exons;
27     //created from a reader:
28 gpertea 124 GBamRecord(bam1_t* from_b=NULL, bam_header_t* b_header=NULL):exons(1) {
29 gpertea 121 bam_header=NULL;
30     if (from_b==NULL) {
31     b=bam_init1();
32 gpertea 126 //novel=true;
33 gpertea 121 }
34     else {
35 gpertea 126 b=from_b; //it'll take over from_b and FREE it when destroyed
36     //novel=false;
37 gpertea 121 }
38     bam_header=b_header;
39     setupCoordinates();//sets start, end coordinate and populates exons array
40     }
41 gpertea 126
42     const GBamRecord& operator=(GBamRecord& r) {
43     //make a new copy of the bam1_t record etc.
44     clear();
45     b=bam_dup1(r.b);
46     //novel=true; //will also free b when destroyed
47     start=r.start;
48     end=r.end;
49     exons = r.exons;
50     return *this;
51     }
52    
53 gpertea 121 void setupCoordinates();
54     void clear() {
55 gpertea 126 //if (novel) {
56     bam_destroy1(b);
57     // novel=false;
58     // }
59     b=NULL;
60     exons.Clear();
61     bam_header=NULL;
62 gpertea 121 }
63    
64     ~GBamRecord() {
65 gpertea 126 clear();
66 gpertea 121 }
67    
68     void parse_error(const char* s) {
69     GError("BAM parsing error: %s\n", s);
70     }
71    
72     bam1_t* get_b() { return b; }
73    
74     void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available
75     int32_t isize=0) { //mate info for current record
76     b->core.mtid=mtid;
77     b->core.mpos=m0pos; // should be -1 if '*'
78     b->core.isize=isize; //should be 0 if not available
79     }
80    
81     void set_flags(uint16_t flags) {
82     b->core.flag=flags;
83     }
84    
85    
86    
87     //creates a new record from 1-based alignment coordinate
88     //quals should be given as Phred33
89     //Warning: pos and mate_pos must be given 1-based!
90     GBamRecord(const char* qname, int32_t gseq_tid,
91     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL);
92     GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
93     int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
94     int insert_size, const char* qseq, const char* quals=NULL,
95     GVec<char*>* aux_strings=NULL);
96     //const std::vector<std::string>* aux_strings=NULL);
97     void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format
98     void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format
99     void add_quals(const char* quals); //quality values string in Phred33 format
100     void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1")
101     void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
102     int ori_len = b->data_len;
103     b->data_len += 3 + len;
104     b->l_aux += 3 + len;
105     if (b->m_data < b->data_len) {
106     b->m_data = b->data_len;
107     kroundup32(b->m_data);
108     b->data = (uint8_t*)realloc(b->data, b->m_data);
109     }
110     b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
111     b->data[ori_len + 2] = atype;
112     memcpy(b->data + ori_len + 3, data, len);
113     }
114     void add_tag(const char tag[2], char atype, int len, uint8_t *data) {
115     //same with add_aux()
116     add_aux(tag,atype,len,data);
117     }
118     //--query methods:
119     uint32_t flags() { return b->core.flag; }
120     bool isUnmapped() { return ((b->core.flag & BAM_FUNMAP) != 0); }
121     bool isMapped() { return ((b->core.flag & BAM_FUNMAP) == 0); }
122     bool isPaired() { return ((b->core.flag & BAM_FPAIRED) != 0); }
123 gpertea 124 const char* name() { return bam1_qname(b); }
124 gpertea 121 int mateNum() {
125     int r=0;
126     if ((b->core.flag & BAM_FREAD1) != 0) r=1;
127     else if ((b->core.flag & BAM_FREAD2) != 0) r=2;
128     return r;
129     }
130 gpertea 124 bool revStrand() { return ((b->core.flag & BAM_FREVERSE) != 0); }
131 gpertea 121 const char* refName() {
132     return (bam_header!=NULL) ? bam_header->target_name[b->core.tid] : NULL;
133     }
134     int32_t refId() { return b->core.tid; }
135    
136 gpertea 124 int find_tag(const char tag[2], uint8_t* & s, char& tag_type);
137     char* tag_str(const char tag[2]); //return tag value for tag type 'Z'
138     int tag_int(const char tag[2]); //return numeric value of tag (for numeric types)
139     char tag_char(const char tag[2]); //return char value of tag (for type 'A')
140     char spliceStrand(); // '+', '-' from the XS tag, or '.' if no XS tag
141 gpertea 121 char* sequence(); //user must free this after use
142     char* qualities();//user must free this after use
143     char* cigar(); //returns text version of the CIGAR string; must be freed by user
144     };
145    
146    
147     class GBamReader {
148     samfile_t* bam_file;
149     char* fname;
150     public:
151 gpertea 126 void bopen(const char* filename) {
152 gpertea 121 bam_file=samopen(filename, "rb", 0);
153     //for .sam files:
154     //bam_file=samopen(fname, "r", 0);
155     if (bam_file==NULL)
156     GError("Error: could not open SAM file %s!\n",filename);
157     fname=Gstrdup(filename);
158     }
159 gpertea 129 GBamReader(const char* fn) {
160 gpertea 126 bam_file=NULL;
161 gpertea 121 fname=NULL;
162 gpertea 129 bopen(fn);
163 gpertea 121 }
164    
165 gpertea 126 bam_header_t* header() {
166     return bam_file? bam_file->header : NULL;
167     }
168     void bclose() {
169     if (bam_file) {
170     samclose(bam_file);
171     bam_file=NULL;
172     }
173     }
174 gpertea 121 ~GBamReader() {
175 gpertea 126 bclose();
176 gpertea 121 GFREE(fname);
177     }
178 gpertea 126 void rewind() {
179     if (fname==NULL) {
180     GMessage("Warning: GBamReader::rewind() called without a file name.\n");
181     return;
182     }
183     bclose();
184     char* ifname=fname;
185     bopen(ifname);
186     GFREE(ifname);
187     }
188 gpertea 121
189 gpertea 126 GBamRecord* next() {
190     if (bam_file==NULL)
191     GError("Warning: GBamReader::next() called with no open file.\n");
192     bam1_t* b = bam_init1();
193     if (samread(bam_file, b) >= 0) {
194     GBamRecord* bamrec=new GBamRecord(b, bam_file->header);
195     return bamrec;
196     }
197     else {
198     bam_destroy1(b);
199     return NULL;
200     }
201     }
202 gpertea 121 };
203    
204    
205     class GBamWriter {
206     samfile_t* bam_file;
207     bam_header_t* bam_header;
208     public:
209     void create(const char* fname, bool uncompressed=false) {
210     if (bam_header==NULL)
211     GError("Error: no bam_header for GBamWriter::create()!\n");
212     if (uncompressed) {
213     bam_file=samopen(fname, "wbu", bam_header);
214     }
215     else {
216     bam_file=samopen(fname, "wb", bam_header);
217     }
218     if (bam_file==NULL)
219     GError("Error: could not create BAM file %s!\n",fname);
220     }
221     void create(const char* fname, bam_header_t* bh, bool uncompressed=false) {
222     bam_header=bh;
223     create(fname,uncompressed);
224     }
225    
226     GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
227     create(fname, bh, uncompressed);
228     }
229    
230     GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
231     tamFile samf_in=sam_open(samfname);
232     if (samf_in==NULL)
233     GError("Error: could not open SAM file %s\n", samfname);
234     bam_header=sam_header_read(samf_in);
235     if (bam_header==NULL)
236     GError("Error: could not read SAM header from %s!\n",samfname);
237     sam_close(samf_in);
238     create(fname, uncompressed);
239     }
240    
241     ~GBamWriter() {
242     samclose(bam_file);
243     bam_header_destroy(bam_header);
244     }
245     bam_header_t* get_header() { return bam_header; }
246     int32_t get_tid(const char *seq_name) {
247     if (bam_header==NULL)
248     GError("Error: missing SAM header (get_tid())\n");
249     return bam_get_tid(bam_header, seq_name);
250     }
251    
252     //just a convenience function for creating a new record, but it's NOT written
253     //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
254     GBamRecord* new_record(const char* qname, const char* gseqname,
255     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
256     int32_t gseq_tid=get_tid(gseqname);
257     if (gseq_tid < 0 && strcmp(gseqname, "*")) {
258     if (bam_header->n_targets == 0) {
259     GError("Error: missing/invalid SAM header\n");
260     } else
261     GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
262     gseqname);
263     }
264    
265     return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual));
266     }
267    
268     GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname,
269     int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos,
270     int insert_size, const char* qseq, const char* quals=NULL,
271     GVec<char*>* aux_strings=NULL) {
272     int32_t gseq_tid=get_tid(gseqname);
273     if (gseq_tid < 0 && strcmp(gseqname, "*")) {
274     if (bam_header->n_targets == 0) {
275     GError("Error: missing/invalid SAM header\n");
276     } else
277     GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
278     gseqname);
279     }
280     int32_t mgseq_tid=-1;
281     if (mgseqname!=NULL) {
282     if (strcmp(mgseqname, "=")==0) {
283     mgseq_tid=gseq_tid;
284     }
285     else {
286     mgseq_tid=get_tid(mgseqname);
287     if (mgseq_tid < 0 && strcmp(mgseqname, "*")) {
288     GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
289     mgseqname);
290     }
291     }
292     }
293     return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar,
294     mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings));
295     }
296    
297     void write(GBamRecord* brec) {
298     if (brec!=NULL)
299     samwrite(this->bam_file,brec->get_b());
300     }
301     void write(bam1_t* b) {
302     samwrite(this->bam_file, b);
303     }
304     };
305    
306     #endif