ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBam.h
Revision: 234
Committed: Wed Apr 11 21:05:36 2012 UTC (7 years, 7 months ago) by gpertea
File size: 11783 byte(s)
Log Message:
fixed find_tag() to use the bam_aux_get()

Line User Rev File contents
1 gpertea 121 #ifndef _G_BAM_H
2     #define _G_BAM_H
3     #include "GBase.h"
4     #include "GList.hh"
5     #include "bam.h"
6     #include "sam.h"
7    
8    
9     class GBamReader;
10     class GBamWriter;
11    
12     class GBamRecord: public GSeg {
13     friend class GBamReader;
14     friend class GBamWriter;
15     bam1_t* b;
16     // b->data has the following strings concatenated:
17     // qname (including the terminal \0)
18     // +cigar (each event encoded on 32 bits)
19     // +seq (4bit-encoded)
20     // +qual
21     // +aux
22 gpertea 225 bool novel; //created from scratch, not from a given bam1_t* record
23 gpertea 121 bam_header_t* bam_header;
24 gpertea 225 char tag[2];
25     uint8_t abuf[512];
26 gpertea 121 public:
27 gpertea 126 GVec<GSeg> exons;
28     //created from a reader:
29 gpertea 124 GBamRecord(bam1_t* from_b=NULL, bam_header_t* b_header=NULL):exons(1) {
30 gpertea 121 bam_header=NULL;
31     if (from_b==NULL) {
32     b=bam_init1();
33 gpertea 225 novel=true;
34 gpertea 121 }
35     else {
36 gpertea 225 b=from_b; //it'll take over from_b but not free it when destroyed
37     novel=false;
38 gpertea 121 }
39     bam_header=b_header;
40     setupCoordinates();//sets start, end coordinate and populates exons array
41     }
42 gpertea 126
43     const GBamRecord& operator=(GBamRecord& r) {
44     //make a new copy of the bam1_t record etc.
45     clear();
46     b=bam_dup1(r.b);
47 gpertea 225 novel=true; //will also free b when destroyed
48 gpertea 126 start=r.start;
49     end=r.end;
50     exons = r.exons;
51     return *this;
52     }
53    
54 gpertea 121 void setupCoordinates();
55     void clear() {
56 gpertea 225 if (novel) {
57     bam_destroy1(b);
58     novel=false;
59     }
60 gpertea 126 b=NULL;
61     exons.Clear();
62     bam_header=NULL;
63 gpertea 121 }
64    
65     ~GBamRecord() {
66 gpertea 126 clear();
67 gpertea 121 }
68    
69     void parse_error(const char* s) {
70     GError("BAM parsing error: %s\n", s);
71     }
72    
73     bam1_t* get_b() { return b; }
74    
75     void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available
76     int32_t isize=0) { //mate info for current record
77     b->core.mtid=mtid;
78     b->core.mpos=m0pos; // should be -1 if '*'
79     b->core.isize=isize; //should be 0 if not available
80     }
81    
82     void set_flags(uint16_t flags) {
83     b->core.flag=flags;
84     }
85    
86     //creates a new record from 1-based alignment coordinate
87     //quals should be given as Phred33
88     //Warning: pos and mate_pos must be given 1-based!
89     GBamRecord(const char* qname, int32_t gseq_tid,
90     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL);
91     GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
92     int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
93     int insert_size, const char* qseq, const char* quals=NULL,
94     GVec<char*>* aux_strings=NULL);
95     //const std::vector<std::string>* aux_strings=NULL);
96     void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format
97     void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format
98     void add_quals(const char* quals); //quality values string in Phred33 format
99     void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1")
100     void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
101 gpertea 225 //IMPORTANT: strings (Z,H) should include the terminal \0
102     int addz=0;
103     if ((atype=='Z' || atype=='H') && data[len-1]!=0) {
104     addz=1;
105     }
106 gpertea 121 int ori_len = b->data_len;
107 gpertea 225 b->data_len += 3 + len + addz;
108     b->l_aux += 3 + len + addz;
109 gpertea 121 if (b->m_data < b->data_len) {
110     b->m_data = b->data_len;
111     kroundup32(b->m_data);
112     b->data = (uint8_t*)realloc(b->data, b->m_data);
113     }
114     b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
115     b->data[ori_len + 2] = atype;
116 gpertea 225 if (addz) {
117     b->data[ori_len+len+4]=0;
118     }
119 gpertea 121 memcpy(b->data + ori_len + 3, data, len);
120     }
121 gpertea 225
122 gpertea 121 void add_tag(const char tag[2], char atype, int len, uint8_t *data) {
123     //same with add_aux()
124     add_aux(tag,atype,len,data);
125     }
126     //--query methods:
127     uint32_t flags() { return b->core.flag; }
128     bool isUnmapped() { return ((b->core.flag & BAM_FUNMAP) != 0); }
129     bool isMapped() { return ((b->core.flag & BAM_FUNMAP) == 0); }
130     bool isPaired() { return ((b->core.flag & BAM_FPAIRED) != 0); }
131 gpertea 124 const char* name() { return bam1_qname(b); }
132 gpertea 132 int pairOrder() {
133     //which read in the pair: 0 = unpaired, 1=first read, 2=second read
134 gpertea 121 int r=0;
135     if ((b->core.flag & BAM_FREAD1) != 0) r=1;
136     else if ((b->core.flag & BAM_FREAD2) != 0) r=2;
137     return r;
138     }
139 gpertea 132 bool revStrand() {
140     //this is the raw alignment strand, NOT the transcription (XS) strand
141     return ((b->core.flag & BAM_FREVERSE) != 0);
142     }
143     const char* refName() {
144     return (bam_header!=NULL) ?
145     ((b->core.tid<0) ? "*" : bam_header->target_name[b->core.tid]) : NULL;
146     }
147 gpertea 121 int32_t refId() { return b->core.tid; }
148 gpertea 132 int32_t mate_refId() { return b->core.mtid; }
149     const char* mate_refName() {
150     return (bam_header!=NULL) ?
151     ((b->core.mtid<0) ? "*" : bam_header->target_name[b->core.mtid]) : NULL;
152     }
153     int32_t insertSize() { return b->core.isize; }
154     int32_t mate_start() { return b->core.mpos<0? 0 : b->core.mpos+1; }
155 gpertea 121
156 gpertea 234 //int find_tag(const char tag[2], uint8_t* & s, char& tag_type);
157     uint8_t* find_tag(const char tag[2]);
158 gpertea 132 //position s at the beginning of tag data, tag_type is set to the found tag type
159     //returns length of tag data, or 0 if tag not found
160    
161 gpertea 124 char* tag_str(const char tag[2]); //return tag value for tag type 'Z'
162     int tag_int(const char tag[2]); //return numeric value of tag (for numeric types)
163     char tag_char(const char tag[2]); //return char value of tag (for type 'A')
164     char spliceStrand(); // '+', '-' from the XS tag, or '.' if no XS tag
165 gpertea 132
166     char* sequence(); //user should free after use
167     char* qualities();//user should free after use
168     char* cigar(); //returns text version of the CIGAR string; user must free
169 gpertea 121 };
170    
171    
172     class GBamReader {
173     samfile_t* bam_file;
174     char* fname;
175     public:
176 gpertea 126 void bopen(const char* filename) {
177 gpertea 144 if (strcmp(filename, "-")==0) {
178     //if stdin was given, we HAVE to assume it's text SAM format, sorry
179     bam_file=samopen(filename, "r", 0);
180     }
181     else {
182     //BAM files have the zlib signature bytes at the beginning: 1F 8B 08
183     //if that's not present, we assume sam file
184     FILE* f=fopen(filename, "rb");
185     if (f==NULL) GError("Error opening file %s!\n", filename);
186     byte fsig[3];
187     size_t rd=fread(fsig, 1, 3, f);
188     if (rd<3) GError("Error reading from file %s!\n",filename);
189     if ((fsig[0]==0x1F && fsig[1]==0x8B && fsig[2]==0x08) ||
190     (fsig[0]=='B' && fsig[1]=='A' && fsig[2]=='M')) {
191     bam_file=samopen(filename, "rb", 0); //BAM or uncompressed BAM
192     }
193     else { //assume text SAM file
194     bam_file=samopen(filename, "r", 0);
195     }
196     }
197 gpertea 121 if (bam_file==NULL)
198     GError("Error: could not open SAM file %s!\n",filename);
199     fname=Gstrdup(filename);
200     }
201 gpertea 129 GBamReader(const char* fn) {
202 gpertea 126 bam_file=NULL;
203 gpertea 121 fname=NULL;
204 gpertea 129 bopen(fn);
205 gpertea 121 }
206    
207 gpertea 126 bam_header_t* header() {
208     return bam_file? bam_file->header : NULL;
209     }
210     void bclose() {
211     if (bam_file) {
212     samclose(bam_file);
213     bam_file=NULL;
214     }
215     }
216 gpertea 121 ~GBamReader() {
217 gpertea 126 bclose();
218 gpertea 121 GFREE(fname);
219     }
220 gpertea 126 void rewind() {
221     if (fname==NULL) {
222     GMessage("Warning: GBamReader::rewind() called without a file name.\n");
223     return;
224     }
225     bclose();
226     char* ifname=fname;
227     bopen(ifname);
228     GFREE(ifname);
229     }
230 gpertea 121
231 gpertea 126 GBamRecord* next() {
232     if (bam_file==NULL)
233     GError("Warning: GBamReader::next() called with no open file.\n");
234     bam1_t* b = bam_init1();
235     if (samread(bam_file, b) >= 0) {
236     GBamRecord* bamrec=new GBamRecord(b, bam_file->header);
237     return bamrec;
238     }
239     else {
240     bam_destroy1(b);
241     return NULL;
242     }
243     }
244 gpertea 121 };
245    
246    
247     class GBamWriter {
248     samfile_t* bam_file;
249     bam_header_t* bam_header;
250     public:
251     void create(const char* fname, bool uncompressed=false) {
252     if (bam_header==NULL)
253     GError("Error: no bam_header for GBamWriter::create()!\n");
254     if (uncompressed) {
255     bam_file=samopen(fname, "wbu", bam_header);
256     }
257     else {
258     bam_file=samopen(fname, "wb", bam_header);
259     }
260     if (bam_file==NULL)
261     GError("Error: could not create BAM file %s!\n",fname);
262     }
263     void create(const char* fname, bam_header_t* bh, bool uncompressed=false) {
264     bam_header=bh;
265     create(fname,uncompressed);
266     }
267    
268     GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
269     create(fname, bh, uncompressed);
270     }
271    
272     GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
273     tamFile samf_in=sam_open(samfname);
274     if (samf_in==NULL)
275     GError("Error: could not open SAM file %s\n", samfname);
276     bam_header=sam_header_read(samf_in);
277     if (bam_header==NULL)
278     GError("Error: could not read SAM header from %s!\n",samfname);
279     sam_close(samf_in);
280     create(fname, uncompressed);
281     }
282    
283     ~GBamWriter() {
284     samclose(bam_file);
285     bam_header_destroy(bam_header);
286     }
287     bam_header_t* get_header() { return bam_header; }
288     int32_t get_tid(const char *seq_name) {
289     if (bam_header==NULL)
290     GError("Error: missing SAM header (get_tid())\n");
291     return bam_get_tid(bam_header, seq_name);
292     }
293    
294     //just a convenience function for creating a new record, but it's NOT written
295     //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
296     GBamRecord* new_record(const char* qname, const char* gseqname,
297     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
298     int32_t gseq_tid=get_tid(gseqname);
299     if (gseq_tid < 0 && strcmp(gseqname, "*")) {
300     if (bam_header->n_targets == 0) {
301     GError("Error: missing/invalid SAM header\n");
302     } else
303     GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
304     gseqname);
305     }
306    
307     return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual));
308     }
309    
310     GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname,
311     int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos,
312     int insert_size, const char* qseq, const char* quals=NULL,
313     GVec<char*>* aux_strings=NULL) {
314     int32_t gseq_tid=get_tid(gseqname);
315     if (gseq_tid < 0 && strcmp(gseqname, "*")) {
316     if (bam_header->n_targets == 0) {
317     GError("Error: missing/invalid SAM header\n");
318     } else
319     GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
320     gseqname);
321     }
322     int32_t mgseq_tid=-1;
323     if (mgseqname!=NULL) {
324     if (strcmp(mgseqname, "=")==0) {
325     mgseq_tid=gseq_tid;
326     }
327     else {
328     mgseq_tid=get_tid(mgseqname);
329     if (mgseq_tid < 0 && strcmp(mgseqname, "*")) {
330     GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
331     mgseqname);
332     }
333     }
334     }
335     return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar,
336     mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings));
337     }
338    
339     void write(GBamRecord* brec) {
340     if (brec!=NULL)
341     samwrite(this->bam_file,brec->get_b());
342     }
343     void write(bam1_t* b) {
344     samwrite(this->bam_file, b);
345     }
346     };
347    
348     #endif