ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBam.h
Revision: 124
Committed: Tue Dec 6 04:54:35 2011 UTC (7 years, 7 months ago) by gpertea
File size: 8866 byte(s)
Log Message:
GBam wip

Line User Rev File contents
1 gpertea 121 #ifndef _G_BAM_H
2     #define _G_BAM_H
3     #include "GBase.h"
4     #include "GList.hh"
5     #include "bam.h"
6     #include "sam.h"
7    
8    
9     class GBamReader;
10     class GBamWriter;
11    
12     class GBamRecord: public GSeg {
13     friend class GBamReader;
14     friend class GBamWriter;
15     bam1_t* b;
16     // b->data has the following strings concatenated:
17     // qname (including the terminal \0)
18     // +cigar (each event encoded on 32 bits)
19     // +seq (4bit-encoded)
20     // +qual
21     // +aux
22     bool novel;
23     bam_header_t* bam_header;
24    
25     public:
26     GPVec<GSeg> exons;
27 gpertea 124 GBamRecord(bam1_t* from_b=NULL, bam_header_t* b_header=NULL):exons(1) {
28 gpertea 121 bam_header=NULL;
29     if (from_b==NULL) {
30     b=bam_init1();
31     novel=true;
32     }
33     else {
34     b=from_b;
35     novel=false;
36     }
37     bam_header=b_header;
38     setupCoordinates();//sets start, end coordinate and populates exons array
39     }
40     void setupCoordinates();
41     void clear() {
42     if (novel) {
43     bam_destroy1(b);
44     }
45     novel=true;
46     b=bam_init1();
47     }
48    
49     ~GBamRecord() {
50     if (novel) { bam_destroy1(b); }
51     }
52    
53     void parse_error(const char* s) {
54     GError("BAM parsing error: %s\n", s);
55     }
56    
57     bam1_t* get_b() { return b; }
58    
59     void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available
60     int32_t isize=0) { //mate info for current record
61     b->core.mtid=mtid;
62     b->core.mpos=m0pos; // should be -1 if '*'
63     b->core.isize=isize; //should be 0 if not available
64     }
65    
66     void set_flags(uint16_t flags) {
67     b->core.flag=flags;
68     }
69    
70    
71    
72     //creates a new record from 1-based alignment coordinate
73     //quals should be given as Phred33
74     //Warning: pos and mate_pos must be given 1-based!
75     GBamRecord(const char* qname, int32_t gseq_tid,
76     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL);
77     GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
78     int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
79     int insert_size, const char* qseq, const char* quals=NULL,
80     GVec<char*>* aux_strings=NULL);
81     //const std::vector<std::string>* aux_strings=NULL);
82     void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format
83     void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format
84     void add_quals(const char* quals); //quality values string in Phred33 format
85     void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1")
86     void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
87     int ori_len = b->data_len;
88     b->data_len += 3 + len;
89     b->l_aux += 3 + len;
90     if (b->m_data < b->data_len) {
91     b->m_data = b->data_len;
92     kroundup32(b->m_data);
93     b->data = (uint8_t*)realloc(b->data, b->m_data);
94     }
95     b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
96     b->data[ori_len + 2] = atype;
97     memcpy(b->data + ori_len + 3, data, len);
98     }
99     void add_tag(const char tag[2], char atype, int len, uint8_t *data) {
100     //same with add_aux()
101     add_aux(tag,atype,len,data);
102     }
103     //--query methods:
104     uint32_t flags() { return b->core.flag; }
105     bool isUnmapped() { return ((b->core.flag & BAM_FUNMAP) != 0); }
106     bool isMapped() { return ((b->core.flag & BAM_FUNMAP) == 0); }
107     bool isPaired() { return ((b->core.flag & BAM_FPAIRED) != 0); }
108 gpertea 124 const char* name() { return bam1_qname(b); }
109 gpertea 121 int mateNum() {
110     int r=0;
111     if ((b->core.flag & BAM_FREAD1) != 0) r=1;
112     else if ((b->core.flag & BAM_FREAD2) != 0) r=2;
113     return r;
114     }
115 gpertea 124 bool revStrand() { return ((b->core.flag & BAM_FREVERSE) != 0); }
116 gpertea 121 const char* refName() {
117     return (bam_header!=NULL) ? bam_header->target_name[b->core.tid] : NULL;
118     }
119     int32_t refId() { return b->core.tid; }
120    
121 gpertea 124 int find_tag(const char tag[2], uint8_t* & s, char& tag_type);
122     char* tag_str(const char tag[2]); //return tag value for tag type 'Z'
123     int tag_int(const char tag[2]); //return numeric value of tag (for numeric types)
124     char tag_char(const char tag[2]); //return char value of tag (for type 'A')
125     char spliceStrand(); // '+', '-' from the XS tag, or '.' if no XS tag
126 gpertea 121 char* sequence(); //user must free this after use
127     char* qualities();//user must free this after use
128     char* cigar(); //returns text version of the CIGAR string; must be freed by user
129     };
130    
131    
132     class GBamReader {
133     samfile_t* bam_file;
134     char* fname;
135     public:
136     void open(const char* filename) {
137     bam_file=samopen(filename, "rb", 0);
138     //for .sam files:
139     //bam_file=samopen(fname, "r", 0);
140     if (bam_file==NULL)
141     GError("Error: could not open SAM file %s!\n",filename);
142     fname=Gstrdup(filename);
143     }
144    
145     GBamReader(const char* fname) {
146     fname=NULL;
147     open(fname);
148     }
149    
150     ~GBamReader() {
151     samclose(bam_file);
152     GFREE(fname);
153     }
154    
155     };
156    
157    
158     class GBamWriter {
159     samfile_t* bam_file;
160     bam_header_t* bam_header;
161     public:
162     void create(const char* fname, bool uncompressed=false) {
163     if (bam_header==NULL)
164     GError("Error: no bam_header for GBamWriter::create()!\n");
165     if (uncompressed) {
166     bam_file=samopen(fname, "wbu", bam_header);
167     }
168     else {
169     bam_file=samopen(fname, "wb", bam_header);
170     }
171     if (bam_file==NULL)
172     GError("Error: could not create BAM file %s!\n",fname);
173     }
174     void create(const char* fname, bam_header_t* bh, bool uncompressed=false) {
175     bam_header=bh;
176     create(fname,uncompressed);
177     }
178    
179     GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
180     create(fname, bh, uncompressed);
181     }
182    
183     GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
184     tamFile samf_in=sam_open(samfname);
185     if (samf_in==NULL)
186     GError("Error: could not open SAM file %s\n", samfname);
187     bam_header=sam_header_read(samf_in);
188     if (bam_header==NULL)
189     GError("Error: could not read SAM header from %s!\n",samfname);
190     sam_close(samf_in);
191     create(fname, uncompressed);
192     }
193    
194     ~GBamWriter() {
195     samclose(bam_file);
196     bam_header_destroy(bam_header);
197     }
198     bam_header_t* get_header() { return bam_header; }
199     int32_t get_tid(const char *seq_name) {
200     if (bam_header==NULL)
201     GError("Error: missing SAM header (get_tid())\n");
202     return bam_get_tid(bam_header, seq_name);
203     }
204    
205     //just a convenience function for creating a new record, but it's NOT written
206     //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
207     GBamRecord* new_record(const char* qname, const char* gseqname,
208     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
209     int32_t gseq_tid=get_tid(gseqname);
210     if (gseq_tid < 0 && strcmp(gseqname, "*")) {
211     if (bam_header->n_targets == 0) {
212     GError("Error: missing/invalid SAM header\n");
213     } else
214     GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
215     gseqname);
216     }
217    
218     return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual));
219     }
220    
221     GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname,
222     int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos,
223     int insert_size, const char* qseq, const char* quals=NULL,
224     GVec<char*>* aux_strings=NULL) {
225     int32_t gseq_tid=get_tid(gseqname);
226     if (gseq_tid < 0 && strcmp(gseqname, "*")) {
227     if (bam_header->n_targets == 0) {
228     GError("Error: missing/invalid SAM header\n");
229     } else
230     GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
231     gseqname);
232     }
233     int32_t mgseq_tid=-1;
234     if (mgseqname!=NULL) {
235     if (strcmp(mgseqname, "=")==0) {
236     mgseq_tid=gseq_tid;
237     }
238     else {
239     mgseq_tid=get_tid(mgseqname);
240     if (mgseq_tid < 0 && strcmp(mgseqname, "*")) {
241     GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
242     mgseqname);
243     }
244     }
245     }
246     return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar,
247     mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings));
248     }
249    
250     void write(GBamRecord* brec) {
251     if (brec!=NULL)
252     samwrite(this->bam_file,brec->get_b());
253     }
254     void write(bam1_t* b) {
255     samwrite(this->bam_file, b);
256     }
257     };
258    
259     #endif