ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBam.h
Revision: 310
Committed: Fri Mar 22 20:06:27 2013 UTC (6 years, 6 months ago) by gpertea
File size: 12494 byte(s)
Log Message:
sync with igm repo

Line User Rev File contents
1 gpertea 121 #ifndef _G_BAM_H
2     #define _G_BAM_H
3     #include "GBase.h"
4     #include "GList.hh"
5     #include "bam.h"
6     #include "sam.h"
7    
8     class GBamReader;
9     class GBamWriter;
10    
11     class GBamRecord: public GSeg {
12     friend class GBamReader;
13     friend class GBamWriter;
14     bam1_t* b;
15     // b->data has the following strings concatenated:
16     // qname (including the terminal \0)
17     // +cigar (each event encoded on 32 bits)
18     // +seq (4bit-encoded)
19     // +qual
20     // +aux
21 gpertea 310 bool novel; //if true, will also free b on delete
22 gpertea 121 bam_header_t* bam_header;
23 gpertea 225 char tag[2];
24     uint8_t abuf[512];
25 gpertea 121 public:
26 gpertea 299 GVec<GSeg> exons; //coordinates will be 1-based
27 gpertea 126 //created from a reader:
28 gpertea 310 void bfree_on_delete(bool b_free=true) { novel=b_free; }
29     GBamRecord(bam1_t* from_b=NULL, bam_header_t* b_header=NULL, bool b_free=true):exons(1) {
30 gpertea 121 bam_header=NULL;
31     if (from_b==NULL) {
32     b=bam_init1();
33 gpertea 225 novel=true;
34 gpertea 121 }
35     else {
36 gpertea 310 b=from_b; //it'll take over from_b
37     novel=b_free;
38 gpertea 121 }
39 gpertea 310
40 gpertea 121 bam_header=b_header;
41 gpertea 299 setupCoordinates();//set 1-based coordinates (start, end and exons array)
42 gpertea 121 }
43 gpertea 126
44     const GBamRecord& operator=(GBamRecord& r) {
45     //make a new copy of the bam1_t record etc.
46     clear();
47     b=bam_dup1(r.b);
48 gpertea 225 novel=true; //will also free b when destroyed
49 gpertea 126 start=r.start;
50     end=r.end;
51     exons = r.exons;
52     return *this;
53     }
54    
55 gpertea 121 void setupCoordinates();
56     void clear() {
57 gpertea 225 if (novel) {
58     bam_destroy1(b);
59 gpertea 310 //novel=false;
60 gpertea 225 }
61 gpertea 126 b=NULL;
62     exons.Clear();
63     bam_header=NULL;
64 gpertea 121 }
65    
66     ~GBamRecord() {
67 gpertea 126 clear();
68 gpertea 121 }
69    
70     void parse_error(const char* s) {
71     GError("BAM parsing error: %s\n", s);
72     }
73    
74     bam1_t* get_b() { return b; }
75    
76     void set_mdata(int32_t mtid, int32_t m0pos, //0-based coordinate, -1 if not available
77     int32_t isize=0) { //mate info for current record
78     b->core.mtid=mtid;
79     b->core.mpos=m0pos; // should be -1 if '*'
80     b->core.isize=isize; //should be 0 if not available
81     }
82    
83     void set_flags(uint16_t flags) {
84     b->core.flag=flags;
85     }
86    
87     //creates a new record from 1-based alignment coordinate
88     //quals should be given as Phred33
89     //Warning: pos and mate_pos must be given 1-based!
90     GBamRecord(const char* qname, int32_t gseq_tid,
91     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* quals=NULL);
92     GBamRecord(const char* qname, int32_t flags, int32_t g_tid,
93     int pos, int map_qual, const char* cigar, int32_t mg_tid, int mate_pos,
94     int insert_size, const char* qseq, const char* quals=NULL,
95     GVec<char*>* aux_strings=NULL);
96     //const std::vector<std::string>* aux_strings=NULL);
97     void set_cigar(const char* cigar); //converts and adds CIGAR string given in plain SAM text format
98     void add_sequence(const char* qseq, int slen=-1); //adds the DNA sequence given in plain text format
99     void add_quals(const char* quals); //quality values string in Phred33 format
100     void add_aux(const char* str); //adds one aux field in plain SAM text format (e.g. "NM:i:1")
101     void add_aux(const char tag[2], char atype, int len, uint8_t *data) {
102 gpertea 225 //IMPORTANT: strings (Z,H) should include the terminal \0
103     int addz=0;
104     if ((atype=='Z' || atype=='H') && data[len-1]!=0) {
105     addz=1;
106     }
107 gpertea 121 int ori_len = b->data_len;
108 gpertea 225 b->data_len += 3 + len + addz;
109     b->l_aux += 3 + len + addz;
110 gpertea 121 if (b->m_data < b->data_len) {
111     b->m_data = b->data_len;
112     kroundup32(b->m_data);
113     b->data = (uint8_t*)realloc(b->data, b->m_data);
114     }
115     b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1];
116     b->data[ori_len + 2] = atype;
117 gpertea 225 if (addz) {
118     b->data[ori_len+len+4]=0;
119     }
120 gpertea 121 memcpy(b->data + ori_len + 3, data, len);
121     }
122 gpertea 225
123 gpertea 121 void add_tag(const char tag[2], char atype, int len, uint8_t *data) {
124     //same with add_aux()
125     add_aux(tag,atype,len,data);
126     }
127     //--query methods:
128     uint32_t flags() { return b->core.flag; }
129     bool isUnmapped() { return ((b->core.flag & BAM_FUNMAP) != 0); }
130     bool isMapped() { return ((b->core.flag & BAM_FUNMAP) == 0); }
131     bool isPaired() { return ((b->core.flag & BAM_FPAIRED) != 0); }
132 gpertea 124 const char* name() { return bam1_qname(b); }
133 gpertea 132 int pairOrder() {
134     //which read in the pair: 0 = unpaired, 1=first read, 2=second read
135 gpertea 121 int r=0;
136     if ((b->core.flag & BAM_FREAD1) != 0) r=1;
137     else if ((b->core.flag & BAM_FREAD2) != 0) r=2;
138     return r;
139     }
140 gpertea 132 bool revStrand() {
141     //this is the raw alignment strand, NOT the transcription (XS) strand
142     return ((b->core.flag & BAM_FREVERSE) != 0);
143     }
144     const char* refName() {
145     return (bam_header!=NULL) ?
146     ((b->core.tid<0) ? "*" : bam_header->target_name[b->core.tid]) : NULL;
147     }
148 gpertea 121 int32_t refId() { return b->core.tid; }
149 gpertea 132 int32_t mate_refId() { return b->core.mtid; }
150     const char* mate_refName() {
151     return (bam_header!=NULL) ?
152     ((b->core.mtid<0) ? "*" : bam_header->target_name[b->core.mtid]) : NULL;
153     }
154     int32_t insertSize() { return b->core.isize; }
155     int32_t mate_start() { return b->core.mpos<0? 0 : b->core.mpos+1; }
156 gpertea 121
157 gpertea 234 //int find_tag(const char tag[2], uint8_t* & s, char& tag_type);
158     uint8_t* find_tag(const char tag[2]);
159 gpertea 132 //position s at the beginning of tag data, tag_type is set to the found tag type
160     //returns length of tag data, or 0 if tag not found
161    
162 gpertea 124 char* tag_str(const char tag[2]); //return tag value for tag type 'Z'
163     int tag_int(const char tag[2]); //return numeric value of tag (for numeric types)
164     char tag_char(const char tag[2]); //return char value of tag (for type 'A')
165     char spliceStrand(); // '+', '-' from the XS tag, or '.' if no XS tag
166 gpertea 132
167     char* sequence(); //user should free after use
168     char* qualities();//user should free after use
169     char* cigar(); //returns text version of the CIGAR string; user must free
170 gpertea 121 };
171    
172 gpertea 310 // from sam.c:
173     #define FTYPE_BAM 1
174     #define FTYPE_READ 2
175 gpertea 121
176     class GBamReader {
177     samfile_t* bam_file;
178     char* fname;
179 gpertea 310 // from bam_import.c:
180     struct samtools_tamFile_t {
181     gzFile fp;
182     void *ks;
183     void *str;
184     uint64_t n_lines;
185     int is_first;
186     };
187    
188 gpertea 121 public:
189 gpertea 126 void bopen(const char* filename) {
190 gpertea 144 if (strcmp(filename, "-")==0) {
191     //if stdin was given, we HAVE to assume it's text SAM format, sorry
192     bam_file=samopen(filename, "r", 0);
193     }
194     else {
195     //BAM files have the zlib signature bytes at the beginning: 1F 8B 08
196     //if that's not present, we assume sam file
197     FILE* f=fopen(filename, "rb");
198     if (f==NULL) GError("Error opening file %s!\n", filename);
199     byte fsig[3];
200     size_t rd=fread(fsig, 1, 3, f);
201 gpertea 310 fclose(f);
202 gpertea 144 if (rd<3) GError("Error reading from file %s!\n",filename);
203     if ((fsig[0]==0x1F && fsig[1]==0x8B && fsig[2]==0x08) ||
204     (fsig[0]=='B' && fsig[1]=='A' && fsig[2]=='M')) {
205     bam_file=samopen(filename, "rb", 0); //BAM or uncompressed BAM
206     }
207     else { //assume text SAM file
208     bam_file=samopen(filename, "r", 0);
209     }
210     }
211 gpertea 121 if (bam_file==NULL)
212     GError("Error: could not open SAM file %s!\n",filename);
213     fname=Gstrdup(filename);
214     }
215 gpertea 129 GBamReader(const char* fn) {
216 gpertea 126 bam_file=NULL;
217 gpertea 121 fname=NULL;
218 gpertea 129 bopen(fn);
219 gpertea 121 }
220    
221 gpertea 126 bam_header_t* header() {
222     return bam_file? bam_file->header : NULL;
223     }
224     void bclose() {
225     if (bam_file) {
226     samclose(bam_file);
227     bam_file=NULL;
228     }
229     }
230 gpertea 121 ~GBamReader() {
231 gpertea 126 bclose();
232 gpertea 121 GFREE(fname);
233     }
234 gpertea 310 int64_t fpos() {
235     if ( bam_file->type & FTYPE_BAM )
236     return bgzf_tell(bam_file->x.bam);
237     else
238     return (int64_t)gztell(((samtools_tamFile_t*)(bam_file->x.tamr))->fp);
239     }
240     int64_t fseek(int64_t offs) {
241     if ( bam_file->type & FTYPE_BAM )
242     return bgzf_seek(bam_file->x.bam, offs, SEEK_SET);
243     else
244     return (int64_t)gzseek(((samtools_tamFile_t*)(bam_file->x.tamr))->fp, offs, SEEK_SET);
245     }
246 gpertea 126 void rewind() {
247     if (fname==NULL) {
248     GMessage("Warning: GBamReader::rewind() called without a file name.\n");
249     return;
250     }
251     bclose();
252     char* ifname=fname;
253     bopen(ifname);
254     GFREE(ifname);
255     }
256 gpertea 121
257 gpertea 126 GBamRecord* next() {
258     if (bam_file==NULL)
259     GError("Warning: GBamReader::next() called with no open file.\n");
260     bam1_t* b = bam_init1();
261     if (samread(bam_file, b) >= 0) {
262 gpertea 310 GBamRecord* bamrec=new GBamRecord(b, bam_file->header, true);
263 gpertea 126 return bamrec;
264     }
265     else {
266     bam_destroy1(b);
267     return NULL;
268     }
269     }
270 gpertea 121 };
271    
272    
273     class GBamWriter {
274     samfile_t* bam_file;
275     bam_header_t* bam_header;
276     public:
277     void create(const char* fname, bool uncompressed=false) {
278     if (bam_header==NULL)
279     GError("Error: no bam_header for GBamWriter::create()!\n");
280     if (uncompressed) {
281     bam_file=samopen(fname, "wbu", bam_header);
282     }
283     else {
284     bam_file=samopen(fname, "wb", bam_header);
285     }
286     if (bam_file==NULL)
287     GError("Error: could not create BAM file %s!\n",fname);
288     }
289     void create(const char* fname, bam_header_t* bh, bool uncompressed=false) {
290     bam_header=bh;
291     create(fname,uncompressed);
292     }
293    
294     GBamWriter(const char* fname, bam_header_t* bh, bool uncompressed=false) {
295     create(fname, bh, uncompressed);
296     }
297    
298     GBamWriter(const char* fname, const char* samfname, bool uncompressed=false) {
299     tamFile samf_in=sam_open(samfname);
300     if (samf_in==NULL)
301     GError("Error: could not open SAM file %s\n", samfname);
302     bam_header=sam_header_read(samf_in);
303     if (bam_header==NULL)
304     GError("Error: could not read SAM header from %s!\n",samfname);
305     sam_close(samf_in);
306     create(fname, uncompressed);
307     }
308    
309     ~GBamWriter() {
310     samclose(bam_file);
311     bam_header_destroy(bam_header);
312     }
313     bam_header_t* get_header() { return bam_header; }
314     int32_t get_tid(const char *seq_name) {
315     if (bam_header==NULL)
316     GError("Error: missing SAM header (get_tid())\n");
317     return bam_get_tid(bam_header, seq_name);
318     }
319    
320     //just a convenience function for creating a new record, but it's NOT written
321     //given pos must be 1-based (so it'll be stored as pos-1 because BAM is 0-based)
322     GBamRecord* new_record(const char* qname, const char* gseqname,
323     int pos, bool reverse, const char* qseq, const char* cigar=NULL, const char* qual=NULL) {
324     int32_t gseq_tid=get_tid(gseqname);
325     if (gseq_tid < 0 && strcmp(gseqname, "*")) {
326     if (bam_header->n_targets == 0) {
327     GError("Error: missing/invalid SAM header\n");
328     } else
329     GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
330     gseqname);
331     }
332    
333     return (new GBamRecord(qname, gseq_tid, pos, reverse, qseq, cigar, qual));
334     }
335    
336     GBamRecord* new_record(const char* qname, int32_t flags, const char* gseqname,
337     int pos, int map_qual, const char* cigar, const char* mgseqname, int mate_pos,
338     int insert_size, const char* qseq, const char* quals=NULL,
339     GVec<char*>* aux_strings=NULL) {
340     int32_t gseq_tid=get_tid(gseqname);
341     if (gseq_tid < 0 && strcmp(gseqname, "*")) {
342     if (bam_header->n_targets == 0) {
343     GError("Error: missing/invalid SAM header\n");
344     } else
345     GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
346     gseqname);
347     }
348     int32_t mgseq_tid=-1;
349     if (mgseqname!=NULL) {
350     if (strcmp(mgseqname, "=")==0) {
351     mgseq_tid=gseq_tid;
352     }
353     else {
354     mgseq_tid=get_tid(mgseqname);
355     if (mgseq_tid < 0 && strcmp(mgseqname, "*")) {
356     GMessage("Warning: reference '%s' not found in header, will consider it '*'.\n",
357     mgseqname);
358     }
359     }
360     }
361     return (new GBamRecord(qname, flags, gseq_tid, pos, map_qual, cigar,
362     mgseq_tid, mate_pos, insert_size, qseq, quals, aux_strings));
363     }
364    
365     void write(GBamRecord* brec) {
366     if (brec!=NULL)
367     samwrite(this->bam_file,brec->get_b());
368     }
369     void write(bam1_t* b) {
370     samwrite(this->bam_file, b);
371     }
372     };
373    
374     #endif