ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBam.cpp
(Generate patch)
# Line 2 | Line 2
2   #include <ctype.h>
3   #include "kstring.h"
4  
5 < //auxiliary functions for BAM record handling
5 > //auxiliary functions for low level BAM record creation
6   uint8_t* realloc_bdata(bam1_t *b, int size) {
7    if (b->m_data < size) {
8          b->m_data = size;
# Line 35 | Line 35
35     novel=true;
36     bam_header=NULL;
37     b=bam_init1();
38 <   b->core.tid=gseq_tid;
39 <   if (pos<=0) {
38 >   if (pos<=0 || gseq_tid<0) {
39                 b->core.pos=-1; //unmapped
41               //if (gseq_tid<0)
40                 b->core.flag |= BAM_FUNMAP;
41 +               gseq_tid=-1;
42                 }
43            else b->core.pos=pos-1; //BAM is 0-based
44 +   b->core.tid=gseq_tid;
45     b->core.qual=255;
46 +   b->core.mtid=-1;
47 +   b->core.mpos=-1;
48     int l_qseq=strlen(qseq);
49     //this may not be accurate, setting CIGAR is the correct way
50     //b->core.bin = bam_reg2bin(b->core.pos, b->core.pos+l_qseq-1);
# Line 168 | Line 170
170  
171   void GBamRecord::add_aux(const char* str) {
172       //requires: being called AFTER add_quals()
171     static char tag[2];
172     static uint8_t abuf[512];
173     //requires: being called AFTER add_quals()
173       int strl=strlen(str);
174       //int doff = b->core.l_qname + b->core.n_cigar*4 + (b->core.l_qseq+1)/2 + b->core.l_qseq + b->l_aux;
175       //int doff0=doff;
# Line 249 | Line 248
248    this->add_aux(tag, atype, alen, adata);
249    }//add_aux()
250  
251 + void interpret_CIGAR(char cop, int cl, int aln_start) {
252 + // NB: Closed ranges
253 + // gpos = current genomic position (will end up as right coordinate on the genome)
254 + // rpos = read position (will end up as the length of the read)
255 + // cop = CIGAR operation, cl = operation length
256 + int rpos = 0;
257 + int gpos = aln_start;
258 + int num_mismatches=0; //NM tag value = edit distance
259 + switch (cop) {
260 + case BAM_CDIFF:
261 +      num_mismatches+=cl;
262 + case BAM_CMATCH:
263 +      //have to actually check for mismatches: num_mismatches+=count_mismatches;
264 + case BAM_CEQUAL:
265 +      printf("[%d-%d]", gpos, gpos + cl - 1);
266 +      gpos+=cl;
267 +      rpos+=cl;
268 +      break;
269 + case BAM_CPAD:
270 +      // printf("[%d-%d]", pos, pos + cl - 1); // Spans positions, No Coverage
271 +      gpos+=cl;
272 +      break;
273 +
274 + case BAM_CHARD_CLIP:
275 +      // printf("[%d]", pos);  // No coverage
276 +      // gpos is not advanced by this operation
277 +      break;
278 +
279 + case BAM_CSOFT_CLIP:
280 +      //soft clipped bases, present in SEQ
281 +      rpos+=cl;
282 +      break;
283 + case BAM_CINS:
284 +      // No Coverage
285 +      // adds cl bases "throughput" but not genomic position "coverage" (gap in genomic seq)
286 +      // should also add cl to the number of "mismatches" (unaligned read bases)
287 +      num_mismatches+=cl;
288 +      // How you handle this is application dependent
289 +      // gpos is not advanced by this operation
290 +      rpos+=cl;
291 +      break;
292 +
293 + case BAM_CDEL:
294 +      printf("D"); //deletion in reference sequence relative to the read (gap in read sequence)
295 +      // printf("[%d-%d]", pos, pos + cl - 1);  
296 +      // Spans positions
297 +      num_mismatches+=cl;
298 +      gpos += cl;
299 +      break;
300 +
301 + case BAM_CREF_SKIP:
302 +      // intron
303 +      printf("S");
304 +      // printf("[%d-%d]", pos, pos + cl - 1); // Spans positions, No Coverage
305 +      //special skip operation, not contributing to "edit distance",
306 +      //   so num_mismatches is not updated
307 +      gpos+=cl;
308 +      break;
309 +
310 + default:
311 +      fprintf(stderr, "Unhandled cigar_op %d:%d\n", cop, cl);
312 +      //printf("?");
313 +  }
314 + } // interpret_CIGAR(), just a reference of CIGAR operations interpretation
315  
316   void GBamRecord::setupCoordinates() {
317     uint32_t *cigar = bam1_cigar(b);
318     const bam1_core_t *c = &b->core;
319     if (c->flag & BAM_FUNMAP) return; /* skip unmapped reads */
320     int l=0;
321 <   start=c->pos+1;
321 >   start=c->pos+1; //genomic start coordinate
322     int exstart=c->pos;
323     for (int i = 0; i < c->n_cigar; ++i) {
324          int op = cigar[i]&0xf;
# Line 266 | Line 329
329             else if (op == BAM_CREF_SKIP) { //N
330                 //intron starts
331                 //exon ends here
332 <               exons.Add(new GSeg(exstart+1,l));
332 >               GSeg exon(exstart+1,c->pos+l);
333 >               exons.Add(exon);
334                 l += cigar[i]>>4;
335                 exstart=c->pos+l;
336                 }
337          }
338 <   exons.Add(new GSeg(exstart+1, l));
339 <   end=c->pos+l;
338 >   GSeg exon(exstart+1,c->pos+l);
339 >   exons.Add(exon);
340 >   end=c->pos+l; //genomic start coordinate
341   }
342 <
342 > /*
343   int GBamRecord::find_tag(const char tag[2], uint8_t* & s, char& tag_type) {
344     //position s at the beginning of tag "data" (after the type char)
345     //returns the length of tag data, and tag type in tag_type
346 <   s=bam1_aux(b);
347 <   while (s < b->data + b->data_len) {
346 >   uint8_t* aux_start=bam1_aux(b);
347 >   s=aux_start;
348 >   while (s < aux_start + b->l_aux - 1) {
349       char key[2];
350       key[0] = (char)s[0]; key[1] = (char)s[1];
351       s += 2; tag_type = (char)*s; ++s;
# Line 317 | Line 383
383                           { m_inc = 2; }
384             else if ('i' == sub_type || 'I' == sub_type || 'f' == sub_type)
385                           { m_inc = 4; }
386 <           if (m_inc==0) GError("Error: invalid 'B' array subtype (%c)!\n",sub_type);
386 >           if (m_inc==0)
387 >              GError("Error: invalid 'B' array subtype (%c)!\n",sub_type);
388             inc += m_inc*n;
389             break;
390         case 'H':
391         case 'Z':
392           while (*(s+inc)) ++inc;
393 +          ++inc; // to skip the terminating \0
394           break;
395         } //switch (tag_type)
396 <     if (tag[0]==key[0] && tag[1]==key[1])
396 >     if (tag[0]==key[0] && tag[1]==key[1]) {
397          return inc;
398 +        }
399 +     s+=inc;
400       }//while aux data
401     return 0;
402     }
403 + */
404 + uint8_t* GBamRecord::find_tag(const char tag[2]) {
405 +   return bam_aux_get(this->b, tag);
406 + }
407  
408   char GBamRecord::tag_char(const char tag[2]) { //retrieve tag data as single char
409 <   uint8_t *s;
410 <   char tag_type=0;
411 <   int vlen=find_tag(tag, s, tag_type);
338 <   if (vlen==0) return 0;
339 <   //if (vlen>1) GWarning("Warning: tag %c%c value has length %d, but char was expected.\n",
340 <   //        tag[0],tag[1],vlen);
341 <   return (char)s[0];
409 >   uint8_t* s=find_tag(tag);
410 >   if (s) return ( bam_aux2A(s) );
411 >   return 0;
412    }
413  
414   int GBamRecord::tag_int(const char tag[2]) { //get the numeric value of tag
415 <   uint8_t *s;
416 <   char tag_type=0;
347 <   int vlen=find_tag(tag, s, tag_type);
348 <   if (vlen==0) return 0;
349 <   if (vlen==1) return (int)(*(int8_t*)s);
350 <    else if (vlen==2) return (int)(*(int16_t*)s);
351 <     else if (vlen==4) return (int)(*(int32_t*)s);
352 <     else GMessage("Warning: tag %c%c value has length %d, but int type was expected.\n",
353 <           tag[0],tag[1], vlen);
415 >   uint8_t *s=find_tag(tag);
416 >   if (s) return ( bam_aux2i(s) );
417     return 0;
418     }
419  
420   char* GBamRecord::tag_str(const char tag[2]) { //return string value for a tag
421 <   uint8_t *s;
422 <   char tag_type=0;
423 <   int vlen=find_tag(tag, s, tag_type);
361 <   if (vlen==0) return NULL; //not found
362 <   //if (vlen>1) GWarning("Warning: tag %c%c value has length %d, but char was expected.\n",
363 <   //        tag[0],tag[1],vlen);
364 <   return (char *)s;
421 >   uint8_t *s=find_tag(tag);
422 >   if (s) return ( bam_aux2Z(s) );
423 >   return NULL;
424     }
425  
426   char GBamRecord::spliceStrand() { // '+', '-' from the XS tag, or 0 if no XS tag
427 <   uint8_t *s;
428 <   char tag_type=0;
429 <   int vlen=find_tag("XS", s, tag_type);
371 <   if (vlen==0) return '.';
372 <   return (char)s[0];
427 >   char c=tag_char("XS");
428 >   if (c) return c;
429 >     else return '.';
430     }
431  
432   char* GBamRecord::sequence() { //user must free this after use

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines