ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 139
Committed: Thu Dec 15 22:36:13 2011 UTC (8 years, 5 months ago) by gpertea
File size: 57033 byte(s)
Log Message:
spliceCigar() implemented, needs testing

Line User Rev File contents
1 gpertea 29 /*
2     * bwt_map.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/17/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <cassert>
15     #include <cstdlib>
16     #include <iostream>
17     #include <set>
18     #include <vector>
19     #include <cmath>
20    
21     #include "common.h"
22     #include "bwt_map.h"
23     #include "tokenize.h"
24     #include "reads.h"
25    
26     using namespace std;
27    
28     void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29     {
30     uint32_t reference_id = bh.ref_id();
31    
32     pair<RefHits::iterator, bool> ret =
33     _hits_for_ref.insert(make_pair(reference_id, HitList()));
34     HitList& hl = ret.first->second;
35    
36     if (check_uniqueness)
37     {
38     // Check uniqueness, in case we are adding spliced hits from
39     // several spliced alignment sources (e.g. de novo hashing + Bowtie
40     // against a user-supplied index). We don't want to count the same
41     // alignment twice if it happened to be found by more than one method
42     HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43     HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44     for (; lb != ub && lb != hl.end(); ++lb)
45     {
46     if (*lb == bh)
47     {
48     //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49     return;
50     }
51    
52     if (lb->insert_id() == bh.insert_id() &&
53     lb->ref_id() == bh.ref_id() &&
54     lb->antisense_align() == bh.antisense_align())
55     {
56     // If we get here, we may be looking at the same alignment
57     // However, spanning_reads may report a shorter, trimmed alignment
58     // so not all fields will be equal. If they just disagree on the
59     // ends, and don't indicate a different junction coord, the
60     // alignments are the same.
61    
62     if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63     (bh.left() <= lb->left() && bh.right() >= lb->right()))
64     {
65     vector<pair<int,int> > lb_gaps, bh_gaps;
66     lb->gaps(lb_gaps);
67     bh.gaps(bh_gaps);
68     if (lb_gaps == bh_gaps)
69     {
70     // One alignment is contained in the other, they agree on
71     // where the gaps, if any, are, and they share an id
72     // => this is a redundant aligment, so toss it
73     //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74     return;
75     }
76     }
77     }
78     }
79     }
80     _total_hits++;
81     hl.push_back(bh);
82     }
83    
84     bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85     {
86     return h1.insert_id() < h2.insert_id();
87     }
88    
89     void LineHitFactory::openStream(HitStream& hs)
90     {
91     if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92     //open the file for HitStream here
93     hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94     if (hs._hit_file==NULL)
95     err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96     return;
97     }
98     if (hs._fzpipe!=NULL) {
99     hs._hit_file=hs._fzpipe->file;
100     }
101     }
102    
103     void LineHitFactory::rewind(HitStream& hs)
104     {
105     if (hs._fzpipe!=NULL) {
106     hs._fzpipe->rewind();
107     hs._hit_file=hs._fzpipe->file;
108     }
109     else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110     }
111    
112     bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113     FILE* f=(FILE *)(hs._hit_file);
114     bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115     if (!new_rec || feof(f)) {
116     hs._eof=true;
117     return false;
118     }
119     ++_line_num;
120     char* nl = strrchr(_hit_buf, '\n');
121     if (nl) *nl = 0;
122     buf = _hit_buf;
123     buf_size = _hit_buf_max_sz - 1;
124     return true;
125     }
126    
127     void LineHitFactory::closeStream(HitStream& hs) {
128     if (hs._fzpipe!=NULL) {
129     hs._fzpipe->close();
130     return;
131     }
132     if (hs._hit_file!=NULL) {
133     fclose((FILE*)(hs._hit_file));
134     hs._hit_file=NULL;
135     }
136     }
137     void BAMHitFactory::openStream(HitStream& hs) {
138     if (hs._hit_file==NULL) {
139     if (hs._hit_file_name.empty())
140     //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141     return; //invalid stream, could be just a place holder
142     //open the file here if not already open
143     string fext=getFext(hs._hit_file_name);
144     if (fext=="sam")
145     hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146     else
147     hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148    
149     samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150    
151     if (sam_file == NULL)
152     err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153     if (sam_file->header == NULL)
154     err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155     memset(&_next_hit, 0, sizeof(_next_hit));
156     //_beginning = bgzf_tell(sam_file->x.bam);
157     _sam_header=sam_file->header;
158     if (inspect_header(hs) == false)
159     err_die("Error: invalid SAM header for file %s\n",
160     hs._hit_file_name.c_str());
161     }
162     }
163    
164     void BAMHitFactory::closeStream(HitStream& hs) {
165     if (hs._hit_file) {
166     samclose((samfile_t*)(hs._hit_file));
167     }
168     hs._hit_file=NULL;
169     _sam_header=NULL;
170     }
171    
172     void BAMHitFactory::rewind(HitStream& hs)
173     {
174     /*
175     if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176     {
177     bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178     _eof = false;
179     }
180     */
181     this->closeStream(hs);
182     this->openStream(hs);
183     }
184    
185     string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186     const bam1_t* bamrec=(const bam1_t*)hit_buf;
187     char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188     string sam_line(tamline);
189     free(tamline);
190     return sam_line;
191     }
192    
193     bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194     if (_next_hit.data) {
195     free(_next_hit.data);
196     _next_hit.data = NULL;
197     }
198     _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199     if (hs.eof() || !hs.ready()) return false;
200    
201     //mark_curr_pos();
202    
203     memset(&_next_hit, 0, sizeof(_next_hit));
204    
205     int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206     if (bytes_read <= 0) {
207     hs._eof = true;
208     return false;
209     }
210     buf = (const char*)&_next_hit;
211     buf_size = bytes_read;
212     return true;
213     }
214    
215    
216     BowtieHit HitFactory::create_hit(const string& insert_name,
217     const string& ref_name,
218     int left,
219     const vector<CigarOp>& cigar,
220     bool antisense_aln,
221     bool antisense_splice,
222     unsigned char edit_dist,
223     unsigned char splice_mms,
224     bool end)
225     {
226     uint64_t insert_id = _insert_table.get_id(insert_name);
227     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228    
229     return BowtieHit(reference_id,
230     insert_id,
231     left,
232     cigar,
233     antisense_aln,
234     antisense_splice,
235     edit_dist,
236     splice_mms,
237     end);
238     }
239    
240     BowtieHit HitFactory::create_hit(const string& insert_name,
241     const string& ref_name,
242     uint32_t left,
243     uint32_t read_len,
244     bool antisense_aln,
245     unsigned char edit_dist,
246     bool end)
247     {
248     uint64_t insert_id = _insert_table.get_id(insert_name);
249     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250    
251     return BowtieHit(reference_id,
252     insert_id,
253     left,
254     read_len,
255     antisense_aln,
256     edit_dist,
257     end);
258     }
259    
260     bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261     BowtieHit& bh,
262     bool strip_slash,
263     char* name_out,
264     char* name_tags,
265     char* seq,
266     char* qual)
267     {
268     if (!orig_bwt_buf || !*orig_bwt_buf)
269     return false;
270    
271     static const int buf_size = 2048;
272    
273     char bwt_buf[buf_size];
274     strcpy(bwt_buf, orig_bwt_buf);
275     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276    
277     char orientation;
278    
279     //int bwtf_ret = 0;
280     //uint32_t seqid = 0;
281    
282     char text_name[buf_size];
283     unsigned int text_offset;
284    
285    
286     //unsigned int other_occs;
287     char mismatches[buf_size];
288     //memset(mismatches, 0, sizeof(mismatches));
289    
290     const char* buf = bwt_buf;
291     char* name = get_token((char**)&buf,"\t");
292     char* orientation_str = get_token((char**)&buf,"\t");
293     char* text_name_str = get_token((char**)&buf,"\t");
294    
295     strcpy(text_name, text_name_str);
296    
297     char* text_offset_str = get_token((char**)&buf,"\t");
298     char* seq_str = get_token((char**)&buf,"\t");
299     if (seq)
300     strcpy(seq, seq_str);
301    
302     const char* qual_str = get_token((char**)&buf,"\t");
303     if (qual)
304     strcpy(qual, qual_str);
305    
306     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307     mismatches[0] = 0;
308     char* mismatches_str = get_token((char**)&buf, "\t");
309     if (mismatches_str)
310     strcpy(mismatches, mismatches_str);
311    
312     orientation = orientation_str[0];
313     text_offset = atoi(text_offset_str);
314    
315     bool end = true;
316     unsigned int seg_offset = 0;
317     unsigned int seg_num = 0;
318     unsigned int num_segs = 0;
319    
320     // Copy the tag out of the name field before we might wipe it out
321     char* pipe = strrchr(name, '|');
322     if (pipe)
323     {
324     if (name_tags)
325     strcpy(name_tags, pipe);
326    
327     char* tag_buf = pipe + 1;
328     if (strchr(tag_buf, ':'))
329     {
330     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331     if (seg_num + 1 == num_segs)
332     end = true;
333     else
334     end = false;
335     }
336    
337     *pipe = 0;
338     }
339     // Stripping the slash and number following it gives the insert name
340     char* slash = strrchr(name, '/');
341     if (strip_slash && slash)
342     *slash = 0;
343    
344     size_t read_len = strlen(seq_str);
345    
346     // Add this alignment to the table of hits for this half of the
347     // Bowtie map
348    
349     //vector<string> mismatch_toks;
350     char* pch = strtok (mismatches,",");
351     unsigned char num_mismatches = 0;
352     while (pch != NULL)
353     {
354     char* colon = strchr(pch, ':');
355     if (colon)
356     {
357     num_mismatches++;
358     }
359     //mismatch_toks.push_back(pch);
360     pch = strtok (NULL, ",");
361     }
362    
363     bh = create_hit(name,
364     text_name,
365     text_offset,
366     (int)read_len,
367     orientation == '-',
368     num_mismatches,
369     end);
370    
371     return true;
372     }
373    
374     int anchor_mismatch = 0;
375    
376 gpertea 70
377     void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
378     bool &end, unsigned int &seg_offset, unsigned int& seg_num,
379     unsigned int & num_segs) {
380     char* pipe = strrchr(name, '|');
381     if (pipe)
382     {
383     if (name_tags)
384     strcpy(name_tags, pipe);
385    
386     char* tag_buf = pipe + 1;
387     if (strchr(tag_buf, ':'))
388     {
389     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
390     if (seg_num + 1 == num_segs)
391     end = true;
392     else
393     end = false;
394     }
395    
396     *pipe = 0;
397     }
398    
399     // Stripping the slash and number following it gives the insert name
400     char* slash = strrchr(name, '/');
401     if (strip_slash && slash)
402     *slash = 0;
403     }
404    
405    
406 gpertea 29 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
407     BowtieHit& bh,
408     bool strip_slash,
409     char* name_out,
410     char* name_tags,
411     char* seq,
412     char* qual)
413     {
414     if (!orig_bwt_buf || !*orig_bwt_buf)
415     return false;
416    
417     static const int buf_size = 2048;
418    
419     char bwt_buf[buf_size];
420     strcpy(bwt_buf, orig_bwt_buf);
421     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
422    
423     char orientation;
424     char text_name[buf_size];
425     unsigned int text_offset;
426     char mismatches[buf_size];
427     //memset(mismatches, 0, sizeof(mismatches));
428    
429     char* buf = bwt_buf;
430     char* name = get_token((char**)&buf,"\t");
431     char* orientation_str = get_token((char**)&buf,"\t");
432     char* text_name_str = get_token((char**)&buf,"\t");
433     strcpy(text_name, text_name_str);
434    
435     char* text_offset_str = get_token((char**)&buf,"\t");
436     char* seq_str = get_token((char**)&buf,"\t");
437     if (seq)
438     strcpy(seq, seq_str);
439    
440     const char* qual_str = get_token((char**)&buf,"\t");
441     if (qual)
442     strcpy(qual, qual_str);
443    
444     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
445     mismatches[0] = 0;
446     char* mismatches_str = get_token((char**)&buf, "\t");
447     if (mismatches_str)
448     strcpy(mismatches, mismatches_str);
449    
450     orientation = orientation_str[0];
451     text_offset = atoi(text_offset_str);
452    
453     bool end = true;
454     unsigned int seg_offset = 0;
455     unsigned int seg_num = 0;
456     unsigned int num_segs = 0;
457    
458     // Copy the tag out of the name field before we might wipe it out
459 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
460 gpertea 29
461     // Add this alignment to the table of hits for this half of the
462     // Bowtie map
463    
464     // Parse the text_name field to recover the splice coords
465     vector<string> toks;
466    
467     tokenize_strict(text_name, "|", toks);
468    
469     int num_extra_toks = (int)toks.size() - 6;
470    
471     if (num_extra_toks >= 0)
472     {
473     static const uint8_t left_window_edge_field = 1;
474     static const uint8_t splice_field = 2;
475     //static const uint8_t right_window_edge_field = 3;
476     static const uint8_t junction_type_field = 4;
477     static const uint8_t strand_field = 5;
478    
479     string contig = toks[0];
480     for (int t = 1; t <= num_extra_toks; ++t)
481     {
482     contig += "|";
483     contig += toks[t];
484     }
485    
486     vector<string> splice_toks;
487     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
488     if (splice_toks.size() != 2)
489     {
490     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
491     //fprintf(stderr, "%s (token: %s)\n", text_name,
492     // toks[num_extra_toks + splice_field].c_str());
493     return false;
494     }
495    
496     //
497     // check for an insertion hit
498     //
499     if(toks[num_extra_toks + junction_type_field] == "ins")
500     {
501     int8_t spliced_read_len = strlen(seq_str);
502     /*
503     * The 0-based position of the left edge of the alignment. Note that this
504     * value may need to be futher corrected to account for the presence of
505     * of the insertion.
506     */
507     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
508     uint32_t right = left + spliced_read_len - 1;
509    
510 gpertea 137
511 gpertea 29 /*
512     * The 0-based position of the last genomic sequence before the insertion
513     */
514     uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
515    
516     string insertedSequence = splice_toks[1];
517     /*
518     * The 0-based position of the first genomic sequence after teh insertion
519     */
520     uint32_t right_splice_pos = left_splice_pos + 1;
521     if(left > left_splice_pos){
522     /*
523     * The genomic position of the left edge of the alignment needs to be corrected
524     * If the alignment does not extend into the insertion, simply subtract the length
525     * of the inserted sequence, otherwise, just set it equal to the right edge
526     */
527     left = left - insertedSequence.length();
528     if(left < right_splice_pos){
529     left = right_splice_pos;
530     }
531     }
532     if(right > left_splice_pos){
533     right = right - insertedSequence.length();
534     if(right < left_splice_pos){
535     right = left_splice_pos;
536     }
537     }
538     /*
539     * Now, right and left should be properly transformed into genomic coordinates
540     * We should be able to deduce how much the alignment matches the insertion
541     * simply based on the length of the read
542     */
543     int left_match_length = 0;
544     if(left <= left_splice_pos){
545     left_match_length = left_splice_pos - left + 1;
546     }
547     int right_match_length = 0;
548     if(right >= right_splice_pos){
549     right_match_length = right - right_splice_pos + 1;
550     }
551     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
552    
553     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
554     return false;
555    
556     string junction_strand = toks[num_extra_toks + strand_field];
557     if(junction_strand != "rev" && junction_strand != "fwd"){
558     fprintf(stderr, "Malformed insertion record\n");
559     return false;
560     }
561    
562     char* pch = strtok( mismatches, ",");
563     unsigned char num_mismatches = 0;
564    
565     /*
566     * remember that text_offset holds the left end of the
567     * alignment, relative to the start of the contig
568     */
569    
570     /*
571     * The 0-based relative position of the left-most character
572     * before the insertion in the contig
573     */
574     int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
575     while (pch != NULL)
576     {
577     char* colon = strchr(pch, ':');
578     if (colon)
579     {
580     *colon = 0;
581     int mismatch_pos = atoi(pch);
582    
583     /*
584     * for reversely mapped reads,
585     * find the correct mismatched position.
586     */
587     if(orientation == '-'){
588     mismatch_pos = spliced_read_len - mismatch_pos - 1;
589     }
590    
591     /*
592     * Only count mismatches outside of the insertion region
593     * If there is a mismatch within the insertion,
594     * disallow this hit
595     */
596     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
597     num_mismatches++;
598     }else{
599     return false;
600     }
601     }
602     pch = strtok (NULL, ",");
603     }
604    
605    
606     vector<CigarOp> cigar;
607     cigar.push_back(CigarOp(MATCH, left_match_length));
608     cigar.push_back(CigarOp(INS, insertion_match_length));
609     cigar.push_back(CigarOp(MATCH, right_match_length));
610    
611     /*
612     * For now, disallow hits that don't span
613     * the insertion. If we allow these types of hits,
614     * then long_spanning.cpp needs to be updated
615     * in order to intelligently merge these kinds
616     * of reads back together
617     *
618     * Following code has been changed to allow segment that end
619     * in an insertion
620     */
621     bh = create_hit(name,
622     contig,
623     left,
624     cigar,
625     orientation == '-',
626     junction_strand == "rev",
627     num_mismatches,
628     0,
629     end);
630     return true;
631     }
632    
633     else
634     {
635     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
636     int spliced_read_len = strlen(seq_str);
637     int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
638     if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
639     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
640    
641     if (right_splice_pos <= 0 || left_splice_pos <= 0)
642     return false;
643    
644     if (orientation == '+')
645     {
646     if (left_splice_pos + seg_offset < _anchor_length){
647     return false;
648     }
649     }
650     else
651     {
652     if (right_splice_pos + seg_offset < _anchor_length)
653     return false;
654     }
655     //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
656     //atoi(toks[right_window_edge_field].c_str());
657     int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
658    
659     string junction_strand = toks[num_extra_toks + strand_field];
660     if (!(junction_strand == "rev" || junction_strand == "fwd")||
661     !(orientation == '-' || orientation == '+'))
662     {
663     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
664     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
665     // junction_strand.c_str(), orientation);
666     return false;
667     }
668    
669     //vector<string> mismatch_toks;
670     char* pch = strtok (mismatches,",");
671     int mismatches_in_anchor = 0;
672     unsigned char num_mismatches = 0;
673     while (pch != NULL)
674     {
675     char* colon = strchr(pch, ':');
676     if (colon)
677     {
678     *colon = 0;
679     num_mismatches++;
680     int mismatch_pos = atoi(pch);
681     if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
682     (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
683     mismatches_in_anchor++;
684     }
685     //mismatch_toks.push_back(pch);
686     pch = strtok (NULL, ",");
687     }
688    
689     // FIXME: we probably should exclude these hits somewhere, but this
690     // isn't the right place
691     vector<CigarOp> cigar;
692     cigar.push_back(CigarOp(MATCH, left_splice_pos));
693     if(toks[num_extra_toks + junction_type_field] == "del"){
694     cigar.push_back(CigarOp(DEL, gap_len));
695     }else{
696     cigar.push_back(CigarOp(REF_SKIP, gap_len));
697     }
698     cigar.push_back(CigarOp(MATCH, right_splice_pos));
699    
700     bh = create_hit(name,
701     contig,
702     left,
703     cigar,
704     orientation == '-',
705     junction_strand == "rev",
706     num_mismatches,
707     mismatches_in_anchor,
708     end);
709     return true;
710     }
711     }
712     else
713     {
714     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
715     //fprintf(stderr, "%s\n", orig_bwt_buf);
716     // continue;
717     return false;
718     }
719    
720     return false;
721     }
722    
723 gpertea 70
724 gpertea 137 int parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
725 gpertea 70 bool &spliced_alignment) {
726     const char* p_cig = cigar_str;
727 gpertea 137 int refspan=0; //alignment span on reference sequence
728 gpertea 70 while (*p_cig)
729     {
730     char* t;
731 gpertea 137 int op_len = (int)strtol(p_cig, &t, 10);
732     if (op_len <= 0)
733 gpertea 70 {
734 gpertea 137 fprintf (stderr, "Error: CIGAR op has zero length\n");
735     return 0;
736 gpertea 70 }
737     char op_char = toupper(*t);
738     CigarOpCode opcode;
739 gpertea 137 switch (op_char) {
740     case '=':
741     case 'X':
742     case 'M': opcode = MATCH;
743     refspan+=op_len;
744     break;
745     case 'I': opcode = INS;
746     break;
747     case 'D': opcode = DEL;
748     refspan+=op_len;
749     break;
750     case 'N': if (op_len > max_report_intron_length)
751     return 0;
752     opcode = REF_SKIP;
753     spliced_alignment = true;
754     refspan+=op_len;
755     break;
756     case 'S': opcode = SOFT_CLIP;
757     break;
758     case 'H': opcode = HARD_CLIP;
759     break;
760     case 'P': opcode = PAD;
761     break;
762     default: fprintf (stderr, "Error: invalid CIGAR operation\n");
763     return 0;
764     }
765 gpertea 70 p_cig = t + 1;
766 gpertea 137 cigar.push_back(CigarOp(opcode, op_len));
767     } //while cigar codes
768 gpertea 70 if (*p_cig) {
769 gpertea 137 fprintf (stderr, "Error: unmatched CIGAR operation (%s)\n",p_cig);
770     return 0;
771 gpertea 70 }
772 gpertea 137 return refspan;
773 gpertea 70 }
774    
775 gpertea 137 int getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
776 gpertea 139 vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) {
777 gpertea 137 int gspan=0;//genomic span of the alignment
778     const char* tag_buf = buf;
779 gpertea 70 sam_nm=0;
780 gpertea 139 int num_mismatches=0;
781 gpertea 70 while((tag_buf = get_token((char**)&buf,"\t")))
782     {
783     vector<string> tuple_fields;
784     tokenize(tag_buf,":", tuple_fields);
785     if (tuple_fields.size() == 3)
786     {
787     if (tuple_fields[0] == "XS")
788     {
789     if (tuple_fields[2] == "-")
790     antisense_splice = true;
791     }
792     else if (tuple_fields[0] == "NM")
793     {
794     sam_nm = atoi(tuple_fields[2].c_str());
795     }
796     else if (tuple_fields[0] == "NS")
797     {
798     //ignored for now
799     }
800     else if (tuple_fields[0] == "MD")
801     {
802     const char* p=tuple_fields[2].c_str();
803     int bi=0; //base offset position in the read
804     while (*p != 0) {
805     if (isdigit(*p)) {
806     int v=atoi(p);
807     do { p++; } while (isdigit(*p));
808     bi+=v;
809     }
810 gpertea 137 while (isalpha(*p)) {
811 gpertea 70 p++;
812 gpertea 139 num_mismatches++;
813     //mismatches.push_back(bi);
814     mismatches[bi]=true;
815 gpertea 70 bi++;
816     }
817     if (*p=='^') { //reference deletion
818     p++;
819 gpertea 137 while (isalpha(*p)) { //insert read bases
820 gpertea 70 p++; bi++;
821 gpertea 137 }
822 gpertea 70 }
823     }
824     }
825 gpertea 139 //else
826     //{
827 gpertea 70 //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
828     //return false;
829 gpertea 139 //}
830 gpertea 70 }
831     }
832     /* By convention,the NM field of the SAM record
833     * counts an insertion or deletion. I dont' think
834     * we want the mismatch count in the BowtieHit
835     * record to reflect this. Therefore, subtract out
836     * the mismatches due to in/dels
837     */
838     for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
839 gpertea 137 switch (itr->opcode)
840     {
841     case MATCH:
842     case REF_SKIP:
843     case PAD:
844     gspan += itr->length;
845     break;
846     case DEL:
847     gspan += itr->length;
848     sam_nm -= itr->length;
849     break;
850     case INS:
851     sam_nm -= itr->length;
852     break;
853 gpertea 70 }
854 gpertea 137 }
855 gpertea 139 return num_mismatches;
856 gpertea 70 }
857    
858 gpertea 29 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
859     BowtieHit& bh,
860     bool strip_slash,
861     char* name_out,
862     char* name_tags,
863     char* seq,
864     char* qual)
865     {
866     if (!orig_bwt_buf || !*orig_bwt_buf)
867     return false;
868     char bwt_buf[2048];
869     strcpy(bwt_buf, orig_bwt_buf);
870     // Are we still in the header region?
871     if (bwt_buf[0] == '@')
872     return false;
873    
874     char* buf = bwt_buf;
875     char* name = get_token((char**)&buf,"\t");
876     char* sam_flag_str = get_token((char**)&buf,"\t");
877     char* text_name = get_token((char**)&buf,"\t");
878     char* text_offset_str = get_token((char**)&buf,"\t");
879     const char* map_qual_str = get_token((char**)&buf,"\t");
880     char* cigar_str = get_token((char**)&buf,"\t");
881     const char* mate_ref_str = get_token((char**)&buf,"\t");
882     const char* mate_pos_str = get_token((char**)&buf,"\t");
883     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
884    
885     const char* seq_str = get_token((char**)&buf,"\t");
886     if (seq)
887     strcpy(seq, seq_str);
888    
889     const char* qual_str = get_token((char**)&buf,"\t");
890     if (qual)
891     strcpy(qual, qual_str);
892    
893     if (!name ||
894     !sam_flag_str ||
895     !text_name ||
896     !text_offset_str ||
897     !map_qual_str ||
898     !cigar_str ||
899     !mate_ref_str ||
900     !mate_pos_str ||
901     !inferred_insert_sz_str ||
902     !seq_str ||
903     !qual_str)
904     {
905     // truncated or malformed SAM record
906     return false;
907     }
908    
909    
910     int sam_flag = atoi(sam_flag_str);
911     int text_offset = atoi(text_offset_str);
912    
913     bool end = true;
914     unsigned int seg_offset = 0;
915     unsigned int seg_num = 0;
916     unsigned int num_segs = 0;
917    
918     // Copy the tag out of the name field before we might wipe it out
919 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
920 gpertea 29
921     vector<CigarOp> cigar;
922     bool spliced_alignment = false;
923 gpertea 137 int refspan=parseCigar(cigar, cigar_str, spliced_alignment);
924     if (refspan==0)
925 gpertea 70 return false;
926 gpertea 29 //vector<string> attributes;
927     //tokenize(tag_buf, " \t",attributes);
928 gpertea 70
929 gpertea 29 bool antisense_splice = false;
930 gpertea 70 int sam_nm = 0; //the value of the NM tag (edit distance)
931 gpertea 137 //int mismatches[1024];//array with mismatch positions on the read (0-based from the left aligned end of the read)
932 gpertea 139 vector<bool> mismatches;
933     mismatches.resize(strlen(seq_str), false);
934     int num_mismatches=getSAMmismatches(buf, cigar, mismatches, sam_nm, antisense_splice);
935 gpertea 29
936     if (spliced_alignment)
937     {
938     bh = create_hit(name,
939     text_name,
940     text_offset - 1,
941     cigar,
942     sam_flag & 0x0010,
943     antisense_splice,
944 gpertea 139 num_mismatches,
945 gpertea 70 0,
946 gpertea 29 end);
947     }
948     else
949     {
950     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
951     bh = create_hit(name,
952     text_name,
953     text_offset - 1, // SAM files are 1-indexed
954     cigar,
955     sam_flag & 0x0010,
956     false,
957 gpertea 139 num_mismatches,
958 gpertea 29 0,
959     end);
960     }
961 gpertea 70 return true;
962 gpertea 29 }
963    
964 gpertea 137 void cigar_add(vector<CigarOp>& cigar, CigarOp& op) {
965     if (op.length<=0) return;
966     if (cigar.size()>0 && cigar.back().opcode==op.opcode) {
967     cigar.back().length+=op.length;
968     }
969     cigar.push_back(op);
970     }
971 gpertea 70
972 gpertea 139 int spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<bool> mismatches,
973 gpertea 137 int &left, int spl_start, int spl_len, CigarOpCode spl_code) {
974 gpertea 70 //merge the original 'cigar' with the new insert/gap operation
975 gpertea 137 //at position spl_start and place the result into splcigar;
976 gpertea 139 //TODO: ideally this should also get and rebuild the MD string (alignment mismatches)
977     int spl_mismatches=0;
978     //return value: mismatches in the insert region for INS case,
979     //or number of mismatches in the anchor region
980     //return -1 if somehow the hit seems bad
981    
982 gpertea 137 //these offsets are relative to the beginning of alignment on reference
983     int spl_ofs=spl_start-left; //relative position of splice op
984     int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op
985     CigarOp gapop(spl_code, spl_len); //for DEL or REF_SKIP
986     if (spl_code==INS)
987     spl_ofs_end += spl_len;
988     int ref_ofs=0; //working offset on reference
989     int read_ofs=0; //working offset on the read, relative to the leftmost aligned base
990     bool xfound=false;
991     //if (left<=spl_start+spl_len) {
992     if (spl_ofs_end>0) {
993     int prev_opcode=0;
994     int prev_oplen=0;
995     for (size_t c = 0 ; c < cigar.size(); ++c)
996     {
997     int prev_read_ofs=read_ofs;
998     int cur_op_ofs=ref_ofs;
999     int cur_opcode=cigar[c].opcode;
1000     int cur_oplen=cigar[c].length;
1001     switch (cur_opcode) {
1002     case MATCH:
1003     ref_ofs+=cur_oplen;
1004     read_ofs+=cur_oplen;
1005 gpertea 139 if (spl_code==REF_SKIP)
1006     for (int o=cur_op_ofs;o<ref_ofs;o++) {
1007     int rofs=prev_read_ofs+(cur_op_ofs-o);
1008     if (abs(spl_ofs-o)<min_anchor_len && mismatches[rofs])
1009     spl_mismatches++;
1010     }
1011 gpertea 137 break;
1012     case DEL:
1013     case REF_SKIP:
1014     case PAD:
1015     ref_ofs+=cur_oplen;
1016     break;
1017     case SOFT_CLIP:
1018     case INS:
1019     read_ofs+=cur_oplen;
1020     break;
1021     //case HARD_CLIP:
1022     }
1023 gpertea 139
1024 gpertea 137 if (cur_op_ofs>=spl_ofs_end || ref_ofs<=spl_ofs) {
1025 gpertea 139 if (cur_op_ofs==spl_ofs_end && spl_code!=INS)
1026 gpertea 137 {
1027 gpertea 139 xfound=true;
1028 gpertea 137 //we have to insert the gap here first
1029 gpertea 139 cigar_add(splcigar, gapop);
1030     //also, check
1031 gpertea 137 }
1032     cigar_add(splcigar, cigar[c]);
1033     }
1034     else //if (ref_ofs>spl_ofs) {
1035     { //op intersection
1036     xfound=true;
1037     if (spl_code==INS) {
1038     //we have to shorten cur_opcode
1039     // find the overlap between current range
1040     int ovl_start = (cur_op_ofs>spl_ofs) ? cur_op_ofs : spl_ofs;
1041     int ovl_end = (ref_ofs>spl_ofs_end) ? spl_ofs_end : ref_ofs;
1042     CigarOp op(cigar[c]);
1043     int len=op.length;
1044     len-=(ovl_end-ovl_start);
1045     if (len>0) {
1046     op.length=len;
1047     cigar_add(splcigar, op);
1048     }
1049     }
1050     else {//DEL or REF_SKIP
1051     //spl_ofs == spl_ofs_end
1052     //we have to split cur_opcode
1053 gpertea 139 //look for mismatches within min_anchor_len distance from splice point
1054 gpertea 137 CigarOp op(cigar[c]);
1055     op.length=spl_ofs-cur_op_ofs;
1056     cigar_add(splcigar, op);
1057     cigar_add(splcigar, gapop);
1058     op.length=ref_ofs-spl_ofs;
1059     cigar_add(splcigar,op);
1060     }
1061     } //op intersection
1062     prev_opcode=cur_opcode;
1063     prev_oplen=cur_oplen;
1064     } //for each cigar opcode
1065     } //intersection possible
1066 gpertea 139
1067     //if (!xfound) {//no intersection found between splice event and alignment
1068 gpertea 137 if (spl_ofs_end<=0) {
1069     //alignment starts after the splice event
1070     if (spl_code==INS) left-=spl_len;
1071     else left+=spl_len;
1072     }
1073     //else {
1074     //alignment ends before the splice event
1075     //nothing to do
1076     // }
1077 gpertea 139 //return spl_mismatches;
1078     // }
1079 gpertea 97
1080 gpertea 139 return spl_mismatches;
1081 gpertea 70 }
1082    
1083 gpertea 69 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1084     BowtieHit& bh,
1085     bool strip_slash,
1086     char* name_out,
1087     char* name_tags,
1088     char* seq,
1089     char* qual)
1090     {
1091     if (!orig_bwt_buf || !*orig_bwt_buf)
1092     return false;
1093 gpertea 29
1094 gpertea 69 char bwt_buf[2048];
1095     strcpy(bwt_buf, orig_bwt_buf);
1096     // Are we still in the header region?
1097     if (bwt_buf[0] == '@')
1098     return false;
1099 gpertea 29
1100 gpertea 69 char* buf = bwt_buf;
1101     char* name = get_token((char**)&buf,"\t");
1102     char* sam_flag_str = get_token((char**)&buf,"\t");
1103     char* text_name = get_token((char**)&buf,"\t");
1104     char* text_offset_str = get_token((char**)&buf,"\t");
1105     const char* map_qual_str = get_token((char**)&buf,"\t");
1106     char* cigar_str = get_token((char**)&buf,"\t");
1107     const char* mate_ref_str = get_token((char**)&buf,"\t");
1108     const char* mate_pos_str = get_token((char**)&buf,"\t");
1109     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1110 gpertea 137 //int num_mismatches=0;
1111     //int mismatches[1024]; //list of 0-based mismatch positions in this read
1112 gpertea 70 //parsed from SAM's MD:Z: tag
1113 gpertea 69 const char* seq_str = get_token((char**)&buf,"\t");
1114     if (seq)
1115     strcpy(seq, seq_str);
1116    
1117     const char* qual_str = get_token((char**)&buf,"\t");
1118     if (qual)
1119     strcpy(qual, qual_str);
1120    
1121     if (!name ||
1122     !sam_flag_str ||
1123     !text_name ||
1124     !text_offset_str ||
1125     !map_qual_str ||
1126     !cigar_str ||
1127     !mate_ref_str ||
1128     !mate_pos_str ||
1129     !inferred_insert_sz_str ||
1130     !seq_str ||
1131     !qual_str)
1132     {
1133     // truncated or malformed SAM record
1134     return false;
1135     }
1136    
1137    
1138     int sam_flag = atoi(sam_flag_str);
1139     int text_offset = atoi(text_offset_str);
1140 gpertea 137 text_offset--; //make it 0-based (SAM is 1-based, Bowtie is 0-based)
1141 gpertea 69 bool end = true;
1142     unsigned int seg_offset = 0;
1143     unsigned int seg_num = 0;
1144     unsigned int num_segs = 0;
1145    
1146     // Copy the tag out of the name field before we might wipe it out
1147 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1148 gpertea 69
1149 gpertea 70 vector<CigarOp> samcigar;
1150     bool spliced_alignment = false;
1151 gpertea 137 int refspan=parseCigar(samcigar, cigar_str, spliced_alignment);
1152     if (refspan==0)
1153     return false;
1154 gpertea 70 bool antisense_splice = false;
1155     int sam_nm = 0;
1156 gpertea 139 vector<bool> mismatches;
1157     mismatches.resize(strlen(seq_str), false);
1158 gpertea 69
1159 gpertea 139 int num_mismatches=getSAMmismatches(buf, samcigar, mismatches, sam_nm, antisense_splice);
1160    
1161 gpertea 70 //##############################################
1162 gpertea 69
1163     // Add this alignment to the table of hits for this half of the
1164     // Bowtie map
1165    
1166     // Parse the text_name field to recover the splice coords
1167     vector<string> toks;
1168     tokenize_strict(text_name, "|", toks);
1169    
1170     int num_extra_toks = (int)toks.size() - 6;
1171    
1172     if (num_extra_toks >= 0)
1173     {
1174     static const uint8_t left_window_edge_field = 1;
1175     static const uint8_t splice_field = 2;
1176     //static const uint8_t right_window_edge_field = 3;
1177     static const uint8_t junction_type_field = 4;
1178     static const uint8_t strand_field = 5;
1179    
1180     string contig = toks[0];
1181     for (int t = 1; t <= num_extra_toks; ++t)
1182     {
1183     contig += "|";
1184     contig += toks[t];
1185     }
1186    
1187     vector<string> splice_toks;
1188     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1189     if (splice_toks.size() != 2)
1190     {
1191     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1192 gpertea 70 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1193 gpertea 69 // toks[num_extra_toks + splice_field].c_str());
1194     return false;
1195     }
1196 gpertea 137
1197     string junction_strand = toks[num_extra_toks + strand_field];
1198     if(junction_strand != "rev" && junction_strand != "fwd"){
1199     fprintf(stderr, "Malformed insertion record\n");
1200     return false;
1201     }
1202    
1203 gpertea 69 //
1204     // check for an insertion hit
1205     //
1206     if(toks[num_extra_toks + junction_type_field] == "ins")
1207     {
1208 gpertea 137 //int8_t spliced_read_len = strlen(seq_str);
1209     //TODO FIXME: use the CIGAR instead of seq length!
1210 gpertea 70 // The 0-based position of the left edge of the alignment. Note that this
1211 gpertea 137 // value may need to be further corrected to account for the presence of
1212 gpertea 70 // of the insertion.
1213 gpertea 137 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1214 gpertea 69
1215 gpertea 70 // The 0-based position of the last genomic sequence before the insertion
1216 gpertea 137 int left_splice_pos = atoi(splice_toks[0].c_str());
1217 gpertea 69
1218     string insertedSequence = splice_toks[1];
1219 gpertea 70 // The 0-based position of the first genomic sequence after the insertion
1220    
1221 gpertea 137 vector<CigarOp> splcigar;
1222    
1223     //TODO: implement this
1224     //this also updates left to the adjusted genomic coordinates
1225 gpertea 139 int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches,
1226     left, left_splice_pos+1, insertedSequence.length(), INS);
1227     if (spl_num_mismatches<0) return false;
1228     num_mismatches-=spl_num_mismatches;
1229 gpertea 137 /*
1230 gpertea 69 uint32_t right_splice_pos = left_splice_pos + 1;
1231 gpertea 137
1232     //uint32_t right = left + spliced_read_len - 1;
1233     int right = left + refspan - 1;
1234    
1235 gpertea 69 if(left > left_splice_pos){
1236 gpertea 137 //The genomic position of the left edge of the alignment needs to be corrected
1237     //If the alignment does not extend into the insertion, simply subtract the length
1238     //of the inserted sequence, otherwise, just set it equal to the right edge
1239 gpertea 69 left = left - insertedSequence.length();
1240     if(left < right_splice_pos){
1241     left = right_splice_pos;
1242     }
1243     }
1244     if(right > left_splice_pos){
1245     right = right - insertedSequence.length();
1246     if(right < left_splice_pos){
1247     right = left_splice_pos;
1248     }
1249     }
1250 gpertea 137 // Now, right and left should be properly transformed into genomic coordinates
1251     // We should be able to deduce how much the alignment matches the insertion
1252     // simply based on the length of the read
1253 gpertea 69 int left_match_length = 0;
1254     if(left <= left_splice_pos){
1255     left_match_length = left_splice_pos - left + 1;
1256     }
1257     int right_match_length = 0;
1258     if(right >= right_splice_pos){
1259     right_match_length = right - right_splice_pos + 1;
1260     }
1261     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1262    
1263     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1264     return false;
1265    
1266 gpertea 70 //char* pch = strtok( mismatches, ",");
1267     //unsigned char num_mismatches = 0;
1268     //text_offset holds the left end of the alignment,
1269     //RELATIVE TO the start of the contig
1270 gpertea 69
1271 gpertea 70 //The 0-based relative position of the left-most character
1272     //before the insertion in the contig
1273 gpertea 69 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1274 gpertea 137 for (size_t i=0;i<mismatches.size();++i) {
1275 gpertea 70 int mismatch_pos = mismatches[i];
1276     // for reversely mapped reads,
1277     //find the correct mismatched position.
1278     if (sam_flag & 0x0010){
1279     mismatch_pos = spliced_read_len - mismatch_pos - 1;
1280     }
1281 gpertea 69
1282 gpertea 70 //Only count mismatches outside of the insertion region
1283     // If there is a mismatch within the insertion,
1284     // disallow this hit
1285 gpertea 137 if(mismatch_pos + text_offset <= relative_splice_pos ||
1286     mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1287 gpertea 70 spl_num_mismatches++;
1288     }else{
1289     return false;
1290 gpertea 69 }
1291     }
1292 gpertea 137 */
1293     //vector<CigarOp> splcigar;
1294     //spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1295     //splcigar.push_back(CigarOp(MATCH, left_match_length));
1296     //splcigar.push_back(CigarOp(INS, insertion_match_length));
1297     //splcigar.push_back(CigarOp(MATCH, right_match_length));
1298 gpertea 69
1299     bh = create_hit(name,
1300     contig,
1301     left,
1302 gpertea 70 //splcigar,
1303     splcigar,
1304     sam_flag & 0x0010,
1305 gpertea 69 junction_strand == "rev",
1306 gpertea 139 num_mismatches,
1307 gpertea 69 0,
1308     end);
1309     return true;
1310 gpertea 70 } //"ins"
1311 gpertea 137 else //"del" or intron
1312 gpertea 69 {
1313 gpertea 139 // The 0-based position of the left edge of the alignment.
1314 gpertea 137 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1315 gpertea 69
1316 gpertea 139 // The 0-based position of the last genomic sequence before the deletion
1317     int left_splice_pos = atoi(splice_toks[0].c_str());
1318 gpertea 69
1319 gpertea 139 int gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1;
1320     /*
1321 gpertea 69 if ((sam_flag & 0x0010) == 0) //######
1322     {
1323 gpertea 139 if (left_splice_ofs + seg_offset < _anchor_length)
1324 gpertea 135 return false;
1325     }
1326     else
1327     {
1328 gpertea 139 if (right_splice_ofs + seg_offset < _anchor_length)
1329 gpertea 135 return false;
1330     }
1331 gpertea 139 */
1332 gpertea 69 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1333     //atoi(toks[right_window_edge_field].c_str());
1334    
1335 gpertea 139 /*
1336     //offset of deletion point, relative to the beginning of the alignment
1337     int left_splice_ofs = left_splice_pos-left+1;
1338    
1339 gpertea 69 int mismatches_in_anchor = 0;
1340 gpertea 137 for (size_t i=0;i<mismatches.size();++i) {
1341     spl_num_mismatches++;
1342     int mismatch_pos = mismatches[i];
1343 gpertea 139 if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_ofs) < (int)min_anchor_len) ||
1344 gpertea 137 ((sam_flag & 0x0010) != 0 &&
1345 gpertea 139 abs(((int)refspan - left_splice_ofs + 1) - mismatch_pos)) < (int)min_anchor_len)
1346 gpertea 70 mismatches_in_anchor++;
1347 gpertea 137 }
1348 gpertea 139 */
1349 gpertea 70 vector<CigarOp> splcigar;
1350     CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP ;
1351 gpertea 137 //TODO: FIXME: use relative offsets for this call instead, like for the INS case
1352 gpertea 139 int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left,
1353     left_splice_pos+1, gap_len, opcode);
1354     if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches)
1355     return false;
1356     /*
1357 gpertea 70 splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1358 gpertea 69 if(toks[num_extra_toks + junction_type_field] == "del"){
1359 gpertea 70 splcigar.push_back(CigarOp(DEL, gap_len));
1360 gpertea 69 }else{
1361 gpertea 70 splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1362 gpertea 69 }
1363 gpertea 70 splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1364 gpertea 139 */
1365 gpertea 69 bh = create_hit(name,
1366     contig,
1367     left,
1368 gpertea 70 splcigar,
1369     (sam_flag & 0x0010),
1370 gpertea 69 junction_strand == "rev",
1371 gpertea 139 num_mismatches,
1372 gpertea 70 spl_num_mismatches,
1373 gpertea 69 end);
1374     return true;
1375     }
1376 gpertea 70 } //parse splice data
1377 gpertea 69 else
1378     {
1379     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1380     //fprintf(stderr, "%s\n", orig_bwt_buf);
1381     // continue;
1382     return false;
1383     }
1384    
1385     return false;
1386     }
1387    
1388    
1389 gpertea 135
1390 gpertea 29 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1391     BowtieHit& bh, bool strip_slash,
1392     char* name_out, char* name_tags,
1393     char* seq, char* qual) {
1394     if (_sam_header==NULL)
1395     err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1396     const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1397    
1398    
1399     uint32_t sam_flag = hit_buf->core.flag;
1400    
1401     int text_offset = hit_buf->core.pos;
1402     int text_mate_pos = hit_buf->core.mpos;
1403     int target_id = hit_buf->core.tid;
1404     int mate_target_id = hit_buf->core.mtid;
1405    
1406     vector<CigarOp> cigar;
1407     bool spliced_alignment = false;
1408     int num_hits = 1;
1409    
1410     double mapQ = hit_buf->core.qual;
1411     long double error_prob;
1412     if (mapQ > 0)
1413     {
1414     long double p = (-1.0 * mapQ) / 10.0;
1415     error_prob = pow(10.0L, p);
1416     }
1417     else
1418     {
1419     error_prob = 1.0;
1420     }
1421    
1422     //header->target_name[c->tid]
1423    
1424     bool end = true;
1425     unsigned int seg_offset = 0;
1426     unsigned int seg_num = 0;
1427     unsigned int num_segs = 0;
1428     // Copy the tag out of the name field before we might wipe it out
1429     char* qname = bam1_qname(hit_buf);
1430     char* pipe = strrchr(qname, '|');
1431     if (pipe)
1432     {
1433     if (name_tags)
1434     strcpy(name_tags, pipe);
1435    
1436     char* tag_buf = pipe + 1;
1437     if (strchr(tag_buf, ':'))
1438     {
1439     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1440     if (seg_num + 1 == num_segs)
1441     end = true;
1442     else
1443     end = false;
1444     }
1445    
1446     *pipe = 0;
1447     }
1448    
1449    
1450     if (target_id < 0) {
1451     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1452     bh = create_hit(qname,
1453     "*", //ref_name
1454     0, //left coord
1455     0, //read_len
1456     false, //antisense_aln
1457     0, //edit_dist
1458     end);
1459     return true;
1460     }
1461    
1462     //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1463     string text_name = _sam_header->target_name[target_id];
1464     for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1465     {
1466     //char* t;
1467    
1468     int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1469     if (length <= 0)
1470     {
1471     fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1472     return false;
1473     }
1474    
1475     CigarOpCode opcode;
1476     switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1477     {
1478     case BAM_CMATCH: opcode = MATCH; break;
1479     case BAM_CINS: opcode = INS; break;
1480     case BAM_CDEL: opcode = DEL; break;
1481     case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1482     case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1483     case BAM_CPAD: opcode = PAD; break;
1484     case BAM_CREF_SKIP:
1485     opcode = REF_SKIP;
1486     spliced_alignment = true;
1487     if (length > (int)max_report_intron_length)
1488     {
1489     //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1490     return false;
1491     }
1492     break;
1493     default:
1494     fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1495     return false;
1496     }
1497     if (opcode != HARD_CLIP)
1498     cigar.push_back(CigarOp(opcode, length));
1499     }
1500    
1501     string mrnm;
1502     if (mate_target_id >= 0) {
1503     if (mate_target_id == target_id) {
1504     //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1505     mrnm = _sam_header->target_name[mate_target_id];
1506     }
1507     else {
1508     //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1509     return false;
1510     }
1511     }
1512     else {
1513     text_mate_pos = 0;
1514     }
1515     //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1516     bool antisense_splice=false;
1517     unsigned char num_mismatches = 0;
1518     unsigned char num_splice_anchor_mismatches = 0;
1519    
1520     uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1521     if (ptr) {
1522     char src_strand_char = bam_aux2A(ptr);
1523     if (src_strand_char == '-')
1524     antisense_splice = true;
1525     // else if (src_strand_char == '+')
1526     // source_strand = CUFF_FWD;
1527     }
1528    
1529     ptr = bam_aux_get(hit_buf, "NM");
1530     if (ptr) {
1531     num_mismatches = bam_aux2i(ptr);
1532     }
1533    
1534     ptr = bam_aux_get(hit_buf, "NH");
1535     if (ptr) {
1536     num_hits = bam_aux2i(ptr);
1537     }
1538    
1539     //bool antisense_aln = bam1_strand(hit_buf);
1540    
1541     //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1542     // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1543     if (spliced_alignment) {
1544     //if (source_strand == CUFF_STRAND_UNKNOWN) {
1545     // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1546     // }
1547     bh = create_hit(qname,
1548     text_name,
1549     text_offset, // BAM files are 0-indexed
1550     cigar,
1551     sam_flag & 0x0010,
1552     antisense_splice,
1553     num_mismatches,
1554     num_splice_anchor_mismatches,
1555     end);
1556    
1557     }
1558     else {
1559     //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1560     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1561     bh = create_hit(qname,
1562     text_name,
1563     text_offset, // BAM files are 0-indexed
1564     cigar,
1565     sam_flag & 0x0010,
1566     false,
1567     num_mismatches,
1568     0,
1569     end);
1570     }
1571     if (seq!=NULL) {
1572     char *bseq = (char*)bam1_seq(hit_buf);
1573     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1574     char v = bam1_seqi(bseq,i);
1575     seq[i]=bam_nt16_rev_table[v];
1576     }
1577     seq[hit_buf->core.l_qseq]=0;
1578     }
1579     if (qual!=NULL) {
1580     char *bq = (char*)bam1_qual(hit_buf);
1581     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1582     qual[i]=bq[i]+33;
1583     }
1584     qual[hit_buf->core.l_qseq]=0;
1585     }
1586     return true;
1587     }
1588    
1589     bool BAMHitFactory::inspect_header(HitStream& hs)
1590     {
1591     bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1592    
1593     if (header == NULL) {
1594     fprintf(stderr, "Warning: No BAM header\n");
1595     return false;
1596     }
1597     if (header->l_text == 0) {
1598     fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1599     return false;
1600     }
1601     return true;
1602     }
1603    
1604    
1605     void get_mapped_reads(FILE* bwtf,
1606     HitTable& hits,
1607     HitFactory& hit_factory,
1608     bool strip_slash,
1609     bool verbose)
1610     {
1611    
1612    
1613     char bwt_buf[2048];
1614     uint32_t reads_extracted = 0;
1615    
1616     while (fgets(bwt_buf, 2048, bwtf))
1617     {
1618     // Chomp the newline
1619     char* nl = strrchr(bwt_buf, '\n');
1620     if (nl) *nl = 0;
1621     if (*bwt_buf == 0)
1622     continue;
1623     // Get a new record from the tab-delimited Bowtie map
1624     BowtieHit bh;
1625     if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1626     {
1627     // Only check uniqueness if these hits are spliced
1628     hits.add_hit(bh, true);
1629     }
1630     reads_extracted++;
1631     }
1632    
1633     // This will sort the map by insert id.
1634     hits.finalize();
1635     fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1636     }
1637    
1638    
1639     /*
1640     AlignStatus status(const BowtieHit* align)
1641     {
1642     if (!align)
1643     return UNALIGNED;
1644     if (align->contiguous())
1645     return CONTIGUOUS;
1646     return SPLICED;
1647     }
1648     */
1649    
1650     void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1651     {
1652     int max_hit_pos = -1;
1653     for (size_t i = 0; i < hits.size(); ++i)
1654     {
1655     max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1656     }
1657    
1658     if ((int)DoC.size() < max_hit_pos)
1659     DoC.resize(max_hit_pos);
1660    
1661     for (size_t i = 0; i < hits.size(); ++i)
1662     {
1663     const BowtieHit& bh = hits[i];
1664    
1665     // split up the coverage contibution for this reads
1666     size_t j = bh.left();
1667     const vector<CigarOp>& cigar = bh.cigar();
1668    
1669     for (size_t c = 0 ; c < cigar.size(); ++c)
1670     {
1671     switch(cigar[c].opcode)
1672     {
1673     case MATCH:
1674     for (size_t m = 0; m < cigar[c].length; ++m)
1675     {
1676     if (DoC[j + m] < 0xFFFF)
1677     DoC[j + m]++;
1678     }
1679     //fall through this case to REF_SKIP is intentional
1680     case REF_SKIP:
1681     j += cigar[c].length;
1682     break;
1683     default:
1684     break;
1685     }
1686    
1687     }
1688     }
1689     }
1690    
1691     void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1692     {
1693     if ((int)DoC.size() < bh.right())
1694     DoC.resize(bh.right());
1695    
1696     // split up the coverage contibution for this reads
1697     size_t j = bh.left();
1698     const vector<CigarOp>& cigar = bh.cigar();
1699    
1700     for (size_t c = 0 ; c < cigar.size(); ++c)
1701     {
1702     switch(cigar[c].opcode)
1703     {
1704     case MATCH:
1705     for (size_t m = 0; m < cigar[c].length; ++m)
1706     {
1707 gpertea 135 if (DoC[j + m] < VMAXINT32)
1708 gpertea 29 DoC[j + m]++;
1709     }
1710     //fall through this case to REF_SKIP is intentional
1711     case REF_SKIP:
1712     j += cigar[c].length;
1713     break;
1714     default:
1715     break;
1716     }
1717    
1718     }
1719     }
1720    
1721     void print_hit(FILE* fout,
1722     const char* read_name,
1723     const BowtieHit& bh,
1724     const char* ref_name,
1725     const char* sequence,
1726     const char* qualities,
1727     bool from_bowtie)
1728     {
1729     string seq;
1730     string quals;
1731     if (sequence)
1732     {
1733     seq = sequence;
1734     quals = qualities;
1735     seq.resize(bh.read_len());
1736     quals.resize(bh.read_len());
1737     }
1738     else
1739     {
1740     seq = "*";
1741     }
1742    
1743     if (qualities)
1744     {
1745     quals = qualities;
1746     quals.resize(bh.read_len());
1747     }
1748     else
1749     {
1750     quals = "*";
1751     }
1752    
1753     uint32_t sam_flag = 0;
1754     if (bh.antisense_align())
1755     {
1756     sam_flag |= 0x0010; // BAM_FREVERSE
1757     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1758     {
1759     reverse_complement(seq);
1760     reverse(quals.begin(), quals.end());
1761     }
1762     }
1763    
1764     uint32_t sam_pos = bh.left() + 1;
1765     uint32_t map_quality = 255;
1766     char cigar[256];
1767     cigar[0] = 0;
1768     string mate_ref_name = "*";
1769     uint32_t mate_pos = 0;
1770     uint32_t insert_size = 0;
1771     //string qualities = "*";
1772    
1773     const vector<CigarOp>& bh_cigar = bh.cigar();
1774    
1775     /*
1776     * In addition to calculating the cigar string,
1777     * we need to figure out how many in/dels are in the
1778     * sequence, so that we can give the correct
1779     * value for the NM tag
1780     */
1781     int indel_distance = 0;
1782     for (size_t c = 0; c < bh_cigar.size(); ++c)
1783     {
1784     char ibuf[64];
1785     sprintf(ibuf, "%d", bh_cigar[c].length);
1786     switch(bh_cigar[c].opcode)
1787     {
1788     case MATCH:
1789     strcat(cigar, ibuf);
1790     strcat(cigar, "M");
1791     break;
1792     case INS:
1793     strcat(cigar, ibuf);
1794     strcat(cigar, "I");
1795     indel_distance += bh_cigar[c].length;
1796     break;
1797     case DEL:
1798     strcat(cigar, ibuf);
1799     strcat(cigar, "D");
1800     indel_distance += bh_cigar[c].length;
1801     break;
1802     case REF_SKIP:
1803     strcat(cigar, ibuf);
1804     strcat(cigar, "N");
1805     break;
1806     default:
1807     break;
1808     }
1809     }
1810    
1811     //string q = string(bh.read_len(), '!');
1812     //string s = string(bh.read_len(), 'N');
1813    
1814     fprintf(fout,
1815     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1816     read_name,
1817     sam_flag,
1818     ref_name,
1819     sam_pos,
1820     map_quality,
1821     cigar,
1822     mate_ref_name.c_str(),
1823     mate_pos,
1824     insert_size,
1825     seq.c_str(),
1826     quals.c_str());
1827    
1828     if (!sam_readgroup_id.empty())
1829     fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1830    
1831     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1832    
1833     bool containsSplice = false;
1834     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1835     if(itr->opcode == REF_SKIP){
1836     containsSplice = true;
1837     break;
1838     }
1839     }
1840    
1841     if (containsSplice)
1842     fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1843    
1844     fprintf(fout, "\n");
1845     }
1846    
1847     void print_bamhit(GBamWriter& wbam,
1848     const char* read_name,
1849     const BowtieHit& bh,
1850     const char* ref_name,
1851     const char* sequence,
1852     const char* qualities,
1853     bool from_bowtie)
1854     {
1855     string seq;
1856     string quals;
1857     if (sequence) {
1858     seq = sequence;
1859     quals = qualities;
1860     seq.resize(bh.read_len());
1861     quals.resize(bh.read_len());
1862     }
1863     else {
1864     seq = "*";
1865     }
1866     if (qualities) {
1867     quals = qualities;
1868     quals.resize(bh.read_len());
1869     }
1870     else
1871     {
1872     quals = "*";
1873     }
1874    
1875     uint32_t sam_flag = 0;
1876     if (bh.antisense_align())
1877     {
1878     sam_flag |= 0x0010; // BAM_FREVERSE
1879     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1880     {
1881     reverse_complement(seq);
1882     reverse(quals.begin(), quals.end());
1883     }
1884     }
1885    
1886     uint32_t sam_pos = bh.left() + 1;
1887     uint32_t map_quality = 255;
1888     char cigar[256];
1889     cigar[0] = 0;
1890     string mate_ref_name = "*";
1891     uint32_t mate_pos = 0;
1892     uint32_t insert_size = 0;
1893     //string qualities = "*";
1894    
1895     const vector<CigarOp>& bh_cigar = bh.cigar();
1896     /*
1897     * In addition to calculating the cigar string,
1898     * we need to figure out how many in/dels are in the
1899     * sequence, so that we can give the correct
1900     * value for the NM tag
1901     */
1902     int indel_distance = 0;
1903     for (size_t c = 0; c < bh_cigar.size(); ++c)
1904     {
1905     char ibuf[64];
1906     sprintf(ibuf, "%d", bh_cigar[c].length);
1907     switch(bh_cigar[c].opcode)
1908     {
1909     case MATCH:
1910     strcat(cigar, ibuf);
1911     strcat(cigar, "M");
1912     break;
1913     case INS:
1914     strcat(cigar, ibuf);
1915     strcat(cigar, "I");
1916     indel_distance += bh_cigar[c].length;
1917     break;
1918     case DEL:
1919     strcat(cigar, ibuf);
1920     strcat(cigar, "D");
1921     indel_distance += bh_cigar[c].length;
1922     break;
1923     case REF_SKIP:
1924     strcat(cigar, ibuf);
1925     strcat(cigar, "N");
1926     break;
1927     default:
1928     break;
1929     }
1930     }
1931    
1932     //string q = string(bh.read_len(), '!');
1933     //string s = string(bh.read_len(), 'N');
1934     /*
1935     fprintf(fout,
1936     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1937     read_name,
1938     sam_flag,
1939     ref_name,
1940     sam_pos,
1941     map_quality,
1942     cigar,
1943     mate_ref_name.c_str(),
1944     mate_pos,
1945     insert_size,
1946     seq.c_str(),
1947     quals.c_str());
1948    
1949     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1950     */
1951     vector<string> auxdata;
1952    
1953     if (!sam_readgroup_id.empty())
1954     {
1955     string nm("RG:Z:");
1956     nm += sam_readgroup_id;
1957     auxdata.push_back(nm);
1958     }
1959    
1960     string nm("NM:i:");
1961     str_appendInt(nm, bh.edit_dist() + indel_distance);
1962     auxdata.push_back(nm);
1963     bool containsSplice = false;
1964     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1965     if(itr->opcode == REF_SKIP){
1966     containsSplice = true;
1967     break;
1968     }
1969    
1970     if (containsSplice) {
1971     //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1972     nm="XS:A:";
1973     nm+=(char)(bh.antisense_splice() ? '-' : '+');
1974     auxdata.push_back(nm);
1975     }
1976    
1977     GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1978     cigar, mate_ref_name.c_str(), mate_pos,
1979     insert_size, seq.c_str(), quals.c_str(), &auxdata);
1980    
1981    
1982    
1983     wbam.write(brec);
1984     delete brec;
1985     }
1986    
1987     /**
1988     * Print a vector of cigar operations to a file.
1989     * @param bh_cigar A vector of CigarOps
1990     * @return a string representation of the cigar string
1991     */
1992     std::string print_cigar(vector<CigarOp>& bh_cigar){
1993     char cigar[256];
1994     cigar[0] = 0;
1995     for (size_t c = 0; c < bh_cigar.size(); ++c)
1996     {
1997     char ibuf[64];
1998     sprintf(ibuf, "%d", bh_cigar[c].length);
1999     switch(bh_cigar[c].opcode)
2000     {
2001     case MATCH:
2002     strcat(cigar, ibuf);
2003     strcat(cigar, "M");
2004     break;
2005     case INS:
2006     strcat(cigar, ibuf);
2007     strcat(cigar, "I");
2008     break;
2009     case DEL:
2010     strcat(cigar, ibuf);
2011     strcat(cigar, "D");
2012     break;
2013     case REF_SKIP:
2014     strcat(cigar, ibuf);
2015     strcat(cigar, "N");
2016     break;
2017     default:
2018     break;
2019     }
2020     }
2021     string result(cigar);
2022     return result;
2023     }
2024    
2025     bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
2026     {
2027     RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
2028     if (!ref_str)
2029     return false;
2030    
2031     const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
2032    
2033     size_t pos_seq = 0;
2034     size_t pos_ref = 0;
2035     size_t mismatch = 0;
2036     for (size_t i = 0; i < _cigar.size(); ++i)
2037     {
2038     CigarOp cigar = _cigar[i];
2039     switch(cigar.opcode)
2040     {
2041     case MATCH:
2042     {
2043     for (size_t j = 0; j < cigar.length; ++j)
2044     {
2045     if (_seq[pos_seq++] != ref_seq[pos_ref++])
2046     ++mismatch;
2047     }
2048     }
2049     break;
2050     case INS:
2051     {
2052     pos_seq += cigar.length;
2053     }
2054     break;
2055    
2056     case DEL:
2057     case REF_SKIP:
2058     {
2059     pos_ref += cigar.length;
2060     }
2061     break;
2062    
2063     default:
2064     break;
2065     }
2066     }
2067    
2068     return mismatch == _edit_dist;
2069     }