ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 140
Committed: Thu Dec 15 22:40:48 2011 UTC (8 years, 5 months ago) by gpertea
File size: 56917 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 29 /*
2     * bwt_map.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/17/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <cassert>
15     #include <cstdlib>
16     #include <iostream>
17     #include <set>
18     #include <vector>
19     #include <cmath>
20    
21     #include "common.h"
22     #include "bwt_map.h"
23     #include "tokenize.h"
24     #include "reads.h"
25    
26     using namespace std;
27    
28     void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29     {
30     uint32_t reference_id = bh.ref_id();
31    
32     pair<RefHits::iterator, bool> ret =
33     _hits_for_ref.insert(make_pair(reference_id, HitList()));
34     HitList& hl = ret.first->second;
35    
36     if (check_uniqueness)
37     {
38     // Check uniqueness, in case we are adding spliced hits from
39     // several spliced alignment sources (e.g. de novo hashing + Bowtie
40     // against a user-supplied index). We don't want to count the same
41     // alignment twice if it happened to be found by more than one method
42     HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43     HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44     for (; lb != ub && lb != hl.end(); ++lb)
45     {
46     if (*lb == bh)
47     {
48     //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49     return;
50     }
51    
52     if (lb->insert_id() == bh.insert_id() &&
53     lb->ref_id() == bh.ref_id() &&
54     lb->antisense_align() == bh.antisense_align())
55     {
56     // If we get here, we may be looking at the same alignment
57     // However, spanning_reads may report a shorter, trimmed alignment
58     // so not all fields will be equal. If they just disagree on the
59     // ends, and don't indicate a different junction coord, the
60     // alignments are the same.
61    
62     if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63     (bh.left() <= lb->left() && bh.right() >= lb->right()))
64     {
65     vector<pair<int,int> > lb_gaps, bh_gaps;
66     lb->gaps(lb_gaps);
67     bh.gaps(bh_gaps);
68     if (lb_gaps == bh_gaps)
69     {
70     // One alignment is contained in the other, they agree on
71     // where the gaps, if any, are, and they share an id
72     // => this is a redundant aligment, so toss it
73     //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74     return;
75     }
76     }
77     }
78     }
79     }
80     _total_hits++;
81     hl.push_back(bh);
82     }
83    
84     bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85     {
86     return h1.insert_id() < h2.insert_id();
87     }
88    
89     void LineHitFactory::openStream(HitStream& hs)
90     {
91     if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92     //open the file for HitStream here
93     hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94     if (hs._hit_file==NULL)
95     err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96     return;
97     }
98     if (hs._fzpipe!=NULL) {
99     hs._hit_file=hs._fzpipe->file;
100     }
101     }
102    
103     void LineHitFactory::rewind(HitStream& hs)
104     {
105     if (hs._fzpipe!=NULL) {
106     hs._fzpipe->rewind();
107     hs._hit_file=hs._fzpipe->file;
108     }
109     else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110     }
111    
112     bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113     FILE* f=(FILE *)(hs._hit_file);
114     bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115     if (!new_rec || feof(f)) {
116     hs._eof=true;
117     return false;
118     }
119     ++_line_num;
120     char* nl = strrchr(_hit_buf, '\n');
121     if (nl) *nl = 0;
122     buf = _hit_buf;
123     buf_size = _hit_buf_max_sz - 1;
124     return true;
125     }
126    
127     void LineHitFactory::closeStream(HitStream& hs) {
128     if (hs._fzpipe!=NULL) {
129     hs._fzpipe->close();
130     return;
131     }
132     if (hs._hit_file!=NULL) {
133     fclose((FILE*)(hs._hit_file));
134     hs._hit_file=NULL;
135     }
136     }
137     void BAMHitFactory::openStream(HitStream& hs) {
138     if (hs._hit_file==NULL) {
139     if (hs._hit_file_name.empty())
140     //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141     return; //invalid stream, could be just a place holder
142     //open the file here if not already open
143     string fext=getFext(hs._hit_file_name);
144     if (fext=="sam")
145     hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146     else
147     hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148    
149     samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150    
151     if (sam_file == NULL)
152     err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153     if (sam_file->header == NULL)
154     err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155     memset(&_next_hit, 0, sizeof(_next_hit));
156     //_beginning = bgzf_tell(sam_file->x.bam);
157     _sam_header=sam_file->header;
158     if (inspect_header(hs) == false)
159     err_die("Error: invalid SAM header for file %s\n",
160     hs._hit_file_name.c_str());
161     }
162     }
163    
164     void BAMHitFactory::closeStream(HitStream& hs) {
165     if (hs._hit_file) {
166     samclose((samfile_t*)(hs._hit_file));
167     }
168     hs._hit_file=NULL;
169     _sam_header=NULL;
170     }
171    
172     void BAMHitFactory::rewind(HitStream& hs)
173     {
174     /*
175     if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176     {
177     bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178     _eof = false;
179     }
180     */
181     this->closeStream(hs);
182     this->openStream(hs);
183     }
184    
185     string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186     const bam1_t* bamrec=(const bam1_t*)hit_buf;
187     char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188     string sam_line(tamline);
189     free(tamline);
190     return sam_line;
191     }
192    
193     bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194     if (_next_hit.data) {
195     free(_next_hit.data);
196     _next_hit.data = NULL;
197     }
198     _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199     if (hs.eof() || !hs.ready()) return false;
200    
201     //mark_curr_pos();
202    
203     memset(&_next_hit, 0, sizeof(_next_hit));
204    
205     int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206     if (bytes_read <= 0) {
207     hs._eof = true;
208     return false;
209     }
210     buf = (const char*)&_next_hit;
211     buf_size = bytes_read;
212     return true;
213     }
214    
215    
216     BowtieHit HitFactory::create_hit(const string& insert_name,
217     const string& ref_name,
218     int left,
219     const vector<CigarOp>& cigar,
220     bool antisense_aln,
221     bool antisense_splice,
222     unsigned char edit_dist,
223     unsigned char splice_mms,
224     bool end)
225     {
226     uint64_t insert_id = _insert_table.get_id(insert_name);
227     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228    
229     return BowtieHit(reference_id,
230     insert_id,
231     left,
232     cigar,
233     antisense_aln,
234     antisense_splice,
235     edit_dist,
236     splice_mms,
237     end);
238     }
239    
240     BowtieHit HitFactory::create_hit(const string& insert_name,
241     const string& ref_name,
242     uint32_t left,
243     uint32_t read_len,
244     bool antisense_aln,
245     unsigned char edit_dist,
246     bool end)
247     {
248     uint64_t insert_id = _insert_table.get_id(insert_name);
249     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250    
251     return BowtieHit(reference_id,
252     insert_id,
253     left,
254     read_len,
255     antisense_aln,
256     edit_dist,
257     end);
258     }
259    
260     bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261     BowtieHit& bh,
262     bool strip_slash,
263     char* name_out,
264     char* name_tags,
265     char* seq,
266     char* qual)
267     {
268     if (!orig_bwt_buf || !*orig_bwt_buf)
269     return false;
270    
271     static const int buf_size = 2048;
272    
273     char bwt_buf[buf_size];
274     strcpy(bwt_buf, orig_bwt_buf);
275     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276    
277     char orientation;
278    
279     //int bwtf_ret = 0;
280     //uint32_t seqid = 0;
281    
282     char text_name[buf_size];
283     unsigned int text_offset;
284    
285    
286     //unsigned int other_occs;
287     char mismatches[buf_size];
288     //memset(mismatches, 0, sizeof(mismatches));
289    
290     const char* buf = bwt_buf;
291     char* name = get_token((char**)&buf,"\t");
292     char* orientation_str = get_token((char**)&buf,"\t");
293     char* text_name_str = get_token((char**)&buf,"\t");
294    
295     strcpy(text_name, text_name_str);
296    
297     char* text_offset_str = get_token((char**)&buf,"\t");
298     char* seq_str = get_token((char**)&buf,"\t");
299     if (seq)
300     strcpy(seq, seq_str);
301    
302     const char* qual_str = get_token((char**)&buf,"\t");
303     if (qual)
304     strcpy(qual, qual_str);
305    
306     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307     mismatches[0] = 0;
308     char* mismatches_str = get_token((char**)&buf, "\t");
309     if (mismatches_str)
310     strcpy(mismatches, mismatches_str);
311    
312     orientation = orientation_str[0];
313     text_offset = atoi(text_offset_str);
314    
315     bool end = true;
316     unsigned int seg_offset = 0;
317     unsigned int seg_num = 0;
318     unsigned int num_segs = 0;
319    
320     // Copy the tag out of the name field before we might wipe it out
321     char* pipe = strrchr(name, '|');
322     if (pipe)
323     {
324     if (name_tags)
325     strcpy(name_tags, pipe);
326    
327     char* tag_buf = pipe + 1;
328     if (strchr(tag_buf, ':'))
329     {
330     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331     if (seg_num + 1 == num_segs)
332     end = true;
333     else
334     end = false;
335     }
336    
337     *pipe = 0;
338     }
339     // Stripping the slash and number following it gives the insert name
340     char* slash = strrchr(name, '/');
341     if (strip_slash && slash)
342     *slash = 0;
343    
344     size_t read_len = strlen(seq_str);
345    
346     // Add this alignment to the table of hits for this half of the
347     // Bowtie map
348    
349     //vector<string> mismatch_toks;
350     char* pch = strtok (mismatches,",");
351     unsigned char num_mismatches = 0;
352     while (pch != NULL)
353     {
354     char* colon = strchr(pch, ':');
355     if (colon)
356     {
357     num_mismatches++;
358     }
359     //mismatch_toks.push_back(pch);
360     pch = strtok (NULL, ",");
361     }
362    
363     bh = create_hit(name,
364     text_name,
365     text_offset,
366     (int)read_len,
367     orientation == '-',
368     num_mismatches,
369     end);
370    
371     return true;
372     }
373    
374     int anchor_mismatch = 0;
375    
376 gpertea 70
377     void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
378     bool &end, unsigned int &seg_offset, unsigned int& seg_num,
379     unsigned int & num_segs) {
380     char* pipe = strrchr(name, '|');
381     if (pipe)
382     {
383     if (name_tags)
384     strcpy(name_tags, pipe);
385    
386     char* tag_buf = pipe + 1;
387     if (strchr(tag_buf, ':'))
388     {
389     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
390     if (seg_num + 1 == num_segs)
391     end = true;
392     else
393     end = false;
394     }
395    
396     *pipe = 0;
397     }
398    
399     // Stripping the slash and number following it gives the insert name
400     char* slash = strrchr(name, '/');
401     if (strip_slash && slash)
402     *slash = 0;
403     }
404    
405    
406 gpertea 29 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
407     BowtieHit& bh,
408     bool strip_slash,
409     char* name_out,
410     char* name_tags,
411     char* seq,
412     char* qual)
413     {
414     if (!orig_bwt_buf || !*orig_bwt_buf)
415     return false;
416    
417     static const int buf_size = 2048;
418    
419     char bwt_buf[buf_size];
420     strcpy(bwt_buf, orig_bwt_buf);
421     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
422    
423     char orientation;
424     char text_name[buf_size];
425     unsigned int text_offset;
426     char mismatches[buf_size];
427     //memset(mismatches, 0, sizeof(mismatches));
428    
429     char* buf = bwt_buf;
430     char* name = get_token((char**)&buf,"\t");
431     char* orientation_str = get_token((char**)&buf,"\t");
432     char* text_name_str = get_token((char**)&buf,"\t");
433     strcpy(text_name, text_name_str);
434    
435     char* text_offset_str = get_token((char**)&buf,"\t");
436     char* seq_str = get_token((char**)&buf,"\t");
437     if (seq)
438     strcpy(seq, seq_str);
439    
440     const char* qual_str = get_token((char**)&buf,"\t");
441     if (qual)
442     strcpy(qual, qual_str);
443    
444     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
445     mismatches[0] = 0;
446     char* mismatches_str = get_token((char**)&buf, "\t");
447     if (mismatches_str)
448     strcpy(mismatches, mismatches_str);
449    
450     orientation = orientation_str[0];
451     text_offset = atoi(text_offset_str);
452    
453     bool end = true;
454     unsigned int seg_offset = 0;
455     unsigned int seg_num = 0;
456     unsigned int num_segs = 0;
457    
458     // Copy the tag out of the name field before we might wipe it out
459 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
460 gpertea 29
461     // Add this alignment to the table of hits for this half of the
462     // Bowtie map
463    
464     // Parse the text_name field to recover the splice coords
465     vector<string> toks;
466    
467     tokenize_strict(text_name, "|", toks);
468    
469     int num_extra_toks = (int)toks.size() - 6;
470    
471     if (num_extra_toks >= 0)
472     {
473     static const uint8_t left_window_edge_field = 1;
474     static const uint8_t splice_field = 2;
475     //static const uint8_t right_window_edge_field = 3;
476     static const uint8_t junction_type_field = 4;
477     static const uint8_t strand_field = 5;
478    
479     string contig = toks[0];
480     for (int t = 1; t <= num_extra_toks; ++t)
481     {
482     contig += "|";
483     contig += toks[t];
484     }
485    
486     vector<string> splice_toks;
487     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
488     if (splice_toks.size() != 2)
489     {
490     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
491     //fprintf(stderr, "%s (token: %s)\n", text_name,
492     // toks[num_extra_toks + splice_field].c_str());
493     return false;
494     }
495    
496     //
497     // check for an insertion hit
498     //
499     if(toks[num_extra_toks + junction_type_field] == "ins")
500     {
501     int8_t spliced_read_len = strlen(seq_str);
502     /*
503     * The 0-based position of the left edge of the alignment. Note that this
504     * value may need to be futher corrected to account for the presence of
505     * of the insertion.
506     */
507     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
508     uint32_t right = left + spliced_read_len - 1;
509    
510 gpertea 137
511 gpertea 29 /*
512     * The 0-based position of the last genomic sequence before the insertion
513     */
514     uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
515    
516     string insertedSequence = splice_toks[1];
517     /*
518     * The 0-based position of the first genomic sequence after teh insertion
519     */
520     uint32_t right_splice_pos = left_splice_pos + 1;
521     if(left > left_splice_pos){
522     /*
523     * The genomic position of the left edge of the alignment needs to be corrected
524     * If the alignment does not extend into the insertion, simply subtract the length
525     * of the inserted sequence, otherwise, just set it equal to the right edge
526     */
527     left = left - insertedSequence.length();
528     if(left < right_splice_pos){
529     left = right_splice_pos;
530     }
531     }
532     if(right > left_splice_pos){
533     right = right - insertedSequence.length();
534     if(right < left_splice_pos){
535     right = left_splice_pos;
536     }
537     }
538     /*
539     * Now, right and left should be properly transformed into genomic coordinates
540     * We should be able to deduce how much the alignment matches the insertion
541     * simply based on the length of the read
542     */
543     int left_match_length = 0;
544     if(left <= left_splice_pos){
545     left_match_length = left_splice_pos - left + 1;
546     }
547     int right_match_length = 0;
548     if(right >= right_splice_pos){
549     right_match_length = right - right_splice_pos + 1;
550     }
551     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
552    
553     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
554     return false;
555    
556     string junction_strand = toks[num_extra_toks + strand_field];
557     if(junction_strand != "rev" && junction_strand != "fwd"){
558     fprintf(stderr, "Malformed insertion record\n");
559     return false;
560     }
561    
562     char* pch = strtok( mismatches, ",");
563     unsigned char num_mismatches = 0;
564    
565     /*
566     * remember that text_offset holds the left end of the
567     * alignment, relative to the start of the contig
568     */
569    
570     /*
571     * The 0-based relative position of the left-most character
572     * before the insertion in the contig
573     */
574     int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
575     while (pch != NULL)
576     {
577     char* colon = strchr(pch, ':');
578     if (colon)
579     {
580     *colon = 0;
581     int mismatch_pos = atoi(pch);
582    
583     /*
584     * for reversely mapped reads,
585     * find the correct mismatched position.
586     */
587     if(orientation == '-'){
588     mismatch_pos = spliced_read_len - mismatch_pos - 1;
589     }
590    
591     /*
592     * Only count mismatches outside of the insertion region
593     * If there is a mismatch within the insertion,
594     * disallow this hit
595     */
596     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
597     num_mismatches++;
598     }else{
599     return false;
600     }
601     }
602     pch = strtok (NULL, ",");
603     }
604    
605    
606     vector<CigarOp> cigar;
607     cigar.push_back(CigarOp(MATCH, left_match_length));
608     cigar.push_back(CigarOp(INS, insertion_match_length));
609     cigar.push_back(CigarOp(MATCH, right_match_length));
610    
611     /*
612     * For now, disallow hits that don't span
613     * the insertion. If we allow these types of hits,
614     * then long_spanning.cpp needs to be updated
615     * in order to intelligently merge these kinds
616     * of reads back together
617     *
618     * Following code has been changed to allow segment that end
619     * in an insertion
620     */
621     bh = create_hit(name,
622     contig,
623     left,
624     cigar,
625     orientation == '-',
626     junction_strand == "rev",
627     num_mismatches,
628     0,
629     end);
630     return true;
631     }
632    
633     else
634     {
635     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
636     int spliced_read_len = strlen(seq_str);
637     int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
638     if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
639     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
640    
641     if (right_splice_pos <= 0 || left_splice_pos <= 0)
642     return false;
643    
644     if (orientation == '+')
645     {
646     if (left_splice_pos + seg_offset < _anchor_length){
647     return false;
648     }
649     }
650     else
651     {
652     if (right_splice_pos + seg_offset < _anchor_length)
653     return false;
654     }
655     //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
656     //atoi(toks[right_window_edge_field].c_str());
657     int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
658    
659     string junction_strand = toks[num_extra_toks + strand_field];
660     if (!(junction_strand == "rev" || junction_strand == "fwd")||
661     !(orientation == '-' || orientation == '+'))
662     {
663     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
664     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
665     // junction_strand.c_str(), orientation);
666     return false;
667     }
668    
669     //vector<string> mismatch_toks;
670     char* pch = strtok (mismatches,",");
671     int mismatches_in_anchor = 0;
672     unsigned char num_mismatches = 0;
673     while (pch != NULL)
674     {
675     char* colon = strchr(pch, ':');
676     if (colon)
677     {
678     *colon = 0;
679     num_mismatches++;
680     int mismatch_pos = atoi(pch);
681     if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
682     (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
683     mismatches_in_anchor++;
684     }
685     //mismatch_toks.push_back(pch);
686     pch = strtok (NULL, ",");
687     }
688    
689     // FIXME: we probably should exclude these hits somewhere, but this
690     // isn't the right place
691     vector<CigarOp> cigar;
692     cigar.push_back(CigarOp(MATCH, left_splice_pos));
693     if(toks[num_extra_toks + junction_type_field] == "del"){
694     cigar.push_back(CigarOp(DEL, gap_len));
695     }else{
696     cigar.push_back(CigarOp(REF_SKIP, gap_len));
697     }
698     cigar.push_back(CigarOp(MATCH, right_splice_pos));
699    
700     bh = create_hit(name,
701     contig,
702     left,
703     cigar,
704     orientation == '-',
705     junction_strand == "rev",
706     num_mismatches,
707     mismatches_in_anchor,
708     end);
709     return true;
710     }
711     }
712     else
713     {
714     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
715     //fprintf(stderr, "%s\n", orig_bwt_buf);
716     // continue;
717     return false;
718     }
719    
720     return false;
721     }
722    
723 gpertea 70
724 gpertea 137 int parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
725 gpertea 70 bool &spliced_alignment) {
726     const char* p_cig = cigar_str;
727 gpertea 137 int refspan=0; //alignment span on reference sequence
728 gpertea 70 while (*p_cig)
729     {
730     char* t;
731 gpertea 137 int op_len = (int)strtol(p_cig, &t, 10);
732     if (op_len <= 0)
733 gpertea 70 {
734 gpertea 137 fprintf (stderr, "Error: CIGAR op has zero length\n");
735     return 0;
736 gpertea 70 }
737     char op_char = toupper(*t);
738     CigarOpCode opcode;
739 gpertea 137 switch (op_char) {
740     case '=':
741     case 'X':
742     case 'M': opcode = MATCH;
743     refspan+=op_len;
744     break;
745     case 'I': opcode = INS;
746     break;
747     case 'D': opcode = DEL;
748     refspan+=op_len;
749     break;
750     case 'N': if (op_len > max_report_intron_length)
751     return 0;
752     opcode = REF_SKIP;
753     spliced_alignment = true;
754     refspan+=op_len;
755     break;
756     case 'S': opcode = SOFT_CLIP;
757     break;
758     case 'H': opcode = HARD_CLIP;
759     break;
760     case 'P': opcode = PAD;
761     break;
762     default: fprintf (stderr, "Error: invalid CIGAR operation\n");
763     return 0;
764     }
765 gpertea 70 p_cig = t + 1;
766 gpertea 137 cigar.push_back(CigarOp(opcode, op_len));
767     } //while cigar codes
768 gpertea 70 if (*p_cig) {
769 gpertea 137 fprintf (stderr, "Error: unmatched CIGAR operation (%s)\n",p_cig);
770     return 0;
771 gpertea 70 }
772 gpertea 137 return refspan;
773 gpertea 70 }
774    
775 gpertea 137 int getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
776 gpertea 139 vector<bool>& mismatches, int& sam_nm, bool& antisense_splice) {
777 gpertea 137 int gspan=0;//genomic span of the alignment
778     const char* tag_buf = buf;
779 gpertea 70 sam_nm=0;
780 gpertea 139 int num_mismatches=0;
781 gpertea 70 while((tag_buf = get_token((char**)&buf,"\t")))
782     {
783     vector<string> tuple_fields;
784     tokenize(tag_buf,":", tuple_fields);
785     if (tuple_fields.size() == 3)
786     {
787     if (tuple_fields[0] == "XS")
788     {
789     if (tuple_fields[2] == "-")
790     antisense_splice = true;
791     }
792     else if (tuple_fields[0] == "NM")
793     {
794     sam_nm = atoi(tuple_fields[2].c_str());
795     }
796     else if (tuple_fields[0] == "NS")
797     {
798     //ignored for now
799     }
800     else if (tuple_fields[0] == "MD")
801     {
802     const char* p=tuple_fields[2].c_str();
803     int bi=0; //base offset position in the read
804     while (*p != 0) {
805     if (isdigit(*p)) {
806     int v=atoi(p);
807     do { p++; } while (isdigit(*p));
808     bi+=v;
809     }
810 gpertea 137 while (isalpha(*p)) {
811 gpertea 70 p++;
812 gpertea 139 num_mismatches++;
813     //mismatches.push_back(bi);
814     mismatches[bi]=true;
815 gpertea 70 bi++;
816     }
817     if (*p=='^') { //reference deletion
818     p++;
819 gpertea 137 while (isalpha(*p)) { //insert read bases
820 gpertea 70 p++; bi++;
821 gpertea 137 }
822 gpertea 70 }
823     }
824     }
825 gpertea 139 //else
826     //{
827 gpertea 70 //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
828     //return false;
829 gpertea 139 //}
830 gpertea 70 }
831     }
832     /* By convention,the NM field of the SAM record
833     * counts an insertion or deletion. I dont' think
834     * we want the mismatch count in the BowtieHit
835     * record to reflect this. Therefore, subtract out
836     * the mismatches due to in/dels
837     */
838     for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
839 gpertea 137 switch (itr->opcode)
840     {
841     case MATCH:
842     case REF_SKIP:
843     case PAD:
844     gspan += itr->length;
845     break;
846     case DEL:
847     gspan += itr->length;
848     sam_nm -= itr->length;
849     break;
850     case INS:
851     sam_nm -= itr->length;
852     break;
853 gpertea 70 }
854 gpertea 137 }
855 gpertea 139 return num_mismatches;
856 gpertea 70 }
857    
858 gpertea 29 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
859     BowtieHit& bh,
860     bool strip_slash,
861     char* name_out,
862     char* name_tags,
863     char* seq,
864     char* qual)
865     {
866     if (!orig_bwt_buf || !*orig_bwt_buf)
867     return false;
868     char bwt_buf[2048];
869     strcpy(bwt_buf, orig_bwt_buf);
870     // Are we still in the header region?
871     if (bwt_buf[0] == '@')
872     return false;
873    
874     char* buf = bwt_buf;
875     char* name = get_token((char**)&buf,"\t");
876     char* sam_flag_str = get_token((char**)&buf,"\t");
877     char* text_name = get_token((char**)&buf,"\t");
878     char* text_offset_str = get_token((char**)&buf,"\t");
879     const char* map_qual_str = get_token((char**)&buf,"\t");
880     char* cigar_str = get_token((char**)&buf,"\t");
881     const char* mate_ref_str = get_token((char**)&buf,"\t");
882     const char* mate_pos_str = get_token((char**)&buf,"\t");
883     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
884    
885     const char* seq_str = get_token((char**)&buf,"\t");
886     if (seq)
887     strcpy(seq, seq_str);
888    
889     const char* qual_str = get_token((char**)&buf,"\t");
890     if (qual)
891     strcpy(qual, qual_str);
892    
893     if (!name ||
894     !sam_flag_str ||
895     !text_name ||
896     !text_offset_str ||
897     !map_qual_str ||
898     !cigar_str ||
899     !mate_ref_str ||
900     !mate_pos_str ||
901     !inferred_insert_sz_str ||
902     !seq_str ||
903     !qual_str)
904     {
905     // truncated or malformed SAM record
906     return false;
907     }
908    
909    
910     int sam_flag = atoi(sam_flag_str);
911     int text_offset = atoi(text_offset_str);
912    
913     bool end = true;
914     unsigned int seg_offset = 0;
915     unsigned int seg_num = 0;
916     unsigned int num_segs = 0;
917    
918     // Copy the tag out of the name field before we might wipe it out
919 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
920 gpertea 29
921     vector<CigarOp> cigar;
922     bool spliced_alignment = false;
923 gpertea 137 int refspan=parseCigar(cigar, cigar_str, spliced_alignment);
924     if (refspan==0)
925 gpertea 70 return false;
926 gpertea 29 //vector<string> attributes;
927     //tokenize(tag_buf, " \t",attributes);
928 gpertea 70
929 gpertea 29 bool antisense_splice = false;
930 gpertea 70 int sam_nm = 0; //the value of the NM tag (edit distance)
931 gpertea 137 //int mismatches[1024];//array with mismatch positions on the read (0-based from the left aligned end of the read)
932 gpertea 139 vector<bool> mismatches;
933     mismatches.resize(strlen(seq_str), false);
934     int num_mismatches=getSAMmismatches(buf, cigar, mismatches, sam_nm, antisense_splice);
935 gpertea 29
936     if (spliced_alignment)
937     {
938     bh = create_hit(name,
939     text_name,
940     text_offset - 1,
941     cigar,
942     sam_flag & 0x0010,
943     antisense_splice,
944 gpertea 139 num_mismatches,
945 gpertea 70 0,
946 gpertea 29 end);
947     }
948     else
949     {
950     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
951     bh = create_hit(name,
952     text_name,
953     text_offset - 1, // SAM files are 1-indexed
954     cigar,
955     sam_flag & 0x0010,
956     false,
957 gpertea 139 num_mismatches,
958 gpertea 29 0,
959     end);
960     }
961 gpertea 70 return true;
962 gpertea 29 }
963    
964 gpertea 137 void cigar_add(vector<CigarOp>& cigar, CigarOp& op) {
965     if (op.length<=0) return;
966     if (cigar.size()>0 && cigar.back().opcode==op.opcode) {
967     cigar.back().length+=op.length;
968     }
969     cigar.push_back(op);
970     }
971 gpertea 70
972 gpertea 139 int spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar, vector<bool> mismatches,
973 gpertea 137 int &left, int spl_start, int spl_len, CigarOpCode spl_code) {
974 gpertea 70 //merge the original 'cigar' with the new insert/gap operation
975 gpertea 137 //at position spl_start and place the result into splcigar;
976 gpertea 139 //TODO: ideally this should also get and rebuild the MD string (alignment mismatches)
977     int spl_mismatches=0;
978     //return value: mismatches in the insert region for INS case,
979     //or number of mismatches in the anchor region
980     //return -1 if somehow the hit seems bad
981    
982 gpertea 137 //these offsets are relative to the beginning of alignment on reference
983     int spl_ofs=spl_start-left; //relative position of splice op
984     int spl_ofs_end=spl_ofs; //relative position of first ref base AFTER splice op
985     CigarOp gapop(spl_code, spl_len); //for DEL or REF_SKIP
986     if (spl_code==INS)
987     spl_ofs_end += spl_len;
988     int ref_ofs=0; //working offset on reference
989     int read_ofs=0; //working offset on the read, relative to the leftmost aligned base
990     bool xfound=false;
991     //if (left<=spl_start+spl_len) {
992     if (spl_ofs_end>0) {
993     int prev_opcode=0;
994     int prev_oplen=0;
995     for (size_t c = 0 ; c < cigar.size(); ++c)
996     {
997     int prev_read_ofs=read_ofs;
998     int cur_op_ofs=ref_ofs;
999     int cur_opcode=cigar[c].opcode;
1000     int cur_oplen=cigar[c].length;
1001     switch (cur_opcode) {
1002     case MATCH:
1003     ref_ofs+=cur_oplen;
1004     read_ofs+=cur_oplen;
1005 gpertea 139 if (spl_code==REF_SKIP)
1006     for (int o=cur_op_ofs;o<ref_ofs;o++) {
1007     int rofs=prev_read_ofs+(cur_op_ofs-o);
1008     if (abs(spl_ofs-o)<min_anchor_len && mismatches[rofs])
1009     spl_mismatches++;
1010     }
1011 gpertea 137 break;
1012     case DEL:
1013     case REF_SKIP:
1014     case PAD:
1015     ref_ofs+=cur_oplen;
1016     break;
1017     case SOFT_CLIP:
1018     case INS:
1019     read_ofs+=cur_oplen;
1020     break;
1021     //case HARD_CLIP:
1022     }
1023 gpertea 139
1024 gpertea 137 if (cur_op_ofs>=spl_ofs_end || ref_ofs<=spl_ofs) {
1025 gpertea 139 if (cur_op_ofs==spl_ofs_end && spl_code!=INS)
1026 gpertea 137 {
1027 gpertea 139 xfound=true;
1028 gpertea 137 //we have to insert the gap here first
1029 gpertea 139 cigar_add(splcigar, gapop);
1030     //also, check
1031 gpertea 137 }
1032     cigar_add(splcigar, cigar[c]);
1033     }
1034     else //if (ref_ofs>spl_ofs) {
1035     { //op intersection
1036     xfound=true;
1037     if (spl_code==INS) {
1038     //we have to shorten cur_opcode
1039     // find the overlap between current range
1040     int ovl_start = (cur_op_ofs>spl_ofs) ? cur_op_ofs : spl_ofs;
1041     int ovl_end = (ref_ofs>spl_ofs_end) ? spl_ofs_end : ref_ofs;
1042     CigarOp op(cigar[c]);
1043     int len=op.length;
1044     len-=(ovl_end-ovl_start);
1045     if (len>0) {
1046     op.length=len;
1047     cigar_add(splcigar, op);
1048     }
1049     }
1050     else {//DEL or REF_SKIP
1051     //spl_ofs == spl_ofs_end
1052     //we have to split cur_opcode
1053 gpertea 139 //look for mismatches within min_anchor_len distance from splice point
1054 gpertea 137 CigarOp op(cigar[c]);
1055     op.length=spl_ofs-cur_op_ofs;
1056     cigar_add(splcigar, op);
1057     cigar_add(splcigar, gapop);
1058     op.length=ref_ofs-spl_ofs;
1059     cigar_add(splcigar,op);
1060     }
1061     } //op intersection
1062     prev_opcode=cur_opcode;
1063     prev_oplen=cur_oplen;
1064     } //for each cigar opcode
1065     } //intersection possible
1066 gpertea 139
1067     //if (!xfound) {//no intersection found between splice event and alignment
1068 gpertea 137 if (spl_ofs_end<=0) {
1069     //alignment starts after the splice event
1070     if (spl_code==INS) left-=spl_len;
1071     else left+=spl_len;
1072     }
1073     //else {
1074     //alignment ends before the splice event
1075     //nothing to do
1076     // }
1077 gpertea 139 //return spl_mismatches;
1078     // }
1079 gpertea 97
1080 gpertea 139 return spl_mismatches;
1081 gpertea 70 }
1082    
1083 gpertea 69 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1084     BowtieHit& bh,
1085     bool strip_slash,
1086     char* name_out,
1087     char* name_tags,
1088     char* seq,
1089     char* qual)
1090     {
1091     if (!orig_bwt_buf || !*orig_bwt_buf)
1092     return false;
1093 gpertea 29
1094 gpertea 69 char bwt_buf[2048];
1095     strcpy(bwt_buf, orig_bwt_buf);
1096     // Are we still in the header region?
1097     if (bwt_buf[0] == '@')
1098     return false;
1099 gpertea 29
1100 gpertea 69 char* buf = bwt_buf;
1101     char* name = get_token((char**)&buf,"\t");
1102     char* sam_flag_str = get_token((char**)&buf,"\t");
1103     char* text_name = get_token((char**)&buf,"\t");
1104     char* text_offset_str = get_token((char**)&buf,"\t");
1105     const char* map_qual_str = get_token((char**)&buf,"\t");
1106     char* cigar_str = get_token((char**)&buf,"\t");
1107     const char* mate_ref_str = get_token((char**)&buf,"\t");
1108     const char* mate_pos_str = get_token((char**)&buf,"\t");
1109     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1110 gpertea 137 //int num_mismatches=0;
1111     //int mismatches[1024]; //list of 0-based mismatch positions in this read
1112 gpertea 70 //parsed from SAM's MD:Z: tag
1113 gpertea 69 const char* seq_str = get_token((char**)&buf,"\t");
1114     if (seq)
1115     strcpy(seq, seq_str);
1116    
1117     const char* qual_str = get_token((char**)&buf,"\t");
1118     if (qual)
1119     strcpy(qual, qual_str);
1120    
1121     if (!name ||
1122     !sam_flag_str ||
1123     !text_name ||
1124     !text_offset_str ||
1125     !map_qual_str ||
1126     !cigar_str ||
1127     !mate_ref_str ||
1128     !mate_pos_str ||
1129     !inferred_insert_sz_str ||
1130     !seq_str ||
1131     !qual_str)
1132     {
1133     // truncated or malformed SAM record
1134     return false;
1135     }
1136    
1137    
1138     int sam_flag = atoi(sam_flag_str);
1139     int text_offset = atoi(text_offset_str);
1140 gpertea 137 text_offset--; //make it 0-based (SAM is 1-based, Bowtie is 0-based)
1141 gpertea 69 bool end = true;
1142     unsigned int seg_offset = 0;
1143     unsigned int seg_num = 0;
1144     unsigned int num_segs = 0;
1145    
1146     // Copy the tag out of the name field before we might wipe it out
1147 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1148 gpertea 69
1149 gpertea 70 vector<CigarOp> samcigar;
1150     bool spliced_alignment = false;
1151 gpertea 137 int refspan=parseCigar(samcigar, cigar_str, spliced_alignment);
1152     if (refspan==0)
1153     return false;
1154 gpertea 70 bool antisense_splice = false;
1155     int sam_nm = 0;
1156 gpertea 139 vector<bool> mismatches;
1157     mismatches.resize(strlen(seq_str), false);
1158 gpertea 69
1159 gpertea 139 int num_mismatches=getSAMmismatches(buf, samcigar, mismatches, sam_nm, antisense_splice);
1160    
1161 gpertea 70 //##############################################
1162 gpertea 69
1163     // Add this alignment to the table of hits for this half of the
1164     // Bowtie map
1165    
1166     // Parse the text_name field to recover the splice coords
1167     vector<string> toks;
1168     tokenize_strict(text_name, "|", toks);
1169    
1170     int num_extra_toks = (int)toks.size() - 6;
1171    
1172     if (num_extra_toks >= 0)
1173     {
1174     static const uint8_t left_window_edge_field = 1;
1175     static const uint8_t splice_field = 2;
1176     //static const uint8_t right_window_edge_field = 3;
1177     static const uint8_t junction_type_field = 4;
1178     static const uint8_t strand_field = 5;
1179    
1180     string contig = toks[0];
1181     for (int t = 1; t <= num_extra_toks; ++t)
1182     {
1183     contig += "|";
1184     contig += toks[t];
1185     }
1186    
1187     vector<string> splice_toks;
1188     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1189     if (splice_toks.size() != 2)
1190     {
1191     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1192 gpertea 70 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1193 gpertea 69 // toks[num_extra_toks + splice_field].c_str());
1194     return false;
1195     }
1196 gpertea 137
1197     string junction_strand = toks[num_extra_toks + strand_field];
1198     if(junction_strand != "rev" && junction_strand != "fwd"){
1199     fprintf(stderr, "Malformed insertion record\n");
1200     return false;
1201     }
1202    
1203 gpertea 69 //
1204     // check for an insertion hit
1205     //
1206     if(toks[num_extra_toks + junction_type_field] == "ins")
1207     {
1208 gpertea 137 //int8_t spliced_read_len = strlen(seq_str);
1209     //TODO FIXME: use the CIGAR instead of seq length!
1210 gpertea 70 // The 0-based position of the left edge of the alignment. Note that this
1211 gpertea 137 // value may need to be further corrected to account for the presence of
1212 gpertea 70 // of the insertion.
1213 gpertea 137 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1214 gpertea 69
1215 gpertea 70 // The 0-based position of the last genomic sequence before the insertion
1216 gpertea 137 int left_splice_pos = atoi(splice_toks[0].c_str());
1217 gpertea 69
1218     string insertedSequence = splice_toks[1];
1219 gpertea 70 // The 0-based position of the first genomic sequence after the insertion
1220    
1221 gpertea 137 vector<CigarOp> splcigar;
1222     //this also updates left to the adjusted genomic coordinates
1223 gpertea 139 int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches,
1224     left, left_splice_pos+1, insertedSequence.length(), INS);
1225     if (spl_num_mismatches<0) return false;
1226     num_mismatches-=spl_num_mismatches;
1227 gpertea 137 /*
1228 gpertea 69 uint32_t right_splice_pos = left_splice_pos + 1;
1229 gpertea 137
1230     //uint32_t right = left + spliced_read_len - 1;
1231     int right = left + refspan - 1;
1232    
1233 gpertea 69 if(left > left_splice_pos){
1234 gpertea 137 //The genomic position of the left edge of the alignment needs to be corrected
1235     //If the alignment does not extend into the insertion, simply subtract the length
1236     //of the inserted sequence, otherwise, just set it equal to the right edge
1237 gpertea 69 left = left - insertedSequence.length();
1238     if(left < right_splice_pos){
1239     left = right_splice_pos;
1240     }
1241     }
1242     if(right > left_splice_pos){
1243     right = right - insertedSequence.length();
1244     if(right < left_splice_pos){
1245     right = left_splice_pos;
1246     }
1247     }
1248 gpertea 137 // Now, right and left should be properly transformed into genomic coordinates
1249     // We should be able to deduce how much the alignment matches the insertion
1250     // simply based on the length of the read
1251 gpertea 69 int left_match_length = 0;
1252     if(left <= left_splice_pos){
1253     left_match_length = left_splice_pos - left + 1;
1254     }
1255     int right_match_length = 0;
1256     if(right >= right_splice_pos){
1257     right_match_length = right - right_splice_pos + 1;
1258     }
1259     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1260    
1261     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1262     return false;
1263    
1264 gpertea 70 //char* pch = strtok( mismatches, ",");
1265     //unsigned char num_mismatches = 0;
1266     //text_offset holds the left end of the alignment,
1267     //RELATIVE TO the start of the contig
1268 gpertea 69
1269 gpertea 70 //The 0-based relative position of the left-most character
1270     //before the insertion in the contig
1271 gpertea 69 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1272 gpertea 137 for (size_t i=0;i<mismatches.size();++i) {
1273 gpertea 70 int mismatch_pos = mismatches[i];
1274     // for reversely mapped reads,
1275     //find the correct mismatched position.
1276     if (sam_flag & 0x0010){
1277     mismatch_pos = spliced_read_len - mismatch_pos - 1;
1278     }
1279 gpertea 69
1280 gpertea 70 //Only count mismatches outside of the insertion region
1281     // If there is a mismatch within the insertion,
1282     // disallow this hit
1283 gpertea 137 if(mismatch_pos + text_offset <= relative_splice_pos ||
1284     mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1285 gpertea 70 spl_num_mismatches++;
1286     }else{
1287     return false;
1288 gpertea 69 }
1289     }
1290 gpertea 137 */
1291     //vector<CigarOp> splcigar;
1292     //spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1293     //splcigar.push_back(CigarOp(MATCH, left_match_length));
1294     //splcigar.push_back(CigarOp(INS, insertion_match_length));
1295     //splcigar.push_back(CigarOp(MATCH, right_match_length));
1296 gpertea 69
1297     bh = create_hit(name,
1298     contig,
1299     left,
1300 gpertea 70 //splcigar,
1301     splcigar,
1302     sam_flag & 0x0010,
1303 gpertea 69 junction_strand == "rev",
1304 gpertea 139 num_mismatches,
1305 gpertea 69 0,
1306     end);
1307     return true;
1308 gpertea 70 } //"ins"
1309 gpertea 137 else //"del" or intron
1310 gpertea 69 {
1311 gpertea 139 // The 0-based position of the left edge of the alignment.
1312 gpertea 137 int left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1313 gpertea 69
1314 gpertea 139 // The 0-based position of the last genomic sequence before the deletion
1315     int left_splice_pos = atoi(splice_toks[0].c_str());
1316 gpertea 69
1317 gpertea 139 int gap_len = atoi(splice_toks[1].c_str()) - left_splice_pos - 1;
1318     /*
1319 gpertea 69 if ((sam_flag & 0x0010) == 0) //######
1320     {
1321 gpertea 139 if (left_splice_ofs + seg_offset < _anchor_length)
1322 gpertea 135 return false;
1323     }
1324     else
1325     {
1326 gpertea 139 if (right_splice_ofs + seg_offset < _anchor_length)
1327 gpertea 135 return false;
1328     }
1329 gpertea 139 */
1330 gpertea 69 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1331     //atoi(toks[right_window_edge_field].c_str());
1332    
1333 gpertea 139 /*
1334     //offset of deletion point, relative to the beginning of the alignment
1335     int left_splice_ofs = left_splice_pos-left+1;
1336    
1337 gpertea 69 int mismatches_in_anchor = 0;
1338 gpertea 137 for (size_t i=0;i<mismatches.size();++i) {
1339     spl_num_mismatches++;
1340     int mismatch_pos = mismatches[i];
1341 gpertea 139 if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_ofs) < (int)min_anchor_len) ||
1342 gpertea 137 ((sam_flag & 0x0010) != 0 &&
1343 gpertea 139 abs(((int)refspan - left_splice_ofs + 1) - mismatch_pos)) < (int)min_anchor_len)
1344 gpertea 70 mismatches_in_anchor++;
1345 gpertea 137 }
1346 gpertea 139 */
1347 gpertea 70 vector<CigarOp> splcigar;
1348     CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP ;
1349 gpertea 140
1350 gpertea 139 int spl_num_mismatches=spliceCigar(splcigar, samcigar, mismatches, left,
1351     left_splice_pos+1, gap_len, opcode);
1352     if (spl_num_mismatches<0) // || spl_num_mismatches>max_anchor_mismatches)
1353     return false;
1354     /*
1355 gpertea 70 splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1356 gpertea 69 if(toks[num_extra_toks + junction_type_field] == "del"){
1357 gpertea 70 splcigar.push_back(CigarOp(DEL, gap_len));
1358 gpertea 69 }else{
1359 gpertea 70 splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1360 gpertea 69 }
1361 gpertea 70 splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1362 gpertea 139 */
1363 gpertea 69 bh = create_hit(name,
1364     contig,
1365     left,
1366 gpertea 70 splcigar,
1367     (sam_flag & 0x0010),
1368 gpertea 69 junction_strand == "rev",
1369 gpertea 139 num_mismatches,
1370 gpertea 70 spl_num_mismatches,
1371 gpertea 69 end);
1372     return true;
1373     }
1374 gpertea 70 } //parse splice data
1375 gpertea 69 else
1376     {
1377     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1378     //fprintf(stderr, "%s\n", orig_bwt_buf);
1379     // continue;
1380     return false;
1381     }
1382    
1383     return false;
1384     }
1385    
1386    
1387 gpertea 135
1388 gpertea 29 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1389     BowtieHit& bh, bool strip_slash,
1390     char* name_out, char* name_tags,
1391     char* seq, char* qual) {
1392     if (_sam_header==NULL)
1393     err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1394     const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1395    
1396    
1397     uint32_t sam_flag = hit_buf->core.flag;
1398    
1399     int text_offset = hit_buf->core.pos;
1400     int text_mate_pos = hit_buf->core.mpos;
1401     int target_id = hit_buf->core.tid;
1402     int mate_target_id = hit_buf->core.mtid;
1403    
1404     vector<CigarOp> cigar;
1405     bool spliced_alignment = false;
1406     int num_hits = 1;
1407    
1408     double mapQ = hit_buf->core.qual;
1409     long double error_prob;
1410     if (mapQ > 0)
1411     {
1412     long double p = (-1.0 * mapQ) / 10.0;
1413     error_prob = pow(10.0L, p);
1414     }
1415     else
1416     {
1417     error_prob = 1.0;
1418     }
1419    
1420     //header->target_name[c->tid]
1421    
1422     bool end = true;
1423     unsigned int seg_offset = 0;
1424     unsigned int seg_num = 0;
1425     unsigned int num_segs = 0;
1426     // Copy the tag out of the name field before we might wipe it out
1427     char* qname = bam1_qname(hit_buf);
1428     char* pipe = strrchr(qname, '|');
1429     if (pipe)
1430     {
1431     if (name_tags)
1432     strcpy(name_tags, pipe);
1433    
1434     char* tag_buf = pipe + 1;
1435     if (strchr(tag_buf, ':'))
1436     {
1437     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1438     if (seg_num + 1 == num_segs)
1439     end = true;
1440     else
1441     end = false;
1442     }
1443    
1444     *pipe = 0;
1445     }
1446    
1447    
1448     if (target_id < 0) {
1449     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1450     bh = create_hit(qname,
1451     "*", //ref_name
1452     0, //left coord
1453     0, //read_len
1454     false, //antisense_aln
1455     0, //edit_dist
1456     end);
1457     return true;
1458     }
1459    
1460     //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1461     string text_name = _sam_header->target_name[target_id];
1462     for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1463     {
1464     //char* t;
1465    
1466     int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1467     if (length <= 0)
1468     {
1469     fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1470     return false;
1471     }
1472    
1473     CigarOpCode opcode;
1474     switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1475     {
1476     case BAM_CMATCH: opcode = MATCH; break;
1477     case BAM_CINS: opcode = INS; break;
1478     case BAM_CDEL: opcode = DEL; break;
1479     case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1480     case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1481     case BAM_CPAD: opcode = PAD; break;
1482     case BAM_CREF_SKIP:
1483     opcode = REF_SKIP;
1484     spliced_alignment = true;
1485     if (length > (int)max_report_intron_length)
1486     {
1487     //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1488     return false;
1489     }
1490     break;
1491     default:
1492     fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1493     return false;
1494     }
1495     if (opcode != HARD_CLIP)
1496     cigar.push_back(CigarOp(opcode, length));
1497     }
1498    
1499     string mrnm;
1500     if (mate_target_id >= 0) {
1501     if (mate_target_id == target_id) {
1502     //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1503     mrnm = _sam_header->target_name[mate_target_id];
1504     }
1505     else {
1506     //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1507     return false;
1508     }
1509     }
1510     else {
1511     text_mate_pos = 0;
1512     }
1513     //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1514     bool antisense_splice=false;
1515     unsigned char num_mismatches = 0;
1516     unsigned char num_splice_anchor_mismatches = 0;
1517    
1518     uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1519     if (ptr) {
1520     char src_strand_char = bam_aux2A(ptr);
1521     if (src_strand_char == '-')
1522     antisense_splice = true;
1523     // else if (src_strand_char == '+')
1524     // source_strand = CUFF_FWD;
1525     }
1526    
1527     ptr = bam_aux_get(hit_buf, "NM");
1528     if (ptr) {
1529     num_mismatches = bam_aux2i(ptr);
1530     }
1531    
1532     ptr = bam_aux_get(hit_buf, "NH");
1533     if (ptr) {
1534     num_hits = bam_aux2i(ptr);
1535     }
1536    
1537     //bool antisense_aln = bam1_strand(hit_buf);
1538    
1539     //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1540     // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1541     if (spliced_alignment) {
1542     //if (source_strand == CUFF_STRAND_UNKNOWN) {
1543     // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1544     // }
1545     bh = create_hit(qname,
1546     text_name,
1547     text_offset, // BAM files are 0-indexed
1548     cigar,
1549     sam_flag & 0x0010,
1550     antisense_splice,
1551     num_mismatches,
1552     num_splice_anchor_mismatches,
1553     end);
1554    
1555     }
1556     else {
1557     //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1558     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1559     bh = create_hit(qname,
1560     text_name,
1561     text_offset, // BAM files are 0-indexed
1562     cigar,
1563     sam_flag & 0x0010,
1564     false,
1565     num_mismatches,
1566     0,
1567     end);
1568     }
1569     if (seq!=NULL) {
1570     char *bseq = (char*)bam1_seq(hit_buf);
1571     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1572     char v = bam1_seqi(bseq,i);
1573     seq[i]=bam_nt16_rev_table[v];
1574     }
1575     seq[hit_buf->core.l_qseq]=0;
1576     }
1577     if (qual!=NULL) {
1578     char *bq = (char*)bam1_qual(hit_buf);
1579     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1580     qual[i]=bq[i]+33;
1581     }
1582     qual[hit_buf->core.l_qseq]=0;
1583     }
1584     return true;
1585     }
1586    
1587     bool BAMHitFactory::inspect_header(HitStream& hs)
1588     {
1589     bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1590    
1591     if (header == NULL) {
1592     fprintf(stderr, "Warning: No BAM header\n");
1593     return false;
1594     }
1595     if (header->l_text == 0) {
1596     fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1597     return false;
1598     }
1599     return true;
1600     }
1601    
1602    
1603     void get_mapped_reads(FILE* bwtf,
1604     HitTable& hits,
1605     HitFactory& hit_factory,
1606     bool strip_slash,
1607     bool verbose)
1608     {
1609    
1610    
1611     char bwt_buf[2048];
1612     uint32_t reads_extracted = 0;
1613    
1614     while (fgets(bwt_buf, 2048, bwtf))
1615     {
1616     // Chomp the newline
1617     char* nl = strrchr(bwt_buf, '\n');
1618     if (nl) *nl = 0;
1619     if (*bwt_buf == 0)
1620     continue;
1621     // Get a new record from the tab-delimited Bowtie map
1622     BowtieHit bh;
1623     if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1624     {
1625     // Only check uniqueness if these hits are spliced
1626     hits.add_hit(bh, true);
1627     }
1628     reads_extracted++;
1629     }
1630    
1631     // This will sort the map by insert id.
1632     hits.finalize();
1633     fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1634     }
1635    
1636    
1637     /*
1638     AlignStatus status(const BowtieHit* align)
1639     {
1640     if (!align)
1641     return UNALIGNED;
1642     if (align->contiguous())
1643     return CONTIGUOUS;
1644     return SPLICED;
1645     }
1646     */
1647    
1648     void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1649     {
1650     int max_hit_pos = -1;
1651     for (size_t i = 0; i < hits.size(); ++i)
1652     {
1653     max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1654     }
1655    
1656     if ((int)DoC.size() < max_hit_pos)
1657     DoC.resize(max_hit_pos);
1658    
1659     for (size_t i = 0; i < hits.size(); ++i)
1660     {
1661     const BowtieHit& bh = hits[i];
1662    
1663     // split up the coverage contibution for this reads
1664     size_t j = bh.left();
1665     const vector<CigarOp>& cigar = bh.cigar();
1666    
1667     for (size_t c = 0 ; c < cigar.size(); ++c)
1668     {
1669     switch(cigar[c].opcode)
1670     {
1671     case MATCH:
1672     for (size_t m = 0; m < cigar[c].length; ++m)
1673     {
1674     if (DoC[j + m] < 0xFFFF)
1675     DoC[j + m]++;
1676     }
1677     //fall through this case to REF_SKIP is intentional
1678     case REF_SKIP:
1679     j += cigar[c].length;
1680     break;
1681     default:
1682     break;
1683     }
1684    
1685     }
1686     }
1687     }
1688    
1689     void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1690     {
1691     if ((int)DoC.size() < bh.right())
1692     DoC.resize(bh.right());
1693    
1694     // split up the coverage contibution for this reads
1695     size_t j = bh.left();
1696     const vector<CigarOp>& cigar = bh.cigar();
1697    
1698     for (size_t c = 0 ; c < cigar.size(); ++c)
1699     {
1700     switch(cigar[c].opcode)
1701     {
1702     case MATCH:
1703     for (size_t m = 0; m < cigar[c].length; ++m)
1704     {
1705 gpertea 135 if (DoC[j + m] < VMAXINT32)
1706 gpertea 29 DoC[j + m]++;
1707     }
1708     //fall through this case to REF_SKIP is intentional
1709     case REF_SKIP:
1710     j += cigar[c].length;
1711     break;
1712     default:
1713     break;
1714     }
1715    
1716     }
1717     }
1718    
1719     void print_hit(FILE* fout,
1720     const char* read_name,
1721     const BowtieHit& bh,
1722     const char* ref_name,
1723     const char* sequence,
1724     const char* qualities,
1725     bool from_bowtie)
1726     {
1727     string seq;
1728     string quals;
1729     if (sequence)
1730     {
1731     seq = sequence;
1732     quals = qualities;
1733     seq.resize(bh.read_len());
1734     quals.resize(bh.read_len());
1735     }
1736     else
1737     {
1738     seq = "*";
1739     }
1740    
1741     if (qualities)
1742     {
1743     quals = qualities;
1744     quals.resize(bh.read_len());
1745     }
1746     else
1747     {
1748     quals = "*";
1749     }
1750    
1751     uint32_t sam_flag = 0;
1752     if (bh.antisense_align())
1753     {
1754     sam_flag |= 0x0010; // BAM_FREVERSE
1755     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1756     {
1757     reverse_complement(seq);
1758     reverse(quals.begin(), quals.end());
1759     }
1760     }
1761    
1762     uint32_t sam_pos = bh.left() + 1;
1763     uint32_t map_quality = 255;
1764     char cigar[256];
1765     cigar[0] = 0;
1766     string mate_ref_name = "*";
1767     uint32_t mate_pos = 0;
1768     uint32_t insert_size = 0;
1769     //string qualities = "*";
1770    
1771     const vector<CigarOp>& bh_cigar = bh.cigar();
1772    
1773     /*
1774     * In addition to calculating the cigar string,
1775     * we need to figure out how many in/dels are in the
1776     * sequence, so that we can give the correct
1777     * value for the NM tag
1778     */
1779     int indel_distance = 0;
1780     for (size_t c = 0; c < bh_cigar.size(); ++c)
1781     {
1782     char ibuf[64];
1783     sprintf(ibuf, "%d", bh_cigar[c].length);
1784     switch(bh_cigar[c].opcode)
1785     {
1786     case MATCH:
1787     strcat(cigar, ibuf);
1788     strcat(cigar, "M");
1789     break;
1790     case INS:
1791     strcat(cigar, ibuf);
1792     strcat(cigar, "I");
1793     indel_distance += bh_cigar[c].length;
1794     break;
1795     case DEL:
1796     strcat(cigar, ibuf);
1797     strcat(cigar, "D");
1798     indel_distance += bh_cigar[c].length;
1799     break;
1800     case REF_SKIP:
1801     strcat(cigar, ibuf);
1802     strcat(cigar, "N");
1803     break;
1804     default:
1805     break;
1806     }
1807     }
1808    
1809     //string q = string(bh.read_len(), '!');
1810     //string s = string(bh.read_len(), 'N');
1811    
1812     fprintf(fout,
1813     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1814     read_name,
1815     sam_flag,
1816     ref_name,
1817     sam_pos,
1818     map_quality,
1819     cigar,
1820     mate_ref_name.c_str(),
1821     mate_pos,
1822     insert_size,
1823     seq.c_str(),
1824     quals.c_str());
1825    
1826     if (!sam_readgroup_id.empty())
1827     fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1828    
1829     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1830    
1831     bool containsSplice = false;
1832     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1833     if(itr->opcode == REF_SKIP){
1834     containsSplice = true;
1835     break;
1836     }
1837     }
1838    
1839     if (containsSplice)
1840     fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1841    
1842     fprintf(fout, "\n");
1843     }
1844    
1845     void print_bamhit(GBamWriter& wbam,
1846     const char* read_name,
1847     const BowtieHit& bh,
1848     const char* ref_name,
1849     const char* sequence,
1850     const char* qualities,
1851     bool from_bowtie)
1852     {
1853     string seq;
1854     string quals;
1855     if (sequence) {
1856     seq = sequence;
1857     quals = qualities;
1858     seq.resize(bh.read_len());
1859     quals.resize(bh.read_len());
1860     }
1861     else {
1862     seq = "*";
1863     }
1864     if (qualities) {
1865     quals = qualities;
1866     quals.resize(bh.read_len());
1867     }
1868     else
1869     {
1870     quals = "*";
1871     }
1872    
1873     uint32_t sam_flag = 0;
1874     if (bh.antisense_align())
1875     {
1876     sam_flag |= 0x0010; // BAM_FREVERSE
1877     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1878     {
1879     reverse_complement(seq);
1880     reverse(quals.begin(), quals.end());
1881     }
1882     }
1883    
1884     uint32_t sam_pos = bh.left() + 1;
1885     uint32_t map_quality = 255;
1886     char cigar[256];
1887     cigar[0] = 0;
1888     string mate_ref_name = "*";
1889     uint32_t mate_pos = 0;
1890     uint32_t insert_size = 0;
1891     //string qualities = "*";
1892    
1893     const vector<CigarOp>& bh_cigar = bh.cigar();
1894     /*
1895     * In addition to calculating the cigar string,
1896     * we need to figure out how many in/dels are in the
1897     * sequence, so that we can give the correct
1898     * value for the NM tag
1899     */
1900     int indel_distance = 0;
1901     for (size_t c = 0; c < bh_cigar.size(); ++c)
1902     {
1903     char ibuf[64];
1904     sprintf(ibuf, "%d", bh_cigar[c].length);
1905     switch(bh_cigar[c].opcode)
1906     {
1907     case MATCH:
1908     strcat(cigar, ibuf);
1909     strcat(cigar, "M");
1910     break;
1911     case INS:
1912     strcat(cigar, ibuf);
1913     strcat(cigar, "I");
1914     indel_distance += bh_cigar[c].length;
1915     break;
1916     case DEL:
1917     strcat(cigar, ibuf);
1918     strcat(cigar, "D");
1919     indel_distance += bh_cigar[c].length;
1920     break;
1921     case REF_SKIP:
1922     strcat(cigar, ibuf);
1923     strcat(cigar, "N");
1924     break;
1925     default:
1926     break;
1927     }
1928     }
1929    
1930     //string q = string(bh.read_len(), '!');
1931     //string s = string(bh.read_len(), 'N');
1932     /*
1933     fprintf(fout,
1934     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1935     read_name,
1936     sam_flag,
1937     ref_name,
1938     sam_pos,
1939     map_quality,
1940     cigar,
1941     mate_ref_name.c_str(),
1942     mate_pos,
1943     insert_size,
1944     seq.c_str(),
1945     quals.c_str());
1946    
1947     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1948     */
1949     vector<string> auxdata;
1950    
1951     if (!sam_readgroup_id.empty())
1952     {
1953     string nm("RG:Z:");
1954     nm += sam_readgroup_id;
1955     auxdata.push_back(nm);
1956     }
1957    
1958     string nm("NM:i:");
1959     str_appendInt(nm, bh.edit_dist() + indel_distance);
1960     auxdata.push_back(nm);
1961     bool containsSplice = false;
1962     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1963     if(itr->opcode == REF_SKIP){
1964     containsSplice = true;
1965     break;
1966     }
1967    
1968     if (containsSplice) {
1969     //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1970     nm="XS:A:";
1971     nm+=(char)(bh.antisense_splice() ? '-' : '+');
1972     auxdata.push_back(nm);
1973     }
1974    
1975     GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1976     cigar, mate_ref_name.c_str(), mate_pos,
1977     insert_size, seq.c_str(), quals.c_str(), &auxdata);
1978    
1979    
1980    
1981     wbam.write(brec);
1982     delete brec;
1983     }
1984    
1985     /**
1986     * Print a vector of cigar operations to a file.
1987     * @param bh_cigar A vector of CigarOps
1988     * @return a string representation of the cigar string
1989     */
1990     std::string print_cigar(vector<CigarOp>& bh_cigar){
1991     char cigar[256];
1992     cigar[0] = 0;
1993     for (size_t c = 0; c < bh_cigar.size(); ++c)
1994     {
1995     char ibuf[64];
1996     sprintf(ibuf, "%d", bh_cigar[c].length);
1997     switch(bh_cigar[c].opcode)
1998     {
1999     case MATCH:
2000     strcat(cigar, ibuf);
2001     strcat(cigar, "M");
2002     break;
2003     case INS:
2004     strcat(cigar, ibuf);
2005     strcat(cigar, "I");
2006     break;
2007     case DEL:
2008     strcat(cigar, ibuf);
2009     strcat(cigar, "D");
2010     break;
2011     case REF_SKIP:
2012     strcat(cigar, ibuf);
2013     strcat(cigar, "N");
2014     break;
2015     default:
2016     break;
2017     }
2018     }
2019     string result(cigar);
2020     return result;
2021     }
2022    
2023     bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
2024     {
2025     RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
2026     if (!ref_str)
2027     return false;
2028    
2029     const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
2030    
2031     size_t pos_seq = 0;
2032     size_t pos_ref = 0;
2033     size_t mismatch = 0;
2034     for (size_t i = 0; i < _cigar.size(); ++i)
2035     {
2036     CigarOp cigar = _cigar[i];
2037     switch(cigar.opcode)
2038     {
2039     case MATCH:
2040     {
2041     for (size_t j = 0; j < cigar.length; ++j)
2042     {
2043     if (_seq[pos_seq++] != ref_seq[pos_ref++])
2044     ++mismatch;
2045     }
2046     }
2047     break;
2048     case INS:
2049     {
2050     pos_seq += cigar.length;
2051     }
2052     break;
2053    
2054     case DEL:
2055     case REF_SKIP:
2056     {
2057     pos_ref += cigar.length;
2058     }
2059     break;
2060    
2061     default:
2062     break;
2063     }
2064     }
2065    
2066     return mismatch == _edit_dist;
2067     }