ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (8 years, 10 months ago) by gpertea
File size: 41661 byte(s)
Log Message:
adding tophat source work

Line User Rev File contents
1 gpertea 29 /*
2     * bwt_map.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/17/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <cassert>
15     #include <cstdlib>
16     #include <iostream>
17     #include <set>
18     #include <vector>
19     #include <cmath>
20    
21     #include "common.h"
22     #include "bwt_map.h"
23     #include "tokenize.h"
24     #include "reads.h"
25    
26     using namespace std;
27    
28     void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29     {
30     uint32_t reference_id = bh.ref_id();
31    
32     pair<RefHits::iterator, bool> ret =
33     _hits_for_ref.insert(make_pair(reference_id, HitList()));
34     HitList& hl = ret.first->second;
35    
36     if (check_uniqueness)
37     {
38     // Check uniqueness, in case we are adding spliced hits from
39     // several spliced alignment sources (e.g. de novo hashing + Bowtie
40     // against a user-supplied index). We don't want to count the same
41     // alignment twice if it happened to be found by more than one method
42     HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43     HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44     for (; lb != ub && lb != hl.end(); ++lb)
45     {
46     if (*lb == bh)
47     {
48     //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49     return;
50     }
51    
52     if (lb->insert_id() == bh.insert_id() &&
53     lb->ref_id() == bh.ref_id() &&
54     lb->antisense_align() == bh.antisense_align())
55     {
56     // If we get here, we may be looking at the same alignment
57     // However, spanning_reads may report a shorter, trimmed alignment
58     // so not all fields will be equal. If they just disagree on the
59     // ends, and don't indicate a different junction coord, the
60     // alignments are the same.
61    
62     if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63     (bh.left() <= lb->left() && bh.right() >= lb->right()))
64     {
65     vector<pair<int,int> > lb_gaps, bh_gaps;
66     lb->gaps(lb_gaps);
67     bh.gaps(bh_gaps);
68     if (lb_gaps == bh_gaps)
69     {
70     // One alignment is contained in the other, they agree on
71     // where the gaps, if any, are, and they share an id
72     // => this is a redundant aligment, so toss it
73     //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74     return;
75     }
76     }
77     }
78     }
79     }
80     _total_hits++;
81     hl.push_back(bh);
82     }
83    
84     bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85     {
86     return h1.insert_id() < h2.insert_id();
87     }
88    
89     void LineHitFactory::openStream(HitStream& hs)
90     {
91     if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92     //open the file for HitStream here
93     hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94     if (hs._hit_file==NULL)
95     err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96     return;
97     }
98     if (hs._fzpipe!=NULL) {
99     hs._hit_file=hs._fzpipe->file;
100     }
101     }
102    
103     void LineHitFactory::rewind(HitStream& hs)
104     {
105     if (hs._fzpipe!=NULL) {
106     hs._fzpipe->rewind();
107     hs._hit_file=hs._fzpipe->file;
108     }
109     else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110     }
111    
112     bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113     FILE* f=(FILE *)(hs._hit_file);
114     bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115     if (!new_rec || feof(f)) {
116     hs._eof=true;
117     return false;
118     }
119     ++_line_num;
120     char* nl = strrchr(_hit_buf, '\n');
121     if (nl) *nl = 0;
122     buf = _hit_buf;
123     buf_size = _hit_buf_max_sz - 1;
124     return true;
125     }
126    
127     void LineHitFactory::closeStream(HitStream& hs) {
128     if (hs._fzpipe!=NULL) {
129     hs._fzpipe->close();
130     return;
131     }
132     if (hs._hit_file!=NULL) {
133     fclose((FILE*)(hs._hit_file));
134     hs._hit_file=NULL;
135     }
136     }
137     void BAMHitFactory::openStream(HitStream& hs) {
138     if (hs._hit_file==NULL) {
139     if (hs._hit_file_name.empty())
140     //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141     return; //invalid stream, could be just a place holder
142     //open the file here if not already open
143     string fext=getFext(hs._hit_file_name);
144     if (fext=="sam")
145     hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146     else
147     hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148    
149     samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150    
151     if (sam_file == NULL)
152     err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153     if (sam_file->header == NULL)
154     err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155     memset(&_next_hit, 0, sizeof(_next_hit));
156     //_beginning = bgzf_tell(sam_file->x.bam);
157     _sam_header=sam_file->header;
158     if (inspect_header(hs) == false)
159     err_die("Error: invalid SAM header for file %s\n",
160     hs._hit_file_name.c_str());
161     }
162     }
163    
164     void BAMHitFactory::closeStream(HitStream& hs) {
165     if (hs._hit_file) {
166     samclose((samfile_t*)(hs._hit_file));
167     }
168     hs._hit_file=NULL;
169     _sam_header=NULL;
170     }
171    
172     void BAMHitFactory::rewind(HitStream& hs)
173     {
174     /*
175     if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176     {
177     bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178     _eof = false;
179     }
180     */
181     this->closeStream(hs);
182     this->openStream(hs);
183     }
184    
185     string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186     const bam1_t* bamrec=(const bam1_t*)hit_buf;
187     char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188     string sam_line(tamline);
189     free(tamline);
190     return sam_line;
191     }
192    
193     bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194     if (_next_hit.data) {
195     free(_next_hit.data);
196     _next_hit.data = NULL;
197     }
198     _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199     if (hs.eof() || !hs.ready()) return false;
200    
201     //mark_curr_pos();
202    
203     memset(&_next_hit, 0, sizeof(_next_hit));
204    
205     int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206     if (bytes_read <= 0) {
207     hs._eof = true;
208     return false;
209     }
210     buf = (const char*)&_next_hit;
211     buf_size = bytes_read;
212     return true;
213     }
214    
215    
216     BowtieHit HitFactory::create_hit(const string& insert_name,
217     const string& ref_name,
218     int left,
219     const vector<CigarOp>& cigar,
220     bool antisense_aln,
221     bool antisense_splice,
222     unsigned char edit_dist,
223     unsigned char splice_mms,
224     bool end)
225     {
226     uint64_t insert_id = _insert_table.get_id(insert_name);
227     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228    
229     return BowtieHit(reference_id,
230     insert_id,
231     left,
232     cigar,
233     antisense_aln,
234     antisense_splice,
235     edit_dist,
236     splice_mms,
237     end);
238     }
239    
240     BowtieHit HitFactory::create_hit(const string& insert_name,
241     const string& ref_name,
242     uint32_t left,
243     uint32_t read_len,
244     bool antisense_aln,
245     unsigned char edit_dist,
246     bool end)
247     {
248     uint64_t insert_id = _insert_table.get_id(insert_name);
249     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250    
251     return BowtieHit(reference_id,
252     insert_id,
253     left,
254     read_len,
255     antisense_aln,
256     edit_dist,
257     end);
258     }
259    
260     bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261     BowtieHit& bh,
262     bool strip_slash,
263     char* name_out,
264     char* name_tags,
265     char* seq,
266     char* qual)
267     {
268     if (!orig_bwt_buf || !*orig_bwt_buf)
269     return false;
270    
271     static const int buf_size = 2048;
272    
273     char bwt_buf[buf_size];
274     strcpy(bwt_buf, orig_bwt_buf);
275     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276    
277     char orientation;
278    
279     //int bwtf_ret = 0;
280     //uint32_t seqid = 0;
281    
282     char text_name[buf_size];
283     unsigned int text_offset;
284    
285    
286     //unsigned int other_occs;
287     char mismatches[buf_size];
288     //memset(mismatches, 0, sizeof(mismatches));
289    
290     const char* buf = bwt_buf;
291     char* name = get_token((char**)&buf,"\t");
292     char* orientation_str = get_token((char**)&buf,"\t");
293     char* text_name_str = get_token((char**)&buf,"\t");
294    
295     strcpy(text_name, text_name_str);
296    
297     char* text_offset_str = get_token((char**)&buf,"\t");
298     char* seq_str = get_token((char**)&buf,"\t");
299     if (seq)
300     strcpy(seq, seq_str);
301    
302     const char* qual_str = get_token((char**)&buf,"\t");
303     if (qual)
304     strcpy(qual, qual_str);
305    
306     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307     mismatches[0] = 0;
308     char* mismatches_str = get_token((char**)&buf, "\t");
309     if (mismatches_str)
310     strcpy(mismatches, mismatches_str);
311    
312     orientation = orientation_str[0];
313     text_offset = atoi(text_offset_str);
314    
315     bool end = true;
316     unsigned int seg_offset = 0;
317     unsigned int seg_num = 0;
318     unsigned int num_segs = 0;
319    
320     // Copy the tag out of the name field before we might wipe it out
321     char* pipe = strrchr(name, '|');
322     if (pipe)
323     {
324     if (name_tags)
325     strcpy(name_tags, pipe);
326    
327     char* tag_buf = pipe + 1;
328     if (strchr(tag_buf, ':'))
329     {
330     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331     if (seg_num + 1 == num_segs)
332     end = true;
333     else
334     end = false;
335     }
336    
337     *pipe = 0;
338     }
339     // Stripping the slash and number following it gives the insert name
340     char* slash = strrchr(name, '/');
341     if (strip_slash && slash)
342     *slash = 0;
343    
344     size_t read_len = strlen(seq_str);
345    
346     // Add this alignment to the table of hits for this half of the
347     // Bowtie map
348    
349     //vector<string> mismatch_toks;
350     char* pch = strtok (mismatches,",");
351     unsigned char num_mismatches = 0;
352     while (pch != NULL)
353     {
354     char* colon = strchr(pch, ':');
355     if (colon)
356     {
357     num_mismatches++;
358     }
359     //mismatch_toks.push_back(pch);
360     pch = strtok (NULL, ",");
361     }
362    
363     bh = create_hit(name,
364     text_name,
365     text_offset,
366     (int)read_len,
367     orientation == '-',
368     num_mismatches,
369     end);
370    
371     return true;
372     }
373    
374     int anchor_mismatch = 0;
375    
376     bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
377     BowtieHit& bh,
378     bool strip_slash,
379     char* name_out,
380     char* name_tags,
381     char* seq,
382     char* qual)
383     {
384     if (!orig_bwt_buf || !*orig_bwt_buf)
385     return false;
386    
387     static const int buf_size = 2048;
388    
389     char bwt_buf[buf_size];
390     strcpy(bwt_buf, orig_bwt_buf);
391     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
392    
393     char orientation;
394     char text_name[buf_size];
395     unsigned int text_offset;
396     char mismatches[buf_size];
397     //memset(mismatches, 0, sizeof(mismatches));
398    
399     char* buf = bwt_buf;
400     char* name = get_token((char**)&buf,"\t");
401     char* orientation_str = get_token((char**)&buf,"\t");
402     char* text_name_str = get_token((char**)&buf,"\t");
403     strcpy(text_name, text_name_str);
404    
405     char* text_offset_str = get_token((char**)&buf,"\t");
406     char* seq_str = get_token((char**)&buf,"\t");
407     if (seq)
408     strcpy(seq, seq_str);
409    
410     const char* qual_str = get_token((char**)&buf,"\t");
411     if (qual)
412     strcpy(qual, qual_str);
413    
414     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
415     mismatches[0] = 0;
416     char* mismatches_str = get_token((char**)&buf, "\t");
417     if (mismatches_str)
418     strcpy(mismatches, mismatches_str);
419    
420     orientation = orientation_str[0];
421     text_offset = atoi(text_offset_str);
422    
423     bool end = true;
424     unsigned int seg_offset = 0;
425     unsigned int seg_num = 0;
426     unsigned int num_segs = 0;
427    
428     // Copy the tag out of the name field before we might wipe it out
429     char* pipe = strrchr(name, '|');
430     if (pipe)
431     {
432     if (name_tags)
433     strcpy(name_tags, pipe);
434    
435     char* tag_buf = pipe + 1;
436     if (strchr(tag_buf, ':'))
437     {
438     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
439     if (seg_num + 1 == num_segs)
440     end = true;
441     else
442     end = false;
443     }
444    
445     *pipe = 0;
446     }
447     // Stripping the slash and number following it gives the insert name
448     char* slash = strrchr(name, '/');
449     if (strip_slash && slash)
450     *slash = 0;
451    
452     //int read_len = strlen(sequence);
453    
454     // Add this alignment to the table of hits for this half of the
455     // Bowtie map
456    
457     // Parse the text_name field to recover the splice coords
458     vector<string> toks;
459    
460     tokenize_strict(text_name, "|", toks);
461    
462     int num_extra_toks = (int)toks.size() - 6;
463    
464     if (num_extra_toks >= 0)
465     {
466     static const uint8_t left_window_edge_field = 1;
467     static const uint8_t splice_field = 2;
468     //static const uint8_t right_window_edge_field = 3;
469     static const uint8_t junction_type_field = 4;
470     static const uint8_t strand_field = 5;
471    
472     string contig = toks[0];
473     for (int t = 1; t <= num_extra_toks; ++t)
474     {
475     contig += "|";
476     contig += toks[t];
477     }
478    
479     vector<string> splice_toks;
480     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
481     if (splice_toks.size() != 2)
482     {
483     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
484     //fprintf(stderr, "%s (token: %s)\n", text_name,
485     // toks[num_extra_toks + splice_field].c_str());
486     return false;
487     }
488    
489     //
490     // check for an insertion hit
491     //
492     if(toks[num_extra_toks + junction_type_field] == "ins")
493     {
494     int8_t spliced_read_len = strlen(seq_str);
495     /*
496     * The 0-based position of the left edge of the alignment. Note that this
497     * value may need to be futher corrected to account for the presence of
498     * of the insertion.
499     */
500     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
501     uint32_t right = left + spliced_read_len - 1;
502    
503     /*
504     * The 0-based position of the last genomic sequence before the insertion
505     */
506     uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
507    
508     string insertedSequence = splice_toks[1];
509     /*
510     * The 0-based position of the first genomic sequence after teh insertion
511     */
512     uint32_t right_splice_pos = left_splice_pos + 1;
513     if(left > left_splice_pos){
514     /*
515     * The genomic position of the left edge of the alignment needs to be corrected
516     * If the alignment does not extend into the insertion, simply subtract the length
517     * of the inserted sequence, otherwise, just set it equal to the right edge
518     */
519     left = left - insertedSequence.length();
520     if(left < right_splice_pos){
521     left = right_splice_pos;
522     }
523     }
524     if(right > left_splice_pos){
525     right = right - insertedSequence.length();
526     if(right < left_splice_pos){
527     right = left_splice_pos;
528     }
529     }
530     /*
531     * Now, right and left should be properly transformed into genomic coordinates
532     * We should be able to deduce how much the alignment matches the insertion
533     * simply based on the length of the read
534     */
535     int left_match_length = 0;
536     if(left <= left_splice_pos){
537     left_match_length = left_splice_pos - left + 1;
538     }
539     int right_match_length = 0;
540     if(right >= right_splice_pos){
541     right_match_length = right - right_splice_pos + 1;
542     }
543     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
544    
545     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
546     return false;
547    
548     string junction_strand = toks[num_extra_toks + strand_field];
549     if(junction_strand != "rev" && junction_strand != "fwd"){
550     fprintf(stderr, "Malformed insertion record\n");
551     return false;
552     }
553    
554     char* pch = strtok( mismatches, ",");
555     unsigned char num_mismatches = 0;
556    
557     /*
558     * remember that text_offset holds the left end of the
559     * alignment, relative to the start of the contig
560     */
561    
562     /*
563     * The 0-based relative position of the left-most character
564     * before the insertion in the contig
565     */
566     int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
567     while (pch != NULL)
568     {
569     char* colon = strchr(pch, ':');
570     if (colon)
571     {
572     *colon = 0;
573     int mismatch_pos = atoi(pch);
574    
575     /*
576     * for reversely mapped reads,
577     * find the correct mismatched position.
578     */
579     if(orientation == '-'){
580     mismatch_pos = spliced_read_len - mismatch_pos - 1;
581     }
582    
583     /*
584     * Only count mismatches outside of the insertion region
585     * If there is a mismatch within the insertion,
586     * disallow this hit
587     */
588     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
589     num_mismatches++;
590     }else{
591     return false;
592     }
593     }
594     pch = strtok (NULL, ",");
595     }
596    
597    
598     vector<CigarOp> cigar;
599     cigar.push_back(CigarOp(MATCH, left_match_length));
600     cigar.push_back(CigarOp(INS, insertion_match_length));
601     cigar.push_back(CigarOp(MATCH, right_match_length));
602    
603     /*
604     * For now, disallow hits that don't span
605     * the insertion. If we allow these types of hits,
606     * then long_spanning.cpp needs to be updated
607     * in order to intelligently merge these kinds
608     * of reads back together
609     *
610     * Following code has been changed to allow segment that end
611     * in an insertion
612     */
613     bh = create_hit(name,
614     contig,
615     left,
616     cigar,
617     orientation == '-',
618     junction_strand == "rev",
619     num_mismatches,
620     0,
621     end);
622     return true;
623     }
624    
625     else
626     {
627     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
628     int spliced_read_len = strlen(seq_str);
629     int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
630     if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
631     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
632    
633     if (right_splice_pos <= 0 || left_splice_pos <= 0)
634     return false;
635    
636     if (orientation == '+')
637     {
638     if (left_splice_pos + seg_offset < _anchor_length){
639     return false;
640     }
641     }
642     else
643     {
644     if (right_splice_pos + seg_offset < _anchor_length)
645     return false;
646     }
647     //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
648     //atoi(toks[right_window_edge_field].c_str());
649     int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
650    
651     string junction_strand = toks[num_extra_toks + strand_field];
652     if (!(junction_strand == "rev" || junction_strand == "fwd")||
653     !(orientation == '-' || orientation == '+'))
654     {
655     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
656     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
657     // junction_strand.c_str(), orientation);
658     return false;
659     }
660    
661     //vector<string> mismatch_toks;
662     char* pch = strtok (mismatches,",");
663     int mismatches_in_anchor = 0;
664     unsigned char num_mismatches = 0;
665     while (pch != NULL)
666     {
667     char* colon = strchr(pch, ':');
668     if (colon)
669     {
670     *colon = 0;
671     num_mismatches++;
672     int mismatch_pos = atoi(pch);
673     if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
674     (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
675     mismatches_in_anchor++;
676     }
677     //mismatch_toks.push_back(pch);
678     pch = strtok (NULL, ",");
679     }
680    
681     // FIXME: we probably should exclude these hits somewhere, but this
682     // isn't the right place
683     vector<CigarOp> cigar;
684     cigar.push_back(CigarOp(MATCH, left_splice_pos));
685     if(toks[num_extra_toks + junction_type_field] == "del"){
686     cigar.push_back(CigarOp(DEL, gap_len));
687     }else{
688     cigar.push_back(CigarOp(REF_SKIP, gap_len));
689     }
690     cigar.push_back(CigarOp(MATCH, right_splice_pos));
691    
692     bh = create_hit(name,
693     contig,
694     left,
695     cigar,
696     orientation == '-',
697     junction_strand == "rev",
698     num_mismatches,
699     mismatches_in_anchor,
700     end);
701     return true;
702     }
703     }
704     else
705     {
706     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
707     //fprintf(stderr, "%s\n", orig_bwt_buf);
708     // continue;
709     return false;
710     }
711    
712     return false;
713     }
714    
715     bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
716     BowtieHit& bh,
717     bool strip_slash,
718     char* name_out,
719     char* name_tags,
720     char* seq,
721     char* qual)
722     {
723     if (!orig_bwt_buf || !*orig_bwt_buf)
724     return false;
725     char bwt_buf[2048];
726     strcpy(bwt_buf, orig_bwt_buf);
727     // Are we still in the header region?
728     if (bwt_buf[0] == '@')
729     return false;
730    
731     char* buf = bwt_buf;
732     char* name = get_token((char**)&buf,"\t");
733     char* sam_flag_str = get_token((char**)&buf,"\t");
734     char* text_name = get_token((char**)&buf,"\t");
735     char* text_offset_str = get_token((char**)&buf,"\t");
736     const char* map_qual_str = get_token((char**)&buf,"\t");
737     char* cigar_str = get_token((char**)&buf,"\t");
738     const char* mate_ref_str = get_token((char**)&buf,"\t");
739     const char* mate_pos_str = get_token((char**)&buf,"\t");
740     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
741    
742     const char* seq_str = get_token((char**)&buf,"\t");
743     if (seq)
744     strcpy(seq, seq_str);
745    
746     const char* qual_str = get_token((char**)&buf,"\t");
747     if (qual)
748     strcpy(qual, qual_str);
749    
750     if (!name ||
751     !sam_flag_str ||
752     !text_name ||
753     !text_offset_str ||
754     !map_qual_str ||
755     !cigar_str ||
756     !mate_ref_str ||
757     !mate_pos_str ||
758     !inferred_insert_sz_str ||
759     !seq_str ||
760     !qual_str)
761     {
762     // truncated or malformed SAM record
763     return false;
764     }
765    
766    
767     int sam_flag = atoi(sam_flag_str);
768     int text_offset = atoi(text_offset_str);
769    
770     bool end = true;
771     unsigned int seg_offset = 0;
772     unsigned int seg_num = 0;
773     unsigned int num_segs = 0;
774    
775     // Copy the tag out of the name field before we might wipe it out
776     char* pipe = strrchr(name, '|');
777     if (pipe)
778     {
779     if (name_tags)
780     strcpy(name_tags, pipe);
781    
782     char* tag_buf = pipe + 1;
783     if (strchr(tag_buf, ':'))
784     {
785     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
786     if (seg_num + 1 == num_segs)
787     end = true;
788     else
789     end = false;
790     }
791    
792     *pipe = 0;
793     }
794    
795     // Stripping the slash and number following it gives the insert name
796     char* slash = strrchr(name, '/');
797     if (strip_slash && slash)
798     *slash = 0;
799    
800     char* p_cig = cigar_str;
801     //int len = strlen(sequence);
802     vector<CigarOp> cigar;
803     bool spliced_alignment = false;
804     // Mostly pilfered direct from the SAM tools:
805     while (*p_cig)
806     {
807     char* t;
808     int length = (int)strtol(p_cig, &t, 10);
809     if (length <= 0)
810     {
811     //fprintf (stderr, "CIGAR op has zero length\n");
812     return false;
813     }
814     char op_char = toupper(*t);
815     CigarOpCode opcode;
816     if (op_char == 'M') opcode = MATCH;
817     else if (op_char == 'I') opcode = INS;
818     else if (op_char == 'D') opcode = DEL;
819     else if (op_char == 'N')
820     {
821     if (length > max_report_intron_length)
822     return false;
823     opcode = REF_SKIP;
824     spliced_alignment = true;
825     }
826     else if (op_char == 'S') opcode = SOFT_CLIP;
827     else if (op_char == 'H') opcode = HARD_CLIP;
828     else if (op_char == 'P') opcode = PAD;
829     else
830     {
831     fprintf (stderr, "invalid CIGAR operation\n");
832     return false;
833     }
834     p_cig = t + 1;
835     //i += length;
836     cigar.push_back(CigarOp(opcode, length));
837     }
838     if (*p_cig)
839     {
840     fprintf (stderr, "unmatched CIGAR operation\n");
841     return false;
842     }
843    
844     //vector<string> attributes;
845     //tokenize(tag_buf, " \t",attributes);
846    
847     bool antisense_splice = false;
848     unsigned char num_mismatches = 0;
849     unsigned char num_splice_anchor_mismatches = 0;
850     const char* tag_buf = buf;
851    
852     // while((tag_buf = get_token((char**)&buf,"\t")))
853     // {
854     // vector<string> tuple_fields;
855     // tokenize(tag_buf,":", tuple_fields);
856     // if (tuple_fields.size() == 3)
857     // {
858     // if (tuple_fields[0] == "XS")
859     // {
860     // if (tuple_fields[2] == "-")
861     // antisense_splice = true;
862     // }
863     // else if (tuple_fields[0] == "NM")
864     // {
865     // num_mismatches = atoi(tuple_fields[2].c_str());
866     // }
867     // else if (tuple_fields[0] == "NS")
868     // {
869     // num_splice_anchor_mismatches = atoi(tuple_fields[2].c_str());
870     // }
871     // else
872     // {
873     // fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
874     // return false;
875     // }
876     // }
877     // }
878    
879     while((tag_buf = get_token((char**)&buf,"\t")))
880     {
881     vector<string> tuple_fields;
882     tokenize(tag_buf,":", tuple_fields);
883     if (tuple_fields.size() == 3)
884     {
885     if (tuple_fields[0] == "XS")
886     {
887     if (tuple_fields[2] == "-")
888     antisense_splice = true;
889     }
890     else if (tuple_fields[0] == "NM")
891     {
892     num_mismatches = atoi(tuple_fields[2].c_str());
893     }
894     else if (tuple_fields[0] == "NS")
895     {
896     //ignored for now
897     }
898     else
899     {
900     //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
901     //return false;
902     }
903     }
904     }
905    
906    
907     /*
908     * By convention,the NM field of the SAM record
909     * counts an insertion or deletion. I dont' think
910     * we want the mismatch count in the BowtieHit
911     * record to reflect this. Therefore, subtract out
912     * the mismatches due to in/dels
913     */
914     for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
915     if(itr->opcode == INS){
916     num_mismatches -= itr->length;
917     }
918     if(itr->opcode == DEL){
919     num_mismatches -= itr->length;
920     }
921     }
922     // vector<string> toks;
923     // tokenize(tag_buf, ":", toks);
924     if (spliced_alignment)
925     {
926     bh = create_hit(name,
927     text_name,
928     text_offset - 1,
929     cigar,
930     sam_flag & 0x0010,
931     antisense_splice,
932     num_mismatches,
933     num_splice_anchor_mismatches,
934     end);
935     return true;
936    
937     }
938     else
939     {
940     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
941    
942     bh = create_hit(name,
943     text_name,
944     text_offset - 1, // SAM files are 1-indexed
945     cigar,
946     sam_flag & 0x0010,
947     false,
948     num_mismatches,
949     0,
950     end);
951     return true;
952     }
953    
954     return false;
955     }
956    
957    
958    
959     bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
960     BowtieHit& bh, bool strip_slash,
961     char* name_out, char* name_tags,
962     char* seq, char* qual) {
963     if (_sam_header==NULL)
964     err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
965     const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
966    
967    
968     uint32_t sam_flag = hit_buf->core.flag;
969    
970     int text_offset = hit_buf->core.pos;
971     int text_mate_pos = hit_buf->core.mpos;
972     int target_id = hit_buf->core.tid;
973     int mate_target_id = hit_buf->core.mtid;
974    
975     vector<CigarOp> cigar;
976     bool spliced_alignment = false;
977     int num_hits = 1;
978    
979     double mapQ = hit_buf->core.qual;
980     long double error_prob;
981     if (mapQ > 0)
982     {
983     long double p = (-1.0 * mapQ) / 10.0;
984     error_prob = pow(10.0L, p);
985     }
986     else
987     {
988     error_prob = 1.0;
989     }
990    
991     //header->target_name[c->tid]
992    
993     bool end = true;
994     unsigned int seg_offset = 0;
995     unsigned int seg_num = 0;
996     unsigned int num_segs = 0;
997     // Copy the tag out of the name field before we might wipe it out
998     char* qname = bam1_qname(hit_buf);
999     char* pipe = strrchr(qname, '|');
1000     if (pipe)
1001     {
1002     if (name_tags)
1003     strcpy(name_tags, pipe);
1004    
1005     char* tag_buf = pipe + 1;
1006     if (strchr(tag_buf, ':'))
1007     {
1008     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1009     if (seg_num + 1 == num_segs)
1010     end = true;
1011     else
1012     end = false;
1013     }
1014    
1015     *pipe = 0;
1016     }
1017    
1018    
1019     if (target_id < 0) {
1020     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1021     bh = create_hit(qname,
1022     "*", //ref_name
1023     0, //left coord
1024     0, //read_len
1025     false, //antisense_aln
1026     0, //edit_dist
1027     end);
1028     return true;
1029     }
1030    
1031     //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1032     string text_name = _sam_header->target_name[target_id];
1033     for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1034     {
1035     //char* t;
1036    
1037     int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1038     if (length <= 0)
1039     {
1040     fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1041     return false;
1042     }
1043    
1044     CigarOpCode opcode;
1045     switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1046     {
1047     case BAM_CMATCH: opcode = MATCH; break;
1048     case BAM_CINS: opcode = INS; break;
1049     case BAM_CDEL: opcode = DEL; break;
1050     case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1051     case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1052     case BAM_CPAD: opcode = PAD; break;
1053     case BAM_CREF_SKIP:
1054     opcode = REF_SKIP;
1055     spliced_alignment = true;
1056     if (length > (int)max_report_intron_length)
1057     {
1058     //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1059     return false;
1060     }
1061     break;
1062     default:
1063     fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1064     return false;
1065     }
1066     if (opcode != HARD_CLIP)
1067     cigar.push_back(CigarOp(opcode, length));
1068     }
1069    
1070     string mrnm;
1071     if (mate_target_id >= 0) {
1072     if (mate_target_id == target_id) {
1073     //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1074     mrnm = _sam_header->target_name[mate_target_id];
1075     }
1076     else {
1077     //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1078     return false;
1079     }
1080     }
1081     else {
1082     text_mate_pos = 0;
1083     }
1084     //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1085     bool antisense_splice=false;
1086     unsigned char num_mismatches = 0;
1087     unsigned char num_splice_anchor_mismatches = 0;
1088    
1089     uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1090     if (ptr) {
1091     char src_strand_char = bam_aux2A(ptr);
1092     if (src_strand_char == '-')
1093     antisense_splice = true;
1094     // else if (src_strand_char == '+')
1095     // source_strand = CUFF_FWD;
1096     }
1097    
1098     ptr = bam_aux_get(hit_buf, "NM");
1099     if (ptr) {
1100     num_mismatches = bam_aux2i(ptr);
1101     }
1102    
1103     ptr = bam_aux_get(hit_buf, "NH");
1104     if (ptr) {
1105     num_hits = bam_aux2i(ptr);
1106     }
1107    
1108     //bool antisense_aln = bam1_strand(hit_buf);
1109    
1110     //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1111     // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1112     if (spliced_alignment) {
1113     //if (source_strand == CUFF_STRAND_UNKNOWN) {
1114     // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1115     // }
1116     bh = create_hit(qname,
1117     text_name,
1118     text_offset, // BAM files are 0-indexed
1119     cigar,
1120     sam_flag & 0x0010,
1121     antisense_splice,
1122     num_mismatches,
1123     num_splice_anchor_mismatches,
1124     end);
1125    
1126     }
1127     else {
1128     //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1129     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1130     bh = create_hit(qname,
1131     text_name,
1132     text_offset, // BAM files are 0-indexed
1133     cigar,
1134     sam_flag & 0x0010,
1135     false,
1136     num_mismatches,
1137     0,
1138     end);
1139     }
1140     if (seq!=NULL) {
1141     char *bseq = (char*)bam1_seq(hit_buf);
1142     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1143     char v = bam1_seqi(bseq,i);
1144     seq[i]=bam_nt16_rev_table[v];
1145     }
1146     seq[hit_buf->core.l_qseq]=0;
1147     }
1148     if (qual!=NULL) {
1149     char *bq = (char*)bam1_qual(hit_buf);
1150     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1151     qual[i]=bq[i]+33;
1152     }
1153     qual[hit_buf->core.l_qseq]=0;
1154     }
1155     return true;
1156     }
1157    
1158     bool BAMHitFactory::inspect_header(HitStream& hs)
1159     {
1160     bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1161    
1162     if (header == NULL) {
1163     fprintf(stderr, "Warning: No BAM header\n");
1164     return false;
1165     }
1166     if (header->l_text == 0) {
1167     fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1168     return false;
1169     }
1170     return true;
1171     }
1172    
1173    
1174     void get_mapped_reads(FILE* bwtf,
1175     HitTable& hits,
1176     HitFactory& hit_factory,
1177     bool strip_slash,
1178     bool verbose)
1179     {
1180    
1181    
1182     char bwt_buf[2048];
1183     uint32_t reads_extracted = 0;
1184    
1185     while (fgets(bwt_buf, 2048, bwtf))
1186     {
1187     // Chomp the newline
1188     char* nl = strrchr(bwt_buf, '\n');
1189     if (nl) *nl = 0;
1190     if (*bwt_buf == 0)
1191     continue;
1192     // Get a new record from the tab-delimited Bowtie map
1193     BowtieHit bh;
1194     if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1195     {
1196     // Only check uniqueness if these hits are spliced
1197     hits.add_hit(bh, true);
1198     }
1199     reads_extracted++;
1200     }
1201    
1202     // This will sort the map by insert id.
1203     hits.finalize();
1204     fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1205     }
1206    
1207    
1208     /*
1209     AlignStatus status(const BowtieHit* align)
1210     {
1211     if (!align)
1212     return UNALIGNED;
1213     if (align->contiguous())
1214     return CONTIGUOUS;
1215     return SPLICED;
1216     }
1217     */
1218    
1219     void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1220     {
1221     int max_hit_pos = -1;
1222     for (size_t i = 0; i < hits.size(); ++i)
1223     {
1224     max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1225     }
1226    
1227     if ((int)DoC.size() < max_hit_pos)
1228     DoC.resize(max_hit_pos);
1229    
1230     for (size_t i = 0; i < hits.size(); ++i)
1231     {
1232     const BowtieHit& bh = hits[i];
1233    
1234     // split up the coverage contibution for this reads
1235     size_t j = bh.left();
1236     const vector<CigarOp>& cigar = bh.cigar();
1237    
1238     for (size_t c = 0 ; c < cigar.size(); ++c)
1239     {
1240     switch(cigar[c].opcode)
1241     {
1242     case MATCH:
1243     for (size_t m = 0; m < cigar[c].length; ++m)
1244     {
1245     if (DoC[j + m] < 0xFFFF)
1246     DoC[j + m]++;
1247     }
1248     //fall through this case to REF_SKIP is intentional
1249     case REF_SKIP:
1250     j += cigar[c].length;
1251     break;
1252     default:
1253     break;
1254     }
1255    
1256     }
1257     }
1258     }
1259    
1260     void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1261     {
1262     if ((int)DoC.size() < bh.right())
1263     DoC.resize(bh.right());
1264    
1265     // split up the coverage contibution for this reads
1266     size_t j = bh.left();
1267     const vector<CigarOp>& cigar = bh.cigar();
1268    
1269     for (size_t c = 0 ; c < cigar.size(); ++c)
1270     {
1271     switch(cigar[c].opcode)
1272     {
1273     case MATCH:
1274     for (size_t m = 0; m < cigar[c].length; ++m)
1275     {
1276     if (DoC[j + m] < 0xFFFFFFFF)
1277     DoC[j + m]++;
1278     }
1279     //fall through this case to REF_SKIP is intentional
1280     case REF_SKIP:
1281     j += cigar[c].length;
1282     break;
1283     default:
1284     break;
1285     }
1286    
1287     }
1288     }
1289    
1290     void print_hit(FILE* fout,
1291     const char* read_name,
1292     const BowtieHit& bh,
1293     const char* ref_name,
1294     const char* sequence,
1295     const char* qualities,
1296     bool from_bowtie)
1297     {
1298     string seq;
1299     string quals;
1300     if (sequence)
1301     {
1302     seq = sequence;
1303     quals = qualities;
1304     seq.resize(bh.read_len());
1305     quals.resize(bh.read_len());
1306     }
1307     else
1308     {
1309     seq = "*";
1310     }
1311    
1312     if (qualities)
1313     {
1314     quals = qualities;
1315     quals.resize(bh.read_len());
1316     }
1317     else
1318     {
1319     quals = "*";
1320     }
1321    
1322     uint32_t sam_flag = 0;
1323     if (bh.antisense_align())
1324     {
1325     sam_flag |= 0x0010; // BAM_FREVERSE
1326     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1327     {
1328     reverse_complement(seq);
1329     reverse(quals.begin(), quals.end());
1330     }
1331     }
1332    
1333     uint32_t sam_pos = bh.left() + 1;
1334     uint32_t map_quality = 255;
1335     char cigar[256];
1336     cigar[0] = 0;
1337     string mate_ref_name = "*";
1338     uint32_t mate_pos = 0;
1339     uint32_t insert_size = 0;
1340     //string qualities = "*";
1341    
1342     const vector<CigarOp>& bh_cigar = bh.cigar();
1343    
1344     /*
1345     * In addition to calculating the cigar string,
1346     * we need to figure out how many in/dels are in the
1347     * sequence, so that we can give the correct
1348     * value for the NM tag
1349     */
1350     int indel_distance = 0;
1351     for (size_t c = 0; c < bh_cigar.size(); ++c)
1352     {
1353     char ibuf[64];
1354     sprintf(ibuf, "%d", bh_cigar[c].length);
1355     switch(bh_cigar[c].opcode)
1356     {
1357     case MATCH:
1358     strcat(cigar, ibuf);
1359     strcat(cigar, "M");
1360     break;
1361     case INS:
1362     strcat(cigar, ibuf);
1363     strcat(cigar, "I");
1364     indel_distance += bh_cigar[c].length;
1365     break;
1366     case DEL:
1367     strcat(cigar, ibuf);
1368     strcat(cigar, "D");
1369     indel_distance += bh_cigar[c].length;
1370     break;
1371     case REF_SKIP:
1372     strcat(cigar, ibuf);
1373     strcat(cigar, "N");
1374     break;
1375     default:
1376     break;
1377     }
1378     }
1379    
1380     //string q = string(bh.read_len(), '!');
1381     //string s = string(bh.read_len(), 'N');
1382    
1383     fprintf(fout,
1384     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1385     read_name,
1386     sam_flag,
1387     ref_name,
1388     sam_pos,
1389     map_quality,
1390     cigar,
1391     mate_ref_name.c_str(),
1392     mate_pos,
1393     insert_size,
1394     seq.c_str(),
1395     quals.c_str());
1396    
1397     if (!sam_readgroup_id.empty())
1398     fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1399    
1400     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1401    
1402     bool containsSplice = false;
1403     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1404     if(itr->opcode == REF_SKIP){
1405     containsSplice = true;
1406     break;
1407     }
1408     }
1409    
1410     if (containsSplice)
1411     fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1412    
1413     fprintf(fout, "\n");
1414     }
1415    
1416     void print_bamhit(GBamWriter& wbam,
1417     const char* read_name,
1418     const BowtieHit& bh,
1419     const char* ref_name,
1420     const char* sequence,
1421     const char* qualities,
1422     bool from_bowtie)
1423     {
1424     string seq;
1425     string quals;
1426     if (sequence) {
1427     seq = sequence;
1428     quals = qualities;
1429     seq.resize(bh.read_len());
1430     quals.resize(bh.read_len());
1431     }
1432     else {
1433     seq = "*";
1434     }
1435     if (qualities) {
1436     quals = qualities;
1437     quals.resize(bh.read_len());
1438     }
1439     else
1440     {
1441     quals = "*";
1442     }
1443    
1444     uint32_t sam_flag = 0;
1445     if (bh.antisense_align())
1446     {
1447     sam_flag |= 0x0010; // BAM_FREVERSE
1448     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1449     {
1450     reverse_complement(seq);
1451     reverse(quals.begin(), quals.end());
1452     }
1453     }
1454    
1455     uint32_t sam_pos = bh.left() + 1;
1456     uint32_t map_quality = 255;
1457     char cigar[256];
1458     cigar[0] = 0;
1459     string mate_ref_name = "*";
1460     uint32_t mate_pos = 0;
1461     uint32_t insert_size = 0;
1462     //string qualities = "*";
1463    
1464     const vector<CigarOp>& bh_cigar = bh.cigar();
1465     /*
1466     * In addition to calculating the cigar string,
1467     * we need to figure out how many in/dels are in the
1468     * sequence, so that we can give the correct
1469     * value for the NM tag
1470     */
1471     int indel_distance = 0;
1472     for (size_t c = 0; c < bh_cigar.size(); ++c)
1473     {
1474     char ibuf[64];
1475     sprintf(ibuf, "%d", bh_cigar[c].length);
1476     switch(bh_cigar[c].opcode)
1477     {
1478     case MATCH:
1479     strcat(cigar, ibuf);
1480     strcat(cigar, "M");
1481     break;
1482     case INS:
1483     strcat(cigar, ibuf);
1484     strcat(cigar, "I");
1485     indel_distance += bh_cigar[c].length;
1486     break;
1487     case DEL:
1488     strcat(cigar, ibuf);
1489     strcat(cigar, "D");
1490     indel_distance += bh_cigar[c].length;
1491     break;
1492     case REF_SKIP:
1493     strcat(cigar, ibuf);
1494     strcat(cigar, "N");
1495     break;
1496     default:
1497     break;
1498     }
1499     }
1500    
1501     //string q = string(bh.read_len(), '!');
1502     //string s = string(bh.read_len(), 'N');
1503     /*
1504     fprintf(fout,
1505     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1506     read_name,
1507     sam_flag,
1508     ref_name,
1509     sam_pos,
1510     map_quality,
1511     cigar,
1512     mate_ref_name.c_str(),
1513     mate_pos,
1514     insert_size,
1515     seq.c_str(),
1516     quals.c_str());
1517    
1518     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1519     */
1520     vector<string> auxdata;
1521    
1522     if (!sam_readgroup_id.empty())
1523     {
1524     string nm("RG:Z:");
1525     nm += sam_readgroup_id;
1526     auxdata.push_back(nm);
1527     }
1528    
1529     string nm("NM:i:");
1530     str_appendInt(nm, bh.edit_dist() + indel_distance);
1531     auxdata.push_back(nm);
1532     bool containsSplice = false;
1533     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1534     if(itr->opcode == REF_SKIP){
1535     containsSplice = true;
1536     break;
1537     }
1538    
1539     if (containsSplice) {
1540     //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1541     nm="XS:A:";
1542     nm+=(char)(bh.antisense_splice() ? '-' : '+');
1543     auxdata.push_back(nm);
1544     }
1545    
1546     GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1547     cigar, mate_ref_name.c_str(), mate_pos,
1548     insert_size, seq.c_str(), quals.c_str(), &auxdata);
1549    
1550    
1551    
1552     wbam.write(brec);
1553     delete brec;
1554     }
1555    
1556     /**
1557     * Print a vector of cigar operations to a file.
1558     * @param bh_cigar A vector of CigarOps
1559     * @return a string representation of the cigar string
1560     */
1561     std::string print_cigar(vector<CigarOp>& bh_cigar){
1562     char cigar[256];
1563     cigar[0] = 0;
1564     for (size_t c = 0; c < bh_cigar.size(); ++c)
1565     {
1566     char ibuf[64];
1567     sprintf(ibuf, "%d", bh_cigar[c].length);
1568     switch(bh_cigar[c].opcode)
1569     {
1570     case MATCH:
1571     strcat(cigar, ibuf);
1572     strcat(cigar, "M");
1573     break;
1574     case INS:
1575     strcat(cigar, ibuf);
1576     strcat(cigar, "I");
1577     break;
1578     case DEL:
1579     strcat(cigar, ibuf);
1580     strcat(cigar, "D");
1581     break;
1582     case REF_SKIP:
1583     strcat(cigar, ibuf);
1584     strcat(cigar, "N");
1585     break;
1586     default:
1587     break;
1588     }
1589     }
1590     string result(cigar);
1591     return result;
1592     }
1593    
1594     bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
1595     {
1596     RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
1597     if (!ref_str)
1598     return false;
1599    
1600     const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
1601    
1602     size_t pos_seq = 0;
1603     size_t pos_ref = 0;
1604     size_t mismatch = 0;
1605     for (size_t i = 0; i < _cigar.size(); ++i)
1606     {
1607     CigarOp cigar = _cigar[i];
1608     switch(cigar.opcode)
1609     {
1610     case MATCH:
1611     {
1612     for (size_t j = 0; j < cigar.length; ++j)
1613     {
1614     if (_seq[pos_seq++] != ref_seq[pos_ref++])
1615     ++mismatch;
1616     }
1617     }
1618     break;
1619     case INS:
1620     {
1621     pos_seq += cigar.length;
1622     }
1623     break;
1624    
1625     case DEL:
1626     case REF_SKIP:
1627     {
1628     pos_ref += cigar.length;
1629     }
1630     break;
1631    
1632     default:
1633     break;
1634     }
1635     }
1636    
1637     return mismatch == _edit_dist;
1638     }