ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 69
Committed: Mon Sep 19 21:00:41 2011 UTC (8 years, 10 months ago) by gpertea
File size: 52149 byte(s)
Log Message:
wip - implementing SplicedSAMHitFactory::get_hit_from_buf()

Line User Rev File contents
1 gpertea 29 /*
2     * bwt_map.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/17/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <cassert>
15     #include <cstdlib>
16     #include <iostream>
17     #include <set>
18     #include <vector>
19     #include <cmath>
20    
21     #include "common.h"
22     #include "bwt_map.h"
23     #include "tokenize.h"
24     #include "reads.h"
25    
26     using namespace std;
27    
28     void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29     {
30     uint32_t reference_id = bh.ref_id();
31    
32     pair<RefHits::iterator, bool> ret =
33     _hits_for_ref.insert(make_pair(reference_id, HitList()));
34     HitList& hl = ret.first->second;
35    
36     if (check_uniqueness)
37     {
38     // Check uniqueness, in case we are adding spliced hits from
39     // several spliced alignment sources (e.g. de novo hashing + Bowtie
40     // against a user-supplied index). We don't want to count the same
41     // alignment twice if it happened to be found by more than one method
42     HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43     HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44     for (; lb != ub && lb != hl.end(); ++lb)
45     {
46     if (*lb == bh)
47     {
48     //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49     return;
50     }
51    
52     if (lb->insert_id() == bh.insert_id() &&
53     lb->ref_id() == bh.ref_id() &&
54     lb->antisense_align() == bh.antisense_align())
55     {
56     // If we get here, we may be looking at the same alignment
57     // However, spanning_reads may report a shorter, trimmed alignment
58     // so not all fields will be equal. If they just disagree on the
59     // ends, and don't indicate a different junction coord, the
60     // alignments are the same.
61    
62     if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63     (bh.left() <= lb->left() && bh.right() >= lb->right()))
64     {
65     vector<pair<int,int> > lb_gaps, bh_gaps;
66     lb->gaps(lb_gaps);
67     bh.gaps(bh_gaps);
68     if (lb_gaps == bh_gaps)
69     {
70     // One alignment is contained in the other, they agree on
71     // where the gaps, if any, are, and they share an id
72     // => this is a redundant aligment, so toss it
73     //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74     return;
75     }
76     }
77     }
78     }
79     }
80     _total_hits++;
81     hl.push_back(bh);
82     }
83    
84     bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85     {
86     return h1.insert_id() < h2.insert_id();
87     }
88    
89     void LineHitFactory::openStream(HitStream& hs)
90     {
91     if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92     //open the file for HitStream here
93     hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94     if (hs._hit_file==NULL)
95     err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96     return;
97     }
98     if (hs._fzpipe!=NULL) {
99     hs._hit_file=hs._fzpipe->file;
100     }
101     }
102    
103     void LineHitFactory::rewind(HitStream& hs)
104     {
105     if (hs._fzpipe!=NULL) {
106     hs._fzpipe->rewind();
107     hs._hit_file=hs._fzpipe->file;
108     }
109     else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110     }
111    
112     bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113     FILE* f=(FILE *)(hs._hit_file);
114     bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115     if (!new_rec || feof(f)) {
116     hs._eof=true;
117     return false;
118     }
119     ++_line_num;
120     char* nl = strrchr(_hit_buf, '\n');
121     if (nl) *nl = 0;
122     buf = _hit_buf;
123     buf_size = _hit_buf_max_sz - 1;
124     return true;
125     }
126    
127     void LineHitFactory::closeStream(HitStream& hs) {
128     if (hs._fzpipe!=NULL) {
129     hs._fzpipe->close();
130     return;
131     }
132     if (hs._hit_file!=NULL) {
133     fclose((FILE*)(hs._hit_file));
134     hs._hit_file=NULL;
135     }
136     }
137     void BAMHitFactory::openStream(HitStream& hs) {
138     if (hs._hit_file==NULL) {
139     if (hs._hit_file_name.empty())
140     //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141     return; //invalid stream, could be just a place holder
142     //open the file here if not already open
143     string fext=getFext(hs._hit_file_name);
144     if (fext=="sam")
145     hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146     else
147     hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148    
149     samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150    
151     if (sam_file == NULL)
152     err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153     if (sam_file->header == NULL)
154     err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155     memset(&_next_hit, 0, sizeof(_next_hit));
156     //_beginning = bgzf_tell(sam_file->x.bam);
157     _sam_header=sam_file->header;
158     if (inspect_header(hs) == false)
159     err_die("Error: invalid SAM header for file %s\n",
160     hs._hit_file_name.c_str());
161     }
162     }
163    
164     void BAMHitFactory::closeStream(HitStream& hs) {
165     if (hs._hit_file) {
166     samclose((samfile_t*)(hs._hit_file));
167     }
168     hs._hit_file=NULL;
169     _sam_header=NULL;
170     }
171    
172     void BAMHitFactory::rewind(HitStream& hs)
173     {
174     /*
175     if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176     {
177     bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178     _eof = false;
179     }
180     */
181     this->closeStream(hs);
182     this->openStream(hs);
183     }
184    
185     string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186     const bam1_t* bamrec=(const bam1_t*)hit_buf;
187     char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188     string sam_line(tamline);
189     free(tamline);
190     return sam_line;
191     }
192    
193     bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194     if (_next_hit.data) {
195     free(_next_hit.data);
196     _next_hit.data = NULL;
197     }
198     _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199     if (hs.eof() || !hs.ready()) return false;
200    
201     //mark_curr_pos();
202    
203     memset(&_next_hit, 0, sizeof(_next_hit));
204    
205     int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206     if (bytes_read <= 0) {
207     hs._eof = true;
208     return false;
209     }
210     buf = (const char*)&_next_hit;
211     buf_size = bytes_read;
212     return true;
213     }
214    
215    
216     BowtieHit HitFactory::create_hit(const string& insert_name,
217     const string& ref_name,
218     int left,
219     const vector<CigarOp>& cigar,
220     bool antisense_aln,
221     bool antisense_splice,
222     unsigned char edit_dist,
223     unsigned char splice_mms,
224     bool end)
225     {
226     uint64_t insert_id = _insert_table.get_id(insert_name);
227     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228    
229     return BowtieHit(reference_id,
230     insert_id,
231     left,
232     cigar,
233     antisense_aln,
234     antisense_splice,
235     edit_dist,
236     splice_mms,
237     end);
238     }
239    
240     BowtieHit HitFactory::create_hit(const string& insert_name,
241     const string& ref_name,
242     uint32_t left,
243     uint32_t read_len,
244     bool antisense_aln,
245     unsigned char edit_dist,
246     bool end)
247     {
248     uint64_t insert_id = _insert_table.get_id(insert_name);
249     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250    
251     return BowtieHit(reference_id,
252     insert_id,
253     left,
254     read_len,
255     antisense_aln,
256     edit_dist,
257     end);
258     }
259    
260     bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261     BowtieHit& bh,
262     bool strip_slash,
263     char* name_out,
264     char* name_tags,
265     char* seq,
266     char* qual)
267     {
268     if (!orig_bwt_buf || !*orig_bwt_buf)
269     return false;
270    
271     static const int buf_size = 2048;
272    
273     char bwt_buf[buf_size];
274     strcpy(bwt_buf, orig_bwt_buf);
275     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276    
277     char orientation;
278    
279     //int bwtf_ret = 0;
280     //uint32_t seqid = 0;
281    
282     char text_name[buf_size];
283     unsigned int text_offset;
284    
285    
286     //unsigned int other_occs;
287     char mismatches[buf_size];
288     //memset(mismatches, 0, sizeof(mismatches));
289    
290     const char* buf = bwt_buf;
291     char* name = get_token((char**)&buf,"\t");
292     char* orientation_str = get_token((char**)&buf,"\t");
293     char* text_name_str = get_token((char**)&buf,"\t");
294    
295     strcpy(text_name, text_name_str);
296    
297     char* text_offset_str = get_token((char**)&buf,"\t");
298     char* seq_str = get_token((char**)&buf,"\t");
299     if (seq)
300     strcpy(seq, seq_str);
301    
302     const char* qual_str = get_token((char**)&buf,"\t");
303     if (qual)
304     strcpy(qual, qual_str);
305    
306     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307     mismatches[0] = 0;
308     char* mismatches_str = get_token((char**)&buf, "\t");
309     if (mismatches_str)
310     strcpy(mismatches, mismatches_str);
311    
312     orientation = orientation_str[0];
313     text_offset = atoi(text_offset_str);
314    
315     bool end = true;
316     unsigned int seg_offset = 0;
317     unsigned int seg_num = 0;
318     unsigned int num_segs = 0;
319    
320     // Copy the tag out of the name field before we might wipe it out
321     char* pipe = strrchr(name, '|');
322     if (pipe)
323     {
324     if (name_tags)
325     strcpy(name_tags, pipe);
326    
327     char* tag_buf = pipe + 1;
328     if (strchr(tag_buf, ':'))
329     {
330     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331     if (seg_num + 1 == num_segs)
332     end = true;
333     else
334     end = false;
335     }
336    
337     *pipe = 0;
338     }
339     // Stripping the slash and number following it gives the insert name
340     char* slash = strrchr(name, '/');
341     if (strip_slash && slash)
342     *slash = 0;
343    
344     size_t read_len = strlen(seq_str);
345    
346     // Add this alignment to the table of hits for this half of the
347     // Bowtie map
348    
349     //vector<string> mismatch_toks;
350     char* pch = strtok (mismatches,",");
351     unsigned char num_mismatches = 0;
352     while (pch != NULL)
353     {
354     char* colon = strchr(pch, ':');
355     if (colon)
356     {
357     num_mismatches++;
358     }
359     //mismatch_toks.push_back(pch);
360     pch = strtok (NULL, ",");
361     }
362    
363     bh = create_hit(name,
364     text_name,
365     text_offset,
366     (int)read_len,
367     orientation == '-',
368     num_mismatches,
369     end);
370    
371     return true;
372     }
373    
374     int anchor_mismatch = 0;
375    
376     bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
377     BowtieHit& bh,
378     bool strip_slash,
379     char* name_out,
380     char* name_tags,
381     char* seq,
382     char* qual)
383     {
384     if (!orig_bwt_buf || !*orig_bwt_buf)
385     return false;
386    
387     static const int buf_size = 2048;
388    
389     char bwt_buf[buf_size];
390     strcpy(bwt_buf, orig_bwt_buf);
391     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
392    
393     char orientation;
394     char text_name[buf_size];
395     unsigned int text_offset;
396     char mismatches[buf_size];
397     //memset(mismatches, 0, sizeof(mismatches));
398    
399     char* buf = bwt_buf;
400     char* name = get_token((char**)&buf,"\t");
401     char* orientation_str = get_token((char**)&buf,"\t");
402     char* text_name_str = get_token((char**)&buf,"\t");
403     strcpy(text_name, text_name_str);
404    
405     char* text_offset_str = get_token((char**)&buf,"\t");
406     char* seq_str = get_token((char**)&buf,"\t");
407     if (seq)
408     strcpy(seq, seq_str);
409    
410     const char* qual_str = get_token((char**)&buf,"\t");
411     if (qual)
412     strcpy(qual, qual_str);
413    
414     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
415     mismatches[0] = 0;
416     char* mismatches_str = get_token((char**)&buf, "\t");
417     if (mismatches_str)
418     strcpy(mismatches, mismatches_str);
419    
420     orientation = orientation_str[0];
421     text_offset = atoi(text_offset_str);
422    
423     bool end = true;
424     unsigned int seg_offset = 0;
425     unsigned int seg_num = 0;
426     unsigned int num_segs = 0;
427    
428     // Copy the tag out of the name field before we might wipe it out
429     char* pipe = strrchr(name, '|');
430     if (pipe)
431     {
432     if (name_tags)
433     strcpy(name_tags, pipe);
434    
435     char* tag_buf = pipe + 1;
436     if (strchr(tag_buf, ':'))
437     {
438     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
439     if (seg_num + 1 == num_segs)
440     end = true;
441     else
442     end = false;
443     }
444    
445     *pipe = 0;
446     }
447     // Stripping the slash and number following it gives the insert name
448     char* slash = strrchr(name, '/');
449     if (strip_slash && slash)
450     *slash = 0;
451    
452     //int read_len = strlen(sequence);
453    
454     // Add this alignment to the table of hits for this half of the
455     // Bowtie map
456    
457     // Parse the text_name field to recover the splice coords
458     vector<string> toks;
459    
460     tokenize_strict(text_name, "|", toks);
461    
462     int num_extra_toks = (int)toks.size() - 6;
463    
464     if (num_extra_toks >= 0)
465     {
466     static const uint8_t left_window_edge_field = 1;
467     static const uint8_t splice_field = 2;
468     //static const uint8_t right_window_edge_field = 3;
469     static const uint8_t junction_type_field = 4;
470     static const uint8_t strand_field = 5;
471    
472     string contig = toks[0];
473     for (int t = 1; t <= num_extra_toks; ++t)
474     {
475     contig += "|";
476     contig += toks[t];
477     }
478    
479     vector<string> splice_toks;
480     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
481     if (splice_toks.size() != 2)
482     {
483     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
484     //fprintf(stderr, "%s (token: %s)\n", text_name,
485     // toks[num_extra_toks + splice_field].c_str());
486     return false;
487     }
488    
489     //
490     // check for an insertion hit
491     //
492     if(toks[num_extra_toks + junction_type_field] == "ins")
493     {
494     int8_t spliced_read_len = strlen(seq_str);
495     /*
496     * The 0-based position of the left edge of the alignment. Note that this
497     * value may need to be futher corrected to account for the presence of
498     * of the insertion.
499     */
500     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
501     uint32_t right = left + spliced_read_len - 1;
502    
503     /*
504     * The 0-based position of the last genomic sequence before the insertion
505     */
506     uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
507    
508     string insertedSequence = splice_toks[1];
509     /*
510     * The 0-based position of the first genomic sequence after teh insertion
511     */
512     uint32_t right_splice_pos = left_splice_pos + 1;
513     if(left > left_splice_pos){
514     /*
515     * The genomic position of the left edge of the alignment needs to be corrected
516     * If the alignment does not extend into the insertion, simply subtract the length
517     * of the inserted sequence, otherwise, just set it equal to the right edge
518     */
519     left = left - insertedSequence.length();
520     if(left < right_splice_pos){
521     left = right_splice_pos;
522     }
523     }
524     if(right > left_splice_pos){
525     right = right - insertedSequence.length();
526     if(right < left_splice_pos){
527     right = left_splice_pos;
528     }
529     }
530     /*
531     * Now, right and left should be properly transformed into genomic coordinates
532     * We should be able to deduce how much the alignment matches the insertion
533     * simply based on the length of the read
534     */
535     int left_match_length = 0;
536     if(left <= left_splice_pos){
537     left_match_length = left_splice_pos - left + 1;
538     }
539     int right_match_length = 0;
540     if(right >= right_splice_pos){
541     right_match_length = right - right_splice_pos + 1;
542     }
543     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
544    
545     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
546     return false;
547    
548     string junction_strand = toks[num_extra_toks + strand_field];
549     if(junction_strand != "rev" && junction_strand != "fwd"){
550     fprintf(stderr, "Malformed insertion record\n");
551     return false;
552     }
553    
554     char* pch = strtok( mismatches, ",");
555     unsigned char num_mismatches = 0;
556    
557     /*
558     * remember that text_offset holds the left end of the
559     * alignment, relative to the start of the contig
560     */
561    
562     /*
563     * The 0-based relative position of the left-most character
564     * before the insertion in the contig
565     */
566     int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
567     while (pch != NULL)
568     {
569     char* colon = strchr(pch, ':');
570     if (colon)
571     {
572     *colon = 0;
573     int mismatch_pos = atoi(pch);
574    
575     /*
576     * for reversely mapped reads,
577     * find the correct mismatched position.
578     */
579     if(orientation == '-'){
580     mismatch_pos = spliced_read_len - mismatch_pos - 1;
581     }
582    
583     /*
584     * Only count mismatches outside of the insertion region
585     * If there is a mismatch within the insertion,
586     * disallow this hit
587     */
588     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
589     num_mismatches++;
590     }else{
591     return false;
592     }
593     }
594     pch = strtok (NULL, ",");
595     }
596    
597    
598     vector<CigarOp> cigar;
599     cigar.push_back(CigarOp(MATCH, left_match_length));
600     cigar.push_back(CigarOp(INS, insertion_match_length));
601     cigar.push_back(CigarOp(MATCH, right_match_length));
602    
603     /*
604     * For now, disallow hits that don't span
605     * the insertion. If we allow these types of hits,
606     * then long_spanning.cpp needs to be updated
607     * in order to intelligently merge these kinds
608     * of reads back together
609     *
610     * Following code has been changed to allow segment that end
611     * in an insertion
612     */
613     bh = create_hit(name,
614     contig,
615     left,
616     cigar,
617     orientation == '-',
618     junction_strand == "rev",
619     num_mismatches,
620     0,
621     end);
622     return true;
623     }
624    
625     else
626     {
627     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
628     int spliced_read_len = strlen(seq_str);
629     int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
630     if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
631     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
632    
633     if (right_splice_pos <= 0 || left_splice_pos <= 0)
634     return false;
635    
636     if (orientation == '+')
637     {
638     if (left_splice_pos + seg_offset < _anchor_length){
639     return false;
640     }
641     }
642     else
643     {
644     if (right_splice_pos + seg_offset < _anchor_length)
645     return false;
646     }
647     //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
648     //atoi(toks[right_window_edge_field].c_str());
649     int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
650    
651     string junction_strand = toks[num_extra_toks + strand_field];
652     if (!(junction_strand == "rev" || junction_strand == "fwd")||
653     !(orientation == '-' || orientation == '+'))
654     {
655     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
656     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
657     // junction_strand.c_str(), orientation);
658     return false;
659     }
660    
661     //vector<string> mismatch_toks;
662     char* pch = strtok (mismatches,",");
663     int mismatches_in_anchor = 0;
664     unsigned char num_mismatches = 0;
665     while (pch != NULL)
666     {
667     char* colon = strchr(pch, ':');
668     if (colon)
669     {
670     *colon = 0;
671     num_mismatches++;
672     int mismatch_pos = atoi(pch);
673     if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
674     (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
675     mismatches_in_anchor++;
676     }
677     //mismatch_toks.push_back(pch);
678     pch = strtok (NULL, ",");
679     }
680    
681     // FIXME: we probably should exclude these hits somewhere, but this
682     // isn't the right place
683     vector<CigarOp> cigar;
684     cigar.push_back(CigarOp(MATCH, left_splice_pos));
685     if(toks[num_extra_toks + junction_type_field] == "del"){
686     cigar.push_back(CigarOp(DEL, gap_len));
687     }else{
688     cigar.push_back(CigarOp(REF_SKIP, gap_len));
689     }
690     cigar.push_back(CigarOp(MATCH, right_splice_pos));
691    
692     bh = create_hit(name,
693     contig,
694     left,
695     cigar,
696     orientation == '-',
697     junction_strand == "rev",
698     num_mismatches,
699     mismatches_in_anchor,
700     end);
701     return true;
702     }
703     }
704     else
705     {
706     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
707     //fprintf(stderr, "%s\n", orig_bwt_buf);
708     // continue;
709     return false;
710     }
711    
712     return false;
713     }
714    
715     bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
716     BowtieHit& bh,
717     bool strip_slash,
718     char* name_out,
719     char* name_tags,
720     char* seq,
721     char* qual)
722     {
723     if (!orig_bwt_buf || !*orig_bwt_buf)
724     return false;
725     char bwt_buf[2048];
726     strcpy(bwt_buf, orig_bwt_buf);
727     // Are we still in the header region?
728     if (bwt_buf[0] == '@')
729     return false;
730    
731     char* buf = bwt_buf;
732     char* name = get_token((char**)&buf,"\t");
733     char* sam_flag_str = get_token((char**)&buf,"\t");
734     char* text_name = get_token((char**)&buf,"\t");
735     char* text_offset_str = get_token((char**)&buf,"\t");
736     const char* map_qual_str = get_token((char**)&buf,"\t");
737     char* cigar_str = get_token((char**)&buf,"\t");
738     const char* mate_ref_str = get_token((char**)&buf,"\t");
739     const char* mate_pos_str = get_token((char**)&buf,"\t");
740     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
741    
742     const char* seq_str = get_token((char**)&buf,"\t");
743     if (seq)
744     strcpy(seq, seq_str);
745    
746     const char* qual_str = get_token((char**)&buf,"\t");
747     if (qual)
748     strcpy(qual, qual_str);
749    
750     if (!name ||
751     !sam_flag_str ||
752     !text_name ||
753     !text_offset_str ||
754     !map_qual_str ||
755     !cigar_str ||
756     !mate_ref_str ||
757     !mate_pos_str ||
758     !inferred_insert_sz_str ||
759     !seq_str ||
760     !qual_str)
761     {
762     // truncated or malformed SAM record
763     return false;
764     }
765    
766    
767     int sam_flag = atoi(sam_flag_str);
768     int text_offset = atoi(text_offset_str);
769    
770     bool end = true;
771     unsigned int seg_offset = 0;
772     unsigned int seg_num = 0;
773     unsigned int num_segs = 0;
774    
775     // Copy the tag out of the name field before we might wipe it out
776     char* pipe = strrchr(name, '|');
777     if (pipe)
778     {
779     if (name_tags)
780     strcpy(name_tags, pipe);
781    
782     char* tag_buf = pipe + 1;
783     if (strchr(tag_buf, ':'))
784     {
785     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
786     if (seg_num + 1 == num_segs)
787     end = true;
788     else
789     end = false;
790     }
791    
792     *pipe = 0;
793     }
794    
795     // Stripping the slash and number following it gives the insert name
796     char* slash = strrchr(name, '/');
797     if (strip_slash && slash)
798     *slash = 0;
799    
800     char* p_cig = cigar_str;
801     //int len = strlen(sequence);
802     vector<CigarOp> cigar;
803     bool spliced_alignment = false;
804     // Mostly pilfered direct from the SAM tools:
805     while (*p_cig)
806     {
807     char* t;
808     int length = (int)strtol(p_cig, &t, 10);
809     if (length <= 0)
810     {
811     //fprintf (stderr, "CIGAR op has zero length\n");
812     return false;
813     }
814     char op_char = toupper(*t);
815     CigarOpCode opcode;
816     if (op_char == 'M') opcode = MATCH;
817     else if (op_char == 'I') opcode = INS;
818     else if (op_char == 'D') opcode = DEL;
819     else if (op_char == 'N')
820     {
821     if (length > max_report_intron_length)
822     return false;
823     opcode = REF_SKIP;
824     spliced_alignment = true;
825     }
826     else if (op_char == 'S') opcode = SOFT_CLIP;
827     else if (op_char == 'H') opcode = HARD_CLIP;
828     else if (op_char == 'P') opcode = PAD;
829     else
830     {
831     fprintf (stderr, "invalid CIGAR operation\n");
832     return false;
833     }
834     p_cig = t + 1;
835     //i += length;
836     cigar.push_back(CigarOp(opcode, length));
837     }
838     if (*p_cig)
839     {
840     fprintf (stderr, "unmatched CIGAR operation\n");
841     return false;
842     }
843    
844     //vector<string> attributes;
845     //tokenize(tag_buf, " \t",attributes);
846    
847     bool antisense_splice = false;
848     unsigned char num_mismatches = 0;
849     unsigned char num_splice_anchor_mismatches = 0;
850     const char* tag_buf = buf;
851    
852     // while((tag_buf = get_token((char**)&buf,"\t")))
853     // {
854     // vector<string> tuple_fields;
855     // tokenize(tag_buf,":", tuple_fields);
856     // if (tuple_fields.size() == 3)
857     // {
858     // if (tuple_fields[0] == "XS")
859     // {
860     // if (tuple_fields[2] == "-")
861     // antisense_splice = true;
862     // }
863     // else if (tuple_fields[0] == "NM")
864     // {
865     // num_mismatches = atoi(tuple_fields[2].c_str());
866     // }
867     // else if (tuple_fields[0] == "NS")
868     // {
869     // num_splice_anchor_mismatches = atoi(tuple_fields[2].c_str());
870     // }
871     // else
872     // {
873     // fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
874     // return false;
875     // }
876     // }
877     // }
878    
879     while((tag_buf = get_token((char**)&buf,"\t")))
880     {
881     vector<string> tuple_fields;
882     tokenize(tag_buf,":", tuple_fields);
883     if (tuple_fields.size() == 3)
884     {
885     if (tuple_fields[0] == "XS")
886     {
887     if (tuple_fields[2] == "-")
888     antisense_splice = true;
889     }
890     else if (tuple_fields[0] == "NM")
891     {
892     num_mismatches = atoi(tuple_fields[2].c_str());
893     }
894     else if (tuple_fields[0] == "NS")
895     {
896     //ignored for now
897     }
898     else
899     {
900     //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
901     //return false;
902     }
903     }
904     }
905    
906    
907     /*
908     * By convention,the NM field of the SAM record
909     * counts an insertion or deletion. I dont' think
910     * we want the mismatch count in the BowtieHit
911     * record to reflect this. Therefore, subtract out
912     * the mismatches due to in/dels
913     */
914     for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
915     if(itr->opcode == INS){
916     num_mismatches -= itr->length;
917     }
918     if(itr->opcode == DEL){
919     num_mismatches -= itr->length;
920     }
921     }
922     // vector<string> toks;
923     // tokenize(tag_buf, ":", toks);
924     if (spliced_alignment)
925     {
926     bh = create_hit(name,
927     text_name,
928     text_offset - 1,
929     cigar,
930     sam_flag & 0x0010,
931     antisense_splice,
932     num_mismatches,
933     num_splice_anchor_mismatches,
934     end);
935     return true;
936    
937     }
938     else
939     {
940     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
941    
942     bh = create_hit(name,
943     text_name,
944     text_offset - 1, // SAM files are 1-indexed
945     cigar,
946     sam_flag & 0x0010,
947     false,
948     num_mismatches,
949     0,
950     end);
951     return true;
952     }
953    
954     return false;
955     }
956    
957 gpertea 69 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
958     BowtieHit& bh,
959     bool strip_slash,
960     char* name_out,
961     char* name_tags,
962     char* seq,
963     char* qual)
964     {
965     if (!orig_bwt_buf || !*orig_bwt_buf)
966     return false;
967 gpertea 29
968 gpertea 69 char bwt_buf[2048];
969     strcpy(bwt_buf, orig_bwt_buf);
970     // Are we still in the header region?
971     if (bwt_buf[0] == '@')
972     return false;
973 gpertea 29
974 gpertea 69 //char orientation;
975     /*
976     char mismatches[buf_size];
977    
978     //char* orientation_str = get_token((char**)&buf,"\t");
979    
980     mismatches[0] = 0;
981     char* mismatches_str = get_token((char**)&buf, "\t");
982     if (mismatches_str)
983     strcpy(mismatches, mismatches_str);
984    
985     orientation = orientation_str[0];
986     text_offset = atoi(text_offset_str);
987     */
988    
989     char* buf = bwt_buf;
990     char* name = get_token((char**)&buf,"\t");
991     char* sam_flag_str = get_token((char**)&buf,"\t");
992     char* text_name = get_token((char**)&buf,"\t");
993     char* text_offset_str = get_token((char**)&buf,"\t");
994     const char* map_qual_str = get_token((char**)&buf,"\t");
995     char* cigar_str = get_token((char**)&buf,"\t");
996     const char* mate_ref_str = get_token((char**)&buf,"\t");
997     const char* mate_pos_str = get_token((char**)&buf,"\t");
998     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
999    
1000     const char* seq_str = get_token((char**)&buf,"\t");
1001     if (seq)
1002     strcpy(seq, seq_str);
1003    
1004     const char* qual_str = get_token((char**)&buf,"\t");
1005     if (qual)
1006     strcpy(qual, qual_str);
1007    
1008     if (!name ||
1009     !sam_flag_str ||
1010     !text_name ||
1011     !text_offset_str ||
1012     !map_qual_str ||
1013     !cigar_str ||
1014     !mate_ref_str ||
1015     !mate_pos_str ||
1016     !inferred_insert_sz_str ||
1017     !seq_str ||
1018     !qual_str)
1019     {
1020     // truncated or malformed SAM record
1021     return false;
1022     }
1023    
1024    
1025     int sam_flag = atoi(sam_flag_str);
1026     int text_offset = atoi(text_offset_str);
1027    
1028     //##############################################
1029     bool end = true;
1030     unsigned int seg_offset = 0;
1031     unsigned int seg_num = 0;
1032     unsigned int num_segs = 0;
1033    
1034     // Copy the tag out of the name field before we might wipe it out
1035     char* pipe = strrchr(name, '|');
1036     if (pipe)
1037     {
1038     if (name_tags)
1039     strcpy(name_tags, pipe);
1040    
1041     char* tag_buf = pipe + 1;
1042     if (strchr(tag_buf, ':'))
1043     {
1044     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1045     if (seg_num + 1 == num_segs)
1046     end = true;
1047     else
1048     end = false;
1049     }
1050    
1051     *pipe = 0;
1052     }
1053     // Stripping the slash and number following it gives the insert name
1054     char* slash = strrchr(name, '/');
1055     if (strip_slash && slash)
1056     *slash = 0;
1057    
1058     //int read_len = strlen(sequence);
1059    
1060     // Add this alignment to the table of hits for this half of the
1061     // Bowtie map
1062    
1063     // Parse the text_name field to recover the splice coords
1064     vector<string> toks;
1065    
1066     tokenize_strict(text_name, "|", toks);
1067    
1068     int num_extra_toks = (int)toks.size() - 6;
1069    
1070     if (num_extra_toks >= 0)
1071     {
1072     static const uint8_t left_window_edge_field = 1;
1073     static const uint8_t splice_field = 2;
1074     //static const uint8_t right_window_edge_field = 3;
1075     static const uint8_t junction_type_field = 4;
1076     static const uint8_t strand_field = 5;
1077    
1078     string contig = toks[0];
1079     for (int t = 1; t <= num_extra_toks; ++t)
1080     {
1081     contig += "|";
1082     contig += toks[t];
1083     }
1084    
1085     vector<string> splice_toks;
1086     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1087     if (splice_toks.size() != 2)
1088     {
1089     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1090     //fprintf(stderr, "%s (token: %s)\n", text_name,
1091     // toks[num_extra_toks + splice_field].c_str());
1092     return false;
1093     }
1094    
1095     //
1096     // check for an insertion hit
1097     //
1098     if(toks[num_extra_toks + junction_type_field] == "ins")
1099     {
1100     int8_t spliced_read_len = strlen(seq_str);
1101     /*
1102     * The 0-based position of the left edge of the alignment. Note that this
1103     * value may need to be futher corrected to account for the presence of
1104     * of the insertion.
1105     */
1106     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1107     uint32_t right = left + spliced_read_len - 1;
1108    
1109     /*
1110     * The 0-based position of the last genomic sequence before the insertion
1111     */
1112     uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
1113    
1114     string insertedSequence = splice_toks[1];
1115     /*
1116     * The 0-based position of the first genomic sequence after teh insertion
1117     */
1118     uint32_t right_splice_pos = left_splice_pos + 1;
1119     if(left > left_splice_pos){
1120     /*
1121     * The genomic position of the left edge of the alignment needs to be corrected
1122     * If the alignment does not extend into the insertion, simply subtract the length
1123     * of the inserted sequence, otherwise, just set it equal to the right edge
1124     */
1125     left = left - insertedSequence.length();
1126     if(left < right_splice_pos){
1127     left = right_splice_pos;
1128     }
1129     }
1130     if(right > left_splice_pos){
1131     right = right - insertedSequence.length();
1132     if(right < left_splice_pos){
1133     right = left_splice_pos;
1134     }
1135     }
1136     /*
1137     * Now, right and left should be properly transformed into genomic coordinates
1138     * We should be able to deduce how much the alignment matches the insertion
1139     * simply based on the length of the read
1140     */
1141     int left_match_length = 0;
1142     if(left <= left_splice_pos){
1143     left_match_length = left_splice_pos - left + 1;
1144     }
1145     int right_match_length = 0;
1146     if(right >= right_splice_pos){
1147     right_match_length = right - right_splice_pos + 1;
1148     }
1149     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1150    
1151     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1152     return false;
1153    
1154     string junction_strand = toks[num_extra_toks + strand_field];
1155     if(junction_strand != "rev" && junction_strand != "fwd"){
1156     fprintf(stderr, "Malformed insertion record\n");
1157     return false;
1158     }
1159    
1160     char* pch = strtok( mismatches, ",");
1161     unsigned char num_mismatches = 0;
1162    
1163     /*
1164     * remember that text_offset holds the left end of the
1165     * alignment, relative to the start of the contig
1166     */
1167    
1168     /*
1169     * The 0-based relative position of the left-most character
1170     * before the insertion in the contig
1171     */
1172     int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1173     while (pch != NULL)
1174     {
1175     char* colon = strchr(pch, ':');
1176     if (colon)
1177     {
1178     *colon = 0;
1179     int mismatch_pos = atoi(pch);
1180    
1181     /*
1182     * for reversely mapped reads,
1183     * find the correct mismatched position.
1184     */
1185     if (sam_flag & 0x0010){
1186     mismatch_pos = spliced_read_len - mismatch_pos - 1;
1187     }
1188    
1189     /*
1190     * Only count mismatches outside of the insertion region
1191     * If there is a mismatch within the insertion,
1192     * disallow this hit
1193     */
1194     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1195     num_mismatches++;
1196     }else{
1197     return false;
1198     }
1199     }
1200     pch = strtok (NULL, ",");
1201     }
1202    
1203    
1204     vector<CigarOp> cigar;
1205     cigar.push_back(CigarOp(MATCH, left_match_length));
1206     cigar.push_back(CigarOp(INS, insertion_match_length));
1207     cigar.push_back(CigarOp(MATCH, right_match_length));
1208    
1209     /*
1210     * For now, disallow hits that don't span
1211     * the insertion. If we allow these types of hits,
1212     * then long_spanning.cpp needs to be updated
1213     * in order to intelligently merge these kinds
1214     * of reads back together
1215     *
1216     * Following code has been changed to allow segment that end
1217     * in an insertion
1218     */
1219     bh = create_hit(name,
1220     contig,
1221     left,
1222     cigar,
1223     sam_flag & 0x0010, //#######
1224     junction_strand == "rev",
1225     num_mismatches,
1226     0,
1227     end);
1228     return true;
1229     }
1230    
1231     else
1232     {
1233     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1234     int spliced_read_len = strlen(seq_str);
1235     int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
1236     if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
1237     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
1238    
1239     if (right_splice_pos <= 0 || left_splice_pos <= 0)
1240     return false;
1241    
1242     if ((sam_flag & 0x0010) == 0) //######
1243     {
1244     if (left_splice_pos + seg_offset < _anchor_length){
1245     return false;
1246     }
1247     }
1248     else
1249     {
1250     if (right_splice_pos + seg_offset < _anchor_length)
1251     return false;
1252     }
1253     //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1254     //atoi(toks[right_window_edge_field].c_str());
1255     int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
1256    
1257     string junction_strand = toks[num_extra_toks + strand_field];
1258     if (!(junction_strand == "rev" || junction_strand == "fwd")||
1259     !(orientation == '-' || orientation == '+'))
1260     {
1261     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1262     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
1263     // junction_strand.c_str(), orientation);
1264     return false;
1265     }
1266    
1267     //vector<string> mismatch_toks;
1268     char* pch = strtok (mismatches,",");
1269     int mismatches_in_anchor = 0;
1270     unsigned char num_mismatches = 0;
1271     while (pch != NULL)
1272     {
1273     char* colon = strchr(pch, ':');
1274     if (colon)
1275     {
1276     *colon = 0;
1277     num_mismatches++;
1278     int mismatch_pos = atoi(pch);
1279     if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
1280     (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
1281     mismatches_in_anchor++;
1282     }
1283     //mismatch_toks.push_back(pch);
1284     pch = strtok (NULL, ",");
1285     }
1286    
1287     // FIXME: we probably should exclude these hits somewhere, but this
1288     // isn't the right place
1289     vector<CigarOp> cigar;
1290     cigar.push_back(CigarOp(MATCH, left_splice_pos));
1291     if(toks[num_extra_toks + junction_type_field] == "del"){
1292     cigar.push_back(CigarOp(DEL, gap_len));
1293     }else{
1294     cigar.push_back(CigarOp(REF_SKIP, gap_len));
1295     }
1296     cigar.push_back(CigarOp(MATCH, right_splice_pos));
1297    
1298     bh = create_hit(name,
1299     contig,
1300     left,
1301     cigar,
1302     orientation == '-',
1303     junction_strand == "rev",
1304     num_mismatches,
1305     mismatches_in_anchor,
1306     end);
1307     return true;
1308     }
1309     }
1310     else
1311     {
1312     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1313     //fprintf(stderr, "%s\n", orig_bwt_buf);
1314     // continue;
1315     return false;
1316     }
1317    
1318     return false;
1319     }
1320    
1321    
1322 gpertea 29 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1323     BowtieHit& bh, bool strip_slash,
1324     char* name_out, char* name_tags,
1325     char* seq, char* qual) {
1326     if (_sam_header==NULL)
1327     err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1328     const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1329    
1330    
1331     uint32_t sam_flag = hit_buf->core.flag;
1332    
1333     int text_offset = hit_buf->core.pos;
1334     int text_mate_pos = hit_buf->core.mpos;
1335     int target_id = hit_buf->core.tid;
1336     int mate_target_id = hit_buf->core.mtid;
1337    
1338     vector<CigarOp> cigar;
1339     bool spliced_alignment = false;
1340     int num_hits = 1;
1341    
1342     double mapQ = hit_buf->core.qual;
1343     long double error_prob;
1344     if (mapQ > 0)
1345     {
1346     long double p = (-1.0 * mapQ) / 10.0;
1347     error_prob = pow(10.0L, p);
1348     }
1349     else
1350     {
1351     error_prob = 1.0;
1352     }
1353    
1354     //header->target_name[c->tid]
1355    
1356     bool end = true;
1357     unsigned int seg_offset = 0;
1358     unsigned int seg_num = 0;
1359     unsigned int num_segs = 0;
1360     // Copy the tag out of the name field before we might wipe it out
1361     char* qname = bam1_qname(hit_buf);
1362     char* pipe = strrchr(qname, '|');
1363     if (pipe)
1364     {
1365     if (name_tags)
1366     strcpy(name_tags, pipe);
1367    
1368     char* tag_buf = pipe + 1;
1369     if (strchr(tag_buf, ':'))
1370     {
1371     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1372     if (seg_num + 1 == num_segs)
1373     end = true;
1374     else
1375     end = false;
1376     }
1377    
1378     *pipe = 0;
1379     }
1380    
1381    
1382     if (target_id < 0) {
1383     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1384     bh = create_hit(qname,
1385     "*", //ref_name
1386     0, //left coord
1387     0, //read_len
1388     false, //antisense_aln
1389     0, //edit_dist
1390     end);
1391     return true;
1392     }
1393    
1394     //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1395     string text_name = _sam_header->target_name[target_id];
1396     for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1397     {
1398     //char* t;
1399    
1400     int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1401     if (length <= 0)
1402     {
1403     fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1404     return false;
1405     }
1406    
1407     CigarOpCode opcode;
1408     switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1409     {
1410     case BAM_CMATCH: opcode = MATCH; break;
1411     case BAM_CINS: opcode = INS; break;
1412     case BAM_CDEL: opcode = DEL; break;
1413     case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1414     case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1415     case BAM_CPAD: opcode = PAD; break;
1416     case BAM_CREF_SKIP:
1417     opcode = REF_SKIP;
1418     spliced_alignment = true;
1419     if (length > (int)max_report_intron_length)
1420     {
1421     //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1422     return false;
1423     }
1424     break;
1425     default:
1426     fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1427     return false;
1428     }
1429     if (opcode != HARD_CLIP)
1430     cigar.push_back(CigarOp(opcode, length));
1431     }
1432    
1433     string mrnm;
1434     if (mate_target_id >= 0) {
1435     if (mate_target_id == target_id) {
1436     //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1437     mrnm = _sam_header->target_name[mate_target_id];
1438     }
1439     else {
1440     //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1441     return false;
1442     }
1443     }
1444     else {
1445     text_mate_pos = 0;
1446     }
1447     //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1448     bool antisense_splice=false;
1449     unsigned char num_mismatches = 0;
1450     unsigned char num_splice_anchor_mismatches = 0;
1451    
1452     uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1453     if (ptr) {
1454     char src_strand_char = bam_aux2A(ptr);
1455     if (src_strand_char == '-')
1456     antisense_splice = true;
1457     // else if (src_strand_char == '+')
1458     // source_strand = CUFF_FWD;
1459     }
1460    
1461     ptr = bam_aux_get(hit_buf, "NM");
1462     if (ptr) {
1463     num_mismatches = bam_aux2i(ptr);
1464     }
1465    
1466     ptr = bam_aux_get(hit_buf, "NH");
1467     if (ptr) {
1468     num_hits = bam_aux2i(ptr);
1469     }
1470    
1471     //bool antisense_aln = bam1_strand(hit_buf);
1472    
1473     //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1474     // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1475     if (spliced_alignment) {
1476     //if (source_strand == CUFF_STRAND_UNKNOWN) {
1477     // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1478     // }
1479     bh = create_hit(qname,
1480     text_name,
1481     text_offset, // BAM files are 0-indexed
1482     cigar,
1483     sam_flag & 0x0010,
1484     antisense_splice,
1485     num_mismatches,
1486     num_splice_anchor_mismatches,
1487     end);
1488    
1489     }
1490     else {
1491     //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1492     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1493     bh = create_hit(qname,
1494     text_name,
1495     text_offset, // BAM files are 0-indexed
1496     cigar,
1497     sam_flag & 0x0010,
1498     false,
1499     num_mismatches,
1500     0,
1501     end);
1502     }
1503     if (seq!=NULL) {
1504     char *bseq = (char*)bam1_seq(hit_buf);
1505     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1506     char v = bam1_seqi(bseq,i);
1507     seq[i]=bam_nt16_rev_table[v];
1508     }
1509     seq[hit_buf->core.l_qseq]=0;
1510     }
1511     if (qual!=NULL) {
1512     char *bq = (char*)bam1_qual(hit_buf);
1513     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1514     qual[i]=bq[i]+33;
1515     }
1516     qual[hit_buf->core.l_qseq]=0;
1517     }
1518     return true;
1519     }
1520    
1521     bool BAMHitFactory::inspect_header(HitStream& hs)
1522     {
1523     bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1524    
1525     if (header == NULL) {
1526     fprintf(stderr, "Warning: No BAM header\n");
1527     return false;
1528     }
1529     if (header->l_text == 0) {
1530     fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1531     return false;
1532     }
1533     return true;
1534     }
1535    
1536    
1537     void get_mapped_reads(FILE* bwtf,
1538     HitTable& hits,
1539     HitFactory& hit_factory,
1540     bool strip_slash,
1541     bool verbose)
1542     {
1543    
1544    
1545     char bwt_buf[2048];
1546     uint32_t reads_extracted = 0;
1547    
1548     while (fgets(bwt_buf, 2048, bwtf))
1549     {
1550     // Chomp the newline
1551     char* nl = strrchr(bwt_buf, '\n');
1552     if (nl) *nl = 0;
1553     if (*bwt_buf == 0)
1554     continue;
1555     // Get a new record from the tab-delimited Bowtie map
1556     BowtieHit bh;
1557     if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1558     {
1559     // Only check uniqueness if these hits are spliced
1560     hits.add_hit(bh, true);
1561     }
1562     reads_extracted++;
1563     }
1564    
1565     // This will sort the map by insert id.
1566     hits.finalize();
1567     fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1568     }
1569    
1570    
1571     /*
1572     AlignStatus status(const BowtieHit* align)
1573     {
1574     if (!align)
1575     return UNALIGNED;
1576     if (align->contiguous())
1577     return CONTIGUOUS;
1578     return SPLICED;
1579     }
1580     */
1581    
1582     void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1583     {
1584     int max_hit_pos = -1;
1585     for (size_t i = 0; i < hits.size(); ++i)
1586     {
1587     max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1588     }
1589    
1590     if ((int)DoC.size() < max_hit_pos)
1591     DoC.resize(max_hit_pos);
1592    
1593     for (size_t i = 0; i < hits.size(); ++i)
1594     {
1595     const BowtieHit& bh = hits[i];
1596    
1597     // split up the coverage contibution for this reads
1598     size_t j = bh.left();
1599     const vector<CigarOp>& cigar = bh.cigar();
1600    
1601     for (size_t c = 0 ; c < cigar.size(); ++c)
1602     {
1603     switch(cigar[c].opcode)
1604     {
1605     case MATCH:
1606     for (size_t m = 0; m < cigar[c].length; ++m)
1607     {
1608     if (DoC[j + m] < 0xFFFF)
1609     DoC[j + m]++;
1610     }
1611     //fall through this case to REF_SKIP is intentional
1612     case REF_SKIP:
1613     j += cigar[c].length;
1614     break;
1615     default:
1616     break;
1617     }
1618    
1619     }
1620     }
1621     }
1622    
1623     void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1624     {
1625     if ((int)DoC.size() < bh.right())
1626     DoC.resize(bh.right());
1627    
1628     // split up the coverage contibution for this reads
1629     size_t j = bh.left();
1630     const vector<CigarOp>& cigar = bh.cigar();
1631    
1632     for (size_t c = 0 ; c < cigar.size(); ++c)
1633     {
1634     switch(cigar[c].opcode)
1635     {
1636     case MATCH:
1637     for (size_t m = 0; m < cigar[c].length; ++m)
1638     {
1639     if (DoC[j + m] < 0xFFFFFFFF)
1640     DoC[j + m]++;
1641     }
1642     //fall through this case to REF_SKIP is intentional
1643     case REF_SKIP:
1644     j += cigar[c].length;
1645     break;
1646     default:
1647     break;
1648     }
1649    
1650     }
1651     }
1652    
1653     void print_hit(FILE* fout,
1654     const char* read_name,
1655     const BowtieHit& bh,
1656     const char* ref_name,
1657     const char* sequence,
1658     const char* qualities,
1659     bool from_bowtie)
1660     {
1661     string seq;
1662     string quals;
1663     if (sequence)
1664     {
1665     seq = sequence;
1666     quals = qualities;
1667     seq.resize(bh.read_len());
1668     quals.resize(bh.read_len());
1669     }
1670     else
1671     {
1672     seq = "*";
1673     }
1674    
1675     if (qualities)
1676     {
1677     quals = qualities;
1678     quals.resize(bh.read_len());
1679     }
1680     else
1681     {
1682     quals = "*";
1683     }
1684    
1685     uint32_t sam_flag = 0;
1686     if (bh.antisense_align())
1687     {
1688     sam_flag |= 0x0010; // BAM_FREVERSE
1689     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1690     {
1691     reverse_complement(seq);
1692     reverse(quals.begin(), quals.end());
1693     }
1694     }
1695    
1696     uint32_t sam_pos = bh.left() + 1;
1697     uint32_t map_quality = 255;
1698     char cigar[256];
1699     cigar[0] = 0;
1700     string mate_ref_name = "*";
1701     uint32_t mate_pos = 0;
1702     uint32_t insert_size = 0;
1703     //string qualities = "*";
1704    
1705     const vector<CigarOp>& bh_cigar = bh.cigar();
1706    
1707     /*
1708     * In addition to calculating the cigar string,
1709     * we need to figure out how many in/dels are in the
1710     * sequence, so that we can give the correct
1711     * value for the NM tag
1712     */
1713     int indel_distance = 0;
1714     for (size_t c = 0; c < bh_cigar.size(); ++c)
1715     {
1716     char ibuf[64];
1717     sprintf(ibuf, "%d", bh_cigar[c].length);
1718     switch(bh_cigar[c].opcode)
1719     {
1720     case MATCH:
1721     strcat(cigar, ibuf);
1722     strcat(cigar, "M");
1723     break;
1724     case INS:
1725     strcat(cigar, ibuf);
1726     strcat(cigar, "I");
1727     indel_distance += bh_cigar[c].length;
1728     break;
1729     case DEL:
1730     strcat(cigar, ibuf);
1731     strcat(cigar, "D");
1732     indel_distance += bh_cigar[c].length;
1733     break;
1734     case REF_SKIP:
1735     strcat(cigar, ibuf);
1736     strcat(cigar, "N");
1737     break;
1738     default:
1739     break;
1740     }
1741     }
1742    
1743     //string q = string(bh.read_len(), '!');
1744     //string s = string(bh.read_len(), 'N');
1745    
1746     fprintf(fout,
1747     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1748     read_name,
1749     sam_flag,
1750     ref_name,
1751     sam_pos,
1752     map_quality,
1753     cigar,
1754     mate_ref_name.c_str(),
1755     mate_pos,
1756     insert_size,
1757     seq.c_str(),
1758     quals.c_str());
1759    
1760     if (!sam_readgroup_id.empty())
1761     fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1762    
1763     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1764    
1765     bool containsSplice = false;
1766     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1767     if(itr->opcode == REF_SKIP){
1768     containsSplice = true;
1769     break;
1770     }
1771     }
1772    
1773     if (containsSplice)
1774     fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1775    
1776     fprintf(fout, "\n");
1777     }
1778    
1779     void print_bamhit(GBamWriter& wbam,
1780     const char* read_name,
1781     const BowtieHit& bh,
1782     const char* ref_name,
1783     const char* sequence,
1784     const char* qualities,
1785     bool from_bowtie)
1786     {
1787     string seq;
1788     string quals;
1789     if (sequence) {
1790     seq = sequence;
1791     quals = qualities;
1792     seq.resize(bh.read_len());
1793     quals.resize(bh.read_len());
1794     }
1795     else {
1796     seq = "*";
1797     }
1798     if (qualities) {
1799     quals = qualities;
1800     quals.resize(bh.read_len());
1801     }
1802     else
1803     {
1804     quals = "*";
1805     }
1806    
1807     uint32_t sam_flag = 0;
1808     if (bh.antisense_align())
1809     {
1810     sam_flag |= 0x0010; // BAM_FREVERSE
1811     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1812     {
1813     reverse_complement(seq);
1814     reverse(quals.begin(), quals.end());
1815     }
1816     }
1817    
1818     uint32_t sam_pos = bh.left() + 1;
1819     uint32_t map_quality = 255;
1820     char cigar[256];
1821     cigar[0] = 0;
1822     string mate_ref_name = "*";
1823     uint32_t mate_pos = 0;
1824     uint32_t insert_size = 0;
1825     //string qualities = "*";
1826    
1827     const vector<CigarOp>& bh_cigar = bh.cigar();
1828     /*
1829     * In addition to calculating the cigar string,
1830     * we need to figure out how many in/dels are in the
1831     * sequence, so that we can give the correct
1832     * value for the NM tag
1833     */
1834     int indel_distance = 0;
1835     for (size_t c = 0; c < bh_cigar.size(); ++c)
1836     {
1837     char ibuf[64];
1838     sprintf(ibuf, "%d", bh_cigar[c].length);
1839     switch(bh_cigar[c].opcode)
1840     {
1841     case MATCH:
1842     strcat(cigar, ibuf);
1843     strcat(cigar, "M");
1844     break;
1845     case INS:
1846     strcat(cigar, ibuf);
1847     strcat(cigar, "I");
1848     indel_distance += bh_cigar[c].length;
1849     break;
1850     case DEL:
1851     strcat(cigar, ibuf);
1852     strcat(cigar, "D");
1853     indel_distance += bh_cigar[c].length;
1854     break;
1855     case REF_SKIP:
1856     strcat(cigar, ibuf);
1857     strcat(cigar, "N");
1858     break;
1859     default:
1860     break;
1861     }
1862     }
1863    
1864     //string q = string(bh.read_len(), '!');
1865     //string s = string(bh.read_len(), 'N');
1866     /*
1867     fprintf(fout,
1868     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1869     read_name,
1870     sam_flag,
1871     ref_name,
1872     sam_pos,
1873     map_quality,
1874     cigar,
1875     mate_ref_name.c_str(),
1876     mate_pos,
1877     insert_size,
1878     seq.c_str(),
1879     quals.c_str());
1880    
1881     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1882     */
1883     vector<string> auxdata;
1884    
1885     if (!sam_readgroup_id.empty())
1886     {
1887     string nm("RG:Z:");
1888     nm += sam_readgroup_id;
1889     auxdata.push_back(nm);
1890     }
1891    
1892     string nm("NM:i:");
1893     str_appendInt(nm, bh.edit_dist() + indel_distance);
1894     auxdata.push_back(nm);
1895     bool containsSplice = false;
1896     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1897     if(itr->opcode == REF_SKIP){
1898     containsSplice = true;
1899     break;
1900     }
1901    
1902     if (containsSplice) {
1903     //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1904     nm="XS:A:";
1905     nm+=(char)(bh.antisense_splice() ? '-' : '+');
1906     auxdata.push_back(nm);
1907     }
1908    
1909     GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1910     cigar, mate_ref_name.c_str(), mate_pos,
1911     insert_size, seq.c_str(), quals.c_str(), &auxdata);
1912    
1913    
1914    
1915     wbam.write(brec);
1916     delete brec;
1917     }
1918    
1919     /**
1920     * Print a vector of cigar operations to a file.
1921     * @param bh_cigar A vector of CigarOps
1922     * @return a string representation of the cigar string
1923     */
1924     std::string print_cigar(vector<CigarOp>& bh_cigar){
1925     char cigar[256];
1926     cigar[0] = 0;
1927     for (size_t c = 0; c < bh_cigar.size(); ++c)
1928     {
1929     char ibuf[64];
1930     sprintf(ibuf, "%d", bh_cigar[c].length);
1931     switch(bh_cigar[c].opcode)
1932     {
1933     case MATCH:
1934     strcat(cigar, ibuf);
1935     strcat(cigar, "M");
1936     break;
1937     case INS:
1938     strcat(cigar, ibuf);
1939     strcat(cigar, "I");
1940     break;
1941     case DEL:
1942     strcat(cigar, ibuf);
1943     strcat(cigar, "D");
1944     break;
1945     case REF_SKIP:
1946     strcat(cigar, ibuf);
1947     strcat(cigar, "N");
1948     break;
1949     default:
1950     break;
1951     }
1952     }
1953     string result(cigar);
1954     return result;
1955     }
1956    
1957     bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
1958     {
1959     RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
1960     if (!ref_str)
1961     return false;
1962    
1963     const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
1964    
1965     size_t pos_seq = 0;
1966     size_t pos_ref = 0;
1967     size_t mismatch = 0;
1968     for (size_t i = 0; i < _cigar.size(); ++i)
1969     {
1970     CigarOp cigar = _cigar[i];
1971     switch(cigar.opcode)
1972     {
1973     case MATCH:
1974     {
1975     for (size_t j = 0; j < cigar.length; ++j)
1976     {
1977     if (_seq[pos_seq++] != ref_seq[pos_ref++])
1978     ++mismatch;
1979     }
1980     }
1981     break;
1982     case INS:
1983     {
1984     pos_seq += cigar.length;
1985     }
1986     break;
1987    
1988     case DEL:
1989     case REF_SKIP:
1990     {
1991     pos_ref += cigar.length;
1992     }
1993     break;
1994    
1995     default:
1996     break;
1997     }
1998     }
1999    
2000     return mismatch == _edit_dist;
2001     }