ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 97
Committed: Thu Oct 6 19:45:36 2011 UTC (8 years, 8 months ago) by gpertea
File size: 52897 byte(s)
Log Message:
by default bam2fastx shows unmapped only

Line User Rev File contents
1 gpertea 29 /*
2     * bwt_map.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/17/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <cassert>
15     #include <cstdlib>
16     #include <iostream>
17     #include <set>
18     #include <vector>
19     #include <cmath>
20    
21     #include "common.h"
22     #include "bwt_map.h"
23     #include "tokenize.h"
24     #include "reads.h"
25    
26     using namespace std;
27    
28     void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29     {
30     uint32_t reference_id = bh.ref_id();
31    
32     pair<RefHits::iterator, bool> ret =
33     _hits_for_ref.insert(make_pair(reference_id, HitList()));
34     HitList& hl = ret.first->second;
35    
36     if (check_uniqueness)
37     {
38     // Check uniqueness, in case we are adding spliced hits from
39     // several spliced alignment sources (e.g. de novo hashing + Bowtie
40     // against a user-supplied index). We don't want to count the same
41     // alignment twice if it happened to be found by more than one method
42     HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43     HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44     for (; lb != ub && lb != hl.end(); ++lb)
45     {
46     if (*lb == bh)
47     {
48     //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49     return;
50     }
51    
52     if (lb->insert_id() == bh.insert_id() &&
53     lb->ref_id() == bh.ref_id() &&
54     lb->antisense_align() == bh.antisense_align())
55     {
56     // If we get here, we may be looking at the same alignment
57     // However, spanning_reads may report a shorter, trimmed alignment
58     // so not all fields will be equal. If they just disagree on the
59     // ends, and don't indicate a different junction coord, the
60     // alignments are the same.
61    
62     if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63     (bh.left() <= lb->left() && bh.right() >= lb->right()))
64     {
65     vector<pair<int,int> > lb_gaps, bh_gaps;
66     lb->gaps(lb_gaps);
67     bh.gaps(bh_gaps);
68     if (lb_gaps == bh_gaps)
69     {
70     // One alignment is contained in the other, they agree on
71     // where the gaps, if any, are, and they share an id
72     // => this is a redundant aligment, so toss it
73     //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74     return;
75     }
76     }
77     }
78     }
79     }
80     _total_hits++;
81     hl.push_back(bh);
82     }
83    
84     bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85     {
86     return h1.insert_id() < h2.insert_id();
87     }
88    
89     void LineHitFactory::openStream(HitStream& hs)
90     {
91     if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92     //open the file for HitStream here
93     hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94     if (hs._hit_file==NULL)
95     err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96     return;
97     }
98     if (hs._fzpipe!=NULL) {
99     hs._hit_file=hs._fzpipe->file;
100     }
101     }
102    
103     void LineHitFactory::rewind(HitStream& hs)
104     {
105     if (hs._fzpipe!=NULL) {
106     hs._fzpipe->rewind();
107     hs._hit_file=hs._fzpipe->file;
108     }
109     else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110     }
111    
112     bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113     FILE* f=(FILE *)(hs._hit_file);
114     bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115     if (!new_rec || feof(f)) {
116     hs._eof=true;
117     return false;
118     }
119     ++_line_num;
120     char* nl = strrchr(_hit_buf, '\n');
121     if (nl) *nl = 0;
122     buf = _hit_buf;
123     buf_size = _hit_buf_max_sz - 1;
124     return true;
125     }
126    
127     void LineHitFactory::closeStream(HitStream& hs) {
128     if (hs._fzpipe!=NULL) {
129     hs._fzpipe->close();
130     return;
131     }
132     if (hs._hit_file!=NULL) {
133     fclose((FILE*)(hs._hit_file));
134     hs._hit_file=NULL;
135     }
136     }
137     void BAMHitFactory::openStream(HitStream& hs) {
138     if (hs._hit_file==NULL) {
139     if (hs._hit_file_name.empty())
140     //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141     return; //invalid stream, could be just a place holder
142     //open the file here if not already open
143     string fext=getFext(hs._hit_file_name);
144     if (fext=="sam")
145     hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146     else
147     hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148    
149     samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150    
151     if (sam_file == NULL)
152     err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153     if (sam_file->header == NULL)
154     err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155     memset(&_next_hit, 0, sizeof(_next_hit));
156     //_beginning = bgzf_tell(sam_file->x.bam);
157     _sam_header=sam_file->header;
158     if (inspect_header(hs) == false)
159     err_die("Error: invalid SAM header for file %s\n",
160     hs._hit_file_name.c_str());
161     }
162     }
163    
164     void BAMHitFactory::closeStream(HitStream& hs) {
165     if (hs._hit_file) {
166     samclose((samfile_t*)(hs._hit_file));
167     }
168     hs._hit_file=NULL;
169     _sam_header=NULL;
170     }
171    
172     void BAMHitFactory::rewind(HitStream& hs)
173     {
174     /*
175     if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176     {
177     bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178     _eof = false;
179     }
180     */
181     this->closeStream(hs);
182     this->openStream(hs);
183     }
184    
185     string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186     const bam1_t* bamrec=(const bam1_t*)hit_buf;
187     char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188     string sam_line(tamline);
189     free(tamline);
190     return sam_line;
191     }
192    
193     bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194     if (_next_hit.data) {
195     free(_next_hit.data);
196     _next_hit.data = NULL;
197     }
198     _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199     if (hs.eof() || !hs.ready()) return false;
200    
201     //mark_curr_pos();
202    
203     memset(&_next_hit, 0, sizeof(_next_hit));
204    
205     int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206     if (bytes_read <= 0) {
207     hs._eof = true;
208     return false;
209     }
210     buf = (const char*)&_next_hit;
211     buf_size = bytes_read;
212     return true;
213     }
214    
215    
216     BowtieHit HitFactory::create_hit(const string& insert_name,
217     const string& ref_name,
218     int left,
219     const vector<CigarOp>& cigar,
220     bool antisense_aln,
221     bool antisense_splice,
222     unsigned char edit_dist,
223     unsigned char splice_mms,
224     bool end)
225     {
226     uint64_t insert_id = _insert_table.get_id(insert_name);
227     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228    
229     return BowtieHit(reference_id,
230     insert_id,
231     left,
232     cigar,
233     antisense_aln,
234     antisense_splice,
235     edit_dist,
236     splice_mms,
237     end);
238     }
239    
240     BowtieHit HitFactory::create_hit(const string& insert_name,
241     const string& ref_name,
242     uint32_t left,
243     uint32_t read_len,
244     bool antisense_aln,
245     unsigned char edit_dist,
246     bool end)
247     {
248     uint64_t insert_id = _insert_table.get_id(insert_name);
249     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250    
251     return BowtieHit(reference_id,
252     insert_id,
253     left,
254     read_len,
255     antisense_aln,
256     edit_dist,
257     end);
258     }
259    
260     bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261     BowtieHit& bh,
262     bool strip_slash,
263     char* name_out,
264     char* name_tags,
265     char* seq,
266     char* qual)
267     {
268     if (!orig_bwt_buf || !*orig_bwt_buf)
269     return false;
270    
271     static const int buf_size = 2048;
272    
273     char bwt_buf[buf_size];
274     strcpy(bwt_buf, orig_bwt_buf);
275     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276    
277     char orientation;
278    
279     //int bwtf_ret = 0;
280     //uint32_t seqid = 0;
281    
282     char text_name[buf_size];
283     unsigned int text_offset;
284    
285    
286     //unsigned int other_occs;
287     char mismatches[buf_size];
288     //memset(mismatches, 0, sizeof(mismatches));
289    
290     const char* buf = bwt_buf;
291     char* name = get_token((char**)&buf,"\t");
292     char* orientation_str = get_token((char**)&buf,"\t");
293     char* text_name_str = get_token((char**)&buf,"\t");
294    
295     strcpy(text_name, text_name_str);
296    
297     char* text_offset_str = get_token((char**)&buf,"\t");
298     char* seq_str = get_token((char**)&buf,"\t");
299     if (seq)
300     strcpy(seq, seq_str);
301    
302     const char* qual_str = get_token((char**)&buf,"\t");
303     if (qual)
304     strcpy(qual, qual_str);
305    
306     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307     mismatches[0] = 0;
308     char* mismatches_str = get_token((char**)&buf, "\t");
309     if (mismatches_str)
310     strcpy(mismatches, mismatches_str);
311    
312     orientation = orientation_str[0];
313     text_offset = atoi(text_offset_str);
314    
315     bool end = true;
316     unsigned int seg_offset = 0;
317     unsigned int seg_num = 0;
318     unsigned int num_segs = 0;
319    
320     // Copy the tag out of the name field before we might wipe it out
321     char* pipe = strrchr(name, '|');
322     if (pipe)
323     {
324     if (name_tags)
325     strcpy(name_tags, pipe);
326    
327     char* tag_buf = pipe + 1;
328     if (strchr(tag_buf, ':'))
329     {
330     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331     if (seg_num + 1 == num_segs)
332     end = true;
333     else
334     end = false;
335     }
336    
337     *pipe = 0;
338     }
339     // Stripping the slash and number following it gives the insert name
340     char* slash = strrchr(name, '/');
341     if (strip_slash && slash)
342     *slash = 0;
343    
344     size_t read_len = strlen(seq_str);
345    
346     // Add this alignment to the table of hits for this half of the
347     // Bowtie map
348    
349     //vector<string> mismatch_toks;
350     char* pch = strtok (mismatches,",");
351     unsigned char num_mismatches = 0;
352     while (pch != NULL)
353     {
354     char* colon = strchr(pch, ':');
355     if (colon)
356     {
357     num_mismatches++;
358     }
359     //mismatch_toks.push_back(pch);
360     pch = strtok (NULL, ",");
361     }
362    
363     bh = create_hit(name,
364     text_name,
365     text_offset,
366     (int)read_len,
367     orientation == '-',
368     num_mismatches,
369     end);
370    
371     return true;
372     }
373    
374     int anchor_mismatch = 0;
375    
376 gpertea 70
377     void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
378     bool &end, unsigned int &seg_offset, unsigned int& seg_num,
379     unsigned int & num_segs) {
380     char* pipe = strrchr(name, '|');
381     if (pipe)
382     {
383     if (name_tags)
384     strcpy(name_tags, pipe);
385    
386     char* tag_buf = pipe + 1;
387     if (strchr(tag_buf, ':'))
388     {
389     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
390     if (seg_num + 1 == num_segs)
391     end = true;
392     else
393     end = false;
394     }
395    
396     *pipe = 0;
397     }
398    
399     // Stripping the slash and number following it gives the insert name
400     char* slash = strrchr(name, '/');
401     if (strip_slash && slash)
402     *slash = 0;
403     }
404    
405    
406 gpertea 29 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
407     BowtieHit& bh,
408     bool strip_slash,
409     char* name_out,
410     char* name_tags,
411     char* seq,
412     char* qual)
413     {
414     if (!orig_bwt_buf || !*orig_bwt_buf)
415     return false;
416    
417     static const int buf_size = 2048;
418    
419     char bwt_buf[buf_size];
420     strcpy(bwt_buf, orig_bwt_buf);
421     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
422    
423     char orientation;
424     char text_name[buf_size];
425     unsigned int text_offset;
426     char mismatches[buf_size];
427     //memset(mismatches, 0, sizeof(mismatches));
428    
429     char* buf = bwt_buf;
430     char* name = get_token((char**)&buf,"\t");
431     char* orientation_str = get_token((char**)&buf,"\t");
432     char* text_name_str = get_token((char**)&buf,"\t");
433     strcpy(text_name, text_name_str);
434    
435     char* text_offset_str = get_token((char**)&buf,"\t");
436     char* seq_str = get_token((char**)&buf,"\t");
437     if (seq)
438     strcpy(seq, seq_str);
439    
440     const char* qual_str = get_token((char**)&buf,"\t");
441     if (qual)
442     strcpy(qual, qual_str);
443    
444     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
445     mismatches[0] = 0;
446     char* mismatches_str = get_token((char**)&buf, "\t");
447     if (mismatches_str)
448     strcpy(mismatches, mismatches_str);
449    
450     orientation = orientation_str[0];
451     text_offset = atoi(text_offset_str);
452    
453     bool end = true;
454     unsigned int seg_offset = 0;
455     unsigned int seg_num = 0;
456     unsigned int num_segs = 0;
457    
458     // Copy the tag out of the name field before we might wipe it out
459 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
460 gpertea 29
461     // Add this alignment to the table of hits for this half of the
462     // Bowtie map
463    
464     // Parse the text_name field to recover the splice coords
465     vector<string> toks;
466    
467     tokenize_strict(text_name, "|", toks);
468    
469     int num_extra_toks = (int)toks.size() - 6;
470    
471     if (num_extra_toks >= 0)
472     {
473     static const uint8_t left_window_edge_field = 1;
474     static const uint8_t splice_field = 2;
475     //static const uint8_t right_window_edge_field = 3;
476     static const uint8_t junction_type_field = 4;
477     static const uint8_t strand_field = 5;
478    
479     string contig = toks[0];
480     for (int t = 1; t <= num_extra_toks; ++t)
481     {
482     contig += "|";
483     contig += toks[t];
484     }
485    
486     vector<string> splice_toks;
487     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
488     if (splice_toks.size() != 2)
489     {
490     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
491     //fprintf(stderr, "%s (token: %s)\n", text_name,
492     // toks[num_extra_toks + splice_field].c_str());
493     return false;
494     }
495    
496     //
497     // check for an insertion hit
498     //
499     if(toks[num_extra_toks + junction_type_field] == "ins")
500     {
501     int8_t spliced_read_len = strlen(seq_str);
502     /*
503     * The 0-based position of the left edge of the alignment. Note that this
504     * value may need to be futher corrected to account for the presence of
505     * of the insertion.
506     */
507     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
508     uint32_t right = left + spliced_read_len - 1;
509    
510     /*
511     * The 0-based position of the last genomic sequence before the insertion
512     */
513     uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
514    
515     string insertedSequence = splice_toks[1];
516     /*
517     * The 0-based position of the first genomic sequence after teh insertion
518     */
519     uint32_t right_splice_pos = left_splice_pos + 1;
520     if(left > left_splice_pos){
521     /*
522     * The genomic position of the left edge of the alignment needs to be corrected
523     * If the alignment does not extend into the insertion, simply subtract the length
524     * of the inserted sequence, otherwise, just set it equal to the right edge
525     */
526     left = left - insertedSequence.length();
527     if(left < right_splice_pos){
528     left = right_splice_pos;
529     }
530     }
531     if(right > left_splice_pos){
532     right = right - insertedSequence.length();
533     if(right < left_splice_pos){
534     right = left_splice_pos;
535     }
536     }
537     /*
538     * Now, right and left should be properly transformed into genomic coordinates
539     * We should be able to deduce how much the alignment matches the insertion
540     * simply based on the length of the read
541     */
542     int left_match_length = 0;
543     if(left <= left_splice_pos){
544     left_match_length = left_splice_pos - left + 1;
545     }
546     int right_match_length = 0;
547     if(right >= right_splice_pos){
548     right_match_length = right - right_splice_pos + 1;
549     }
550     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
551    
552     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
553     return false;
554    
555     string junction_strand = toks[num_extra_toks + strand_field];
556     if(junction_strand != "rev" && junction_strand != "fwd"){
557     fprintf(stderr, "Malformed insertion record\n");
558     return false;
559     }
560    
561     char* pch = strtok( mismatches, ",");
562     unsigned char num_mismatches = 0;
563    
564     /*
565     * remember that text_offset holds the left end of the
566     * alignment, relative to the start of the contig
567     */
568    
569     /*
570     * The 0-based relative position of the left-most character
571     * before the insertion in the contig
572     */
573     int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
574     while (pch != NULL)
575     {
576     char* colon = strchr(pch, ':');
577     if (colon)
578     {
579     *colon = 0;
580     int mismatch_pos = atoi(pch);
581    
582     /*
583     * for reversely mapped reads,
584     * find the correct mismatched position.
585     */
586     if(orientation == '-'){
587     mismatch_pos = spliced_read_len - mismatch_pos - 1;
588     }
589    
590     /*
591     * Only count mismatches outside of the insertion region
592     * If there is a mismatch within the insertion,
593     * disallow this hit
594     */
595     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
596     num_mismatches++;
597     }else{
598     return false;
599     }
600     }
601     pch = strtok (NULL, ",");
602     }
603    
604    
605     vector<CigarOp> cigar;
606     cigar.push_back(CigarOp(MATCH, left_match_length));
607     cigar.push_back(CigarOp(INS, insertion_match_length));
608     cigar.push_back(CigarOp(MATCH, right_match_length));
609    
610     /*
611     * For now, disallow hits that don't span
612     * the insertion. If we allow these types of hits,
613     * then long_spanning.cpp needs to be updated
614     * in order to intelligently merge these kinds
615     * of reads back together
616     *
617     * Following code has been changed to allow segment that end
618     * in an insertion
619     */
620     bh = create_hit(name,
621     contig,
622     left,
623     cigar,
624     orientation == '-',
625     junction_strand == "rev",
626     num_mismatches,
627     0,
628     end);
629     return true;
630     }
631    
632     else
633     {
634     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
635     int spliced_read_len = strlen(seq_str);
636     int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
637     if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
638     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
639    
640     if (right_splice_pos <= 0 || left_splice_pos <= 0)
641     return false;
642    
643     if (orientation == '+')
644     {
645     if (left_splice_pos + seg_offset < _anchor_length){
646     return false;
647     }
648     }
649     else
650     {
651     if (right_splice_pos + seg_offset < _anchor_length)
652     return false;
653     }
654     //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
655     //atoi(toks[right_window_edge_field].c_str());
656     int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
657    
658     string junction_strand = toks[num_extra_toks + strand_field];
659     if (!(junction_strand == "rev" || junction_strand == "fwd")||
660     !(orientation == '-' || orientation == '+'))
661     {
662     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
663     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
664     // junction_strand.c_str(), orientation);
665     return false;
666     }
667    
668     //vector<string> mismatch_toks;
669     char* pch = strtok (mismatches,",");
670     int mismatches_in_anchor = 0;
671     unsigned char num_mismatches = 0;
672     while (pch != NULL)
673     {
674     char* colon = strchr(pch, ':');
675     if (colon)
676     {
677     *colon = 0;
678     num_mismatches++;
679     int mismatch_pos = atoi(pch);
680     if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
681     (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
682     mismatches_in_anchor++;
683     }
684     //mismatch_toks.push_back(pch);
685     pch = strtok (NULL, ",");
686     }
687    
688     // FIXME: we probably should exclude these hits somewhere, but this
689     // isn't the right place
690     vector<CigarOp> cigar;
691     cigar.push_back(CigarOp(MATCH, left_splice_pos));
692     if(toks[num_extra_toks + junction_type_field] == "del"){
693     cigar.push_back(CigarOp(DEL, gap_len));
694     }else{
695     cigar.push_back(CigarOp(REF_SKIP, gap_len));
696     }
697     cigar.push_back(CigarOp(MATCH, right_splice_pos));
698    
699     bh = create_hit(name,
700     contig,
701     left,
702     cigar,
703     orientation == '-',
704     junction_strand == "rev",
705     num_mismatches,
706     mismatches_in_anchor,
707     end);
708     return true;
709     }
710     }
711     else
712     {
713     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
714     //fprintf(stderr, "%s\n", orig_bwt_buf);
715     // continue;
716     return false;
717     }
718    
719     return false;
720     }
721    
722 gpertea 70
723     char parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
724     bool &spliced_alignment) {
725     // Mostly pilfered direct from the SAM tools:
726     const char* p_cig = cigar_str;
727     while (*p_cig)
728     {
729     char* t;
730     int length = (int)strtol(p_cig, &t, 10);
731     if (length <= 0)
732     {
733     //fprintf (stderr, "CIGAR op has zero length\n");
734     return false;
735     }
736     char op_char = toupper(*t);
737     CigarOpCode opcode;
738     if (op_char == 'M') opcode = MATCH;
739     else if (op_char == 'I') opcode = INS;
740     else if (op_char == 'D') opcode = DEL;
741     else if (op_char == 'N')
742     {
743     if (length > max_report_intron_length)
744     return false;
745     opcode = REF_SKIP;
746     spliced_alignment = true;
747     }
748     else if (op_char == 'S') opcode = SOFT_CLIP;
749     else if (op_char == 'H') opcode = HARD_CLIP;
750     else if (op_char == 'P') opcode = PAD;
751     else
752     {
753     fprintf (stderr, "invalid CIGAR operation\n");
754     return false;
755     }
756     p_cig = t + 1;
757     //i += length;
758     cigar.push_back(CigarOp(opcode, length));
759     }
760     if (*p_cig) {
761     fprintf (stderr, "unmatched CIGAR operation\n");
762     return *p_cig;
763     }
764     return 0;
765     }
766    
767     void getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
768     int* mismatches, int& num_mismatches, int& sam_nm, bool& antisense_splice) {
769     const char* tag_buf = buf;
770     sam_nm=0;
771     num_mismatches=0;
772     while((tag_buf = get_token((char**)&buf,"\t")))
773     {
774     vector<string> tuple_fields;
775     tokenize(tag_buf,":", tuple_fields);
776     if (tuple_fields.size() == 3)
777     {
778     if (tuple_fields[0] == "XS")
779     {
780     if (tuple_fields[2] == "-")
781     antisense_splice = true;
782     }
783     else if (tuple_fields[0] == "NM")
784     {
785     sam_nm = atoi(tuple_fields[2].c_str());
786     }
787     else if (tuple_fields[0] == "NS")
788     {
789     //ignored for now
790     }
791     else if (tuple_fields[0] == "MD")
792     {
793     const char* p=tuple_fields[2].c_str();
794     int bi=0; //base offset position in the read
795     while (*p != 0) {
796     if (isdigit(*p)) {
797     int v=atoi(p);
798     do { p++; } while (isdigit(*p));
799     bi+=v;
800     }
801     while (isupper(*p)) {
802     p++;
803     mismatches[num_mismatches]=bi;
804     num_mismatches++;
805     bi++;
806     }
807     if (*p=='^') { //reference deletion
808     p++;
809     while (isupper(*p)) { //insert read bases
810     p++; bi++;
811     }
812     }
813     }
814     }
815     else
816     {
817     //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
818     //return false;
819     }
820     }
821     }
822     /* By convention,the NM field of the SAM record
823     * counts an insertion or deletion. I dont' think
824     * we want the mismatch count in the BowtieHit
825     * record to reflect this. Therefore, subtract out
826     * the mismatches due to in/dels
827     */
828     for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
829     if(itr->opcode == INS){
830     sam_nm -= itr->length;
831     }
832     if(itr->opcode == DEL){
833     sam_nm -= itr->length;
834     }
835     }
836     }
837    
838 gpertea 29 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
839     BowtieHit& bh,
840     bool strip_slash,
841     char* name_out,
842     char* name_tags,
843     char* seq,
844     char* qual)
845     {
846     if (!orig_bwt_buf || !*orig_bwt_buf)
847     return false;
848     char bwt_buf[2048];
849     strcpy(bwt_buf, orig_bwt_buf);
850     // Are we still in the header region?
851     if (bwt_buf[0] == '@')
852     return false;
853    
854     char* buf = bwt_buf;
855     char* name = get_token((char**)&buf,"\t");
856     char* sam_flag_str = get_token((char**)&buf,"\t");
857     char* text_name = get_token((char**)&buf,"\t");
858     char* text_offset_str = get_token((char**)&buf,"\t");
859     const char* map_qual_str = get_token((char**)&buf,"\t");
860     char* cigar_str = get_token((char**)&buf,"\t");
861     const char* mate_ref_str = get_token((char**)&buf,"\t");
862     const char* mate_pos_str = get_token((char**)&buf,"\t");
863     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
864    
865     const char* seq_str = get_token((char**)&buf,"\t");
866     if (seq)
867     strcpy(seq, seq_str);
868    
869     const char* qual_str = get_token((char**)&buf,"\t");
870     if (qual)
871     strcpy(qual, qual_str);
872    
873     if (!name ||
874     !sam_flag_str ||
875     !text_name ||
876     !text_offset_str ||
877     !map_qual_str ||
878     !cigar_str ||
879     !mate_ref_str ||
880     !mate_pos_str ||
881     !inferred_insert_sz_str ||
882     !seq_str ||
883     !qual_str)
884     {
885     // truncated or malformed SAM record
886     return false;
887     }
888    
889    
890     int sam_flag = atoi(sam_flag_str);
891     int text_offset = atoi(text_offset_str);
892    
893     bool end = true;
894     unsigned int seg_offset = 0;
895     unsigned int seg_num = 0;
896     unsigned int num_segs = 0;
897    
898     // Copy the tag out of the name field before we might wipe it out
899 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
900 gpertea 29
901     vector<CigarOp> cigar;
902     bool spliced_alignment = false;
903 gpertea 70 if (parseCigar(cigar, cigar_str, spliced_alignment)) {
904     return false;
905     }
906 gpertea 29
907     //vector<string> attributes;
908     //tokenize(tag_buf, " \t",attributes);
909 gpertea 70
910 gpertea 29 bool antisense_splice = false;
911 gpertea 70 int num_mismatches = 0;
912     int sam_nm = 0; //the value of the NM tag (edit distance)
913     int mismatches[1024];
914     getSAMmismatches(buf, cigar, mismatches, num_mismatches, sam_nm, antisense_splice);
915 gpertea 29
916     if (spliced_alignment)
917     {
918     bh = create_hit(name,
919     text_name,
920     text_offset - 1,
921     cigar,
922     sam_flag & 0x0010,
923     antisense_splice,
924     num_mismatches,
925 gpertea 70 0,
926 gpertea 29 end);
927     }
928     else
929     {
930     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
931     bh = create_hit(name,
932     text_name,
933     text_offset - 1, // SAM files are 1-indexed
934     cigar,
935     sam_flag & 0x0010,
936     false,
937     num_mismatches,
938     0,
939     end);
940     }
941 gpertea 70 return true;
942 gpertea 29 }
943    
944 gpertea 70
945     void spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar,
946     int8_t l_len, int mlen, int8_t r_len, CigarOpCode opcode) {
947     //merge the original 'cigar' with the new insert/gap operation
948     // at position l_len into splcigar;
949     //find the original opcode that croses l_len location
950     int ofs=0;
951     bool found=false;
952     for (size_t c = 0 ; c < cigar.size(); ++c)
953     {
954     ofs+=cigar[c].length;
955     if (!found) {
956     if (ofs>=l_len) {
957     found=true;
958 gpertea 97 //TODO:inject new code here, splitting existing code if necessary
959    
960     continue;
961 gpertea 70 }
962     else {
963     //not found yet, just copy these codes
964     splcigar.push_back(cigar[c]);
965     }
966     }
967 gpertea 97 else { //found already
968     //TODO: check this
969     splcigar.push_back(cigar[c]);
970 gpertea 70 }
971     }
972     }
973    
974 gpertea 69 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
975     BowtieHit& bh,
976     bool strip_slash,
977     char* name_out,
978     char* name_tags,
979     char* seq,
980     char* qual)
981     {
982     if (!orig_bwt_buf || !*orig_bwt_buf)
983     return false;
984 gpertea 29
985 gpertea 69 char bwt_buf[2048];
986     strcpy(bwt_buf, orig_bwt_buf);
987     // Are we still in the header region?
988     if (bwt_buf[0] == '@')
989     return false;
990 gpertea 29
991 gpertea 69 char* buf = bwt_buf;
992     char* name = get_token((char**)&buf,"\t");
993     char* sam_flag_str = get_token((char**)&buf,"\t");
994     char* text_name = get_token((char**)&buf,"\t");
995     char* text_offset_str = get_token((char**)&buf,"\t");
996     const char* map_qual_str = get_token((char**)&buf,"\t");
997     char* cigar_str = get_token((char**)&buf,"\t");
998     const char* mate_ref_str = get_token((char**)&buf,"\t");
999     const char* mate_pos_str = get_token((char**)&buf,"\t");
1000     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1001 gpertea 70 int num_mismatches=0;
1002     int mismatches[1024]; //list of 0-based mismatch positions in this read
1003     //parsed from SAM's MD:Z: tag
1004 gpertea 69 const char* seq_str = get_token((char**)&buf,"\t");
1005     if (seq)
1006     strcpy(seq, seq_str);
1007    
1008     const char* qual_str = get_token((char**)&buf,"\t");
1009     if (qual)
1010     strcpy(qual, qual_str);
1011    
1012     if (!name ||
1013     !sam_flag_str ||
1014     !text_name ||
1015     !text_offset_str ||
1016     !map_qual_str ||
1017     !cigar_str ||
1018     !mate_ref_str ||
1019     !mate_pos_str ||
1020     !inferred_insert_sz_str ||
1021     !seq_str ||
1022     !qual_str)
1023     {
1024     // truncated or malformed SAM record
1025     return false;
1026     }
1027    
1028    
1029     int sam_flag = atoi(sam_flag_str);
1030     int text_offset = atoi(text_offset_str);
1031 gpertea 70 text_offset--; //SAM is 1-based, Bowtie is 0-based
1032 gpertea 69 bool end = true;
1033     unsigned int seg_offset = 0;
1034     unsigned int seg_num = 0;
1035     unsigned int num_segs = 0;
1036    
1037     // Copy the tag out of the name field before we might wipe it out
1038 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1039 gpertea 69
1040 gpertea 70 vector<CigarOp> samcigar;
1041     bool spliced_alignment = false;
1042     if (parseCigar(samcigar, cigar_str, spliced_alignment)) {
1043     return false;
1044     }
1045 gpertea 69
1046 gpertea 70 bool antisense_splice = false;
1047     int sam_nm = 0;
1048     getSAMmismatches(buf, samcigar, mismatches, num_mismatches, sam_nm, antisense_splice);
1049 gpertea 69
1050 gpertea 70 //##############################################
1051 gpertea 69
1052     // Add this alignment to the table of hits for this half of the
1053     // Bowtie map
1054    
1055     // Parse the text_name field to recover the splice coords
1056     vector<string> toks;
1057    
1058     tokenize_strict(text_name, "|", toks);
1059    
1060     int num_extra_toks = (int)toks.size() - 6;
1061    
1062     if (num_extra_toks >= 0)
1063     {
1064     static const uint8_t left_window_edge_field = 1;
1065     static const uint8_t splice_field = 2;
1066     //static const uint8_t right_window_edge_field = 3;
1067     static const uint8_t junction_type_field = 4;
1068     static const uint8_t strand_field = 5;
1069    
1070     string contig = toks[0];
1071     for (int t = 1; t <= num_extra_toks; ++t)
1072     {
1073     contig += "|";
1074     contig += toks[t];
1075     }
1076    
1077     vector<string> splice_toks;
1078     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1079     if (splice_toks.size() != 2)
1080     {
1081     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1082 gpertea 70 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1083 gpertea 69 // toks[num_extra_toks + splice_field].c_str());
1084     return false;
1085     }
1086 gpertea 70 int spl_num_mismatches=0;
1087 gpertea 69 //
1088     // check for an insertion hit
1089     //
1090     if(toks[num_extra_toks + junction_type_field] == "ins")
1091     {
1092     int8_t spliced_read_len = strlen(seq_str);
1093 gpertea 70 // The 0-based position of the left edge of the alignment. Note that this
1094     // value may need to be futher corrected to account for the presence of
1095     // of the insertion.
1096 gpertea 69 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1097     uint32_t right = left + spliced_read_len - 1;
1098    
1099 gpertea 70 // The 0-based position of the last genomic sequence before the insertion
1100 gpertea 69 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
1101    
1102     string insertedSequence = splice_toks[1];
1103 gpertea 70 // The 0-based position of the first genomic sequence after the insertion
1104    
1105 gpertea 69 uint32_t right_splice_pos = left_splice_pos + 1;
1106     if(left > left_splice_pos){
1107     /*
1108     * The genomic position of the left edge of the alignment needs to be corrected
1109     * If the alignment does not extend into the insertion, simply subtract the length
1110     * of the inserted sequence, otherwise, just set it equal to the right edge
1111     */
1112     left = left - insertedSequence.length();
1113     if(left < right_splice_pos){
1114     left = right_splice_pos;
1115     }
1116     }
1117     if(right > left_splice_pos){
1118     right = right - insertedSequence.length();
1119     if(right < left_splice_pos){
1120     right = left_splice_pos;
1121     }
1122     }
1123     /*
1124     * Now, right and left should be properly transformed into genomic coordinates
1125     * We should be able to deduce how much the alignment matches the insertion
1126     * simply based on the length of the read
1127     */
1128     int left_match_length = 0;
1129     if(left <= left_splice_pos){
1130     left_match_length = left_splice_pos - left + 1;
1131     }
1132     int right_match_length = 0;
1133     if(right >= right_splice_pos){
1134     right_match_length = right - right_splice_pos + 1;
1135     }
1136     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1137    
1138     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1139     return false;
1140    
1141     string junction_strand = toks[num_extra_toks + strand_field];
1142     if(junction_strand != "rev" && junction_strand != "fwd"){
1143     fprintf(stderr, "Malformed insertion record\n");
1144     return false;
1145     }
1146    
1147 gpertea 70 //char* pch = strtok( mismatches, ",");
1148     //unsigned char num_mismatches = 0;
1149     //text_offset holds the left end of the alignment,
1150     //RELATIVE TO the start of the contig
1151 gpertea 69
1152 gpertea 70 //The 0-based relative position of the left-most character
1153     //before the insertion in the contig
1154 gpertea 69 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1155 gpertea 70 for (int i=0;i<num_mismatches;i++) {
1156     int mismatch_pos = mismatches[i];
1157     // for reversely mapped reads,
1158     //find the correct mismatched position.
1159     if (sam_flag & 0x0010){
1160     mismatch_pos = spliced_read_len - mismatch_pos - 1;
1161     }
1162 gpertea 69
1163 gpertea 70 //Only count mismatches outside of the insertion region
1164     // If there is a mismatch within the insertion,
1165     // disallow this hit
1166     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1167     spl_num_mismatches++;
1168     }else{
1169     return false;
1170 gpertea 69 }
1171     }
1172    
1173 gpertea 70 //TODO: merge with the original cigar string
1174     vector<CigarOp> splcigar;
1175     spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1176 gpertea 69 /*
1177 gpertea 70 splcigar.push_back(CigarOp(MATCH, left_match_length));
1178     splcigar.push_back(CigarOp(INS, insertion_match_length));
1179     splcigar.push_back(CigarOp(MATCH, right_match_length));
1180     */
1181     /*
1182 gpertea 69 * For now, disallow hits that don't span
1183     * the insertion. If we allow these types of hits,
1184     * then long_spanning.cpp needs to be updated
1185     * in order to intelligently merge these kinds
1186     * of reads back together
1187     *
1188     * Following code has been changed to allow segment that end
1189     * in an insertion
1190     */
1191     bh = create_hit(name,
1192     contig,
1193     left,
1194 gpertea 70 //splcigar,
1195     splcigar,
1196     sam_flag & 0x0010,
1197 gpertea 69 junction_strand == "rev",
1198 gpertea 70 spl_num_mismatches,
1199 gpertea 69 0,
1200     end);
1201     return true;
1202 gpertea 70 } //"ins"
1203 gpertea 69 else
1204     {
1205     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1206     int spliced_read_len = strlen(seq_str);
1207     int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
1208     if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
1209     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
1210    
1211     if (right_splice_pos <= 0 || left_splice_pos <= 0)
1212     return false;
1213    
1214     if ((sam_flag & 0x0010) == 0) //######
1215     {
1216 gpertea 70 if (left_splice_pos + seg_offset < _anchor_length)
1217     return false;
1218 gpertea 69 }
1219     else
1220     {
1221 gpertea 70 if (right_splice_pos + seg_offset < _anchor_length)
1222     return false;
1223 gpertea 69 }
1224     //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1225     //atoi(toks[right_window_edge_field].c_str());
1226     int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
1227    
1228     string junction_strand = toks[num_extra_toks + strand_field];
1229 gpertea 70 if (!(junction_strand == "rev" || junction_strand == "fwd"))
1230     // || !(orientation == '-' || orientation == '+'))
1231 gpertea 69 {
1232 gpertea 70 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1233     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
1234     // junction_strand.c_str(), orientation);
1235     return false;
1236 gpertea 69 }
1237    
1238     int mismatches_in_anchor = 0;
1239 gpertea 70 for (int i=0;i<num_mismatches;i++) {
1240     spl_num_mismatches++;
1241     int mismatch_pos = mismatches[i];
1242     if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
1243     ((sam_flag & 0x0010) != 0 &&
1244     abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
1245     mismatches_in_anchor++;
1246     }
1247 gpertea 69
1248     // FIXME: we probably should exclude these hits somewhere, but this
1249     // isn't the right place
1250 gpertea 70 vector<CigarOp> splcigar;
1251     CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP ;
1252     spliceCigar(splcigar, samcigar, left_splice_pos, gap_len, right_splice_pos, INS);
1253     /*
1254     splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1255 gpertea 69 if(toks[num_extra_toks + junction_type_field] == "del"){
1256 gpertea 70 splcigar.push_back(CigarOp(DEL, gap_len));
1257 gpertea 69 }else{
1258 gpertea 70 splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1259 gpertea 69 }
1260 gpertea 70 splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1261     */
1262 gpertea 69 bh = create_hit(name,
1263     contig,
1264     left,
1265 gpertea 70 splcigar,
1266     (sam_flag & 0x0010),
1267 gpertea 69 junction_strand == "rev",
1268 gpertea 70 spl_num_mismatches,
1269 gpertea 69 mismatches_in_anchor,
1270     end);
1271     return true;
1272     }
1273 gpertea 70 } //parse splice data
1274 gpertea 69 else
1275     {
1276     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1277     //fprintf(stderr, "%s\n", orig_bwt_buf);
1278     // continue;
1279     return false;
1280     }
1281    
1282     return false;
1283     }
1284    
1285    
1286 gpertea 29 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1287     BowtieHit& bh, bool strip_slash,
1288     char* name_out, char* name_tags,
1289     char* seq, char* qual) {
1290     if (_sam_header==NULL)
1291     err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1292     const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1293    
1294    
1295     uint32_t sam_flag = hit_buf->core.flag;
1296    
1297     int text_offset = hit_buf->core.pos;
1298     int text_mate_pos = hit_buf->core.mpos;
1299     int target_id = hit_buf->core.tid;
1300     int mate_target_id = hit_buf->core.mtid;
1301    
1302     vector<CigarOp> cigar;
1303     bool spliced_alignment = false;
1304     int num_hits = 1;
1305    
1306     double mapQ = hit_buf->core.qual;
1307     long double error_prob;
1308     if (mapQ > 0)
1309     {
1310     long double p = (-1.0 * mapQ) / 10.0;
1311     error_prob = pow(10.0L, p);
1312     }
1313     else
1314     {
1315     error_prob = 1.0;
1316     }
1317    
1318     //header->target_name[c->tid]
1319    
1320     bool end = true;
1321     unsigned int seg_offset = 0;
1322     unsigned int seg_num = 0;
1323     unsigned int num_segs = 0;
1324     // Copy the tag out of the name field before we might wipe it out
1325     char* qname = bam1_qname(hit_buf);
1326     char* pipe = strrchr(qname, '|');
1327     if (pipe)
1328     {
1329     if (name_tags)
1330     strcpy(name_tags, pipe);
1331    
1332     char* tag_buf = pipe + 1;
1333     if (strchr(tag_buf, ':'))
1334     {
1335     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1336     if (seg_num + 1 == num_segs)
1337     end = true;
1338     else
1339     end = false;
1340     }
1341    
1342     *pipe = 0;
1343     }
1344    
1345    
1346     if (target_id < 0) {
1347     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1348     bh = create_hit(qname,
1349     "*", //ref_name
1350     0, //left coord
1351     0, //read_len
1352     false, //antisense_aln
1353     0, //edit_dist
1354     end);
1355     return true;
1356     }
1357    
1358     //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1359     string text_name = _sam_header->target_name[target_id];
1360     for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1361     {
1362     //char* t;
1363    
1364     int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1365     if (length <= 0)
1366     {
1367     fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1368     return false;
1369     }
1370    
1371     CigarOpCode opcode;
1372     switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1373     {
1374     case BAM_CMATCH: opcode = MATCH; break;
1375     case BAM_CINS: opcode = INS; break;
1376     case BAM_CDEL: opcode = DEL; break;
1377     case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1378     case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1379     case BAM_CPAD: opcode = PAD; break;
1380     case BAM_CREF_SKIP:
1381     opcode = REF_SKIP;
1382     spliced_alignment = true;
1383     if (length > (int)max_report_intron_length)
1384     {
1385     //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1386     return false;
1387     }
1388     break;
1389     default:
1390     fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1391     return false;
1392     }
1393     if (opcode != HARD_CLIP)
1394     cigar.push_back(CigarOp(opcode, length));
1395     }
1396    
1397     string mrnm;
1398     if (mate_target_id >= 0) {
1399     if (mate_target_id == target_id) {
1400     //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1401     mrnm = _sam_header->target_name[mate_target_id];
1402     }
1403     else {
1404     //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1405     return false;
1406     }
1407     }
1408     else {
1409     text_mate_pos = 0;
1410     }
1411     //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1412     bool antisense_splice=false;
1413     unsigned char num_mismatches = 0;
1414     unsigned char num_splice_anchor_mismatches = 0;
1415    
1416     uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1417     if (ptr) {
1418     char src_strand_char = bam_aux2A(ptr);
1419     if (src_strand_char == '-')
1420     antisense_splice = true;
1421     // else if (src_strand_char == '+')
1422     // source_strand = CUFF_FWD;
1423     }
1424    
1425     ptr = bam_aux_get(hit_buf, "NM");
1426     if (ptr) {
1427     num_mismatches = bam_aux2i(ptr);
1428     }
1429    
1430     ptr = bam_aux_get(hit_buf, "NH");
1431     if (ptr) {
1432     num_hits = bam_aux2i(ptr);
1433     }
1434    
1435     //bool antisense_aln = bam1_strand(hit_buf);
1436    
1437     //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1438     // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1439     if (spliced_alignment) {
1440     //if (source_strand == CUFF_STRAND_UNKNOWN) {
1441     // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1442     // }
1443     bh = create_hit(qname,
1444     text_name,
1445     text_offset, // BAM files are 0-indexed
1446     cigar,
1447     sam_flag & 0x0010,
1448     antisense_splice,
1449     num_mismatches,
1450     num_splice_anchor_mismatches,
1451     end);
1452    
1453     }
1454     else {
1455     //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1456     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1457     bh = create_hit(qname,
1458     text_name,
1459     text_offset, // BAM files are 0-indexed
1460     cigar,
1461     sam_flag & 0x0010,
1462     false,
1463     num_mismatches,
1464     0,
1465     end);
1466     }
1467     if (seq!=NULL) {
1468     char *bseq = (char*)bam1_seq(hit_buf);
1469     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1470     char v = bam1_seqi(bseq,i);
1471     seq[i]=bam_nt16_rev_table[v];
1472     }
1473     seq[hit_buf->core.l_qseq]=0;
1474     }
1475     if (qual!=NULL) {
1476     char *bq = (char*)bam1_qual(hit_buf);
1477     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1478     qual[i]=bq[i]+33;
1479     }
1480     qual[hit_buf->core.l_qseq]=0;
1481     }
1482     return true;
1483     }
1484    
1485     bool BAMHitFactory::inspect_header(HitStream& hs)
1486     {
1487     bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1488    
1489     if (header == NULL) {
1490     fprintf(stderr, "Warning: No BAM header\n");
1491     return false;
1492     }
1493     if (header->l_text == 0) {
1494     fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1495     return false;
1496     }
1497     return true;
1498     }
1499    
1500    
1501     void get_mapped_reads(FILE* bwtf,
1502     HitTable& hits,
1503     HitFactory& hit_factory,
1504     bool strip_slash,
1505     bool verbose)
1506     {
1507    
1508    
1509     char bwt_buf[2048];
1510     uint32_t reads_extracted = 0;
1511    
1512     while (fgets(bwt_buf, 2048, bwtf))
1513     {
1514     // Chomp the newline
1515     char* nl = strrchr(bwt_buf, '\n');
1516     if (nl) *nl = 0;
1517     if (*bwt_buf == 0)
1518     continue;
1519     // Get a new record from the tab-delimited Bowtie map
1520     BowtieHit bh;
1521     if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1522     {
1523     // Only check uniqueness if these hits are spliced
1524     hits.add_hit(bh, true);
1525     }
1526     reads_extracted++;
1527     }
1528    
1529     // This will sort the map by insert id.
1530     hits.finalize();
1531     fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1532     }
1533    
1534    
1535     /*
1536     AlignStatus status(const BowtieHit* align)
1537     {
1538     if (!align)
1539     return UNALIGNED;
1540     if (align->contiguous())
1541     return CONTIGUOUS;
1542     return SPLICED;
1543     }
1544     */
1545    
1546     void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1547     {
1548     int max_hit_pos = -1;
1549     for (size_t i = 0; i < hits.size(); ++i)
1550     {
1551     max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1552     }
1553    
1554     if ((int)DoC.size() < max_hit_pos)
1555     DoC.resize(max_hit_pos);
1556    
1557     for (size_t i = 0; i < hits.size(); ++i)
1558     {
1559     const BowtieHit& bh = hits[i];
1560    
1561     // split up the coverage contibution for this reads
1562     size_t j = bh.left();
1563     const vector<CigarOp>& cigar = bh.cigar();
1564    
1565     for (size_t c = 0 ; c < cigar.size(); ++c)
1566     {
1567     switch(cigar[c].opcode)
1568     {
1569     case MATCH:
1570     for (size_t m = 0; m < cigar[c].length; ++m)
1571     {
1572     if (DoC[j + m] < 0xFFFF)
1573     DoC[j + m]++;
1574     }
1575     //fall through this case to REF_SKIP is intentional
1576     case REF_SKIP:
1577     j += cigar[c].length;
1578     break;
1579     default:
1580     break;
1581     }
1582    
1583     }
1584     }
1585     }
1586    
1587     void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1588     {
1589     if ((int)DoC.size() < bh.right())
1590     DoC.resize(bh.right());
1591    
1592     // split up the coverage contibution for this reads
1593     size_t j = bh.left();
1594     const vector<CigarOp>& cigar = bh.cigar();
1595    
1596     for (size_t c = 0 ; c < cigar.size(); ++c)
1597     {
1598     switch(cigar[c].opcode)
1599     {
1600     case MATCH:
1601     for (size_t m = 0; m < cigar[c].length; ++m)
1602     {
1603     if (DoC[j + m] < 0xFFFFFFFF)
1604     DoC[j + m]++;
1605     }
1606     //fall through this case to REF_SKIP is intentional
1607     case REF_SKIP:
1608     j += cigar[c].length;
1609     break;
1610     default:
1611     break;
1612     }
1613    
1614     }
1615     }
1616    
1617     void print_hit(FILE* fout,
1618     const char* read_name,
1619     const BowtieHit& bh,
1620     const char* ref_name,
1621     const char* sequence,
1622     const char* qualities,
1623     bool from_bowtie)
1624     {
1625     string seq;
1626     string quals;
1627     if (sequence)
1628     {
1629     seq = sequence;
1630     quals = qualities;
1631     seq.resize(bh.read_len());
1632     quals.resize(bh.read_len());
1633     }
1634     else
1635     {
1636     seq = "*";
1637     }
1638    
1639     if (qualities)
1640     {
1641     quals = qualities;
1642     quals.resize(bh.read_len());
1643     }
1644     else
1645     {
1646     quals = "*";
1647     }
1648    
1649     uint32_t sam_flag = 0;
1650     if (bh.antisense_align())
1651     {
1652     sam_flag |= 0x0010; // BAM_FREVERSE
1653     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1654     {
1655     reverse_complement(seq);
1656     reverse(quals.begin(), quals.end());
1657     }
1658     }
1659    
1660     uint32_t sam_pos = bh.left() + 1;
1661     uint32_t map_quality = 255;
1662     char cigar[256];
1663     cigar[0] = 0;
1664     string mate_ref_name = "*";
1665     uint32_t mate_pos = 0;
1666     uint32_t insert_size = 0;
1667     //string qualities = "*";
1668    
1669     const vector<CigarOp>& bh_cigar = bh.cigar();
1670    
1671     /*
1672     * In addition to calculating the cigar string,
1673     * we need to figure out how many in/dels are in the
1674     * sequence, so that we can give the correct
1675     * value for the NM tag
1676     */
1677     int indel_distance = 0;
1678     for (size_t c = 0; c < bh_cigar.size(); ++c)
1679     {
1680     char ibuf[64];
1681     sprintf(ibuf, "%d", bh_cigar[c].length);
1682     switch(bh_cigar[c].opcode)
1683     {
1684     case MATCH:
1685     strcat(cigar, ibuf);
1686     strcat(cigar, "M");
1687     break;
1688     case INS:
1689     strcat(cigar, ibuf);
1690     strcat(cigar, "I");
1691     indel_distance += bh_cigar[c].length;
1692     break;
1693     case DEL:
1694     strcat(cigar, ibuf);
1695     strcat(cigar, "D");
1696     indel_distance += bh_cigar[c].length;
1697     break;
1698     case REF_SKIP:
1699     strcat(cigar, ibuf);
1700     strcat(cigar, "N");
1701     break;
1702     default:
1703     break;
1704     }
1705     }
1706    
1707     //string q = string(bh.read_len(), '!');
1708     //string s = string(bh.read_len(), 'N');
1709    
1710     fprintf(fout,
1711     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1712     read_name,
1713     sam_flag,
1714     ref_name,
1715     sam_pos,
1716     map_quality,
1717     cigar,
1718     mate_ref_name.c_str(),
1719     mate_pos,
1720     insert_size,
1721     seq.c_str(),
1722     quals.c_str());
1723    
1724     if (!sam_readgroup_id.empty())
1725     fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1726    
1727     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1728    
1729     bool containsSplice = false;
1730     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1731     if(itr->opcode == REF_SKIP){
1732     containsSplice = true;
1733     break;
1734     }
1735     }
1736    
1737     if (containsSplice)
1738     fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1739    
1740     fprintf(fout, "\n");
1741     }
1742    
1743     void print_bamhit(GBamWriter& wbam,
1744     const char* read_name,
1745     const BowtieHit& bh,
1746     const char* ref_name,
1747     const char* sequence,
1748     const char* qualities,
1749     bool from_bowtie)
1750     {
1751     string seq;
1752     string quals;
1753     if (sequence) {
1754     seq = sequence;
1755     quals = qualities;
1756     seq.resize(bh.read_len());
1757     quals.resize(bh.read_len());
1758     }
1759     else {
1760     seq = "*";
1761     }
1762     if (qualities) {
1763     quals = qualities;
1764     quals.resize(bh.read_len());
1765     }
1766     else
1767     {
1768     quals = "*";
1769     }
1770    
1771     uint32_t sam_flag = 0;
1772     if (bh.antisense_align())
1773     {
1774     sam_flag |= 0x0010; // BAM_FREVERSE
1775     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1776     {
1777     reverse_complement(seq);
1778     reverse(quals.begin(), quals.end());
1779     }
1780     }
1781    
1782     uint32_t sam_pos = bh.left() + 1;
1783     uint32_t map_quality = 255;
1784     char cigar[256];
1785     cigar[0] = 0;
1786     string mate_ref_name = "*";
1787     uint32_t mate_pos = 0;
1788     uint32_t insert_size = 0;
1789     //string qualities = "*";
1790    
1791     const vector<CigarOp>& bh_cigar = bh.cigar();
1792     /*
1793     * In addition to calculating the cigar string,
1794     * we need to figure out how many in/dels are in the
1795     * sequence, so that we can give the correct
1796     * value for the NM tag
1797     */
1798     int indel_distance = 0;
1799     for (size_t c = 0; c < bh_cigar.size(); ++c)
1800     {
1801     char ibuf[64];
1802     sprintf(ibuf, "%d", bh_cigar[c].length);
1803     switch(bh_cigar[c].opcode)
1804     {
1805     case MATCH:
1806     strcat(cigar, ibuf);
1807     strcat(cigar, "M");
1808     break;
1809     case INS:
1810     strcat(cigar, ibuf);
1811     strcat(cigar, "I");
1812     indel_distance += bh_cigar[c].length;
1813     break;
1814     case DEL:
1815     strcat(cigar, ibuf);
1816     strcat(cigar, "D");
1817     indel_distance += bh_cigar[c].length;
1818     break;
1819     case REF_SKIP:
1820     strcat(cigar, ibuf);
1821     strcat(cigar, "N");
1822     break;
1823     default:
1824     break;
1825     }
1826     }
1827    
1828     //string q = string(bh.read_len(), '!');
1829     //string s = string(bh.read_len(), 'N');
1830     /*
1831     fprintf(fout,
1832     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1833     read_name,
1834     sam_flag,
1835     ref_name,
1836     sam_pos,
1837     map_quality,
1838     cigar,
1839     mate_ref_name.c_str(),
1840     mate_pos,
1841     insert_size,
1842     seq.c_str(),
1843     quals.c_str());
1844    
1845     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1846     */
1847     vector<string> auxdata;
1848    
1849     if (!sam_readgroup_id.empty())
1850     {
1851     string nm("RG:Z:");
1852     nm += sam_readgroup_id;
1853     auxdata.push_back(nm);
1854     }
1855    
1856     string nm("NM:i:");
1857     str_appendInt(nm, bh.edit_dist() + indel_distance);
1858     auxdata.push_back(nm);
1859     bool containsSplice = false;
1860     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1861     if(itr->opcode == REF_SKIP){
1862     containsSplice = true;
1863     break;
1864     }
1865    
1866     if (containsSplice) {
1867     //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1868     nm="XS:A:";
1869     nm+=(char)(bh.antisense_splice() ? '-' : '+');
1870     auxdata.push_back(nm);
1871     }
1872    
1873     GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1874     cigar, mate_ref_name.c_str(), mate_pos,
1875     insert_size, seq.c_str(), quals.c_str(), &auxdata);
1876    
1877    
1878    
1879     wbam.write(brec);
1880     delete brec;
1881     }
1882    
1883     /**
1884     * Print a vector of cigar operations to a file.
1885     * @param bh_cigar A vector of CigarOps
1886     * @return a string representation of the cigar string
1887     */
1888     std::string print_cigar(vector<CigarOp>& bh_cigar){
1889     char cigar[256];
1890     cigar[0] = 0;
1891     for (size_t c = 0; c < bh_cigar.size(); ++c)
1892     {
1893     char ibuf[64];
1894     sprintf(ibuf, "%d", bh_cigar[c].length);
1895     switch(bh_cigar[c].opcode)
1896     {
1897     case MATCH:
1898     strcat(cigar, ibuf);
1899     strcat(cigar, "M");
1900     break;
1901     case INS:
1902     strcat(cigar, ibuf);
1903     strcat(cigar, "I");
1904     break;
1905     case DEL:
1906     strcat(cigar, ibuf);
1907     strcat(cigar, "D");
1908     break;
1909     case REF_SKIP:
1910     strcat(cigar, ibuf);
1911     strcat(cigar, "N");
1912     break;
1913     default:
1914     break;
1915     }
1916     }
1917     string result(cigar);
1918     return result;
1919     }
1920    
1921     bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
1922     {
1923     RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
1924     if (!ref_str)
1925     return false;
1926    
1927     const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
1928    
1929     size_t pos_seq = 0;
1930     size_t pos_ref = 0;
1931     size_t mismatch = 0;
1932     for (size_t i = 0; i < _cigar.size(); ++i)
1933     {
1934     CigarOp cigar = _cigar[i];
1935     switch(cigar.opcode)
1936     {
1937     case MATCH:
1938     {
1939     for (size_t j = 0; j < cigar.length; ++j)
1940     {
1941     if (_seq[pos_seq++] != ref_seq[pos_ref++])
1942     ++mismatch;
1943     }
1944     }
1945     break;
1946     case INS:
1947     {
1948     pos_seq += cigar.length;
1949     }
1950     break;
1951    
1952     case DEL:
1953     case REF_SKIP:
1954     {
1955     pos_ref += cigar.length;
1956     }
1957     break;
1958    
1959     default:
1960     break;
1961     }
1962     }
1963    
1964     return mismatch == _edit_dist;
1965     }