ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 70
Committed: Tue Sep 20 19:32:58 2011 UTC (8 years, 8 months ago) by gpertea
File size: 52769 byte(s)
Log Message:
wip : spliceCigar()

Line User Rev File contents
1 gpertea 29 /*
2     * bwt_map.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/17/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <cassert>
15     #include <cstdlib>
16     #include <iostream>
17     #include <set>
18     #include <vector>
19     #include <cmath>
20    
21     #include "common.h"
22     #include "bwt_map.h"
23     #include "tokenize.h"
24     #include "reads.h"
25    
26     using namespace std;
27    
28     void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29     {
30     uint32_t reference_id = bh.ref_id();
31    
32     pair<RefHits::iterator, bool> ret =
33     _hits_for_ref.insert(make_pair(reference_id, HitList()));
34     HitList& hl = ret.first->second;
35    
36     if (check_uniqueness)
37     {
38     // Check uniqueness, in case we are adding spliced hits from
39     // several spliced alignment sources (e.g. de novo hashing + Bowtie
40     // against a user-supplied index). We don't want to count the same
41     // alignment twice if it happened to be found by more than one method
42     HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43     HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44     for (; lb != ub && lb != hl.end(); ++lb)
45     {
46     if (*lb == bh)
47     {
48     //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49     return;
50     }
51    
52     if (lb->insert_id() == bh.insert_id() &&
53     lb->ref_id() == bh.ref_id() &&
54     lb->antisense_align() == bh.antisense_align())
55     {
56     // If we get here, we may be looking at the same alignment
57     // However, spanning_reads may report a shorter, trimmed alignment
58     // so not all fields will be equal. If they just disagree on the
59     // ends, and don't indicate a different junction coord, the
60     // alignments are the same.
61    
62     if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63     (bh.left() <= lb->left() && bh.right() >= lb->right()))
64     {
65     vector<pair<int,int> > lb_gaps, bh_gaps;
66     lb->gaps(lb_gaps);
67     bh.gaps(bh_gaps);
68     if (lb_gaps == bh_gaps)
69     {
70     // One alignment is contained in the other, they agree on
71     // where the gaps, if any, are, and they share an id
72     // => this is a redundant aligment, so toss it
73     //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74     return;
75     }
76     }
77     }
78     }
79     }
80     _total_hits++;
81     hl.push_back(bh);
82     }
83    
84     bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85     {
86     return h1.insert_id() < h2.insert_id();
87     }
88    
89     void LineHitFactory::openStream(HitStream& hs)
90     {
91     if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92     //open the file for HitStream here
93     hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94     if (hs._hit_file==NULL)
95     err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96     return;
97     }
98     if (hs._fzpipe!=NULL) {
99     hs._hit_file=hs._fzpipe->file;
100     }
101     }
102    
103     void LineHitFactory::rewind(HitStream& hs)
104     {
105     if (hs._fzpipe!=NULL) {
106     hs._fzpipe->rewind();
107     hs._hit_file=hs._fzpipe->file;
108     }
109     else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110     }
111    
112     bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113     FILE* f=(FILE *)(hs._hit_file);
114     bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115     if (!new_rec || feof(f)) {
116     hs._eof=true;
117     return false;
118     }
119     ++_line_num;
120     char* nl = strrchr(_hit_buf, '\n');
121     if (nl) *nl = 0;
122     buf = _hit_buf;
123     buf_size = _hit_buf_max_sz - 1;
124     return true;
125     }
126    
127     void LineHitFactory::closeStream(HitStream& hs) {
128     if (hs._fzpipe!=NULL) {
129     hs._fzpipe->close();
130     return;
131     }
132     if (hs._hit_file!=NULL) {
133     fclose((FILE*)(hs._hit_file));
134     hs._hit_file=NULL;
135     }
136     }
137     void BAMHitFactory::openStream(HitStream& hs) {
138     if (hs._hit_file==NULL) {
139     if (hs._hit_file_name.empty())
140     //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141     return; //invalid stream, could be just a place holder
142     //open the file here if not already open
143     string fext=getFext(hs._hit_file_name);
144     if (fext=="sam")
145     hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146     else
147     hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148    
149     samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150    
151     if (sam_file == NULL)
152     err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153     if (sam_file->header == NULL)
154     err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155     memset(&_next_hit, 0, sizeof(_next_hit));
156     //_beginning = bgzf_tell(sam_file->x.bam);
157     _sam_header=sam_file->header;
158     if (inspect_header(hs) == false)
159     err_die("Error: invalid SAM header for file %s\n",
160     hs._hit_file_name.c_str());
161     }
162     }
163    
164     void BAMHitFactory::closeStream(HitStream& hs) {
165     if (hs._hit_file) {
166     samclose((samfile_t*)(hs._hit_file));
167     }
168     hs._hit_file=NULL;
169     _sam_header=NULL;
170     }
171    
172     void BAMHitFactory::rewind(HitStream& hs)
173     {
174     /*
175     if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176     {
177     bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178     _eof = false;
179     }
180     */
181     this->closeStream(hs);
182     this->openStream(hs);
183     }
184    
185     string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186     const bam1_t* bamrec=(const bam1_t*)hit_buf;
187     char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188     string sam_line(tamline);
189     free(tamline);
190     return sam_line;
191     }
192    
193     bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194     if (_next_hit.data) {
195     free(_next_hit.data);
196     _next_hit.data = NULL;
197     }
198     _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199     if (hs.eof() || !hs.ready()) return false;
200    
201     //mark_curr_pos();
202    
203     memset(&_next_hit, 0, sizeof(_next_hit));
204    
205     int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206     if (bytes_read <= 0) {
207     hs._eof = true;
208     return false;
209     }
210     buf = (const char*)&_next_hit;
211     buf_size = bytes_read;
212     return true;
213     }
214    
215    
216     BowtieHit HitFactory::create_hit(const string& insert_name,
217     const string& ref_name,
218     int left,
219     const vector<CigarOp>& cigar,
220     bool antisense_aln,
221     bool antisense_splice,
222     unsigned char edit_dist,
223     unsigned char splice_mms,
224     bool end)
225     {
226     uint64_t insert_id = _insert_table.get_id(insert_name);
227     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228    
229     return BowtieHit(reference_id,
230     insert_id,
231     left,
232     cigar,
233     antisense_aln,
234     antisense_splice,
235     edit_dist,
236     splice_mms,
237     end);
238     }
239    
240     BowtieHit HitFactory::create_hit(const string& insert_name,
241     const string& ref_name,
242     uint32_t left,
243     uint32_t read_len,
244     bool antisense_aln,
245     unsigned char edit_dist,
246     bool end)
247     {
248     uint64_t insert_id = _insert_table.get_id(insert_name);
249     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250    
251     return BowtieHit(reference_id,
252     insert_id,
253     left,
254     read_len,
255     antisense_aln,
256     edit_dist,
257     end);
258     }
259    
260     bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261     BowtieHit& bh,
262     bool strip_slash,
263     char* name_out,
264     char* name_tags,
265     char* seq,
266     char* qual)
267     {
268     if (!orig_bwt_buf || !*orig_bwt_buf)
269     return false;
270    
271     static const int buf_size = 2048;
272    
273     char bwt_buf[buf_size];
274     strcpy(bwt_buf, orig_bwt_buf);
275     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276    
277     char orientation;
278    
279     //int bwtf_ret = 0;
280     //uint32_t seqid = 0;
281    
282     char text_name[buf_size];
283     unsigned int text_offset;
284    
285    
286     //unsigned int other_occs;
287     char mismatches[buf_size];
288     //memset(mismatches, 0, sizeof(mismatches));
289    
290     const char* buf = bwt_buf;
291     char* name = get_token((char**)&buf,"\t");
292     char* orientation_str = get_token((char**)&buf,"\t");
293     char* text_name_str = get_token((char**)&buf,"\t");
294    
295     strcpy(text_name, text_name_str);
296    
297     char* text_offset_str = get_token((char**)&buf,"\t");
298     char* seq_str = get_token((char**)&buf,"\t");
299     if (seq)
300     strcpy(seq, seq_str);
301    
302     const char* qual_str = get_token((char**)&buf,"\t");
303     if (qual)
304     strcpy(qual, qual_str);
305    
306     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307     mismatches[0] = 0;
308     char* mismatches_str = get_token((char**)&buf, "\t");
309     if (mismatches_str)
310     strcpy(mismatches, mismatches_str);
311    
312     orientation = orientation_str[0];
313     text_offset = atoi(text_offset_str);
314    
315     bool end = true;
316     unsigned int seg_offset = 0;
317     unsigned int seg_num = 0;
318     unsigned int num_segs = 0;
319    
320     // Copy the tag out of the name field before we might wipe it out
321     char* pipe = strrchr(name, '|');
322     if (pipe)
323     {
324     if (name_tags)
325     strcpy(name_tags, pipe);
326    
327     char* tag_buf = pipe + 1;
328     if (strchr(tag_buf, ':'))
329     {
330     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331     if (seg_num + 1 == num_segs)
332     end = true;
333     else
334     end = false;
335     }
336    
337     *pipe = 0;
338     }
339     // Stripping the slash and number following it gives the insert name
340     char* slash = strrchr(name, '/');
341     if (strip_slash && slash)
342     *slash = 0;
343    
344     size_t read_len = strlen(seq_str);
345    
346     // Add this alignment to the table of hits for this half of the
347     // Bowtie map
348    
349     //vector<string> mismatch_toks;
350     char* pch = strtok (mismatches,",");
351     unsigned char num_mismatches = 0;
352     while (pch != NULL)
353     {
354     char* colon = strchr(pch, ':');
355     if (colon)
356     {
357     num_mismatches++;
358     }
359     //mismatch_toks.push_back(pch);
360     pch = strtok (NULL, ",");
361     }
362    
363     bh = create_hit(name,
364     text_name,
365     text_offset,
366     (int)read_len,
367     orientation == '-',
368     num_mismatches,
369     end);
370    
371     return true;
372     }
373    
374     int anchor_mismatch = 0;
375    
376 gpertea 70
377     void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
378     bool &end, unsigned int &seg_offset, unsigned int& seg_num,
379     unsigned int & num_segs) {
380     char* pipe = strrchr(name, '|');
381     if (pipe)
382     {
383     if (name_tags)
384     strcpy(name_tags, pipe);
385    
386     char* tag_buf = pipe + 1;
387     if (strchr(tag_buf, ':'))
388     {
389     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
390     if (seg_num + 1 == num_segs)
391     end = true;
392     else
393     end = false;
394     }
395    
396     *pipe = 0;
397     }
398    
399     // Stripping the slash and number following it gives the insert name
400     char* slash = strrchr(name, '/');
401     if (strip_slash && slash)
402     *slash = 0;
403     }
404    
405    
406 gpertea 29 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
407     BowtieHit& bh,
408     bool strip_slash,
409     char* name_out,
410     char* name_tags,
411     char* seq,
412     char* qual)
413     {
414     if (!orig_bwt_buf || !*orig_bwt_buf)
415     return false;
416    
417     static const int buf_size = 2048;
418    
419     char bwt_buf[buf_size];
420     strcpy(bwt_buf, orig_bwt_buf);
421     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
422    
423     char orientation;
424     char text_name[buf_size];
425     unsigned int text_offset;
426     char mismatches[buf_size];
427     //memset(mismatches, 0, sizeof(mismatches));
428    
429     char* buf = bwt_buf;
430     char* name = get_token((char**)&buf,"\t");
431     char* orientation_str = get_token((char**)&buf,"\t");
432     char* text_name_str = get_token((char**)&buf,"\t");
433     strcpy(text_name, text_name_str);
434    
435     char* text_offset_str = get_token((char**)&buf,"\t");
436     char* seq_str = get_token((char**)&buf,"\t");
437     if (seq)
438     strcpy(seq, seq_str);
439    
440     const char* qual_str = get_token((char**)&buf,"\t");
441     if (qual)
442     strcpy(qual, qual_str);
443    
444     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
445     mismatches[0] = 0;
446     char* mismatches_str = get_token((char**)&buf, "\t");
447     if (mismatches_str)
448     strcpy(mismatches, mismatches_str);
449    
450     orientation = orientation_str[0];
451     text_offset = atoi(text_offset_str);
452    
453     bool end = true;
454     unsigned int seg_offset = 0;
455     unsigned int seg_num = 0;
456     unsigned int num_segs = 0;
457    
458     // Copy the tag out of the name field before we might wipe it out
459 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
460 gpertea 29
461     // Add this alignment to the table of hits for this half of the
462     // Bowtie map
463    
464     // Parse the text_name field to recover the splice coords
465     vector<string> toks;
466    
467     tokenize_strict(text_name, "|", toks);
468    
469     int num_extra_toks = (int)toks.size() - 6;
470    
471     if (num_extra_toks >= 0)
472     {
473     static const uint8_t left_window_edge_field = 1;
474     static const uint8_t splice_field = 2;
475     //static const uint8_t right_window_edge_field = 3;
476     static const uint8_t junction_type_field = 4;
477     static const uint8_t strand_field = 5;
478    
479     string contig = toks[0];
480     for (int t = 1; t <= num_extra_toks; ++t)
481     {
482     contig += "|";
483     contig += toks[t];
484     }
485    
486     vector<string> splice_toks;
487     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
488     if (splice_toks.size() != 2)
489     {
490     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
491     //fprintf(stderr, "%s (token: %s)\n", text_name,
492     // toks[num_extra_toks + splice_field].c_str());
493     return false;
494     }
495    
496     //
497     // check for an insertion hit
498     //
499     if(toks[num_extra_toks + junction_type_field] == "ins")
500     {
501     int8_t spliced_read_len = strlen(seq_str);
502     /*
503     * The 0-based position of the left edge of the alignment. Note that this
504     * value may need to be futher corrected to account for the presence of
505     * of the insertion.
506     */
507     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
508     uint32_t right = left + spliced_read_len - 1;
509    
510     /*
511     * The 0-based position of the last genomic sequence before the insertion
512     */
513     uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
514    
515     string insertedSequence = splice_toks[1];
516     /*
517     * The 0-based position of the first genomic sequence after teh insertion
518     */
519     uint32_t right_splice_pos = left_splice_pos + 1;
520     if(left > left_splice_pos){
521     /*
522     * The genomic position of the left edge of the alignment needs to be corrected
523     * If the alignment does not extend into the insertion, simply subtract the length
524     * of the inserted sequence, otherwise, just set it equal to the right edge
525     */
526     left = left - insertedSequence.length();
527     if(left < right_splice_pos){
528     left = right_splice_pos;
529     }
530     }
531     if(right > left_splice_pos){
532     right = right - insertedSequence.length();
533     if(right < left_splice_pos){
534     right = left_splice_pos;
535     }
536     }
537     /*
538     * Now, right and left should be properly transformed into genomic coordinates
539     * We should be able to deduce how much the alignment matches the insertion
540     * simply based on the length of the read
541     */
542     int left_match_length = 0;
543     if(left <= left_splice_pos){
544     left_match_length = left_splice_pos - left + 1;
545     }
546     int right_match_length = 0;
547     if(right >= right_splice_pos){
548     right_match_length = right - right_splice_pos + 1;
549     }
550     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
551    
552     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
553     return false;
554    
555     string junction_strand = toks[num_extra_toks + strand_field];
556     if(junction_strand != "rev" && junction_strand != "fwd"){
557     fprintf(stderr, "Malformed insertion record\n");
558     return false;
559     }
560    
561     char* pch = strtok( mismatches, ",");
562     unsigned char num_mismatches = 0;
563    
564     /*
565     * remember that text_offset holds the left end of the
566     * alignment, relative to the start of the contig
567     */
568    
569     /*
570     * The 0-based relative position of the left-most character
571     * before the insertion in the contig
572     */
573     int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
574     while (pch != NULL)
575     {
576     char* colon = strchr(pch, ':');
577     if (colon)
578     {
579     *colon = 0;
580     int mismatch_pos = atoi(pch);
581    
582     /*
583     * for reversely mapped reads,
584     * find the correct mismatched position.
585     */
586     if(orientation == '-'){
587     mismatch_pos = spliced_read_len - mismatch_pos - 1;
588     }
589    
590     /*
591     * Only count mismatches outside of the insertion region
592     * If there is a mismatch within the insertion,
593     * disallow this hit
594     */
595     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
596     num_mismatches++;
597     }else{
598     return false;
599     }
600     }
601     pch = strtok (NULL, ",");
602     }
603    
604    
605     vector<CigarOp> cigar;
606     cigar.push_back(CigarOp(MATCH, left_match_length));
607     cigar.push_back(CigarOp(INS, insertion_match_length));
608     cigar.push_back(CigarOp(MATCH, right_match_length));
609    
610     /*
611     * For now, disallow hits that don't span
612     * the insertion. If we allow these types of hits,
613     * then long_spanning.cpp needs to be updated
614     * in order to intelligently merge these kinds
615     * of reads back together
616     *
617     * Following code has been changed to allow segment that end
618     * in an insertion
619     */
620     bh = create_hit(name,
621     contig,
622     left,
623     cigar,
624     orientation == '-',
625     junction_strand == "rev",
626     num_mismatches,
627     0,
628     end);
629     return true;
630     }
631    
632     else
633     {
634     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
635     int spliced_read_len = strlen(seq_str);
636     int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
637     if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
638     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
639    
640     if (right_splice_pos <= 0 || left_splice_pos <= 0)
641     return false;
642    
643     if (orientation == '+')
644     {
645     if (left_splice_pos + seg_offset < _anchor_length){
646     return false;
647     }
648     }
649     else
650     {
651     if (right_splice_pos + seg_offset < _anchor_length)
652     return false;
653     }
654     //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
655     //atoi(toks[right_window_edge_field].c_str());
656     int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
657    
658     string junction_strand = toks[num_extra_toks + strand_field];
659     if (!(junction_strand == "rev" || junction_strand == "fwd")||
660     !(orientation == '-' || orientation == '+'))
661     {
662     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
663     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
664     // junction_strand.c_str(), orientation);
665     return false;
666     }
667    
668     //vector<string> mismatch_toks;
669     char* pch = strtok (mismatches,",");
670     int mismatches_in_anchor = 0;
671     unsigned char num_mismatches = 0;
672     while (pch != NULL)
673     {
674     char* colon = strchr(pch, ':');
675     if (colon)
676     {
677     *colon = 0;
678     num_mismatches++;
679     int mismatch_pos = atoi(pch);
680     if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
681     (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
682     mismatches_in_anchor++;
683     }
684     //mismatch_toks.push_back(pch);
685     pch = strtok (NULL, ",");
686     }
687    
688     // FIXME: we probably should exclude these hits somewhere, but this
689     // isn't the right place
690     vector<CigarOp> cigar;
691     cigar.push_back(CigarOp(MATCH, left_splice_pos));
692     if(toks[num_extra_toks + junction_type_field] == "del"){
693     cigar.push_back(CigarOp(DEL, gap_len));
694     }else{
695     cigar.push_back(CigarOp(REF_SKIP, gap_len));
696     }
697     cigar.push_back(CigarOp(MATCH, right_splice_pos));
698    
699     bh = create_hit(name,
700     contig,
701     left,
702     cigar,
703     orientation == '-',
704     junction_strand == "rev",
705     num_mismatches,
706     mismatches_in_anchor,
707     end);
708     return true;
709     }
710     }
711     else
712     {
713     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
714     //fprintf(stderr, "%s\n", orig_bwt_buf);
715     // continue;
716     return false;
717     }
718    
719     return false;
720     }
721    
722 gpertea 70
723     char parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
724     bool &spliced_alignment) {
725     // Mostly pilfered direct from the SAM tools:
726     const char* p_cig = cigar_str;
727     while (*p_cig)
728     {
729     char* t;
730     int length = (int)strtol(p_cig, &t, 10);
731     if (length <= 0)
732     {
733     //fprintf (stderr, "CIGAR op has zero length\n");
734     return false;
735     }
736     char op_char = toupper(*t);
737     CigarOpCode opcode;
738     if (op_char == 'M') opcode = MATCH;
739     else if (op_char == 'I') opcode = INS;
740     else if (op_char == 'D') opcode = DEL;
741     else if (op_char == 'N')
742     {
743     if (length > max_report_intron_length)
744     return false;
745     opcode = REF_SKIP;
746     spliced_alignment = true;
747     }
748     else if (op_char == 'S') opcode = SOFT_CLIP;
749     else if (op_char == 'H') opcode = HARD_CLIP;
750     else if (op_char == 'P') opcode = PAD;
751     else
752     {
753     fprintf (stderr, "invalid CIGAR operation\n");
754     return false;
755     }
756     p_cig = t + 1;
757     //i += length;
758     cigar.push_back(CigarOp(opcode, length));
759     }
760     if (*p_cig) {
761     fprintf (stderr, "unmatched CIGAR operation\n");
762     return *p_cig;
763     }
764     return 0;
765     }
766    
767     void getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
768     int* mismatches, int& num_mismatches, int& sam_nm, bool& antisense_splice) {
769     const char* tag_buf = buf;
770     sam_nm=0;
771     num_mismatches=0;
772     while((tag_buf = get_token((char**)&buf,"\t")))
773     {
774     vector<string> tuple_fields;
775     tokenize(tag_buf,":", tuple_fields);
776     if (tuple_fields.size() == 3)
777     {
778     if (tuple_fields[0] == "XS")
779     {
780     if (tuple_fields[2] == "-")
781     antisense_splice = true;
782     }
783     else if (tuple_fields[0] == "NM")
784     {
785     sam_nm = atoi(tuple_fields[2].c_str());
786     }
787     else if (tuple_fields[0] == "NS")
788     {
789     //ignored for now
790     }
791     else if (tuple_fields[0] == "MD")
792     {
793     const char* p=tuple_fields[2].c_str();
794     int bi=0; //base offset position in the read
795     while (*p != 0) {
796     if (isdigit(*p)) {
797     int v=atoi(p);
798     do { p++; } while (isdigit(*p));
799     bi+=v;
800     }
801     while (isupper(*p)) {
802     p++;
803     mismatches[num_mismatches]=bi;
804     num_mismatches++;
805     bi++;
806     }
807     if (*p=='^') { //reference deletion
808     p++;
809     while (isupper(*p)) { //insert read bases
810     p++; bi++;
811     }
812     }
813     }
814     }
815     else
816     {
817     //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
818     //return false;
819     }
820     }
821     }
822     /* By convention,the NM field of the SAM record
823     * counts an insertion or deletion. I dont' think
824     * we want the mismatch count in the BowtieHit
825     * record to reflect this. Therefore, subtract out
826     * the mismatches due to in/dels
827     */
828     for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
829     if(itr->opcode == INS){
830     sam_nm -= itr->length;
831     }
832     if(itr->opcode == DEL){
833     sam_nm -= itr->length;
834     }
835     }
836     }
837    
838 gpertea 29 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
839     BowtieHit& bh,
840     bool strip_slash,
841     char* name_out,
842     char* name_tags,
843     char* seq,
844     char* qual)
845     {
846     if (!orig_bwt_buf || !*orig_bwt_buf)
847     return false;
848     char bwt_buf[2048];
849     strcpy(bwt_buf, orig_bwt_buf);
850     // Are we still in the header region?
851     if (bwt_buf[0] == '@')
852     return false;
853    
854     char* buf = bwt_buf;
855     char* name = get_token((char**)&buf,"\t");
856     char* sam_flag_str = get_token((char**)&buf,"\t");
857     char* text_name = get_token((char**)&buf,"\t");
858     char* text_offset_str = get_token((char**)&buf,"\t");
859     const char* map_qual_str = get_token((char**)&buf,"\t");
860     char* cigar_str = get_token((char**)&buf,"\t");
861     const char* mate_ref_str = get_token((char**)&buf,"\t");
862     const char* mate_pos_str = get_token((char**)&buf,"\t");
863     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
864    
865     const char* seq_str = get_token((char**)&buf,"\t");
866     if (seq)
867     strcpy(seq, seq_str);
868    
869     const char* qual_str = get_token((char**)&buf,"\t");
870     if (qual)
871     strcpy(qual, qual_str);
872    
873     if (!name ||
874     !sam_flag_str ||
875     !text_name ||
876     !text_offset_str ||
877     !map_qual_str ||
878     !cigar_str ||
879     !mate_ref_str ||
880     !mate_pos_str ||
881     !inferred_insert_sz_str ||
882     !seq_str ||
883     !qual_str)
884     {
885     // truncated or malformed SAM record
886     return false;
887     }
888    
889    
890     int sam_flag = atoi(sam_flag_str);
891     int text_offset = atoi(text_offset_str);
892    
893     bool end = true;
894     unsigned int seg_offset = 0;
895     unsigned int seg_num = 0;
896     unsigned int num_segs = 0;
897    
898     // Copy the tag out of the name field before we might wipe it out
899 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
900 gpertea 29
901     vector<CigarOp> cigar;
902     bool spliced_alignment = false;
903 gpertea 70 if (parseCigar(cigar, cigar_str, spliced_alignment)) {
904     return false;
905     }
906 gpertea 29
907     //vector<string> attributes;
908     //tokenize(tag_buf, " \t",attributes);
909 gpertea 70
910 gpertea 29 bool antisense_splice = false;
911 gpertea 70 int num_mismatches = 0;
912     int sam_nm = 0; //the value of the NM tag (edit distance)
913     int mismatches[1024];
914     getSAMmismatches(buf, cigar, mismatches, num_mismatches, sam_nm, antisense_splice);
915 gpertea 29
916     if (spliced_alignment)
917     {
918     bh = create_hit(name,
919     text_name,
920     text_offset - 1,
921     cigar,
922     sam_flag & 0x0010,
923     antisense_splice,
924     num_mismatches,
925 gpertea 70 0,
926 gpertea 29 end);
927     }
928     else
929     {
930     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
931     bh = create_hit(name,
932     text_name,
933     text_offset - 1, // SAM files are 1-indexed
934     cigar,
935     sam_flag & 0x0010,
936     false,
937     num_mismatches,
938     0,
939     end);
940     }
941 gpertea 70 return true;
942 gpertea 29 }
943    
944 gpertea 70
945     void spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar,
946     int8_t l_len, int mlen, int8_t r_len, CigarOpCode opcode) {
947     //merge the original 'cigar' with the new insert/gap operation
948     // at position l_len into splcigar;
949     //find the original opcode that croses l_len location
950     int ofs=0;
951     bool found=false;
952     for (size_t c = 0 ; c < cigar.size(); ++c)
953     {
954     ofs+=cigar[c].length;
955     if (!found) {
956     if (ofs>=l_len) {
957     found=true;
958     //inject new code here
959     }
960     else {
961     //not found yet, just copy these codes
962     splcigar.push_back(cigar[c]);
963     }
964     }
965     else { //found yet
966    
967     }
968     }
969     }
970    
971 gpertea 69 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
972     BowtieHit& bh,
973     bool strip_slash,
974     char* name_out,
975     char* name_tags,
976     char* seq,
977     char* qual)
978     {
979     if (!orig_bwt_buf || !*orig_bwt_buf)
980     return false;
981 gpertea 29
982 gpertea 69 char bwt_buf[2048];
983     strcpy(bwt_buf, orig_bwt_buf);
984     // Are we still in the header region?
985     if (bwt_buf[0] == '@')
986     return false;
987 gpertea 29
988 gpertea 69 char* buf = bwt_buf;
989     char* name = get_token((char**)&buf,"\t");
990     char* sam_flag_str = get_token((char**)&buf,"\t");
991     char* text_name = get_token((char**)&buf,"\t");
992     char* text_offset_str = get_token((char**)&buf,"\t");
993     const char* map_qual_str = get_token((char**)&buf,"\t");
994     char* cigar_str = get_token((char**)&buf,"\t");
995     const char* mate_ref_str = get_token((char**)&buf,"\t");
996     const char* mate_pos_str = get_token((char**)&buf,"\t");
997     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
998 gpertea 70 int num_mismatches=0;
999     int mismatches[1024]; //list of 0-based mismatch positions in this read
1000     //parsed from SAM's MD:Z: tag
1001 gpertea 69 const char* seq_str = get_token((char**)&buf,"\t");
1002     if (seq)
1003     strcpy(seq, seq_str);
1004    
1005     const char* qual_str = get_token((char**)&buf,"\t");
1006     if (qual)
1007     strcpy(qual, qual_str);
1008    
1009     if (!name ||
1010     !sam_flag_str ||
1011     !text_name ||
1012     !text_offset_str ||
1013     !map_qual_str ||
1014     !cigar_str ||
1015     !mate_ref_str ||
1016     !mate_pos_str ||
1017     !inferred_insert_sz_str ||
1018     !seq_str ||
1019     !qual_str)
1020     {
1021     // truncated or malformed SAM record
1022     return false;
1023     }
1024    
1025    
1026     int sam_flag = atoi(sam_flag_str);
1027     int text_offset = atoi(text_offset_str);
1028 gpertea 70 text_offset--; //SAM is 1-based, Bowtie is 0-based
1029 gpertea 69 bool end = true;
1030     unsigned int seg_offset = 0;
1031     unsigned int seg_num = 0;
1032     unsigned int num_segs = 0;
1033    
1034     // Copy the tag out of the name field before we might wipe it out
1035 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1036 gpertea 69
1037 gpertea 70 vector<CigarOp> samcigar;
1038     bool spliced_alignment = false;
1039     if (parseCigar(samcigar, cigar_str, spliced_alignment)) {
1040     return false;
1041     }
1042 gpertea 69
1043 gpertea 70 bool antisense_splice = false;
1044     int sam_nm = 0;
1045     getSAMmismatches(buf, samcigar, mismatches, num_mismatches, sam_nm, antisense_splice);
1046 gpertea 69
1047 gpertea 70 //##############################################
1048 gpertea 69
1049     // Add this alignment to the table of hits for this half of the
1050     // Bowtie map
1051    
1052     // Parse the text_name field to recover the splice coords
1053     vector<string> toks;
1054    
1055     tokenize_strict(text_name, "|", toks);
1056    
1057     int num_extra_toks = (int)toks.size() - 6;
1058    
1059     if (num_extra_toks >= 0)
1060     {
1061     static const uint8_t left_window_edge_field = 1;
1062     static const uint8_t splice_field = 2;
1063     //static const uint8_t right_window_edge_field = 3;
1064     static const uint8_t junction_type_field = 4;
1065     static const uint8_t strand_field = 5;
1066    
1067     string contig = toks[0];
1068     for (int t = 1; t <= num_extra_toks; ++t)
1069     {
1070     contig += "|";
1071     contig += toks[t];
1072     }
1073    
1074     vector<string> splice_toks;
1075     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1076     if (splice_toks.size() != 2)
1077     {
1078     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1079 gpertea 70 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1080 gpertea 69 // toks[num_extra_toks + splice_field].c_str());
1081     return false;
1082     }
1083 gpertea 70 int spl_num_mismatches=0;
1084 gpertea 69 //
1085     // check for an insertion hit
1086     //
1087     if(toks[num_extra_toks + junction_type_field] == "ins")
1088     {
1089     int8_t spliced_read_len = strlen(seq_str);
1090 gpertea 70 // The 0-based position of the left edge of the alignment. Note that this
1091     // value may need to be futher corrected to account for the presence of
1092     // of the insertion.
1093 gpertea 69 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1094     uint32_t right = left + spliced_read_len - 1;
1095    
1096 gpertea 70 // The 0-based position of the last genomic sequence before the insertion
1097 gpertea 69 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
1098    
1099     string insertedSequence = splice_toks[1];
1100 gpertea 70 // The 0-based position of the first genomic sequence after the insertion
1101    
1102 gpertea 69 uint32_t right_splice_pos = left_splice_pos + 1;
1103     if(left > left_splice_pos){
1104     /*
1105     * The genomic position of the left edge of the alignment needs to be corrected
1106     * If the alignment does not extend into the insertion, simply subtract the length
1107     * of the inserted sequence, otherwise, just set it equal to the right edge
1108     */
1109     left = left - insertedSequence.length();
1110     if(left < right_splice_pos){
1111     left = right_splice_pos;
1112     }
1113     }
1114     if(right > left_splice_pos){
1115     right = right - insertedSequence.length();
1116     if(right < left_splice_pos){
1117     right = left_splice_pos;
1118     }
1119     }
1120     /*
1121     * Now, right and left should be properly transformed into genomic coordinates
1122     * We should be able to deduce how much the alignment matches the insertion
1123     * simply based on the length of the read
1124     */
1125     int left_match_length = 0;
1126     if(left <= left_splice_pos){
1127     left_match_length = left_splice_pos - left + 1;
1128     }
1129     int right_match_length = 0;
1130     if(right >= right_splice_pos){
1131     right_match_length = right - right_splice_pos + 1;
1132     }
1133     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1134    
1135     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1136     return false;
1137    
1138     string junction_strand = toks[num_extra_toks + strand_field];
1139     if(junction_strand != "rev" && junction_strand != "fwd"){
1140     fprintf(stderr, "Malformed insertion record\n");
1141     return false;
1142     }
1143    
1144 gpertea 70 //char* pch = strtok( mismatches, ",");
1145     //unsigned char num_mismatches = 0;
1146     //text_offset holds the left end of the alignment,
1147     //RELATIVE TO the start of the contig
1148 gpertea 69
1149 gpertea 70 //The 0-based relative position of the left-most character
1150     //before the insertion in the contig
1151 gpertea 69 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1152 gpertea 70 for (int i=0;i<num_mismatches;i++) {
1153     int mismatch_pos = mismatches[i];
1154     // for reversely mapped reads,
1155     //find the correct mismatched position.
1156     if (sam_flag & 0x0010){
1157     mismatch_pos = spliced_read_len - mismatch_pos - 1;
1158     }
1159 gpertea 69
1160 gpertea 70 //Only count mismatches outside of the insertion region
1161     // If there is a mismatch within the insertion,
1162     // disallow this hit
1163     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1164     spl_num_mismatches++;
1165     }else{
1166     return false;
1167 gpertea 69 }
1168     }
1169    
1170 gpertea 70 //TODO: merge with the original cigar string
1171     vector<CigarOp> splcigar;
1172     spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1173 gpertea 69 /*
1174 gpertea 70 splcigar.push_back(CigarOp(MATCH, left_match_length));
1175     splcigar.push_back(CigarOp(INS, insertion_match_length));
1176     splcigar.push_back(CigarOp(MATCH, right_match_length));
1177     */
1178     /*
1179 gpertea 69 * For now, disallow hits that don't span
1180     * the insertion. If we allow these types of hits,
1181     * then long_spanning.cpp needs to be updated
1182     * in order to intelligently merge these kinds
1183     * of reads back together
1184     *
1185     * Following code has been changed to allow segment that end
1186     * in an insertion
1187     */
1188     bh = create_hit(name,
1189     contig,
1190     left,
1191 gpertea 70 //splcigar,
1192     splcigar,
1193     sam_flag & 0x0010,
1194 gpertea 69 junction_strand == "rev",
1195 gpertea 70 spl_num_mismatches,
1196 gpertea 69 0,
1197     end);
1198     return true;
1199 gpertea 70 } //"ins"
1200 gpertea 69 else
1201     {
1202     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1203     int spliced_read_len = strlen(seq_str);
1204     int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
1205     if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
1206     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
1207    
1208     if (right_splice_pos <= 0 || left_splice_pos <= 0)
1209     return false;
1210    
1211     if ((sam_flag & 0x0010) == 0) //######
1212     {
1213 gpertea 70 if (left_splice_pos + seg_offset < _anchor_length)
1214     return false;
1215 gpertea 69 }
1216     else
1217     {
1218 gpertea 70 if (right_splice_pos + seg_offset < _anchor_length)
1219     return false;
1220 gpertea 69 }
1221     //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1222     //atoi(toks[right_window_edge_field].c_str());
1223     int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
1224    
1225     string junction_strand = toks[num_extra_toks + strand_field];
1226 gpertea 70 if (!(junction_strand == "rev" || junction_strand == "fwd"))
1227     // || !(orientation == '-' || orientation == '+'))
1228 gpertea 69 {
1229 gpertea 70 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1230     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
1231     // junction_strand.c_str(), orientation);
1232     return false;
1233 gpertea 69 }
1234    
1235     int mismatches_in_anchor = 0;
1236 gpertea 70 for (int i=0;i<num_mismatches;i++) {
1237     spl_num_mismatches++;
1238     int mismatch_pos = mismatches[i];
1239     if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
1240     ((sam_flag & 0x0010) != 0 &&
1241     abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
1242     mismatches_in_anchor++;
1243     }
1244 gpertea 69
1245     // FIXME: we probably should exclude these hits somewhere, but this
1246     // isn't the right place
1247 gpertea 70 vector<CigarOp> splcigar;
1248     CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP ;
1249     spliceCigar(splcigar, samcigar, left_splice_pos, gap_len, right_splice_pos, INS);
1250     /*
1251     splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1252 gpertea 69 if(toks[num_extra_toks + junction_type_field] == "del"){
1253 gpertea 70 splcigar.push_back(CigarOp(DEL, gap_len));
1254 gpertea 69 }else{
1255 gpertea 70 splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1256 gpertea 69 }
1257 gpertea 70 splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1258     */
1259 gpertea 69 bh = create_hit(name,
1260     contig,
1261     left,
1262 gpertea 70 splcigar,
1263     (sam_flag & 0x0010),
1264 gpertea 69 junction_strand == "rev",
1265 gpertea 70 spl_num_mismatches,
1266 gpertea 69 mismatches_in_anchor,
1267     end);
1268     return true;
1269     }
1270 gpertea 70 } //parse splice data
1271 gpertea 69 else
1272     {
1273     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1274     //fprintf(stderr, "%s\n", orig_bwt_buf);
1275     // continue;
1276     return false;
1277     }
1278    
1279     return false;
1280     }
1281    
1282    
1283 gpertea 29 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1284     BowtieHit& bh, bool strip_slash,
1285     char* name_out, char* name_tags,
1286     char* seq, char* qual) {
1287     if (_sam_header==NULL)
1288     err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1289     const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1290    
1291    
1292     uint32_t sam_flag = hit_buf->core.flag;
1293    
1294     int text_offset = hit_buf->core.pos;
1295     int text_mate_pos = hit_buf->core.mpos;
1296     int target_id = hit_buf->core.tid;
1297     int mate_target_id = hit_buf->core.mtid;
1298    
1299     vector<CigarOp> cigar;
1300     bool spliced_alignment = false;
1301     int num_hits = 1;
1302    
1303     double mapQ = hit_buf->core.qual;
1304     long double error_prob;
1305     if (mapQ > 0)
1306     {
1307     long double p = (-1.0 * mapQ) / 10.0;
1308     error_prob = pow(10.0L, p);
1309     }
1310     else
1311     {
1312     error_prob = 1.0;
1313     }
1314    
1315     //header->target_name[c->tid]
1316    
1317     bool end = true;
1318     unsigned int seg_offset = 0;
1319     unsigned int seg_num = 0;
1320     unsigned int num_segs = 0;
1321     // Copy the tag out of the name field before we might wipe it out
1322     char* qname = bam1_qname(hit_buf);
1323     char* pipe = strrchr(qname, '|');
1324     if (pipe)
1325     {
1326     if (name_tags)
1327     strcpy(name_tags, pipe);
1328    
1329     char* tag_buf = pipe + 1;
1330     if (strchr(tag_buf, ':'))
1331     {
1332     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1333     if (seg_num + 1 == num_segs)
1334     end = true;
1335     else
1336     end = false;
1337     }
1338    
1339     *pipe = 0;
1340     }
1341    
1342    
1343     if (target_id < 0) {
1344     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1345     bh = create_hit(qname,
1346     "*", //ref_name
1347     0, //left coord
1348     0, //read_len
1349     false, //antisense_aln
1350     0, //edit_dist
1351     end);
1352     return true;
1353     }
1354    
1355     //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1356     string text_name = _sam_header->target_name[target_id];
1357     for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1358     {
1359     //char* t;
1360    
1361     int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1362     if (length <= 0)
1363     {
1364     fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1365     return false;
1366     }
1367    
1368     CigarOpCode opcode;
1369     switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1370     {
1371     case BAM_CMATCH: opcode = MATCH; break;
1372     case BAM_CINS: opcode = INS; break;
1373     case BAM_CDEL: opcode = DEL; break;
1374     case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1375     case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1376     case BAM_CPAD: opcode = PAD; break;
1377     case BAM_CREF_SKIP:
1378     opcode = REF_SKIP;
1379     spliced_alignment = true;
1380     if (length > (int)max_report_intron_length)
1381     {
1382     //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1383     return false;
1384     }
1385     break;
1386     default:
1387     fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1388     return false;
1389     }
1390     if (opcode != HARD_CLIP)
1391     cigar.push_back(CigarOp(opcode, length));
1392     }
1393    
1394     string mrnm;
1395     if (mate_target_id >= 0) {
1396     if (mate_target_id == target_id) {
1397     //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1398     mrnm = _sam_header->target_name[mate_target_id];
1399     }
1400     else {
1401     //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1402     return false;
1403     }
1404     }
1405     else {
1406     text_mate_pos = 0;
1407     }
1408     //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1409     bool antisense_splice=false;
1410     unsigned char num_mismatches = 0;
1411     unsigned char num_splice_anchor_mismatches = 0;
1412    
1413     uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1414     if (ptr) {
1415     char src_strand_char = bam_aux2A(ptr);
1416     if (src_strand_char == '-')
1417     antisense_splice = true;
1418     // else if (src_strand_char == '+')
1419     // source_strand = CUFF_FWD;
1420     }
1421    
1422     ptr = bam_aux_get(hit_buf, "NM");
1423     if (ptr) {
1424     num_mismatches = bam_aux2i(ptr);
1425     }
1426    
1427     ptr = bam_aux_get(hit_buf, "NH");
1428     if (ptr) {
1429     num_hits = bam_aux2i(ptr);
1430     }
1431    
1432     //bool antisense_aln = bam1_strand(hit_buf);
1433    
1434     //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1435     // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1436     if (spliced_alignment) {
1437     //if (source_strand == CUFF_STRAND_UNKNOWN) {
1438     // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1439     // }
1440     bh = create_hit(qname,
1441     text_name,
1442     text_offset, // BAM files are 0-indexed
1443     cigar,
1444     sam_flag & 0x0010,
1445     antisense_splice,
1446     num_mismatches,
1447     num_splice_anchor_mismatches,
1448     end);
1449    
1450     }
1451     else {
1452     //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1453     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1454     bh = create_hit(qname,
1455     text_name,
1456     text_offset, // BAM files are 0-indexed
1457     cigar,
1458     sam_flag & 0x0010,
1459     false,
1460     num_mismatches,
1461     0,
1462     end);
1463     }
1464     if (seq!=NULL) {
1465     char *bseq = (char*)bam1_seq(hit_buf);
1466     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1467     char v = bam1_seqi(bseq,i);
1468     seq[i]=bam_nt16_rev_table[v];
1469     }
1470     seq[hit_buf->core.l_qseq]=0;
1471     }
1472     if (qual!=NULL) {
1473     char *bq = (char*)bam1_qual(hit_buf);
1474     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1475     qual[i]=bq[i]+33;
1476     }
1477     qual[hit_buf->core.l_qseq]=0;
1478     }
1479     return true;
1480     }
1481    
1482     bool BAMHitFactory::inspect_header(HitStream& hs)
1483     {
1484     bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1485    
1486     if (header == NULL) {
1487     fprintf(stderr, "Warning: No BAM header\n");
1488     return false;
1489     }
1490     if (header->l_text == 0) {
1491     fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1492     return false;
1493     }
1494     return true;
1495     }
1496    
1497    
1498     void get_mapped_reads(FILE* bwtf,
1499     HitTable& hits,
1500     HitFactory& hit_factory,
1501     bool strip_slash,
1502     bool verbose)
1503     {
1504    
1505    
1506     char bwt_buf[2048];
1507     uint32_t reads_extracted = 0;
1508    
1509     while (fgets(bwt_buf, 2048, bwtf))
1510     {
1511     // Chomp the newline
1512     char* nl = strrchr(bwt_buf, '\n');
1513     if (nl) *nl = 0;
1514     if (*bwt_buf == 0)
1515     continue;
1516     // Get a new record from the tab-delimited Bowtie map
1517     BowtieHit bh;
1518     if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1519     {
1520     // Only check uniqueness if these hits are spliced
1521     hits.add_hit(bh, true);
1522     }
1523     reads_extracted++;
1524     }
1525    
1526     // This will sort the map by insert id.
1527     hits.finalize();
1528     fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1529     }
1530    
1531    
1532     /*
1533     AlignStatus status(const BowtieHit* align)
1534     {
1535     if (!align)
1536     return UNALIGNED;
1537     if (align->contiguous())
1538     return CONTIGUOUS;
1539     return SPLICED;
1540     }
1541     */
1542    
1543     void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1544     {
1545     int max_hit_pos = -1;
1546     for (size_t i = 0; i < hits.size(); ++i)
1547     {
1548     max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1549     }
1550    
1551     if ((int)DoC.size() < max_hit_pos)
1552     DoC.resize(max_hit_pos);
1553    
1554     for (size_t i = 0; i < hits.size(); ++i)
1555     {
1556     const BowtieHit& bh = hits[i];
1557    
1558     // split up the coverage contibution for this reads
1559     size_t j = bh.left();
1560     const vector<CigarOp>& cigar = bh.cigar();
1561    
1562     for (size_t c = 0 ; c < cigar.size(); ++c)
1563     {
1564     switch(cigar[c].opcode)
1565     {
1566     case MATCH:
1567     for (size_t m = 0; m < cigar[c].length; ++m)
1568     {
1569     if (DoC[j + m] < 0xFFFF)
1570     DoC[j + m]++;
1571     }
1572     //fall through this case to REF_SKIP is intentional
1573     case REF_SKIP:
1574     j += cigar[c].length;
1575     break;
1576     default:
1577     break;
1578     }
1579    
1580     }
1581     }
1582     }
1583    
1584     void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1585     {
1586     if ((int)DoC.size() < bh.right())
1587     DoC.resize(bh.right());
1588    
1589     // split up the coverage contibution for this reads
1590     size_t j = bh.left();
1591     const vector<CigarOp>& cigar = bh.cigar();
1592    
1593     for (size_t c = 0 ; c < cigar.size(); ++c)
1594     {
1595     switch(cigar[c].opcode)
1596     {
1597     case MATCH:
1598     for (size_t m = 0; m < cigar[c].length; ++m)
1599     {
1600     if (DoC[j + m] < 0xFFFFFFFF)
1601     DoC[j + m]++;
1602     }
1603     //fall through this case to REF_SKIP is intentional
1604     case REF_SKIP:
1605     j += cigar[c].length;
1606     break;
1607     default:
1608     break;
1609     }
1610    
1611     }
1612     }
1613    
1614     void print_hit(FILE* fout,
1615     const char* read_name,
1616     const BowtieHit& bh,
1617     const char* ref_name,
1618     const char* sequence,
1619     const char* qualities,
1620     bool from_bowtie)
1621     {
1622     string seq;
1623     string quals;
1624     if (sequence)
1625     {
1626     seq = sequence;
1627     quals = qualities;
1628     seq.resize(bh.read_len());
1629     quals.resize(bh.read_len());
1630     }
1631     else
1632     {
1633     seq = "*";
1634     }
1635    
1636     if (qualities)
1637     {
1638     quals = qualities;
1639     quals.resize(bh.read_len());
1640     }
1641     else
1642     {
1643     quals = "*";
1644     }
1645    
1646     uint32_t sam_flag = 0;
1647     if (bh.antisense_align())
1648     {
1649     sam_flag |= 0x0010; // BAM_FREVERSE
1650     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1651     {
1652     reverse_complement(seq);
1653     reverse(quals.begin(), quals.end());
1654     }
1655     }
1656    
1657     uint32_t sam_pos = bh.left() + 1;
1658     uint32_t map_quality = 255;
1659     char cigar[256];
1660     cigar[0] = 0;
1661     string mate_ref_name = "*";
1662     uint32_t mate_pos = 0;
1663     uint32_t insert_size = 0;
1664     //string qualities = "*";
1665    
1666     const vector<CigarOp>& bh_cigar = bh.cigar();
1667    
1668     /*
1669     * In addition to calculating the cigar string,
1670     * we need to figure out how many in/dels are in the
1671     * sequence, so that we can give the correct
1672     * value for the NM tag
1673     */
1674     int indel_distance = 0;
1675     for (size_t c = 0; c < bh_cigar.size(); ++c)
1676     {
1677     char ibuf[64];
1678     sprintf(ibuf, "%d", bh_cigar[c].length);
1679     switch(bh_cigar[c].opcode)
1680     {
1681     case MATCH:
1682     strcat(cigar, ibuf);
1683     strcat(cigar, "M");
1684     break;
1685     case INS:
1686     strcat(cigar, ibuf);
1687     strcat(cigar, "I");
1688     indel_distance += bh_cigar[c].length;
1689     break;
1690     case DEL:
1691     strcat(cigar, ibuf);
1692     strcat(cigar, "D");
1693     indel_distance += bh_cigar[c].length;
1694     break;
1695     case REF_SKIP:
1696     strcat(cigar, ibuf);
1697     strcat(cigar, "N");
1698     break;
1699     default:
1700     break;
1701     }
1702     }
1703    
1704     //string q = string(bh.read_len(), '!');
1705     //string s = string(bh.read_len(), 'N');
1706    
1707     fprintf(fout,
1708     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1709     read_name,
1710     sam_flag,
1711     ref_name,
1712     sam_pos,
1713     map_quality,
1714     cigar,
1715     mate_ref_name.c_str(),
1716     mate_pos,
1717     insert_size,
1718     seq.c_str(),
1719     quals.c_str());
1720    
1721     if (!sam_readgroup_id.empty())
1722     fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1723    
1724     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1725    
1726     bool containsSplice = false;
1727     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1728     if(itr->opcode == REF_SKIP){
1729     containsSplice = true;
1730     break;
1731     }
1732     }
1733    
1734     if (containsSplice)
1735     fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1736    
1737     fprintf(fout, "\n");
1738     }
1739    
1740     void print_bamhit(GBamWriter& wbam,
1741     const char* read_name,
1742     const BowtieHit& bh,
1743     const char* ref_name,
1744     const char* sequence,
1745     const char* qualities,
1746     bool from_bowtie)
1747     {
1748     string seq;
1749     string quals;
1750     if (sequence) {
1751     seq = sequence;
1752     quals = qualities;
1753     seq.resize(bh.read_len());
1754     quals.resize(bh.read_len());
1755     }
1756     else {
1757     seq = "*";
1758     }
1759     if (qualities) {
1760     quals = qualities;
1761     quals.resize(bh.read_len());
1762     }
1763     else
1764     {
1765     quals = "*";
1766     }
1767    
1768     uint32_t sam_flag = 0;
1769     if (bh.antisense_align())
1770     {
1771     sam_flag |= 0x0010; // BAM_FREVERSE
1772     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1773     {
1774     reverse_complement(seq);
1775     reverse(quals.begin(), quals.end());
1776     }
1777     }
1778    
1779     uint32_t sam_pos = bh.left() + 1;
1780     uint32_t map_quality = 255;
1781     char cigar[256];
1782     cigar[0] = 0;
1783     string mate_ref_name = "*";
1784     uint32_t mate_pos = 0;
1785     uint32_t insert_size = 0;
1786     //string qualities = "*";
1787    
1788     const vector<CigarOp>& bh_cigar = bh.cigar();
1789     /*
1790     * In addition to calculating the cigar string,
1791     * we need to figure out how many in/dels are in the
1792     * sequence, so that we can give the correct
1793     * value for the NM tag
1794     */
1795     int indel_distance = 0;
1796     for (size_t c = 0; c < bh_cigar.size(); ++c)
1797     {
1798     char ibuf[64];
1799     sprintf(ibuf, "%d", bh_cigar[c].length);
1800     switch(bh_cigar[c].opcode)
1801     {
1802     case MATCH:
1803     strcat(cigar, ibuf);
1804     strcat(cigar, "M");
1805     break;
1806     case INS:
1807     strcat(cigar, ibuf);
1808     strcat(cigar, "I");
1809     indel_distance += bh_cigar[c].length;
1810     break;
1811     case DEL:
1812     strcat(cigar, ibuf);
1813     strcat(cigar, "D");
1814     indel_distance += bh_cigar[c].length;
1815     break;
1816     case REF_SKIP:
1817     strcat(cigar, ibuf);
1818     strcat(cigar, "N");
1819     break;
1820     default:
1821     break;
1822     }
1823     }
1824    
1825     //string q = string(bh.read_len(), '!');
1826     //string s = string(bh.read_len(), 'N');
1827     /*
1828     fprintf(fout,
1829     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1830     read_name,
1831     sam_flag,
1832     ref_name,
1833     sam_pos,
1834     map_quality,
1835     cigar,
1836     mate_ref_name.c_str(),
1837     mate_pos,
1838     insert_size,
1839     seq.c_str(),
1840     quals.c_str());
1841    
1842     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1843     */
1844     vector<string> auxdata;
1845    
1846     if (!sam_readgroup_id.empty())
1847     {
1848     string nm("RG:Z:");
1849     nm += sam_readgroup_id;
1850     auxdata.push_back(nm);
1851     }
1852    
1853     string nm("NM:i:");
1854     str_appendInt(nm, bh.edit_dist() + indel_distance);
1855     auxdata.push_back(nm);
1856     bool containsSplice = false;
1857     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1858     if(itr->opcode == REF_SKIP){
1859     containsSplice = true;
1860     break;
1861     }
1862    
1863     if (containsSplice) {
1864     //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1865     nm="XS:A:";
1866     nm+=(char)(bh.antisense_splice() ? '-' : '+');
1867     auxdata.push_back(nm);
1868     }
1869    
1870     GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1871     cigar, mate_ref_name.c_str(), mate_pos,
1872     insert_size, seq.c_str(), quals.c_str(), &auxdata);
1873    
1874    
1875    
1876     wbam.write(brec);
1877     delete brec;
1878     }
1879    
1880     /**
1881     * Print a vector of cigar operations to a file.
1882     * @param bh_cigar A vector of CigarOps
1883     * @return a string representation of the cigar string
1884     */
1885     std::string print_cigar(vector<CigarOp>& bh_cigar){
1886     char cigar[256];
1887     cigar[0] = 0;
1888     for (size_t c = 0; c < bh_cigar.size(); ++c)
1889     {
1890     char ibuf[64];
1891     sprintf(ibuf, "%d", bh_cigar[c].length);
1892     switch(bh_cigar[c].opcode)
1893     {
1894     case MATCH:
1895     strcat(cigar, ibuf);
1896     strcat(cigar, "M");
1897     break;
1898     case INS:
1899     strcat(cigar, ibuf);
1900     strcat(cigar, "I");
1901     break;
1902     case DEL:
1903     strcat(cigar, ibuf);
1904     strcat(cigar, "D");
1905     break;
1906     case REF_SKIP:
1907     strcat(cigar, ibuf);
1908     strcat(cigar, "N");
1909     break;
1910     default:
1911     break;
1912     }
1913     }
1914     string result(cigar);
1915     return result;
1916     }
1917    
1918     bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
1919     {
1920     RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
1921     if (!ref_str)
1922     return false;
1923    
1924     const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
1925    
1926     size_t pos_seq = 0;
1927     size_t pos_ref = 0;
1928     size_t mismatch = 0;
1929     for (size_t i = 0; i < _cigar.size(); ++i)
1930     {
1931     CigarOp cigar = _cigar[i];
1932     switch(cigar.opcode)
1933     {
1934     case MATCH:
1935     {
1936     for (size_t j = 0; j < cigar.length; ++j)
1937     {
1938     if (_seq[pos_seq++] != ref_seq[pos_ref++])
1939     ++mismatch;
1940     }
1941     }
1942     break;
1943     case INS:
1944     {
1945     pos_seq += cigar.length;
1946     }
1947     break;
1948    
1949     case DEL:
1950     case REF_SKIP:
1951     {
1952     pos_ref += cigar.length;
1953     }
1954     break;
1955    
1956     default:
1957     break;
1958     }
1959     }
1960    
1961     return mismatch == _edit_dist;
1962     }