ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bwt_map.cpp
Revision: 135
Committed: Mon Dec 12 22:28:38 2011 UTC (8 years, 5 months ago) by gpertea
File size: 52959 byte(s)
Log Message:
wip - SplicedSAMHitFactory() still not implemented

Line User Rev File contents
1 gpertea 29 /*
2     * bwt_map.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/17/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14     #include <cassert>
15     #include <cstdlib>
16     #include <iostream>
17     #include <set>
18     #include <vector>
19     #include <cmath>
20    
21     #include "common.h"
22     #include "bwt_map.h"
23     #include "tokenize.h"
24     #include "reads.h"
25    
26     using namespace std;
27    
28     void HitTable::add_hit(const BowtieHit& bh, bool check_uniqueness)
29     {
30     uint32_t reference_id = bh.ref_id();
31    
32     pair<RefHits::iterator, bool> ret =
33     _hits_for_ref.insert(make_pair(reference_id, HitList()));
34     HitList& hl = ret.first->second;
35    
36     if (check_uniqueness)
37     {
38     // Check uniqueness, in case we are adding spliced hits from
39     // several spliced alignment sources (e.g. de novo hashing + Bowtie
40     // against a user-supplied index). We don't want to count the same
41     // alignment twice if it happened to be found by more than one method
42     HitList::const_iterator lb = lower_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
43     HitList::const_iterator ub = upper_bound(hl.begin(), hl.end(), bh, hit_insert_id_lt);
44     for (; lb != ub && lb != hl.end(); ++lb)
45     {
46     if (*lb == bh)
47     {
48     //fprintf(stderr, "Chucking duplicate read %d by identity\n", bh.insert_id());
49     return;
50     }
51    
52     if (lb->insert_id() == bh.insert_id() &&
53     lb->ref_id() == bh.ref_id() &&
54     lb->antisense_align() == bh.antisense_align())
55     {
56     // If we get here, we may be looking at the same alignment
57     // However, spanning_reads may report a shorter, trimmed alignment
58     // so not all fields will be equal. If they just disagree on the
59     // ends, and don't indicate a different junction coord, the
60     // alignments are the same.
61    
62     if ((lb->left() <= bh.left() && lb->right() >= bh.right()) ||
63     (bh.left() <= lb->left() && bh.right() >= lb->right()))
64     {
65     vector<pair<int,int> > lb_gaps, bh_gaps;
66     lb->gaps(lb_gaps);
67     bh.gaps(bh_gaps);
68     if (lb_gaps == bh_gaps)
69     {
70     // One alignment is contained in the other, they agree on
71     // where the gaps, if any, are, and they share an id
72     // => this is a redundant aligment, so toss it
73     //fprintf(stderr, "Chucking duplicate read %d by gap agreement\n", bh.insert_id());
74     return;
75     }
76     }
77     }
78     }
79     }
80     _total_hits++;
81     hl.push_back(bh);
82     }
83    
84     bool hit_insert_id_lt(const BowtieHit& h1, const BowtieHit& h2)
85     {
86     return h1.insert_id() < h2.insert_id();
87     }
88    
89     void LineHitFactory::openStream(HitStream& hs)
90     {
91     if (hs._hit_file==NULL && !hs._hit_file_name.empty()) {
92     //open the file for HitStream here
93     hs._hit_file=fopen(hs._hit_file_name.c_str(),"r");
94     if (hs._hit_file==NULL)
95     err_die("Error opening HitStream file %s\n",hs._hit_file_name.c_str());
96     return;
97     }
98     if (hs._fzpipe!=NULL) {
99     hs._hit_file=hs._fzpipe->file;
100     }
101     }
102    
103     void LineHitFactory::rewind(HitStream& hs)
104     {
105     if (hs._fzpipe!=NULL) {
106     hs._fzpipe->rewind();
107     hs._hit_file=hs._fzpipe->file;
108     }
109     else if (hs._hit_file) ::rewind((FILE*)(hs._hit_file));
110     }
111    
112     bool LineHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
113     FILE* f=(FILE *)(hs._hit_file);
114     bool new_rec = (fgets(_hit_buf, _hit_buf_max_sz - 1, f)!=NULL);
115     if (!new_rec || feof(f)) {
116     hs._eof=true;
117     return false;
118     }
119     ++_line_num;
120     char* nl = strrchr(_hit_buf, '\n');
121     if (nl) *nl = 0;
122     buf = _hit_buf;
123     buf_size = _hit_buf_max_sz - 1;
124     return true;
125     }
126    
127     void LineHitFactory::closeStream(HitStream& hs) {
128     if (hs._fzpipe!=NULL) {
129     hs._fzpipe->close();
130     return;
131     }
132     if (hs._hit_file!=NULL) {
133     fclose((FILE*)(hs._hit_file));
134     hs._hit_file=NULL;
135     }
136     }
137     void BAMHitFactory::openStream(HitStream& hs) {
138     if (hs._hit_file==NULL) {
139     if (hs._hit_file_name.empty())
140     //err_die("Error: invalid HitStream set for BAMHitFactory(file name missing)\n");
141     return; //invalid stream, could be just a place holder
142     //open the file here if not already open
143     string fext=getFext(hs._hit_file_name);
144     if (fext=="sam")
145     hs._hit_file = samopen(hs._hit_file_name.c_str(), "r", 0);
146     else
147     hs._hit_file = samopen(hs._hit_file_name.c_str(), "rb", 0);
148    
149     samfile_t* sam_file=(samfile_t*)(hs._hit_file);
150    
151     if (sam_file == NULL)
152     err_die("Error opening SAM file %s\n", hs._hit_file_name.c_str());
153     if (sam_file->header == NULL)
154     err_die("Error: no SAM header found for file %s\n", hs._hit_file_name.c_str());
155     memset(&_next_hit, 0, sizeof(_next_hit));
156     //_beginning = bgzf_tell(sam_file->x.bam);
157     _sam_header=sam_file->header;
158     if (inspect_header(hs) == false)
159     err_die("Error: invalid SAM header for file %s\n",
160     hs._hit_file_name.c_str());
161     }
162     }
163    
164     void BAMHitFactory::closeStream(HitStream& hs) {
165     if (hs._hit_file) {
166     samclose((samfile_t*)(hs._hit_file));
167     }
168     hs._hit_file=NULL;
169     _sam_header=NULL;
170     }
171    
172     void BAMHitFactory::rewind(HitStream& hs)
173     {
174     /*
175     if (_hit_file && ((samfile_t*)_hit_file)->x.bam)
176     {
177     bgzf_seek(((samfile_t*)_hit_file)->x.bam, _beginning, SEEK_SET);
178     _eof = false;
179     }
180     */
181     this->closeStream(hs);
182     this->openStream(hs);
183     }
184    
185     string BAMHitFactory::hitfile_rec(HitStream& hs, const char* hit_buf) {
186     const bam1_t* bamrec=(const bam1_t*)hit_buf;
187     char* tamline=bam_format1(((samfile_t*)(hs._hit_file))->header, bamrec);
188     string sam_line(tamline);
189     free(tamline);
190     return sam_line;
191     }
192    
193     bool BAMHitFactory::next_record(HitStream& hs, const char*& buf, size_t& buf_size) {
194     if (_next_hit.data) {
195     free(_next_hit.data);
196     _next_hit.data = NULL;
197     }
198     _sam_header=((samfile_t*)(hs._hit_file))->header; //needed by get_hit_from_buf later on
199     if (hs.eof() || !hs.ready()) return false;
200    
201     //mark_curr_pos();
202    
203     memset(&_next_hit, 0, sizeof(_next_hit));
204    
205     int bytes_read = samread((samfile_t*)(hs._hit_file), &_next_hit);
206     if (bytes_read <= 0) {
207     hs._eof = true;
208     return false;
209     }
210     buf = (const char*)&_next_hit;
211     buf_size = bytes_read;
212     return true;
213     }
214    
215    
216     BowtieHit HitFactory::create_hit(const string& insert_name,
217     const string& ref_name,
218     int left,
219     const vector<CigarOp>& cigar,
220     bool antisense_aln,
221     bool antisense_splice,
222     unsigned char edit_dist,
223     unsigned char splice_mms,
224     bool end)
225     {
226     uint64_t insert_id = _insert_table.get_id(insert_name);
227     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
228    
229     return BowtieHit(reference_id,
230     insert_id,
231     left,
232     cigar,
233     antisense_aln,
234     antisense_splice,
235     edit_dist,
236     splice_mms,
237     end);
238     }
239    
240     BowtieHit HitFactory::create_hit(const string& insert_name,
241     const string& ref_name,
242     uint32_t left,
243     uint32_t read_len,
244     bool antisense_aln,
245     unsigned char edit_dist,
246     bool end)
247     {
248     uint64_t insert_id = _insert_table.get_id(insert_name);
249     uint32_t reference_id = _ref_table.get_id(ref_name, NULL, 0);
250    
251     return BowtieHit(reference_id,
252     insert_id,
253     left,
254     read_len,
255     antisense_aln,
256     edit_dist,
257     end);
258     }
259    
260     bool BowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
261     BowtieHit& bh,
262     bool strip_slash,
263     char* name_out,
264     char* name_tags,
265     char* seq,
266     char* qual)
267     {
268     if (!orig_bwt_buf || !*orig_bwt_buf)
269     return false;
270    
271     static const int buf_size = 2048;
272    
273     char bwt_buf[buf_size];
274     strcpy(bwt_buf, orig_bwt_buf);
275     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
276    
277     char orientation;
278    
279     //int bwtf_ret = 0;
280     //uint32_t seqid = 0;
281    
282     char text_name[buf_size];
283     unsigned int text_offset;
284    
285    
286     //unsigned int other_occs;
287     char mismatches[buf_size];
288     //memset(mismatches, 0, sizeof(mismatches));
289    
290     const char* buf = bwt_buf;
291     char* name = get_token((char**)&buf,"\t");
292     char* orientation_str = get_token((char**)&buf,"\t");
293     char* text_name_str = get_token((char**)&buf,"\t");
294    
295     strcpy(text_name, text_name_str);
296    
297     char* text_offset_str = get_token((char**)&buf,"\t");
298     char* seq_str = get_token((char**)&buf,"\t");
299     if (seq)
300     strcpy(seq, seq_str);
301    
302     const char* qual_str = get_token((char**)&buf,"\t");
303     if (qual)
304     strcpy(qual, qual_str);
305    
306     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
307     mismatches[0] = 0;
308     char* mismatches_str = get_token((char**)&buf, "\t");
309     if (mismatches_str)
310     strcpy(mismatches, mismatches_str);
311    
312     orientation = orientation_str[0];
313     text_offset = atoi(text_offset_str);
314    
315     bool end = true;
316     unsigned int seg_offset = 0;
317     unsigned int seg_num = 0;
318     unsigned int num_segs = 0;
319    
320     // Copy the tag out of the name field before we might wipe it out
321     char* pipe = strrchr(name, '|');
322     if (pipe)
323     {
324     if (name_tags)
325     strcpy(name_tags, pipe);
326    
327     char* tag_buf = pipe + 1;
328     if (strchr(tag_buf, ':'))
329     {
330     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
331     if (seg_num + 1 == num_segs)
332     end = true;
333     else
334     end = false;
335     }
336    
337     *pipe = 0;
338     }
339     // Stripping the slash and number following it gives the insert name
340     char* slash = strrchr(name, '/');
341     if (strip_slash && slash)
342     *slash = 0;
343    
344     size_t read_len = strlen(seq_str);
345    
346     // Add this alignment to the table of hits for this half of the
347     // Bowtie map
348    
349     //vector<string> mismatch_toks;
350     char* pch = strtok (mismatches,",");
351     unsigned char num_mismatches = 0;
352     while (pch != NULL)
353     {
354     char* colon = strchr(pch, ':');
355     if (colon)
356     {
357     num_mismatches++;
358     }
359     //mismatch_toks.push_back(pch);
360     pch = strtok (NULL, ",");
361     }
362    
363     bh = create_hit(name,
364     text_name,
365     text_offset,
366     (int)read_len,
367     orientation == '-',
368     num_mismatches,
369     end);
370    
371     return true;
372     }
373    
374     int anchor_mismatch = 0;
375    
376 gpertea 70
377     void parseSegReadName(char* name, char*& name_tags, bool strip_slash,
378     bool &end, unsigned int &seg_offset, unsigned int& seg_num,
379     unsigned int & num_segs) {
380     char* pipe = strrchr(name, '|');
381     if (pipe)
382     {
383     if (name_tags)
384     strcpy(name_tags, pipe);
385    
386     char* tag_buf = pipe + 1;
387     if (strchr(tag_buf, ':'))
388     {
389     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
390     if (seg_num + 1 == num_segs)
391     end = true;
392     else
393     end = false;
394     }
395    
396     *pipe = 0;
397     }
398    
399     // Stripping the slash and number following it gives the insert name
400     char* slash = strrchr(name, '/');
401     if (strip_slash && slash)
402     *slash = 0;
403     }
404    
405    
406 gpertea 29 bool SplicedBowtieHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
407     BowtieHit& bh,
408     bool strip_slash,
409     char* name_out,
410     char* name_tags,
411     char* seq,
412     char* qual)
413     {
414     if (!orig_bwt_buf || !*orig_bwt_buf)
415     return false;
416    
417     static const int buf_size = 2048;
418    
419     char bwt_buf[buf_size];
420     strcpy(bwt_buf, orig_bwt_buf);
421     //const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
422    
423     char orientation;
424     char text_name[buf_size];
425     unsigned int text_offset;
426     char mismatches[buf_size];
427     //memset(mismatches, 0, sizeof(mismatches));
428    
429     char* buf = bwt_buf;
430     char* name = get_token((char**)&buf,"\t");
431     char* orientation_str = get_token((char**)&buf,"\t");
432     char* text_name_str = get_token((char**)&buf,"\t");
433     strcpy(text_name, text_name_str);
434    
435     char* text_offset_str = get_token((char**)&buf,"\t");
436     char* seq_str = get_token((char**)&buf,"\t");
437     if (seq)
438     strcpy(seq, seq_str);
439    
440     const char* qual_str = get_token((char**)&buf,"\t");
441     if (qual)
442     strcpy(qual, qual_str);
443    
444     /*const char* other_occs_str =*/ get_token((char**)&buf, "\t");
445     mismatches[0] = 0;
446     char* mismatches_str = get_token((char**)&buf, "\t");
447     if (mismatches_str)
448     strcpy(mismatches, mismatches_str);
449    
450     orientation = orientation_str[0];
451     text_offset = atoi(text_offset_str);
452    
453     bool end = true;
454     unsigned int seg_offset = 0;
455     unsigned int seg_num = 0;
456     unsigned int num_segs = 0;
457    
458     // Copy the tag out of the name field before we might wipe it out
459 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
460 gpertea 29
461     // Add this alignment to the table of hits for this half of the
462     // Bowtie map
463    
464     // Parse the text_name field to recover the splice coords
465     vector<string> toks;
466    
467     tokenize_strict(text_name, "|", toks);
468    
469     int num_extra_toks = (int)toks.size() - 6;
470    
471     if (num_extra_toks >= 0)
472     {
473     static const uint8_t left_window_edge_field = 1;
474     static const uint8_t splice_field = 2;
475     //static const uint8_t right_window_edge_field = 3;
476     static const uint8_t junction_type_field = 4;
477     static const uint8_t strand_field = 5;
478    
479     string contig = toks[0];
480     for (int t = 1; t <= num_extra_toks; ++t)
481     {
482     contig += "|";
483     contig += toks[t];
484     }
485    
486     vector<string> splice_toks;
487     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
488     if (splice_toks.size() != 2)
489     {
490     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
491     //fprintf(stderr, "%s (token: %s)\n", text_name,
492     // toks[num_extra_toks + splice_field].c_str());
493     return false;
494     }
495    
496     //
497     // check for an insertion hit
498     //
499     if(toks[num_extra_toks + junction_type_field] == "ins")
500     {
501     int8_t spliced_read_len = strlen(seq_str);
502     /*
503     * The 0-based position of the left edge of the alignment. Note that this
504     * value may need to be futher corrected to account for the presence of
505     * of the insertion.
506     */
507     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
508     uint32_t right = left + spliced_read_len - 1;
509    
510     /*
511     * The 0-based position of the last genomic sequence before the insertion
512     */
513     uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
514    
515     string insertedSequence = splice_toks[1];
516     /*
517     * The 0-based position of the first genomic sequence after teh insertion
518     */
519     uint32_t right_splice_pos = left_splice_pos + 1;
520     if(left > left_splice_pos){
521     /*
522     * The genomic position of the left edge of the alignment needs to be corrected
523     * If the alignment does not extend into the insertion, simply subtract the length
524     * of the inserted sequence, otherwise, just set it equal to the right edge
525     */
526     left = left - insertedSequence.length();
527     if(left < right_splice_pos){
528     left = right_splice_pos;
529     }
530     }
531     if(right > left_splice_pos){
532     right = right - insertedSequence.length();
533     if(right < left_splice_pos){
534     right = left_splice_pos;
535     }
536     }
537     /*
538     * Now, right and left should be properly transformed into genomic coordinates
539     * We should be able to deduce how much the alignment matches the insertion
540     * simply based on the length of the read
541     */
542     int left_match_length = 0;
543     if(left <= left_splice_pos){
544     left_match_length = left_splice_pos - left + 1;
545     }
546     int right_match_length = 0;
547     if(right >= right_splice_pos){
548     right_match_length = right - right_splice_pos + 1;
549     }
550     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
551    
552     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
553     return false;
554    
555     string junction_strand = toks[num_extra_toks + strand_field];
556     if(junction_strand != "rev" && junction_strand != "fwd"){
557     fprintf(stderr, "Malformed insertion record\n");
558     return false;
559     }
560    
561     char* pch = strtok( mismatches, ",");
562     unsigned char num_mismatches = 0;
563    
564     /*
565     * remember that text_offset holds the left end of the
566     * alignment, relative to the start of the contig
567     */
568    
569     /*
570     * The 0-based relative position of the left-most character
571     * before the insertion in the contig
572     */
573     int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
574     while (pch != NULL)
575     {
576     char* colon = strchr(pch, ':');
577     if (colon)
578     {
579     *colon = 0;
580     int mismatch_pos = atoi(pch);
581    
582     /*
583     * for reversely mapped reads,
584     * find the correct mismatched position.
585     */
586     if(orientation == '-'){
587     mismatch_pos = spliced_read_len - mismatch_pos - 1;
588     }
589    
590     /*
591     * Only count mismatches outside of the insertion region
592     * If there is a mismatch within the insertion,
593     * disallow this hit
594     */
595     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
596     num_mismatches++;
597     }else{
598     return false;
599     }
600     }
601     pch = strtok (NULL, ",");
602     }
603    
604    
605     vector<CigarOp> cigar;
606     cigar.push_back(CigarOp(MATCH, left_match_length));
607     cigar.push_back(CigarOp(INS, insertion_match_length));
608     cigar.push_back(CigarOp(MATCH, right_match_length));
609    
610     /*
611     * For now, disallow hits that don't span
612     * the insertion. If we allow these types of hits,
613     * then long_spanning.cpp needs to be updated
614     * in order to intelligently merge these kinds
615     * of reads back together
616     *
617     * Following code has been changed to allow segment that end
618     * in an insertion
619     */
620     bh = create_hit(name,
621     contig,
622     left,
623     cigar,
624     orientation == '-',
625     junction_strand == "rev",
626     num_mismatches,
627     0,
628     end);
629     return true;
630     }
631    
632     else
633     {
634     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
635     int spliced_read_len = strlen(seq_str);
636     int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
637     if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
638     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
639    
640     if (right_splice_pos <= 0 || left_splice_pos <= 0)
641     return false;
642    
643     if (orientation == '+')
644     {
645     if (left_splice_pos + seg_offset < _anchor_length){
646     return false;
647     }
648     }
649     else
650     {
651     if (right_splice_pos + seg_offset < _anchor_length)
652     return false;
653     }
654     //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
655     //atoi(toks[right_window_edge_field].c_str());
656     int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
657    
658     string junction_strand = toks[num_extra_toks + strand_field];
659     if (!(junction_strand == "rev" || junction_strand == "fwd")||
660     !(orientation == '-' || orientation == '+'))
661     {
662     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
663     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
664     // junction_strand.c_str(), orientation);
665     return false;
666     }
667    
668     //vector<string> mismatch_toks;
669     char* pch = strtok (mismatches,",");
670     int mismatches_in_anchor = 0;
671     unsigned char num_mismatches = 0;
672     while (pch != NULL)
673     {
674     char* colon = strchr(pch, ':');
675     if (colon)
676     {
677     *colon = 0;
678     num_mismatches++;
679     int mismatch_pos = atoi(pch);
680     if ((orientation == '+' && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
681     (orientation == '-' && abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
682     mismatches_in_anchor++;
683     }
684     //mismatch_toks.push_back(pch);
685     pch = strtok (NULL, ",");
686     }
687    
688     // FIXME: we probably should exclude these hits somewhere, but this
689     // isn't the right place
690     vector<CigarOp> cigar;
691     cigar.push_back(CigarOp(MATCH, left_splice_pos));
692     if(toks[num_extra_toks + junction_type_field] == "del"){
693     cigar.push_back(CigarOp(DEL, gap_len));
694     }else{
695     cigar.push_back(CigarOp(REF_SKIP, gap_len));
696     }
697     cigar.push_back(CigarOp(MATCH, right_splice_pos));
698    
699     bh = create_hit(name,
700     contig,
701     left,
702     cigar,
703     orientation == '-',
704     junction_strand == "rev",
705     num_mismatches,
706     mismatches_in_anchor,
707     end);
708     return true;
709     }
710     }
711     else
712     {
713     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
714     //fprintf(stderr, "%s\n", orig_bwt_buf);
715     // continue;
716     return false;
717     }
718    
719     return false;
720     }
721    
722 gpertea 70
723     char parseCigar(vector<CigarOp>& cigar, const char* cigar_str,
724     bool &spliced_alignment) {
725     // Mostly pilfered direct from the SAM tools:
726     const char* p_cig = cigar_str;
727     while (*p_cig)
728     {
729     char* t;
730     int length = (int)strtol(p_cig, &t, 10);
731     if (length <= 0)
732     {
733     //fprintf (stderr, "CIGAR op has zero length\n");
734     return false;
735     }
736     char op_char = toupper(*t);
737     CigarOpCode opcode;
738     if (op_char == 'M') opcode = MATCH;
739     else if (op_char == 'I') opcode = INS;
740     else if (op_char == 'D') opcode = DEL;
741     else if (op_char == 'N')
742     {
743     if (length > max_report_intron_length)
744     return false;
745     opcode = REF_SKIP;
746     spliced_alignment = true;
747     }
748     else if (op_char == 'S') opcode = SOFT_CLIP;
749     else if (op_char == 'H') opcode = HARD_CLIP;
750     else if (op_char == 'P') opcode = PAD;
751     else
752     {
753     fprintf (stderr, "invalid CIGAR operation\n");
754     return false;
755     }
756     p_cig = t + 1;
757     //i += length;
758     cigar.push_back(CigarOp(opcode, length));
759     }
760     if (*p_cig) {
761     fprintf (stderr, "unmatched CIGAR operation\n");
762     return *p_cig;
763     }
764     return 0;
765     }
766    
767     void getSAMmismatches(char* &buf, vector<CigarOp>& cigar,
768     int* mismatches, int& num_mismatches, int& sam_nm, bool& antisense_splice) {
769     const char* tag_buf = buf;
770     sam_nm=0;
771     num_mismatches=0;
772     while((tag_buf = get_token((char**)&buf,"\t")))
773     {
774     vector<string> tuple_fields;
775     tokenize(tag_buf,":", tuple_fields);
776     if (tuple_fields.size() == 3)
777     {
778     if (tuple_fields[0] == "XS")
779     {
780     if (tuple_fields[2] == "-")
781     antisense_splice = true;
782     }
783     else if (tuple_fields[0] == "NM")
784     {
785     sam_nm = atoi(tuple_fields[2].c_str());
786     }
787     else if (tuple_fields[0] == "NS")
788     {
789     //ignored for now
790     }
791     else if (tuple_fields[0] == "MD")
792     {
793     const char* p=tuple_fields[2].c_str();
794     int bi=0; //base offset position in the read
795     while (*p != 0) {
796     if (isdigit(*p)) {
797     int v=atoi(p);
798     do { p++; } while (isdigit(*p));
799     bi+=v;
800     }
801     while (isupper(*p)) {
802     p++;
803     mismatches[num_mismatches]=bi;
804     num_mismatches++;
805     bi++;
806     }
807     if (*p=='^') { //reference deletion
808     p++;
809     while (isupper(*p)) { //insert read bases
810     p++; bi++;
811     }
812     }
813     }
814     }
815     else
816     {
817     //fprintf(stderr, "%s attribute not supported\n", tuple_fields[0].c_str());
818     //return false;
819     }
820     }
821     }
822     /* By convention,the NM field of the SAM record
823     * counts an insertion or deletion. I dont' think
824     * we want the mismatch count in the BowtieHit
825     * record to reflect this. Therefore, subtract out
826     * the mismatches due to in/dels
827     */
828     for(vector<CigarOp>::const_iterator itr = cigar.begin(); itr != cigar.end(); ++itr){
829     if(itr->opcode == INS){
830     sam_nm -= itr->length;
831     }
832     if(itr->opcode == DEL){
833     sam_nm -= itr->length;
834     }
835     }
836     }
837    
838 gpertea 29 bool SAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
839     BowtieHit& bh,
840     bool strip_slash,
841     char* name_out,
842     char* name_tags,
843     char* seq,
844     char* qual)
845     {
846     if (!orig_bwt_buf || !*orig_bwt_buf)
847     return false;
848     char bwt_buf[2048];
849     strcpy(bwt_buf, orig_bwt_buf);
850     // Are we still in the header region?
851     if (bwt_buf[0] == '@')
852     return false;
853    
854     char* buf = bwt_buf;
855     char* name = get_token((char**)&buf,"\t");
856     char* sam_flag_str = get_token((char**)&buf,"\t");
857     char* text_name = get_token((char**)&buf,"\t");
858     char* text_offset_str = get_token((char**)&buf,"\t");
859     const char* map_qual_str = get_token((char**)&buf,"\t");
860     char* cigar_str = get_token((char**)&buf,"\t");
861     const char* mate_ref_str = get_token((char**)&buf,"\t");
862     const char* mate_pos_str = get_token((char**)&buf,"\t");
863     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
864    
865     const char* seq_str = get_token((char**)&buf,"\t");
866     if (seq)
867     strcpy(seq, seq_str);
868    
869     const char* qual_str = get_token((char**)&buf,"\t");
870     if (qual)
871     strcpy(qual, qual_str);
872    
873     if (!name ||
874     !sam_flag_str ||
875     !text_name ||
876     !text_offset_str ||
877     !map_qual_str ||
878     !cigar_str ||
879     !mate_ref_str ||
880     !mate_pos_str ||
881     !inferred_insert_sz_str ||
882     !seq_str ||
883     !qual_str)
884     {
885     // truncated or malformed SAM record
886     return false;
887     }
888    
889    
890     int sam_flag = atoi(sam_flag_str);
891     int text_offset = atoi(text_offset_str);
892    
893     bool end = true;
894     unsigned int seg_offset = 0;
895     unsigned int seg_num = 0;
896     unsigned int num_segs = 0;
897    
898     // Copy the tag out of the name field before we might wipe it out
899 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
900 gpertea 29
901     vector<CigarOp> cigar;
902     bool spliced_alignment = false;
903 gpertea 70 if (parseCigar(cigar, cigar_str, spliced_alignment)) {
904     return false;
905     }
906 gpertea 29
907     //vector<string> attributes;
908     //tokenize(tag_buf, " \t",attributes);
909 gpertea 70
910 gpertea 29 bool antisense_splice = false;
911 gpertea 70 int num_mismatches = 0;
912     int sam_nm = 0; //the value of the NM tag (edit distance)
913     int mismatches[1024];
914     getSAMmismatches(buf, cigar, mismatches, num_mismatches, sam_nm, antisense_splice);
915 gpertea 29
916     if (spliced_alignment)
917     {
918     bh = create_hit(name,
919     text_name,
920     text_offset - 1,
921     cigar,
922     sam_flag & 0x0010,
923     antisense_splice,
924     num_mismatches,
925 gpertea 70 0,
926 gpertea 29 end);
927     }
928     else
929     {
930     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
931     bh = create_hit(name,
932     text_name,
933     text_offset - 1, // SAM files are 1-indexed
934     cigar,
935     sam_flag & 0x0010,
936     false,
937     num_mismatches,
938     0,
939     end);
940     }
941 gpertea 70 return true;
942 gpertea 29 }
943    
944 gpertea 70
945     void spliceCigar(vector<CigarOp>& splcigar, vector<CigarOp>& cigar,
946     int8_t l_len, int mlen, int8_t r_len, CigarOpCode opcode) {
947     //merge the original 'cigar' with the new insert/gap operation
948     // at position l_len into splcigar;
949     //find the original opcode that croses l_len location
950     int ofs=0;
951     bool found=false;
952     for (size_t c = 0 ; c < cigar.size(); ++c)
953     {
954     ofs+=cigar[c].length;
955     if (!found) {
956     if (ofs>=l_len) {
957     found=true;
958 gpertea 97 //TODO:inject new code here, splitting existing code if necessary
959    
960     continue;
961 gpertea 70 }
962     else {
963     //not found yet, just copy these codes
964     splcigar.push_back(cigar[c]);
965     }
966     }
967 gpertea 97 else { //found already
968     //TODO: check this
969     splcigar.push_back(cigar[c]);
970 gpertea 70 }
971     }
972     }
973    
974 gpertea 69 bool SplicedSAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
975     BowtieHit& bh,
976     bool strip_slash,
977     char* name_out,
978     char* name_tags,
979     char* seq,
980     char* qual)
981     {
982     if (!orig_bwt_buf || !*orig_bwt_buf)
983     return false;
984 gpertea 29
985 gpertea 69 char bwt_buf[2048];
986     strcpy(bwt_buf, orig_bwt_buf);
987     // Are we still in the header region?
988     if (bwt_buf[0] == '@')
989     return false;
990 gpertea 29
991 gpertea 69 char* buf = bwt_buf;
992     char* name = get_token((char**)&buf,"\t");
993     char* sam_flag_str = get_token((char**)&buf,"\t");
994     char* text_name = get_token((char**)&buf,"\t");
995     char* text_offset_str = get_token((char**)&buf,"\t");
996     const char* map_qual_str = get_token((char**)&buf,"\t");
997     char* cigar_str = get_token((char**)&buf,"\t");
998     const char* mate_ref_str = get_token((char**)&buf,"\t");
999     const char* mate_pos_str = get_token((char**)&buf,"\t");
1000     const char* inferred_insert_sz_str = get_token((char**)&buf,"\t");
1001 gpertea 70 int num_mismatches=0;
1002     int mismatches[1024]; //list of 0-based mismatch positions in this read
1003     //parsed from SAM's MD:Z: tag
1004 gpertea 69 const char* seq_str = get_token((char**)&buf,"\t");
1005     if (seq)
1006     strcpy(seq, seq_str);
1007    
1008     const char* qual_str = get_token((char**)&buf,"\t");
1009     if (qual)
1010     strcpy(qual, qual_str);
1011    
1012     if (!name ||
1013     !sam_flag_str ||
1014     !text_name ||
1015     !text_offset_str ||
1016     !map_qual_str ||
1017     !cigar_str ||
1018     !mate_ref_str ||
1019     !mate_pos_str ||
1020     !inferred_insert_sz_str ||
1021     !seq_str ||
1022     !qual_str)
1023     {
1024     // truncated or malformed SAM record
1025     return false;
1026     }
1027    
1028    
1029     int sam_flag = atoi(sam_flag_str);
1030     int text_offset = atoi(text_offset_str);
1031 gpertea 70 text_offset--; //SAM is 1-based, Bowtie is 0-based
1032 gpertea 69 bool end = true;
1033     unsigned int seg_offset = 0;
1034     unsigned int seg_num = 0;
1035     unsigned int num_segs = 0;
1036    
1037     // Copy the tag out of the name field before we might wipe it out
1038 gpertea 70 parseSegReadName(name, name_tags, strip_slash, end, seg_offset, seg_num, num_segs);
1039 gpertea 69
1040 gpertea 70 vector<CigarOp> samcigar;
1041     bool spliced_alignment = false;
1042     if (parseCigar(samcigar, cigar_str, spliced_alignment)) {
1043     return false;
1044     }
1045 gpertea 69
1046 gpertea 70 bool antisense_splice = false;
1047     int sam_nm = 0;
1048 gpertea 135 getSAMmismatches(buf, samcigar, mismatches, num_mismatches, sam_nm, antisense_splice);
1049 gpertea 69
1050 gpertea 70 //##############################################
1051 gpertea 69
1052     // Add this alignment to the table of hits for this half of the
1053     // Bowtie map
1054    
1055     // Parse the text_name field to recover the splice coords
1056     vector<string> toks;
1057    
1058     tokenize_strict(text_name, "|", toks);
1059    
1060     int num_extra_toks = (int)toks.size() - 6;
1061    
1062     if (num_extra_toks >= 0)
1063     {
1064     static const uint8_t left_window_edge_field = 1;
1065     static const uint8_t splice_field = 2;
1066     //static const uint8_t right_window_edge_field = 3;
1067     static const uint8_t junction_type_field = 4;
1068     static const uint8_t strand_field = 5;
1069    
1070     string contig = toks[0];
1071     for (int t = 1; t <= num_extra_toks; ++t)
1072     {
1073     contig += "|";
1074     contig += toks[t];
1075     }
1076    
1077     vector<string> splice_toks;
1078     tokenize(toks[num_extra_toks + splice_field], "-", splice_toks);
1079     if (splice_toks.size() != 2)
1080     {
1081     fprintf(stderr, "Warning: found malformed splice record, skipping:\n");
1082 gpertea 70 //fprintf(stderr, "\t%s (token: %s)\n", text_name,
1083 gpertea 69 // toks[num_extra_toks + splice_field].c_str());
1084     return false;
1085     }
1086 gpertea 70 int spl_num_mismatches=0;
1087 gpertea 69 //
1088     // check for an insertion hit
1089     //
1090     if(toks[num_extra_toks + junction_type_field] == "ins")
1091     {
1092     int8_t spliced_read_len = strlen(seq_str);
1093 gpertea 70 // The 0-based position of the left edge of the alignment. Note that this
1094     // value may need to be futher corrected to account for the presence of
1095     // of the insertion.
1096 gpertea 69 uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1097     uint32_t right = left + spliced_read_len - 1;
1098    
1099 gpertea 70 // The 0-based position of the last genomic sequence before the insertion
1100 gpertea 69 uint32_t left_splice_pos = atoi(splice_toks[0].c_str());
1101    
1102     string insertedSequence = splice_toks[1];
1103 gpertea 70 // The 0-based position of the first genomic sequence after the insertion
1104    
1105 gpertea 69 uint32_t right_splice_pos = left_splice_pos + 1;
1106     if(left > left_splice_pos){
1107     /*
1108     * The genomic position of the left edge of the alignment needs to be corrected
1109     * If the alignment does not extend into the insertion, simply subtract the length
1110     * of the inserted sequence, otherwise, just set it equal to the right edge
1111     */
1112     left = left - insertedSequence.length();
1113     if(left < right_splice_pos){
1114     left = right_splice_pos;
1115     }
1116     }
1117     if(right > left_splice_pos){
1118     right = right - insertedSequence.length();
1119     if(right < left_splice_pos){
1120     right = left_splice_pos;
1121     }
1122     }
1123     /*
1124     * Now, right and left should be properly transformed into genomic coordinates
1125     * We should be able to deduce how much the alignment matches the insertion
1126     * simply based on the length of the read
1127     */
1128     int left_match_length = 0;
1129     if(left <= left_splice_pos){
1130     left_match_length = left_splice_pos - left + 1;
1131     }
1132     int right_match_length = 0;
1133     if(right >= right_splice_pos){
1134     right_match_length = right - right_splice_pos + 1;
1135     }
1136     int insertion_match_length = spliced_read_len - left_match_length - right_match_length;
1137    
1138     if(left_match_length <= 0 || right_match_length <= 0 || insertion_match_length <= 0)
1139     return false;
1140    
1141     string junction_strand = toks[num_extra_toks + strand_field];
1142     if(junction_strand != "rev" && junction_strand != "fwd"){
1143     fprintf(stderr, "Malformed insertion record\n");
1144     return false;
1145     }
1146    
1147 gpertea 70 //char* pch = strtok( mismatches, ",");
1148     //unsigned char num_mismatches = 0;
1149     //text_offset holds the left end of the alignment,
1150     //RELATIVE TO the start of the contig
1151 gpertea 69
1152 gpertea 70 //The 0-based relative position of the left-most character
1153     //before the insertion in the contig
1154 gpertea 69 int relative_splice_pos = left_splice_pos - atoi(toks[num_extra_toks + left_window_edge_field].c_str());
1155 gpertea 70 for (int i=0;i<num_mismatches;i++) {
1156     int mismatch_pos = mismatches[i];
1157     // for reversely mapped reads,
1158     //find the correct mismatched position.
1159     if (sam_flag & 0x0010){
1160     mismatch_pos = spliced_read_len - mismatch_pos - 1;
1161     }
1162 gpertea 69
1163 gpertea 70 //Only count mismatches outside of the insertion region
1164     // If there is a mismatch within the insertion,
1165     // disallow this hit
1166     if(mismatch_pos + text_offset <= relative_splice_pos || mismatch_pos + text_offset > relative_splice_pos + insertedSequence.length()){
1167     spl_num_mismatches++;
1168     }else{
1169     return false;
1170 gpertea 69 }
1171     }
1172    
1173 gpertea 135 //TODO: merge with the original cigar string
1174 gpertea 70 vector<CigarOp> splcigar;
1175     spliceCigar(splcigar, samcigar, left_match_length, insertion_match_length, right_match_length, INS);
1176 gpertea 69 /*
1177 gpertea 70 splcigar.push_back(CigarOp(MATCH, left_match_length));
1178     splcigar.push_back(CigarOp(INS, insertion_match_length));
1179     splcigar.push_back(CigarOp(MATCH, right_match_length));
1180     */
1181     /*
1182 gpertea 69 * For now, disallow hits that don't span
1183     * the insertion. If we allow these types of hits,
1184     * then long_spanning.cpp needs to be updated
1185     * in order to intelligently merge these kinds
1186     * of reads back together
1187     *
1188     * Following code has been changed to allow segment that end
1189     * in an insertion
1190     */
1191     bh = create_hit(name,
1192     contig,
1193     left,
1194 gpertea 70 //splcigar,
1195     splcigar,
1196     sam_flag & 0x0010,
1197 gpertea 69 junction_strand == "rev",
1198 gpertea 70 spl_num_mismatches,
1199 gpertea 69 0,
1200     end);
1201     return true;
1202 gpertea 70 } //"ins"
1203 gpertea 69 else
1204     {
1205     uint32_t left = atoi(toks[num_extra_toks + left_window_edge_field].c_str()) + text_offset;
1206     int spliced_read_len = strlen(seq_str);
1207     int8_t left_splice_pos = atoi(splice_toks[0].c_str()) - left + 1;
1208     if(left_splice_pos > spliced_read_len) left_splice_pos = spliced_read_len;
1209     int8_t right_splice_pos = spliced_read_len - left_splice_pos;
1210    
1211     if (right_splice_pos <= 0 || left_splice_pos <= 0)
1212     return false;
1213    
1214     if ((sam_flag & 0x0010) == 0) //######
1215     {
1216 gpertea 135 if (left_splice_pos + seg_offset < _anchor_length)
1217     return false;
1218     }
1219     else
1220     {
1221     if (right_splice_pos + seg_offset < _anchor_length)
1222     return false;
1223     }
1224 gpertea 69 //uint32_t right = atoi(splice_toks[1].c_str()) + right_splice_pos;
1225     //atoi(toks[right_window_edge_field].c_str());
1226     int gap_len = atoi(splice_toks[1].c_str()) - atoi(splice_toks[0].c_str()) - 1;
1227    
1228     string junction_strand = toks[num_extra_toks + strand_field];
1229 gpertea 70 if (!(junction_strand == "rev" || junction_strand == "fwd"))
1230     // || !(orientation == '-' || orientation == '+'))
1231 gpertea 69 {
1232 gpertea 135 fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1233     //fprintf(stderr, "junction_strand=%s, orientation='%c'\n",
1234     // junction_strand.c_str(), orientation);
1235     return false;
1236 gpertea 69 }
1237    
1238     int mismatches_in_anchor = 0;
1239 gpertea 70 for (int i=0;i<num_mismatches;i++) {
1240     spl_num_mismatches++;
1241     int mismatch_pos = mismatches[i];
1242     if (((sam_flag & 0x0010) == 0 && abs(mismatch_pos - left_splice_pos) < (int)min_anchor_len) ||
1243     ((sam_flag & 0x0010) != 0 &&
1244     abs(((int)spliced_read_len - left_splice_pos + 1) - mismatch_pos)) < (int)min_anchor_len)
1245     mismatches_in_anchor++;
1246     }
1247 gpertea 69
1248     // FIXME: we probably should exclude these hits somewhere, but this
1249     // isn't the right place
1250 gpertea 70 vector<CigarOp> splcigar;
1251     CigarOpCode opcode=(toks[num_extra_toks + junction_type_field] == "del")? DEL : REF_SKIP ;
1252 gpertea 135 spliceCigar(splcigar, samcigar, left_splice_pos, gap_len, right_splice_pos, opcode);
1253 gpertea 70 /*
1254     splcigar.push_back(CigarOp(MATCH, left_splice_pos));
1255 gpertea 69 if(toks[num_extra_toks + junction_type_field] == "del"){
1256 gpertea 70 splcigar.push_back(CigarOp(DEL, gap_len));
1257 gpertea 69 }else{
1258 gpertea 70 splcigar.push_back(CigarOp(REF_SKIP, gap_len));
1259 gpertea 69 }
1260 gpertea 70 splcigar.push_back(CigarOp(MATCH, right_splice_pos));
1261     */
1262 gpertea 69 bh = create_hit(name,
1263     contig,
1264     left,
1265 gpertea 70 splcigar,
1266     (sam_flag & 0x0010),
1267 gpertea 69 junction_strand == "rev",
1268 gpertea 70 spl_num_mismatches,
1269 gpertea 69 mismatches_in_anchor,
1270     end);
1271     return true;
1272     }
1273 gpertea 70 } //parse splice data
1274 gpertea 69 else
1275     {
1276     fprintf(stderr, "Warning: found malformed splice record, skipping\n");
1277     //fprintf(stderr, "%s\n", orig_bwt_buf);
1278     // continue;
1279     return false;
1280     }
1281    
1282     return false;
1283     }
1284    
1285    
1286 gpertea 135
1287 gpertea 29 bool BAMHitFactory::get_hit_from_buf(const char* orig_bwt_buf,
1288     BowtieHit& bh, bool strip_slash,
1289     char* name_out, char* name_tags,
1290     char* seq, char* qual) {
1291     if (_sam_header==NULL)
1292     err_die("Error: no SAM header when BAMHitFactory::get_hit_from_buf()!");
1293     const bam1_t* hit_buf = (const bam1_t*)orig_bwt_buf;
1294    
1295    
1296     uint32_t sam_flag = hit_buf->core.flag;
1297    
1298     int text_offset = hit_buf->core.pos;
1299     int text_mate_pos = hit_buf->core.mpos;
1300     int target_id = hit_buf->core.tid;
1301     int mate_target_id = hit_buf->core.mtid;
1302    
1303     vector<CigarOp> cigar;
1304     bool spliced_alignment = false;
1305     int num_hits = 1;
1306    
1307     double mapQ = hit_buf->core.qual;
1308     long double error_prob;
1309     if (mapQ > 0)
1310     {
1311     long double p = (-1.0 * mapQ) / 10.0;
1312     error_prob = pow(10.0L, p);
1313     }
1314     else
1315     {
1316     error_prob = 1.0;
1317     }
1318    
1319     //header->target_name[c->tid]
1320    
1321     bool end = true;
1322     unsigned int seg_offset = 0;
1323     unsigned int seg_num = 0;
1324     unsigned int num_segs = 0;
1325     // Copy the tag out of the name field before we might wipe it out
1326     char* qname = bam1_qname(hit_buf);
1327     char* pipe = strrchr(qname, '|');
1328     if (pipe)
1329     {
1330     if (name_tags)
1331     strcpy(name_tags, pipe);
1332    
1333     char* tag_buf = pipe + 1;
1334     if (strchr(tag_buf, ':'))
1335     {
1336     sscanf(tag_buf, "%u:%u:%u", &seg_offset, &seg_num, &num_segs);
1337     if (seg_num + 1 == num_segs)
1338     end = true;
1339     else
1340     end = false;
1341     }
1342    
1343     *pipe = 0;
1344     }
1345    
1346    
1347     if (target_id < 0) {
1348     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1349     bh = create_hit(qname,
1350     "*", //ref_name
1351     0, //left coord
1352     0, //read_len
1353     false, //antisense_aln
1354     0, //edit_dist
1355     end);
1356     return true;
1357     }
1358    
1359     //string text_name = ((samfile_t*)(hs._hit_file))->header->target_name[target_id];
1360     string text_name = _sam_header->target_name[target_id];
1361     for (int i = 0; i < hit_buf->core.n_cigar; ++i)
1362     {
1363     //char* t;
1364    
1365     int length = bam1_cigar(hit_buf)[i] >> BAM_CIGAR_SHIFT;
1366     if (length <= 0)
1367     {
1368     fprintf (stderr, "BAM error: CIGAR op has zero length\n");
1369     return false;
1370     }
1371    
1372     CigarOpCode opcode;
1373     switch(bam1_cigar(hit_buf)[i] & BAM_CIGAR_MASK)
1374     {
1375     case BAM_CMATCH: opcode = MATCH; break;
1376     case BAM_CINS: opcode = INS; break;
1377     case BAM_CDEL: opcode = DEL; break;
1378     case BAM_CSOFT_CLIP: opcode = SOFT_CLIP; break;
1379     case BAM_CHARD_CLIP: opcode = HARD_CLIP; break;
1380     case BAM_CPAD: opcode = PAD; break;
1381     case BAM_CREF_SKIP:
1382     opcode = REF_SKIP;
1383     spliced_alignment = true;
1384     if (length > (int)max_report_intron_length)
1385     {
1386     //fprintf(stderr, "Encounter REF_SKIP > max_gene_length, skipping\n");
1387     return false;
1388     }
1389     break;
1390     default:
1391     fprintf (stderr, "BAM read: invalid CIGAR operation\n");
1392     return false;
1393     }
1394     if (opcode != HARD_CLIP)
1395     cigar.push_back(CigarOp(opcode, length));
1396     }
1397    
1398     string mrnm;
1399     if (mate_target_id >= 0) {
1400     if (mate_target_id == target_id) {
1401     //mrnm = ((samfile_t*)(hs._hit_file))->header->target_name[mate_target_id];
1402     mrnm = _sam_header->target_name[mate_target_id];
1403     }
1404     else {
1405     //fprintf(stderr, "Trans-spliced mates are not currently supported, skipping\n");
1406     return false;
1407     }
1408     }
1409     else {
1410     text_mate_pos = 0;
1411     }
1412     //CuffStrand source_strand = CUFF_STRAND_UNKNOWN;
1413     bool antisense_splice=false;
1414     unsigned char num_mismatches = 0;
1415     unsigned char num_splice_anchor_mismatches = 0;
1416    
1417     uint8_t* ptr = bam_aux_get(hit_buf, "XS");
1418     if (ptr) {
1419     char src_strand_char = bam_aux2A(ptr);
1420     if (src_strand_char == '-')
1421     antisense_splice = true;
1422     // else if (src_strand_char == '+')
1423     // source_strand = CUFF_FWD;
1424     }
1425    
1426     ptr = bam_aux_get(hit_buf, "NM");
1427     if (ptr) {
1428     num_mismatches = bam_aux2i(ptr);
1429     }
1430    
1431     ptr = bam_aux_get(hit_buf, "NH");
1432     if (ptr) {
1433     num_hits = bam_aux2i(ptr);
1434     }
1435    
1436     //bool antisense_aln = bam1_strand(hit_buf);
1437    
1438     //if (_rg_props.strandedness() == STRANDED_PROTOCOL && source_strand == CUFF_STRAND_UNKNOWN)
1439     // source_strand = use_stranded_protocol(sam_flag, antisense_aln, _rg_props.mate_strand_mapping());
1440     if (spliced_alignment) {
1441     //if (source_strand == CUFF_STRAND_UNKNOWN) {
1442     // fprintf(stderr, "BAM record error: found spliced alignment without XS attribute\n");
1443     // }
1444     bh = create_hit(qname,
1445     text_name,
1446     text_offset, // BAM files are 0-indexed
1447     cigar,
1448     sam_flag & 0x0010,
1449     antisense_splice,
1450     num_mismatches,
1451     num_splice_anchor_mismatches,
1452     end);
1453    
1454     }
1455     else {
1456     //assert(_rg_props.strandedness() == STRANDED_PROTOCOL || source_strand == CUFF_STRAND_UNKNOWN);
1457     //assert(cigar.size() == 1 && cigar[0].opcode == MATCH);
1458     bh = create_hit(qname,
1459     text_name,
1460     text_offset, // BAM files are 0-indexed
1461     cigar,
1462     sam_flag & 0x0010,
1463     false,
1464     num_mismatches,
1465     0,
1466     end);
1467     }
1468     if (seq!=NULL) {
1469     char *bseq = (char*)bam1_seq(hit_buf);
1470     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1471     char v = bam1_seqi(bseq,i);
1472     seq[i]=bam_nt16_rev_table[v];
1473     }
1474     seq[hit_buf->core.l_qseq]=0;
1475     }
1476     if (qual!=NULL) {
1477     char *bq = (char*)bam1_qual(hit_buf);
1478     for(int i=0;i<(hit_buf->core.l_qseq);i++) {
1479     qual[i]=bq[i]+33;
1480     }
1481     qual[hit_buf->core.l_qseq]=0;
1482     }
1483     return true;
1484     }
1485    
1486     bool BAMHitFactory::inspect_header(HitStream& hs)
1487     {
1488     bam_header_t* header = ((samfile_t*)(hs._hit_file))->header;
1489    
1490     if (header == NULL) {
1491     fprintf(stderr, "Warning: No BAM header\n");
1492     return false;
1493     }
1494     if (header->l_text == 0) {
1495     fprintf(stderr, "Warning: BAM header has 0 length or is corrupted. Try using 'samtools reheader'.\n");
1496     return false;
1497     }
1498     return true;
1499     }
1500    
1501    
1502     void get_mapped_reads(FILE* bwtf,
1503     HitTable& hits,
1504     HitFactory& hit_factory,
1505     bool strip_slash,
1506     bool verbose)
1507     {
1508    
1509    
1510     char bwt_buf[2048];
1511     uint32_t reads_extracted = 0;
1512    
1513     while (fgets(bwt_buf, 2048, bwtf))
1514     {
1515     // Chomp the newline
1516     char* nl = strrchr(bwt_buf, '\n');
1517     if (nl) *nl = 0;
1518     if (*bwt_buf == 0)
1519     continue;
1520     // Get a new record from the tab-delimited Bowtie map
1521     BowtieHit bh;
1522     if (hit_factory.get_hit_from_buf(bwt_buf, bh, strip_slash))
1523     {
1524     // Only check uniqueness if these hits are spliced
1525     hits.add_hit(bh, true);
1526     }
1527     reads_extracted++;
1528     }
1529    
1530     // This will sort the map by insert id.
1531     hits.finalize();
1532     fprintf(stderr, "Extracted %d alignments from Bowtie map\n", reads_extracted);
1533     }
1534    
1535    
1536     /*
1537     AlignStatus status(const BowtieHit* align)
1538     {
1539     if (!align)
1540     return UNALIGNED;
1541     if (align->contiguous())
1542     return CONTIGUOUS;
1543     return SPLICED;
1544     }
1545     */
1546    
1547     void add_hits_to_coverage(const HitList& hits, vector<unsigned short>& DoC)
1548     {
1549     int max_hit_pos = -1;
1550     for (size_t i = 0; i < hits.size(); ++i)
1551     {
1552     max_hit_pos = max((int)hits[i].right(),max_hit_pos);
1553     }
1554    
1555     if ((int)DoC.size() < max_hit_pos)
1556     DoC.resize(max_hit_pos);
1557    
1558     for (size_t i = 0; i < hits.size(); ++i)
1559     {
1560     const BowtieHit& bh = hits[i];
1561    
1562     // split up the coverage contibution for this reads
1563     size_t j = bh.left();
1564     const vector<CigarOp>& cigar = bh.cigar();
1565    
1566     for (size_t c = 0 ; c < cigar.size(); ++c)
1567     {
1568     switch(cigar[c].opcode)
1569     {
1570     case MATCH:
1571     for (size_t m = 0; m < cigar[c].length; ++m)
1572     {
1573     if (DoC[j + m] < 0xFFFF)
1574     DoC[j + m]++;
1575     }
1576     //fall through this case to REF_SKIP is intentional
1577     case REF_SKIP:
1578     j += cigar[c].length;
1579     break;
1580     default:
1581     break;
1582     }
1583    
1584     }
1585     }
1586     }
1587    
1588     void add_hit_to_coverage(const BowtieHit& bh, vector<unsigned int>& DoC)
1589     {
1590     if ((int)DoC.size() < bh.right())
1591     DoC.resize(bh.right());
1592    
1593     // split up the coverage contibution for this reads
1594     size_t j = bh.left();
1595     const vector<CigarOp>& cigar = bh.cigar();
1596    
1597     for (size_t c = 0 ; c < cigar.size(); ++c)
1598     {
1599     switch(cigar[c].opcode)
1600     {
1601     case MATCH:
1602     for (size_t m = 0; m < cigar[c].length; ++m)
1603     {
1604 gpertea 135 if (DoC[j + m] < VMAXINT32)
1605 gpertea 29 DoC[j + m]++;
1606     }
1607     //fall through this case to REF_SKIP is intentional
1608     case REF_SKIP:
1609     j += cigar[c].length;
1610     break;
1611     default:
1612     break;
1613     }
1614    
1615     }
1616     }
1617    
1618     void print_hit(FILE* fout,
1619     const char* read_name,
1620     const BowtieHit& bh,
1621     const char* ref_name,
1622     const char* sequence,
1623     const char* qualities,
1624     bool from_bowtie)
1625     {
1626     string seq;
1627     string quals;
1628     if (sequence)
1629     {
1630     seq = sequence;
1631     quals = qualities;
1632     seq.resize(bh.read_len());
1633     quals.resize(bh.read_len());
1634     }
1635     else
1636     {
1637     seq = "*";
1638     }
1639    
1640     if (qualities)
1641     {
1642     quals = qualities;
1643     quals.resize(bh.read_len());
1644     }
1645     else
1646     {
1647     quals = "*";
1648     }
1649    
1650     uint32_t sam_flag = 0;
1651     if (bh.antisense_align())
1652     {
1653     sam_flag |= 0x0010; // BAM_FREVERSE
1654     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1655     {
1656     reverse_complement(seq);
1657     reverse(quals.begin(), quals.end());
1658     }
1659     }
1660    
1661     uint32_t sam_pos = bh.left() + 1;
1662     uint32_t map_quality = 255;
1663     char cigar[256];
1664     cigar[0] = 0;
1665     string mate_ref_name = "*";
1666     uint32_t mate_pos = 0;
1667     uint32_t insert_size = 0;
1668     //string qualities = "*";
1669    
1670     const vector<CigarOp>& bh_cigar = bh.cigar();
1671    
1672     /*
1673     * In addition to calculating the cigar string,
1674     * we need to figure out how many in/dels are in the
1675     * sequence, so that we can give the correct
1676     * value for the NM tag
1677     */
1678     int indel_distance = 0;
1679     for (size_t c = 0; c < bh_cigar.size(); ++c)
1680     {
1681     char ibuf[64];
1682     sprintf(ibuf, "%d", bh_cigar[c].length);
1683     switch(bh_cigar[c].opcode)
1684     {
1685     case MATCH:
1686     strcat(cigar, ibuf);
1687     strcat(cigar, "M");
1688     break;
1689     case INS:
1690     strcat(cigar, ibuf);
1691     strcat(cigar, "I");
1692     indel_distance += bh_cigar[c].length;
1693     break;
1694     case DEL:
1695     strcat(cigar, ibuf);
1696     strcat(cigar, "D");
1697     indel_distance += bh_cigar[c].length;
1698     break;
1699     case REF_SKIP:
1700     strcat(cigar, ibuf);
1701     strcat(cigar, "N");
1702     break;
1703     default:
1704     break;
1705     }
1706     }
1707    
1708     //string q = string(bh.read_len(), '!');
1709     //string s = string(bh.read_len(), 'N');
1710    
1711     fprintf(fout,
1712     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1713     read_name,
1714     sam_flag,
1715     ref_name,
1716     sam_pos,
1717     map_quality,
1718     cigar,
1719     mate_ref_name.c_str(),
1720     mate_pos,
1721     insert_size,
1722     seq.c_str(),
1723     quals.c_str());
1724    
1725     if (!sam_readgroup_id.empty())
1726     fprintf(fout, "\tRG:Z:%s", sam_readgroup_id.c_str());
1727    
1728     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1729    
1730     bool containsSplice = false;
1731     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr){
1732     if(itr->opcode == REF_SKIP){
1733     containsSplice = true;
1734     break;
1735     }
1736     }
1737    
1738     if (containsSplice)
1739     fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1740    
1741     fprintf(fout, "\n");
1742     }
1743    
1744     void print_bamhit(GBamWriter& wbam,
1745     const char* read_name,
1746     const BowtieHit& bh,
1747     const char* ref_name,
1748     const char* sequence,
1749     const char* qualities,
1750     bool from_bowtie)
1751     {
1752     string seq;
1753     string quals;
1754     if (sequence) {
1755     seq = sequence;
1756     quals = qualities;
1757     seq.resize(bh.read_len());
1758     quals.resize(bh.read_len());
1759     }
1760     else {
1761     seq = "*";
1762     }
1763     if (qualities) {
1764     quals = qualities;
1765     quals.resize(bh.read_len());
1766     }
1767     else
1768     {
1769     quals = "*";
1770     }
1771    
1772     uint32_t sam_flag = 0;
1773     if (bh.antisense_align())
1774     {
1775     sam_flag |= 0x0010; // BAM_FREVERSE
1776     if (sequence && !from_bowtie) // if it is from bowtie hit, it's already reversed.
1777     {
1778     reverse_complement(seq);
1779     reverse(quals.begin(), quals.end());
1780     }
1781     }
1782    
1783     uint32_t sam_pos = bh.left() + 1;
1784     uint32_t map_quality = 255;
1785     char cigar[256];
1786     cigar[0] = 0;
1787     string mate_ref_name = "*";
1788     uint32_t mate_pos = 0;
1789     uint32_t insert_size = 0;
1790     //string qualities = "*";
1791    
1792     const vector<CigarOp>& bh_cigar = bh.cigar();
1793     /*
1794     * In addition to calculating the cigar string,
1795     * we need to figure out how many in/dels are in the
1796     * sequence, so that we can give the correct
1797     * value for the NM tag
1798     */
1799     int indel_distance = 0;
1800     for (size_t c = 0; c < bh_cigar.size(); ++c)
1801     {
1802     char ibuf[64];
1803     sprintf(ibuf, "%d", bh_cigar[c].length);
1804     switch(bh_cigar[c].opcode)
1805     {
1806     case MATCH:
1807     strcat(cigar, ibuf);
1808     strcat(cigar, "M");
1809     break;
1810     case INS:
1811     strcat(cigar, ibuf);
1812     strcat(cigar, "I");
1813     indel_distance += bh_cigar[c].length;
1814     break;
1815     case DEL:
1816     strcat(cigar, ibuf);
1817     strcat(cigar, "D");
1818     indel_distance += bh_cigar[c].length;
1819     break;
1820     case REF_SKIP:
1821     strcat(cigar, ibuf);
1822     strcat(cigar, "N");
1823     break;
1824     default:
1825     break;
1826     }
1827     }
1828    
1829     //string q = string(bh.read_len(), '!');
1830     //string s = string(bh.read_len(), 'N');
1831     /*
1832     fprintf(fout,
1833     "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s",
1834     read_name,
1835     sam_flag,
1836     ref_name,
1837     sam_pos,
1838     map_quality,
1839     cigar,
1840     mate_ref_name.c_str(),
1841     mate_pos,
1842     insert_size,
1843     seq.c_str(),
1844     quals.c_str());
1845    
1846     fprintf(fout, "\tNM:i:%d", bh.edit_dist() + indel_distance);
1847     */
1848     vector<string> auxdata;
1849    
1850     if (!sam_readgroup_id.empty())
1851     {
1852     string nm("RG:Z:");
1853     nm += sam_readgroup_id;
1854     auxdata.push_back(nm);
1855     }
1856    
1857     string nm("NM:i:");
1858     str_appendInt(nm, bh.edit_dist() + indel_distance);
1859     auxdata.push_back(nm);
1860     bool containsSplice = false;
1861     for(vector<CigarOp>::const_iterator itr = bh.cigar().begin(); itr != bh.cigar().end(); ++itr)
1862     if(itr->opcode == REF_SKIP){
1863     containsSplice = true;
1864     break;
1865     }
1866    
1867     if (containsSplice) {
1868     //fprintf(fout, "\tXS:A:%c", bh.antisense_splice() ? '-' : '+');
1869     nm="XS:A:";
1870     nm+=(char)(bh.antisense_splice() ? '-' : '+');
1871     auxdata.push_back(nm);
1872     }
1873    
1874     GBamRecord *brec = wbam.new_record(read_name, sam_flag, ref_name, sam_pos, map_quality,
1875     cigar, mate_ref_name.c_str(), mate_pos,
1876     insert_size, seq.c_str(), quals.c_str(), &auxdata);
1877    
1878    
1879    
1880     wbam.write(brec);
1881     delete brec;
1882     }
1883    
1884     /**
1885     * Print a vector of cigar operations to a file.
1886     * @param bh_cigar A vector of CigarOps
1887     * @return a string representation of the cigar string
1888     */
1889     std::string print_cigar(vector<CigarOp>& bh_cigar){
1890     char cigar[256];
1891     cigar[0] = 0;
1892     for (size_t c = 0; c < bh_cigar.size(); ++c)
1893     {
1894     char ibuf[64];
1895     sprintf(ibuf, "%d", bh_cigar[c].length);
1896     switch(bh_cigar[c].opcode)
1897     {
1898     case MATCH:
1899     strcat(cigar, ibuf);
1900     strcat(cigar, "M");
1901     break;
1902     case INS:
1903     strcat(cigar, ibuf);
1904     strcat(cigar, "I");
1905     break;
1906     case DEL:
1907     strcat(cigar, ibuf);
1908     strcat(cigar, "D");
1909     break;
1910     case REF_SKIP:
1911     strcat(cigar, ibuf);
1912     strcat(cigar, "N");
1913     break;
1914     default:
1915     break;
1916     }
1917     }
1918     string result(cigar);
1919     return result;
1920     }
1921    
1922     bool BowtieHit::check_editdist_consistency(const RefSequenceTable& rt)
1923     {
1924     RefSequenceTable::Sequence* ref_str = rt.get_seq(_ref_id);
1925     if (!ref_str)
1926     return false;
1927    
1928     const seqan::Dna5String ref_seq = seqan::infix(*ref_str, _left, right());
1929    
1930     size_t pos_seq = 0;
1931     size_t pos_ref = 0;
1932     size_t mismatch = 0;
1933     for (size_t i = 0; i < _cigar.size(); ++i)
1934     {
1935     CigarOp cigar = _cigar[i];
1936     switch(cigar.opcode)
1937     {
1938     case MATCH:
1939     {
1940     for (size_t j = 0; j < cigar.length; ++j)
1941     {
1942     if (_seq[pos_seq++] != ref_seq[pos_ref++])
1943     ++mismatch;
1944     }
1945     }
1946     break;
1947     case INS:
1948     {
1949     pos_seq += cigar.length;
1950     }
1951     break;
1952    
1953     case DEL:
1954     case REF_SKIP:
1955     {
1956     pos_ref += cigar.length;
1957     }
1958     break;
1959    
1960     default:
1961     break;
1962     }
1963     }
1964    
1965     return mismatch == _edit_dist;
1966     }