ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat_reports.cpp
(Generate patch)
# Line 56 | Line 56
56   void get_seqs(istream& ref_stream,
57                RefSequenceTable& rt,
58                bool keep_seqs = true)
59 < {
59 > {    
60    while(ref_stream.good() && !ref_stream.eof())
61      {
62        RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
# Line 68 | Line 68
68            name.resize(space_pos);
69          }
70        seqan::read(ref_stream, *ref_str, Fasta());
71 <
71 >      
72        rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
73      }
74   }
75  
76 + struct cmp_read_alignment
77 + {
78 +  cmp_read_alignment(const JunctionSet& gtf_juncs) :
79 +    _gtf_juncs(&gtf_juncs) {}
80 +    
81 +  bool operator() (const BowtieHit& l, const BowtieHit& r) const
82 +  {
83 +    FragmentAlignmentGrade gl(l, *_gtf_juncs);
84 +    FragmentAlignmentGrade gr(r, *_gtf_juncs);
85 +    bool better = gr < gl;
86 +    bool worse = gl < gr;
87 +
88 +    if (better && !worse)
89 +      return true;
90 +    else        
91 +      return false;
92 +  }
93 +
94 +  const JunctionSet* _gtf_juncs;
95 + };
96 +
97   void read_best_alignments(const HitsForRead& hits_for_read,
98 <                              FragmentAlignmentGrade& best_grade,
99 <                              HitsForRead& best_hits,
100 <                              const JunctionSet& gtf_junctions)
98 >                          FragmentAlignmentGrade& best_grade,
99 >                          HitsForRead& best_hits,
100 >                          const JunctionSet& gtf_junctions)
101   {
102    const vector<BowtieHit>& hits = hits_for_read.hits;
103    for (size_t i = 0; i < hits.size(); ++i)
104      {
105 <      if (hits[i].edit_dist()>max_read_mismatches) continue;
106 <
86 <      FragmentAlignmentGrade g(hits[i], gtf_junctions);
105 >      if (hits[i].edit_dist() > max_read_mismatches)
106 >        continue;
107  
108 <      // Is the new status better than the current best one?
109 <      if (best_grade < g)
110 <      {
111 <        if (!report_secondary_alignments)
112 <          best_hits.hits.clear();
108 >      if (report_secondary_alignments)
109 >        {
110 >          best_hits.hits.push_back(hits[i]);
111 >        }
112 >      else
113 >        {
114 >          FragmentAlignmentGrade g(hits[i], gtf_junctions);
115 >          
116 >          // Is the new status better than the current best one?
117 >          if (best_grade < g)
118 >            {
119 >              best_hits.hits.clear();
120 >              best_grade = g;
121 >              best_hits.hits.push_back(hits[i]);
122 >            }
123 >          else if (!(g < best_grade)) // is it just as good?
124 >            {
125 >              best_grade.num_alignments++;
126 >              best_hits.hits.push_back(hits[i]);
127 >            }
128 >        }
129 >    }
130  
131 <        best_grade = g;
132 <        best_hits.hits.push_back(hits[i]);
133 <      }
134 <      else if (!(g < best_grade)) // is it just as good?
135 <      {
136 <        best_grade.num_alignments++;
100 <        best_hits.hits.push_back(hits[i]);
101 <      }
131 >  if (report_secondary_alignments && best_hits.hits.size() > 0)
132 >    {
133 >      cmp_read_alignment cmp(gtf_junctions);
134 >      sort(best_hits.hits.begin(), best_hits.hits.end(), cmp);
135 >      best_grade = FragmentAlignmentGrade(best_hits.hits[0], gtf_junctions);
136 >      best_grade.num_alignments = best_hits.hits.size();
137      }
138   }
139  
140 < void pair_best_alignments(const HitsForRead& left_hits,
106 <                          const HitsForRead& right_hits,
107 <                          InsertAlignmentGrade& best_grade,
108 <                          HitsForRead& left_best_hits,
109 <                          HitsForRead& right_best_hits)
140 > void set_insert_alignment_grade(const BowtieHit& lh, const BowtieHit& rh, InsertAlignmentGrade& grade)
141   {
142    // max mate inner distance (genomic)
143    int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev;
144    if (max_mate_inner_dist == -1)
145      max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev;
146 +  
147 +  bool fusion = false;
148 +  if (fusion_search)
149 +    {
150 +      bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING;
151 +      bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING;
152 +      if (left_fusion && right_fusion)
153 +        return;
154 +      
155 +      fusion = left_fusion || right_fusion;
156 +      if (!fusion && lh.ref_id() != rh.ref_id())
157 +        fusion = true;
158 +      
159 +      if (!fusion && lh.ref_id() == rh.ref_id())
160 +        {
161 +          if (lh.antisense_align() == rh.antisense_align())
162 +            fusion = true;
163 +          else
164 +            {
165 +              int inter_dist = 0;
166 +              if (lh.antisense_align())
167 +                inter_dist = lh.left() - rh.right();
168 +              else
169 +                inter_dist = rh.left() - lh.right();
170 +              
171 +              if (inter_dist < -(int)max_insertion_length || inter_dist > (int)fusion_min_dist)
172 +                fusion = true;
173 +            }
174 +        }
175 +    }
176 +
177 +  grade = InsertAlignmentGrade(lh, rh, min_mate_inner_dist, max_mate_inner_dist, fusion);
178 + }
179 +
180 + struct cmp_pair_alignment
181 + {
182 +  cmp_pair_alignment(const JunctionSet& gtf_juncs) :
183 +    _gtf_juncs(&gtf_juncs) {}
184 +    
185 +  bool operator() (const pair<BowtieHit, BowtieHit>& l, const pair<BowtieHit, BowtieHit>& r) const
186 +  {
187 +    InsertAlignmentGrade gl; set_insert_alignment_grade(l.first, l.second, gl);
188 +    InsertAlignmentGrade gr; set_insert_alignment_grade(r.first, r.second, gr);
189 +
190 +    bool better = gr < gl;
191 +    bool worse = gl < gr;
192 +
193 +    if (better && !worse)
194 +      return true;
195 +    else        
196 +      return false;
197 +  }
198 +
199 +  const JunctionSet* _gtf_juncs;
200 + };
201  
202 + void pair_best_alignments(const HitsForRead& left_hits,
203 +                          const HitsForRead& right_hits,
204 +                          InsertAlignmentGrade& best_grade,
205 +                          vector<pair<BowtieHit, BowtieHit> >& best_hits,
206 +                          const JunctionSet& gtf_junctions)
207 + {
208    const vector<BowtieHit>& left = left_hits.hits;
209    const vector<BowtieHit>& right = right_hits.hits;
210 <
210 >  
211    for (size_t i = 0; i < left.size(); ++i)
212      {
213        const BowtieHit& lh = left[i];
# Line 125 | Line 217
217            const BowtieHit& rh = right[j];
218            if (rh.edit_dist() > max_read_mismatches) continue;
219  
220 <          bool fusion = false;
129 <          if (fusion_search)
130 <            {
131 <              bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING;
132 <              bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING;
133 <              if (left_fusion && right_fusion)
134 <                continue;
135 <
136 <              fusion = left_fusion || right_fusion;
137 <              if (!fusion && lh.ref_id() != rh.ref_id())
138 <                fusion = true;
220 >          InsertAlignmentGrade g; set_insert_alignment_grade(lh, rh, g);
221  
222 <              if (!fusion && lh.ref_id() == rh.ref_id())
223 <                {
224 <                  if (lh.antisense_align() == rh.antisense_align())
143 <                    fusion = true;
144 <                  else
145 <                    {
146 <                      int inter_dist = 0;
147 <                      if (lh.antisense_align())
148 <                        inter_dist = lh.left() - rh.right();
149 <                      else
150 <                        inter_dist = rh.left() - lh.right();
151 <
152 <                      if (inter_dist < -(int)max_insertion_length || inter_dist > (int)fusion_min_dist)
153 <                        fusion = true;
154 <                    }
155 <                }
222 >          if (report_secondary_alignments)
223 >            {
224 >              best_hits.push_back(make_pair(lh, rh));
225              }
226 <
158 <          InsertAlignmentGrade g(lh, rh, min_mate_inner_dist, max_mate_inner_dist, fusion);
159 <
160 <          // Is the new status better than the current best one?
161 <          if (best_grade < g)
226 >          else
227              {
228 <              if (!report_secondary_alignments)
228 >              // Is the new status better than the current best one?
229 >              if (best_grade < g)
230                  {
231 <                  left_best_hits.hits.clear();
232 <                  right_best_hits.hits.clear();
231 >                  best_hits.clear();
232 >                  best_grade = g;
233 >                  best_hits.push_back(make_pair(lh, rh));
234 >                }
235 >              else if (!(g < best_grade))
236 >                {
237 >                  best_hits.push_back(make_pair(lh, rh));
238 >                  best_grade.num_alignments++;
239                  }
168
169              best_grade = g;
170              left_best_hits.hits.push_back(lh);
171              right_best_hits.hits.push_back(rh);
172            }
173          else if (!(g < best_grade))
174            {
175              left_best_hits.hits.push_back(lh);
176              right_best_hits.hits.push_back(rh);
177              best_grade.num_alignments++;
240              }
241          }
242      }
243 +
244 +    if (report_secondary_alignments && best_hits.size() > 0)
245 +    {
246 +      cmp_pair_alignment cmp(gtf_junctions);
247 +      sort(best_hits.begin(), best_hits.end(), cmp);
248 +      set_insert_alignment_grade(best_hits[0].first, best_hits[0].second, best_grade);
249 +      best_grade.num_alignments = best_hits.size();
250 +    }
251   }
252  
253   enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT};
# Line 187 | Line 257
257                   int num_hits, const BowtieHit* next_hit, int hitIndex) {
258    bool XS_found = false;
259    if (sam_toks.size()>11) {
260 <
260 >    
261      for (size_t i=11;i<sam_toks.size();++i) {
262        if (sam_toks[i].find("NH:i:")==string::npos &&
263            sam_toks[i].find("XS:i:")==string::npos)
# Line 289 | Line 359
359          {
360            fusion_alignment = true;
361            XF_index = i;
362 <
362 >          
363            vector<string> tuple_fields;
364            tokenize(tok.c_str(), " ", tuple_fields);
365            vector<string> contigs;
# Line 298 | Line 368
368              {
369                ref_name = contigs[0];
370                ref_name2 = contigs[1];
371 <            }
372 <
371 >            }  
372 >          
373            extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
374                                 cigar1, cigar2, left_seq, right_seq,
375                                 left_qual, right_qual, left1, left2);
376 <
376 >          
377            break;
378          }
379      }
380 <
380 >  
381    string qname(read_alt_name);
382    size_t slash_pos=qname.rfind('/');
383    if (slash_pos!=string::npos)
# Line 329 | Line 399
399    }
400    if (!primary)
401      flag |= 0x100;
402 <
402 >  
403    int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
404    int mapQ=255;
405    if (grade.num_alignments > 1)  {
# Line 366 | Line 436
436      bam_writer.write(bamrec);
437      delete bamrec;
438    }
439 <
439 >    
440    return true;
441   }
442  
# Line 416 | Line 486
486        if (strncmp(tok.c_str(), "XF", 2) == 0)
487          {
488            fusion_alignment = true;
489 <
489 >          
490            vector<string> tuple_fields;
491            tokenize(tok.c_str(), " ", tuple_fields);
492            vector<string> contigs;
# Line 425 | Line 495
495              {
496                ref_name = contigs[0];
497                ref_name2 = contigs[1];
498 <            }
499 <
498 >            }  
499 >          
500            extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
501                                 cigar1, cigar2, left_seq, right_seq,
502                                 left_qual, right_qual, left1, left2);
503 <
503 >          
504            break;
505          }
506      }
507 <
507 >  
508    int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
509    int mapQ=255;
510    if (grade.num_alignments > 1) {
# Line 450 | Line 520
520        tlen = bh.left() < partner->left() ? partner->right() - bh.left() :
521          partner->left() - bh.right();
522      }
523 <
523 >    
524      else { //partner on different chromosome/contig
525        sam_toks[6] = rt.get_name(partner->ref_id());
526      }
# Line 509 | Line 579
579      lex_hit_sort(const RefSequenceTable& rt, const HitsForRead& hits)
580          : _rt(rt), _hits(hits)
581      {}
582 <
582 >    
583      bool operator()(const uint32_t& l, const uint32_t& r) const
584      {
585          const BowtieHit& lhs = _hits.hits[l];
# Line 522 | Line 592
592  
593          return lhs.left() < rhs.left();
594      }
595 <
595 >    
596      const RefSequenceTable& _rt;
597      const HitsForRead& _hits;
598   };
# Line 537 | Line 607
607                            )
608   {
609      assert(!read.alt_name.empty());
610 +    if (hits.hits.empty())
611 +      return;
612 +    
613      lex_hit_sort s(rt, hits);
614      vector<uint32_t> index_vector;
615  
616      size_t i;
617      for (i = 0; i < hits.hits.size(); ++i)
618          index_vector.push_back(i);
619 <
619 >    
620      sort(index_vector.begin(), index_vector.end(), s);
621  
622 <    size_t primaryHit = (hits.hits.empty()? 0: random() % hits.hits.size());
622 >    size_t primaryHit = 0;
623 >    if (!report_secondary_alignments)
624 >      primaryHit = random() % hits.hits.size();
625 >    
626      bool multipleHits = (hits.hits.size() > 1);
627      for (i = 0; i < hits.hits.size(); ++i)
628        {
553        bool primary = (i == primaryHit);
629          size_t index = index_vector[i];
630 +        bool primary = (index == primaryHit);
631          const BowtieHit& bh = hits.hits[index];
632          rewrite_sam_record(bam_writer, rt,
633                             bh,
# Line 567 | Line 643
643   }
644  
645   void print_sam_for_pair(const RefSequenceTable& rt,
646 <                        const HitsForRead& left_hits,
571 <                        const HitsForRead& right_hits,
646 >                        const vector<pair<BowtieHit, BowtieHit> >& best_hits,
647                          const InsertAlignmentGrade& grade,
648                          ReadStream& left_reads_file,
649                          ReadStream& right_reads_file,
# Line 578 | Line 653
653                          uint64_t begin_id = 0,
654                          uint64_t end_id = std::numeric_limits<uint64_t>::max())
655   {
581    assert (left_hits.insert_id == right_hits.insert_id);
582
656      Read left_read;
657      Read right_read;
658 <    assert (left_hits.hits.size() == right_hits.hits.size() ||
659 <            (left_hits.hits.empty() || right_hits.hits.empty()));
658 >    if (best_hits.empty())
659 >      return;
660  
661 +    size_t i;
662 +    HitsForRead right_hits;
663 +    for (i = 0; i < best_hits.size(); ++i)
664 +      right_hits.hits.push_back(best_hits[i].second);
665 +    
666      size_t primaryHit = 0;
667      vector<uint32_t> index_vector;
668 <    if(right_hits.hits.size() > 0)
669 <    {
670 <      lex_hit_sort s(rt, right_hits);
671 <      for (size_t i = 0; i < right_hits.hits.size(); ++i)
672 <        index_vector.push_back(i);
595 <
596 <          sort(index_vector.begin(), index_vector.end(), s);
597 <          primaryHit = random() % right_hits.hits.size();
598 <      }
599 <    else if (left_hits.hits.size() > 0)
600 <    {
601 <          lex_hit_sort s(rt, left_hits);
602 <          for (size_t i = 0; i < left_hits.hits.size(); ++i)
603 <              index_vector.push_back(i);
604 <
605 <          sort(index_vector.begin(), index_vector.end(), s);
606 <          primaryHit = random() % left_hits.hits.size();
607 <      }
668 >    lex_hit_sort s(rt, right_hits);
669 >    for (i = 0; i < right_hits.hits.size(); ++i)
670 >      index_vector.push_back(i);
671 >    
672 >    sort(index_vector.begin(), index_vector.end(), s);
673  
674 <    bool got_left_read = left_reads_file.getRead(left_hits.insert_id, left_read,
674 >    if (!report_secondary_alignments)
675 >      primaryHit = random() % right_hits.hits.size();
676 >    
677 >    bool got_left_read = left_reads_file.getRead(best_hits[0].first.insert_id(), left_read,
678                                                   reads_format, false, begin_id, end_id,
679                                                   left_um_out, false);
680 <
681 <    bool got_right_read = right_reads_file.getRead(right_hits.insert_id, right_read,
680 >    
681 >    bool got_right_read = right_reads_file.getRead(best_hits[0].first.insert_id(), right_read,
682                                                     reads_format, false, begin_id, end_id,
683                                                     right_um_out, false);
684 <
685 <    if (left_hits.hits.size() == right_hits.hits.size())
686 <    {
687 <        assert (got_left_read && got_right_read);
688 <        bool multipleHits = (left_hits.hits.size() > 1);
689 <        for (size_t i = 0; i < right_hits.hits.size(); ++i)
690 <        {
691 <            bool primary = (i == primaryHit);
692 <            size_t index = index_vector[i];
693 <            const BowtieHit& right_bh = right_hits.hits[index];
694 <            const BowtieHit& left_bh = left_hits.hits[index];
695 <
696 <            rewrite_sam_record(bam_writer, rt,
697 <                               right_bh,
698 <                               right_bh.hitfile_rec().c_str(),
699 <                               right_read.alt_name.c_str(),
700 <                               grade,
701 <                               FRAG_RIGHT,
702 <                               &left_bh,
703 <                               right_hits.hits.size(),
704 <                               (i < right_hits.hits.size() - 1) ? &(right_hits.hits[index_vector[i+1]]) : NULL,
705 <                               primary,
706 <                               (multipleHits? i: -1));
707 <            rewrite_sam_record(bam_writer, rt,
708 <                               left_bh,
709 <                               left_bh.hitfile_rec().c_str(),
710 <                               left_read.alt_name.c_str(),
711 <                               grade,
712 <                               FRAG_LEFT,
713 <                               &right_bh,
714 <                               left_hits.hits.size(),
715 <                               (i < left_hits.hits.size() - 1) ? &(left_hits.hits[index_vector[i+1]]) : NULL,
716 <                               primary,
649 <                               (multipleHits? i: -1));
650 <        }
651 <    }
652 <    else if (left_hits.hits.empty())
653 <    { //only right read was mapped properly
654 <        /*if (right_um_out) {
655 <          //write it in the mapped file with the #MAPPED# flag
656 <          fprintf(right_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", right_read.alt_name.c_str(),
657 <                              right_read.seq.c_str(), right_read.qual.c_str());
658 <          }
659 <        */
660 <        for (size_t i = 0; i < right_hits.hits.size(); ++i)
661 <        {
662 <            bool primary = (i == primaryHit);
663 <            size_t index = index_vector[i];
664 <            const BowtieHit& bh = right_hits.hits[index];
665 <
666 <            rewrite_sam_record(bam_writer, rt,
667 <                               bh,
668 <                               bh.hitfile_rec().c_str(),
669 <                               right_read.alt_name.c_str(),
670 <                               grade,
671 <                               FRAG_RIGHT,
672 <                               NULL,
673 <                               right_hits.hits.size(),
674 <                               (i < right_hits.hits.size() - 1) ? &(right_hits.hits[index_vector[i+1]]) : NULL,
675 <                               primary,
676 <                               -1);
677 <
678 <        }
679 <    }
680 <    else if (right_hits.hits.empty())
681 <    { //only left read was mapped properly
682 <      /*
683 <      if (left_um_out) {
684 <        //write it in the mapped file with the #MAPPED# flag
685 <        fprintf(left_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", left_read.alt_name.c_str(), left_read.seq.c_str(),
686 <                            left_read.qual.c_str());
687 <        }
688 <      */
689 <        for (size_t i = 0; i < left_hits.hits.size(); ++i)
690 <        {
691 <            bool primary = (i == primaryHit);
692 <            size_t index = index_vector[i];
693 <            const BowtieHit& bh = left_hits.hits[index];
694 <            rewrite_sam_record(bam_writer, rt,
695 <                               bh,
696 <                               bh.hitfile_rec().c_str(),
697 <                               left_read.alt_name.c_str(),
698 <                               grade,
699 <                               FRAG_LEFT,
700 <                               NULL,
701 <                               left_hits.hits.size(),
702 <                               (i < left_hits.hits.size() - 1) ? &(left_hits.hits[index_vector[i+1]]) : NULL,
703 <                               primary,
704 <                               -1);
705 <
706 <        }
707 <    }
708 <    else
709 <    {
710 <        assert (false);
711 <    }
684 >    
685 >    assert (got_left_read && got_right_read);
686 >    bool multipleHits = (best_hits.size() > 1);
687 >    for (i = 0; i < best_hits.size(); ++i)
688 >      {
689 >        size_t index = index_vector[i];
690 >        bool primary = (index == primaryHit);
691 >        const BowtieHit& right_bh = best_hits[index].second;
692 >        const BowtieHit& left_bh = best_hits[index].first;
693 >        
694 >        rewrite_sam_record(bam_writer, rt,
695 >                           right_bh,
696 >                           right_bh.hitfile_rec().c_str(),
697 >                           right_read.alt_name.c_str(),
698 >                           grade,
699 >                           FRAG_RIGHT,
700 >                           &left_bh,
701 >                           best_hits.size(),
702 >                           (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].second) : NULL,
703 >                           primary,
704 >                           (multipleHits? i: -1));
705 >        rewrite_sam_record(bam_writer, rt,
706 >                           left_bh,
707 >                           left_bh.hitfile_rec().c_str(),
708 >                           left_read.alt_name.c_str(),
709 >                           grade,
710 >                           FRAG_LEFT,
711 >                           &right_bh,
712 >                           best_hits.size(),
713 >                           (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].first) : NULL,
714 >                           primary,
715 >                           (multipleHits? i: -1));
716 >      }
717   }
718  
719   /**
# Line 751 | Line 756
756   #if 0
757        vector<Fusion> new_fusions;
758        fusions_from_spliced_hit(bh, new_fusions);
759 <
759 >      
760        for(size_t i = 0; i < new_fusions.size(); ++i)
761          {
762            Fusion fusion = new_fusions[i];
# Line 792 | Line 797
797   {
798    HitsForRead remaining;
799    remaining.insert_id = hits.insert_id;
800 <
800 >  
801    for (size_t i = 0; i < hits.hits.size(); ++i)
802      {
803        BowtieHit& bh = hits.hits[i];
804 +      if (bh.edit_dist() > max_read_mismatches)
805 +        continue;
806 +      
807        bool filter_hit = false;
808        if (!bh.contiguous())
809          {
# Line 839 | Line 847
847  
848      HitsForRead curr_left_hit_group;
849      HitsForRead curr_right_hit_group;
850 <
850 >    
851      l_hs.next_read_hits(curr_left_hit_group);
852      r_hs.next_read_hits(curr_right_hit_group);
853 <
853 >    
854      uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
855      uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
856  
# Line 857 | Line 865
865            {
866              HitsForRead best_hits;
867              best_hits.insert_id = curr_left_obs_order;
868 <            FragmentAlignmentGrade grade;
869 <
868 >            FragmentAlignmentGrade best_grade;
869 >            
870              // Process hits for left singleton, select best alignments
871 <            read_best_alignments(curr_left_hit_group, grade, best_hits, *gtf_junctions);
871 >            read_best_alignments(curr_left_hit_group, best_grade, best_hits, *gtf_junctions);
872              update_junctions(best_hits, *junctions);
873              update_fusions(best_hits, *rt, *fusions);
874 <
874 >            
875              // Get next hit group
876              l_hs.next_read_hits(curr_left_hit_group);
877              curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
878            }
879 <
879 >        
880          // Chew up right singletons
881          while (curr_left_obs_order > curr_right_obs_order &&
882                 curr_right_obs_order < end_id &&
# Line 876 | Line 884
884            {
885              HitsForRead best_hits;
886              best_hits.insert_id = curr_right_obs_order;
887 +            FragmentAlignmentGrade best_grade;
888              if (curr_right_obs_order >= begin_id)
889                {
881                FragmentAlignmentGrade grade;
890                  // Process hit for right singleton, select best alignments
891 <                read_best_alignments(curr_right_hit_group,grade, best_hits, *gtf_junctions);
891 >                read_best_alignments(curr_right_hit_group, best_grade, best_hits, *gtf_junctions);
892                  update_junctions(best_hits, *junctions);
893                  update_fusions(best_hits, *rt, *fusions);
894                }
895 <
895 >            
896              // Get next hit group
897              r_hs.next_read_hits(curr_right_hit_group);
898              curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
899            }
900 <
900 >        
901          // Since we have both left hits and right hits for this insert,
902          // Find the best pairing and print both
903          while (curr_left_obs_order == curr_right_obs_order &&
904                 curr_left_obs_order < end_id &&
905                 curr_left_obs_order != VMAXINT32)
906            {
907 <            if (curr_left_hit_group.hits.empty())
900 <              {
901 <                HitsForRead right_best_hits;
902 <                right_best_hits.insert_id = curr_right_obs_order;
903 <
904 <                FragmentAlignmentGrade grade;
905 <                read_best_alignments(curr_right_hit_group, grade, right_best_hits, *gtf_junctions);
906 <                update_junctions(right_best_hits, *junctions);
907 <                update_fusions(right_best_hits, *rt, *fusions);
908 <              }
909 <            else if (curr_right_hit_group.hits.empty())
910 <              {
911 <                HitsForRead left_best_hits;
912 <                left_best_hits.insert_id = curr_left_obs_order;
907 >            vector<pair<BowtieHit, BowtieHit> > best_hits;
908  
909 <                FragmentAlignmentGrade grade;
910 <                // Process hits for left singleton, select best alignments
911 <                read_best_alignments(curr_left_hit_group, grade, left_best_hits, *gtf_junctions);
912 <                update_junctions(left_best_hits, *junctions);
913 <                update_fusions(left_best_hits, *rt, *fusions);
914 <              }
920 <            else
921 <              {
922 <                HitsForRead left_best_hits;
923 <                HitsForRead right_best_hits;
924 <                left_best_hits.insert_id = curr_left_obs_order;
925 <                right_best_hits.insert_id = curr_right_obs_order;
909 >            InsertAlignmentGrade grade;
910 >            pair_best_alignments(curr_left_hit_group,
911 >                                 curr_right_hit_group,
912 >                                 grade,
913 >                                 best_hits,
914 >                                 *gtf_junctions);
915  
916 <                // daehwan - apply gtf_junctions here, too!
916 >            HitsForRead single_best_hits;
917 >            single_best_hits.insert_id = curr_left_obs_order;
918  
919 <                InsertAlignmentGrade grade;
920 <                pair_best_alignments(curr_left_hit_group,
921 <                                     curr_right_hit_group,
922 <                                     grade,
923 <                                     left_best_hits,
934 <                                     right_best_hits);
935 <                update_junctions(left_best_hits, *junctions);
936 <                update_junctions(right_best_hits, *junctions);
937 <                update_fusions(left_best_hits, *rt, *fusions);
938 <                update_fusions(right_best_hits, *rt, *fusions);
919 >            // this is too much memory copy - we will make it efficient soon
920 >            for (size_t i = 0; i < best_hits.size(); ++i)
921 >              {
922 >                single_best_hits.hits.push_back(best_hits[i].first);
923 >                single_best_hits.hits.push_back(best_hits[i].second);
924                }
925  
926 +            update_junctions(single_best_hits, *junctions);
927 +            update_fusions(single_best_hits, *rt, *fusions);
928 +                    
929              l_hs.next_read_hits(curr_left_hit_group);
930              curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
931 <
931 >            
932              r_hs.next_read_hits(curr_right_hit_group);
933              curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
934            }
# Line 948 | Line 936
936  
937      for (size_t i = 0; i < hit_factories.size(); ++i)
938        delete hit_factories[i];
939 <
939 >    
940      hit_factories.clear();
941    }
942  
# Line 979 | Line 967
967  
968      if (left_reads_offset > 0)
969        left_reads_file.seek(left_reads_offset);
970 <
970 >    
971      if (!zpacker.empty()) left_um_fname+=".z";
972      FZPipe left_um_out;
973      if (left_um_out.openWrite(left_um_fname.c_str(), zpacker)==NULL)
974        err_die("Error: cannot open file %s for writing!\n",left_um_fname.c_str());
975 <
975 >    
976      ReadStream right_reads_file(right_reads_fname);
977      if (right_reads_offset > 0)
978        right_reads_file.seek(right_reads_offset);
979 <
979 >    
980      FZPipe right_um_out;
981      if (right_reads_fname != "")
982        {
# Line 1007 | Line 995
995      HitStream right_hs(right_map_fname, hit_factories.back(), false, true, true, true);
996      if (right_map_offset > 0)
997        right_hs.seek(right_map_offset);
998 <
998 >    
999      HitsForRead curr_left_hit_group;
1000      HitsForRead curr_right_hit_group;
1001 <
1001 >    
1002      left_hs.next_read_hits(curr_left_hit_group);
1003      right_hs.next_read_hits(curr_right_hit_group);
1004 <
1004 >    
1005      uint64_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1006      uint64_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1007  
# Line 1046 | Line 1034
1034                                                  right_um_out.file, true);
1035                //assert(got_read);
1036              }
1037 <
1037 >    
1038              exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1039  
1040              // Process hits for left singleton, select best alignments
1041              read_best_alignments(curr_left_hit_group, grade, best_hits, *gtf_junctions);
1042  
1043 <            if (best_hits.hits.size()>0 && best_hits.hits.size() <= max_multihits)
1043 >            if (best_hits.hits.size() > max_multihits)
1044 >              {
1045 >                best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end());
1046 >                grade.num_alignments = best_hits.hits.size();
1047 >              }
1048 >
1049 >            if (best_hits.hits.size() > 0)
1050                {
1051                  update_junctions(best_hits, *final_junctions);
1052                  update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
# Line 1069 | Line 1063
1063              // Get next hit group
1064              left_hs.next_read_hits(curr_left_hit_group);
1065              curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1066 <          } //left singletons
1067 <
1066 >          } //left singletons
1067 >        
1068          // Chew up right singletons
1069          while (curr_left_obs_order > curr_right_obs_order &&
1070                 curr_right_obs_order < end_id &&
# Line 1100 | Line 1094
1094  
1095                  // Process hit for right singleton, select best alignments
1096                  read_best_alignments(curr_right_hit_group, grade, best_hits, *gtf_junctions);
1097 <                if (best_hits.hits.size()>0 && best_hits.hits.size() <= max_multihits)
1097 >
1098 >                if (best_hits.hits.size() > max_multihits)
1099 >                  {
1100 >                    best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end());
1101 >                    grade.num_alignments = best_hits.hits.size();
1102 >                  }
1103 >                
1104 >                if (best_hits.hits.size() > 0)
1105                    {
1106                      update_junctions(best_hits, *final_junctions);
1107                      update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1108                      update_fusions(best_hits, *rt, *final_fusions, *fusions);
1109 <
1109 >                    
1110                      print_sam_for_single(*rt,
1111                                           best_hits,
1112                                           grade,
# Line 1120 | Line 1121
1121              right_hs.next_read_hits(curr_right_hit_group);
1122              curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1123            }
1124 <
1124 >        
1125          // Since we have both left hits and right hits for this insert,
1126          // Find the best pairing and print both
1127          while (curr_left_obs_order == curr_right_obs_order &&
# Line 1132 | Line 1133
1133              if (curr_left_hit_group.hits.empty())
1134                {   //only right read mapped
1135                  //write it in the mapped file with the #MAPPED# flag
1136 <
1136 >                
1137                  bool got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1138                                                         reads_format, false, begin_id, end_id,
1139                                                         right_um_out.file, false);
# Line 1143 | Line 1144
1144                  */
1145                  HitsForRead right_best_hits;
1146                  right_best_hits.insert_id = curr_right_obs_order;
1147 <
1147 >                
1148                  FragmentAlignmentGrade grade;
1149                  read_best_alignments(curr_right_hit_group, grade, right_best_hits, *gtf_junctions);
1150  
1151 <                if (right_best_hits.hits.size()>0 && right_best_hits.hits.size() <= max_multihits)
1151 >                if (right_best_hits.hits.size() > max_multihits)
1152 >                  {
1153 >                    right_best_hits.hits.erase(right_best_hits.hits.begin() + max_multihits, right_best_hits.hits.end());
1154 >                    grade.num_alignments = right_best_hits.hits.size();
1155 >                  }
1156 >                
1157 >                if (right_best_hits.hits.size() > 0)
1158                    {
1159                      update_junctions(right_best_hits, *final_junctions);
1160                      update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions);
1161                      update_fusions(right_best_hits, *rt, *final_fusions, *fusions);
1162 <
1162 >                    
1163                      print_sam_for_single(*rt,
1164                                           right_best_hits,
1165                                           grade,
# Line 1178 | Line 1185
1185                  FragmentAlignmentGrade grade;
1186                  // Process hits for left singleton, select best alignments
1187                  read_best_alignments(curr_left_hit_group, grade, left_best_hits, *gtf_junctions);
1188 <                if (left_best_hits.hits.size()>0 && left_best_hits.hits.size() <= max_multihits)
1188 >
1189 >                if (left_best_hits.hits.size() > max_multihits)
1190 >                  {
1191 >                    left_best_hits.hits.erase(left_best_hits.hits.begin() + max_multihits, left_best_hits.hits.end());
1192 >                    grade.num_alignments = left_best_hits.hits.size();
1193 >                  }
1194 >                
1195 >                if (left_best_hits.hits.size() > 0)
1196                    {
1197                      update_junctions(left_best_hits, *final_junctions);
1198                      update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions);
1199                      update_fusions(left_best_hits, *rt, *final_fusions, *fusions);
1200 <
1200 >                    
1201                      print_sam_for_single(*rt,
1202                                           left_best_hits,
1203                                           grade,
# Line 1194 | Line 1208
1208                    }
1209                }
1210              else
1211 <              {   //hits for both left and right reads
1212 <                HitsForRead left_best_hits;
1199 <                HitsForRead right_best_hits;
1200 <                left_best_hits.insert_id = curr_left_obs_order;
1201 <                right_best_hits.insert_id = curr_right_obs_order;
1211 >              {
1212 >                vector<pair<BowtieHit, BowtieHit> > best_hits;
1213  
1214                  InsertAlignmentGrade grade;
1215                  pair_best_alignments(curr_left_hit_group,
1216                                       curr_right_hit_group,
1217                                       grade,
1218 <                                     left_best_hits,
1219 <                                     right_best_hits);
1218 >                                     best_hits,
1219 >                                     *gtf_junctions);
1220  
1221 <                if (left_best_hits.hits.size()>0 && left_best_hits.hits.size() <= max_multihits &&
1211 <                    right_best_hits.hits.size()>0 && right_best_hits.hits.size() <= max_multihits)
1221 >                if (best_hits.size() > max_multihits)
1222                    {
1223 <                    update_junctions(left_best_hits, *final_junctions);
1224 <                    update_junctions(right_best_hits, *final_junctions);
1225 <                    update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions);
1216 <                    update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions);
1223 >                    best_hits.erase(best_hits.begin() + max_multihits, best_hits.end());
1224 >                    grade.num_alignments = best_hits.size();
1225 >                  }
1226  
1227 <                    pair_support(left_best_hits, right_best_hits, *final_fusions, *fusions);
1228 <                    update_fusions(left_best_hits, *rt, *final_fusions, *fusions);
1229 <                    update_fusions(right_best_hits, *rt, *final_fusions, *fusions);
1227 >                //hits for both left and right reads
1228 >                HitsForRead single_best_hits;
1229 >                single_best_hits.insert_id = curr_left_obs_order;
1230 >                for (size_t i = 0; i < best_hits.size(); ++i)
1231 >                  {
1232 >                    single_best_hits.hits.push_back(best_hits[i].first);
1233 >                    single_best_hits.hits.push_back(best_hits[i].second);
1234 >                  }
1235 >                
1236 >                if (single_best_hits.hits.size() > 0)
1237 >                  {
1238 >                    update_junctions(single_best_hits, *final_junctions);
1239 >                    update_insertions_and_deletions(single_best_hits, *final_insertions, *final_deletions);
1240 >
1241 >                    pair_support(best_hits, *final_fusions, *fusions);
1242 >                    update_fusions(single_best_hits, *rt, *final_fusions, *fusions);
1243  
1244                      print_sam_for_pair(*rt,
1245 <                                       left_best_hits,
1224 <                                       right_best_hits,
1245 >                                       best_hits,
1246                                         grade,
1247                                         left_reads_file,
1248                                         right_reads_file,
# Line 1232 | Line 1253
1253                                         end_id);
1254                    }
1255                }
1256 <
1256 >            
1257              left_hs.next_read_hits(curr_left_hit_group);
1258              curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1259 <
1259 >            
1260              right_hs.next_read_hits(curr_right_hit_group);
1261              curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1262            }
# Line 1307 | Line 1328
1328      get_seqs(ref_stream, rt, true);
1329  
1330    srandom(1);
1331 <
1331 >  
1332    JunctionSet gtf_junctions;
1333    if (!gtf_juncs.empty())
1334      {
# Line 1377 | Line 1398
1398        worker.rt = &rt;
1399  
1400        worker.right_map_offset = 0;
1401 <
1401 >      
1402        if (i == 0)
1403          {
1404            worker.begin_id = 0;
# Line 1392 | Line 1413
1413            if (offsets_size == 4)
1414              worker.right_map_offset = offsets[i-1][1];
1415          }
1416 <
1416 >      
1417        worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1418  
1419        if (num_threads > 1)
# Line 1426 | Line 1447
1447    // Read hits, extract junctions, and toss the ones that arent strongly enough supported.
1448  
1449    filter_junctions(junctions, gtf_junctions);
1450 <
1450 >  
1451    //size_t num_juncs_after_filter = junctions.size();
1452    //fprintf(stderr, "Filtered %lu junctions\n",
1453    //     num_unfiltered_juncs - num_juncs_after_filter);
# Line 1441 | Line 1462
1462              small_overhangs++;
1463              }
1464      }
1465 <
1465 >    
1466          if (small_overhangs >0)
1467          fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs);
1468    */
# Line 1461 | Line 1482
1482        worker.sam_header_fname = sam_header;
1483        sprintf(filename, "%s/unmapped_left%d.fq", output_dir.c_str(), i);
1484        worker.left_um_fname = filename;
1485 <
1485 >      
1486        if (right_reads_fname != "")
1487          {
1488            sprintf(filename, "%s/unmapped_right%d.fq", output_dir.c_str(), i);
1489            worker.right_um_fname = filename;
1490          }
1491 <
1491 >      
1492        worker.left_reads_fname = left_reads_fname;
1493        worker.left_map_fname = left_map_fname;
1494        worker.right_reads_fname = right_reads_fname;
# Line 1484 | Line 1505
1505  
1506        worker.right_reads_offset = 0;
1507        worker.right_map_offset = 0;
1508 <
1508 >      
1509        if (i == 0)
1510          {
1511            worker.begin_id = 0;
# Line 1494 | Line 1515
1515        else
1516          {
1517            size_t offsets_size = offsets[i-1].size();
1518 <
1518 >          
1519            worker.begin_id = read_ids[i-1];
1520            worker.left_reads_offset = offsets[i-1][offsets_size - 2];
1521            worker.left_map_offset = offsets[i-1].back();
# Line 1504 | Line 1525
1525                worker.right_map_offset = offsets[i-1][1];
1526              }
1527          }
1528 <
1528 >      
1529        worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1530  
1531        if (num_threads > 1)
# Line 1532 | Line 1553
1553  
1554        merge_with(final_insertions, vfinal_insertions[i]);
1555        vfinal_insertions[i].clear();
1556 <
1556 >      
1557        merge_with(final_deletions, vfinal_deletions[i]);
1558        vfinal_deletions[i].clear();
1559  
# Line 1542 | Line 1563
1563  
1564    fprintf (stderr, "Reporting final accepted alignments...");
1565    fprintf (stderr, "done.\n");
1566 <
1566 >  
1567    //small_overhangs = 0;
1568    for (JunctionSet::iterator i = final_junctions.begin(); i != final_junctions.end();)
1569      {
# Line 1555 | Line 1576
1576            ++i;
1577          }
1578      }
1579 <
1579 >  
1580    //    if (small_overhangs > 0)
1581    //            fprintf(stderr, "Warning: %lu small overhang junctions!\n", small_overhangs);
1582  
1583    fprintf (stderr, "Printing junction BED track...");
1584    print_junctions(junctions_out, final_junctions, rt);
1585    fprintf (stderr, "done\n");
1586 <
1586 >  
1587    fprintf (stderr, "Printing insertions...");
1588    print_insertions(insertions_out, final_insertions,rt);
1589    fclose(insertions_out);
1590    fprintf (stderr, "done\n");
1591 <
1591 >  
1592    fprintf (stderr, "Printing deletions...");
1593    print_deletions(deletions_out, final_deletions, rt);
1594    fclose(deletions_out);
# Line 1580 | Line 1601
1601        fclose(fusions_out);
1602        fprintf (stderr, "done\n");
1603      }
1604 <
1604 >  
1605    fprintf(stderr, "Found %lu junctions from happy spliced reads\n", (long unsigned int)final_junctions.size());
1606   }
1607  
1608   void print_usage()
1609   {
1610          fprintf(stderr, "Usage:   tophat_reports <junctions.bed> <insertions.vcf> <deletions.vcf> <accepted_hits.sam> <left_map1,...,left_mapN> <left_reads.fq>  [right_map1,...,right_mapN] [right_reads.fq]\n");
1611 <
1611 >    
1612          //      fprintf(stderr, "Usage:   tophat_reports <coverage.wig> <junctions.bed> <accepted_hits.sam> <map1.bwtout> [splice_map1.sbwtout]\n");
1613   }
1614  
# Line 1595 | Line 1616
1616   {
1617    fprintf(stderr, "tophat_reports v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
1618    fprintf(stderr, "---------------------------------------\n");
1619 <
1619 >  
1620    reads_format = FASTQ;
1621 <
1621 >  
1622    int parse_ret = parse_options(argc, argv, print_usage);
1623    if (parse_ret)
1624      return parse_ret;
1625 <
1625 >  
1626    if(optind >= argc)
1627      {
1628        print_usage();
1629        return 1;
1630      }
1631 <
1631 >  
1632    string ref_file_name = argv[optind++];
1633 <
1633 >  
1634    if(optind >= argc)
1635      {
1636        print_usage();
1637        return 1;
1638      }
1639 <
1639 >  
1640    string junctions_file_name = argv[optind++];
1641 <
1641 >  
1642    if(optind >= argc)
1643      {
1644        print_usage();
1645        return 1;
1646      }
1647 <
1647 >  
1648    string insertions_file_name = argv[optind++];
1649    if(optind >= argc)
1650      {
1651        print_usage();
1652        return 1;
1653      }
1654 <
1654 >  
1655    string deletions_file_name = argv[optind++];
1656    if(optind >= argc)
1657      {
1658        print_usage();
1659        return 1;
1660      }
1661 <
1661 >  
1662    string fusions_file_name = argv[optind++];
1663    if(optind >= argc)
1664      {
1665        print_usage();
1666        return 1;
1667      }
1668 <
1668 >  
1669    string accepted_hits_file_name = argv[optind++];
1670    if(optind >= argc)
1671      {
1672        print_usage();
1673        return 1;
1674      }
1675 <
1675 >  
1676    string left_map_filename = argv[optind++];
1677    if(optind >= argc)
1678      {
1679        print_usage();
1680        return 1;
1681      }
1682 <
1682 >  
1683    string left_reads_filename = argv[optind++];
1684    string unzcmd=getUnpackCmd(left_reads_filename, false);
1685 <
1685 >  
1686    string right_map_filename;
1687    string right_reads_filename;
1688 <
1688 >  
1689    if (optind < argc)
1690      {
1691        right_map_filename = argv[optind++];
# Line 1682 | Line 1703
1703                ref_file_name.c_str());
1704        exit(1);
1705      }
1706 <
1706 >  
1707    FILE* junctions_file = fopen(junctions_file_name.c_str(), "w");
1708    if (junctions_file == NULL)
1709      {
# Line 1690 | Line 1711
1711                junctions_file_name.c_str());
1712        exit(1);
1713      }
1714 <
1714 >  
1715    FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
1716    if (insertions_file == NULL)
1717      {
# Line 1698 | Line 1719
1719                insertions_file_name.c_str());
1720        exit(1);
1721      }
1722 <
1722 >  
1723    FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
1724    if (deletions_file == NULL)
1725      {
# Line 1706 | Line 1727
1727                deletions_file_name.c_str());
1728        exit(1);
1729      }
1730 <
1730 >  
1731    FILE* fusions_file = NULL;
1732    if (fusion_search)
1733      {
# Line 1718 | Line 1739
1739            exit(1);
1740          }
1741      }
1742 <
1742 >  
1743    driver(accepted_hits_file_name,
1744           ref_stream,
1745           left_map_filename,
# Line 1729 | Line 1750
1750           insertions_file,
1751           deletions_file,
1752           fusions_file);
1753 <
1753 >  
1754    return 0;
1755   }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines