ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/long_spanning_reads.cpp
(Generate patch)
# Line 162 | Line 162
162      }
163   }
164  
165 + BowtieHit merge_chain_color(RefSequenceTable& rt,
166 +                            const string& read_seq,
167 +                            const string& read_quals,
168 +                            std::set<Junction>& possible_juncs,
169 +                            std::set<Insertion>& possible_insertions,
170 +                            list<BowtieHit>& hit_chain)
171 + {
172 +  bool antisense = hit_chain.front().antisense_align();
173 +  uint32_t reference_id = hit_chain.front().ref_id();
174 +  uint64_t insert_id = hit_chain.front().insert_id();
175 +
176 +  int left = hit_chain.front().left();
177 +
178 +  list<BowtieHit>::iterator prev_hit = hit_chain.begin();
179 +  list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
180 +
181 +  string seq;
182 +  string qual;
183 +  int old_read_length = 0;
184 +  int first_seg_length = hit_chain.front().seq().length();
185 +  for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
186 +    {
187 +      seq += i->seq();
188 +      qual += i->qual();
189 +      old_read_length += i->read_len();
190 +    }
191 +
192 +  string rev_read_seq, rev_read_quals;
193 +  if (color && antisense)
194 +    {
195 +      rev_read_seq = read_seq;
196 +      reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
197 +
198 +      rev_read_quals = read_quals;
199 +      reverse(rev_read_quals.begin(), rev_read_quals.end());
200 +    }
201 +
202 +  while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
203 +    {
204 +      /*
205 +       * Note that the gap size may be negative, since left() and right() return
206 +       * signed integers, this will be OK.
207 +       */
208 +      int gap  = curr_hit->left() - prev_hit->right();
209 +      if (gap < -(int)max_insertion_length ||
210 +    (gap > (int)max_deletion_length &&
211 +     (gap < min_report_intron_length || gap > max_report_intron_length)))
212 +  {
213 +    return BowtieHit();
214 +  }
215 +
216 +      ++prev_hit;
217 +      ++curr_hit;
218 +    }
219 +
220 +  prev_hit = hit_chain.begin();
221 +  curr_hit = ++(hit_chain.begin());
222 +
223 +  RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id());
224 +  if (!ref_str)
225 +    return BowtieHit();
226 +
227 +  int curr_seg_index = 1;
228 +  while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
229 +    {
230 +      /*
231 +       * This code is assuming that the cigar strings end and start with a match
232 +       * While segment alignments can actually end with a junction, insertion or deletion, the hope
233 +       * is that in those cases, the right and left ends of the alignments will correctly
234 +       * line up, so we won't get to this bit of code
235 +       */
236 +      if (prev_hit->cigar().back().opcode != MATCH || curr_hit->cigar().front().opcode != MATCH)
237 +  {
238 +    return BowtieHit();
239 +  }
240 +
241 +      if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
242 +  {
243 +    /*
244 +     * There is no way that we can splice these together into a valid
245 +     * alignment
246 +     */
247 +    return BowtieHit();
248 +  }
249 +
250 +      bool found_closure = false;
251 +      /*
252 +       * Note that antisense_splice is the same for prev and curr
253 +       */
254 +      bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
255 +      vector<CigarOp> new_cigar;
256 +      int new_left = -1;
257 +      int mismatch = 0;
258 +
259 +      /*
260 +       * Take the length of matched bases in each segment into consideration for closures,
261 +       * this can be a problem for reads of variable lengths.
262 +       */
263 +      int prev_right_end_match_length = prev_hit->cigar().back().length;
264 +      int curr_left_end_match_length = curr_hit->cigar().front().length;
265 +
266 +      if (prev_hit->right() > curr_hit->left())
267 +  {
268 +    std::set<Insertion>::iterator lb, ub;
269 +    /*
270 +     * Note, this offset is determined by the min-anchor length supplied to
271 +     * juncs_db, which is currently hard-coded at 3 in tophat.py
272 +     * as this value determines what sort of read segments
273 +     * should be able to align directly to the splice sequences
274 +     */
275 +    int left_boundary = prev_hit->right() - 4;
276 +    int right_boundary = curr_hit->left() + 4;
277 +
278 +    /*
279 +     * Create a dummy sequence to represent the maximum possible insertion
280 +     */
281 +    std::string maxInsertedSequence = "";
282 +    maxInsertedSequence.resize(max_insertion_length,'A');
283 +
284 +    lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
285 +    ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
286 +
287 +    int reference_mismatch = 0;
288 +
289 +    while (lb != ub && lb != possible_insertions.end())
290 +      {
291 +        /*
292 +         * In the following code, we will check to make sure that the segments have the proper
293 +         * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
294 +         * In general, reads with insertions must match the inserted sequence exactly.
295 +         */
296 +        if (((int)lb->sequence.size()) == (prev_hit->right() - curr_hit->left()))
297 +    {
298 +      /*
299 +       * Check we have enough matched bases on prev or curr segment.
300 +       */
301 +      int insert_to_prev_right = prev_hit->right() - lb->left - 1;
302 +      int curr_left_to_insert = lb->left - curr_hit->left() + 1;
303 +      if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
304 +        {
305 +          ++lb;
306 +          continue;
307 +        }
308 +
309 +      /*
310 +       * Keep track of how many mismatches were made to the genome in the region
311 +       * where we should actually be matching against the insertion
312 +       */
313 +      int this_reference_mismatch = 0;
314 +      int insertion_mismatch = 0;
315 +      int insertion_len = lb->sequence.length();
316 +      const seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
317 +
318 +      /*
319 +       * First check to see if we need to adjust number of observed errors for the left (prev)
320 +       * hit. This is only the case if this segment runs into the insertion. To be consistent
321 +       * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
322 +       */
323 +      string colorSegmentSequence_prev;
324 +      if (insert_to_prev_right > 0)
325 +        {
326 +          const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
327 +          const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
328 +
329 +          if (color)
330 +      {
331 +        string color;
332 +
333 +        if (antisense)
334 +            color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
335 +        else
336 +            color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
337 +
338 +        color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
339 +        colorSegmentSequence_prev = convert_color_to_bp(color);
340 +      }
341 +
342 +          const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
343 +
344 +          /*
345 +           * Scan right in the read until we run out of read
346 +           */
347 +          for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
348 +      {
349 +        /*
350 +         * Any mismatch to the insertion is a failure
351 +         */
352 +        if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
353 +          {
354 +            ++this_reference_mismatch;
355 +          }
356 +
357 +        if (read_index < insertion_len)
358 +          {
359 +            if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
360 +        {
361 +          ++insertion_mismatch;
362 +          break;
363 +        }
364 +          }
365 +        else
366 +          {
367 +            if (referenceSequence[read_index - insertion_len] == 'N' ||
368 +          referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
369 +        {
370 +          --this_reference_mismatch;
371 +        }
372 +          }
373 +      }
374 +        }
375 +
376 +      string colorSegmentSequence_curr;
377 +      if (curr_left_to_insert > 0)
378 +        {
379 +          const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
380 +          const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
381 +
382 +          if (color)
383 +      {
384 +        string color;
385 +        if (antisense)
386 +            color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
387 +        else
388 +            color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
389 +
390 +        color.push_back(curr_hit->seq()[curr_left_to_insert]);
391 +        reverse(color.begin(), color.end());
392 +        string bp = convert_color_to_bp(color);
393 +        reverse(bp.begin(), bp.end());
394 +        colorSegmentSequence_curr = bp;
395 +      }
396 +
397 +          const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
398 +
399 +          /*
400 +           * Scan left in the read until
401 +           * We ran out of read sequence (insertion extends past segment)
402 +           */
403 +          for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
404 +      {
405 +        int segmentPosition = curr_left_to_insert - read_index - 1;
406 +        int insertionPosition = insertion_len - read_index - 1;
407 +
408 +        if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
409 +          {
410 +            ++this_reference_mismatch;
411 +          }
412 +
413 +        if (read_index < insertion_len)
414 +          {
415 +            if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
416 +        {
417 +          ++insertion_mismatch;
418 +          break;
419 +        }
420 +          }
421 +        else
422 +          {
423 +            if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
424 +          (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
425 +        {
426 +          --this_reference_mismatch;
427 +        }
428 +          }
429 +      }
430 +        }
431 +
432 +      if (found_closure)
433 +        {
434 +    fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
435 +    return BowtieHit();
436 +        }
437 +
438 +      if (insertion_mismatch == 0)
439 +        {
440 +    reference_mismatch = this_reference_mismatch;
441 +    mismatch = -reference_mismatch;
442 +    found_closure = true;
443 +    new_left = prev_hit->left();
444 +    new_cigar = prev_hit->cigar();
445 +
446 +    /*
447 +     * Need to make a new insert operation between the two match character that begin
448 +     * and end the intersection of these two junction. Note that we necessarily assume
449 +     * that this insertion can't span beyond the boundaries of these reads. That should
450 +     * probably be better enforced somewhere
451 +     */
452 +
453 +    new_cigar.back().length -= insert_to_prev_right;
454 +    if (new_cigar.back().length <= 0)
455 +      new_cigar.pop_back();
456 +
457 +    new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
458 +    vector<CigarOp> new_right_cigar = curr_hit->cigar();
459 +    new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
460 +
461 +    /*
462 +     * Finish stitching together the new cigar string
463 +     */
464 +    size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
465 +    for (; c < new_right_cigar.size(); ++c)
466 +      {
467 +        new_cigar.push_back(new_right_cigar[c]);
468 +      }
469 +
470 +    if (color)
471 +      {
472 +        if (insert_to_prev_right > 0)
473 +          seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
474 +
475 +        if (curr_left_to_insert > 0)
476 +          seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
477 +      }
478 +        }
479 +    }
480 +        ++lb;
481 +  }
482 +
483 +  if (!found_closure)
484 +  {
485 +    return BowtieHit();
486 +  }
487 +      }
488 +
489 +      /*
490 +       * Stitch segments together using juctions or deletions if necessary.
491 +       */
492 +      else if (prev_hit->right() < curr_hit->left())
493 +  {
494 +    std::set<Junction>::iterator lb, ub;
495 +
496 +    int left_boundary = prev_hit->right() - 4;
497 +    int right_boundary = curr_hit->left() + 4;
498 +
499 +    lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
500 +    ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
501 +
502 +    int new_diff_mismatches = 0xff;
503 +    while (lb != ub && lb != possible_juncs.end())
504 +      {
505 +        int dist_to_left = lb->left - prev_hit->right() + 1;
506 +        int dist_to_right = lb->right - curr_hit->left();
507 +
508 +        if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
509 +    {
510 +      /*
511 +       * Check we have enough matched bases on prev or curr segment.
512 +       */
513 +      if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length )
514 +        {
515 +          ++lb;
516 +          continue;
517 +        }
518 +
519 +      Dna5String new_cmp_str, old_cmp_str;
520 +      int new_mismatch = 0, old_mismatch = 0;
521 +      string new_patch_str; // this is for colorspace reads
522 +      if (dist_to_left > 0)
523 +        {
524 +          new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
525 +          old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
526 +
527 +          string new_seq;
528 +          if (color)
529 +      {
530 +        string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
531 +
532 +        string color, qual;
533 +        if (antisense)
534 +          {
535 +            color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
536 +            qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
537 +          }
538 +          else
539 +          {
540 +            color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
541 +            qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
542 +          }
543 +
544 +        BWA_decode(color, qual, ref, new_seq);
545 +        new_seq = new_seq.substr(1);
546 +      }
547 +
548 +          const string& curr_old_seq = curr_hit->seq();
549 +          const string& curr_seq = color ? new_seq : curr_hit->seq();
550 +          for (int i = 0; i < dist_to_left; ++i)
551 +      {
552 +        if (curr_seq[i] != new_cmp_str[i])
553 +          ++new_mismatch;
554 +
555 +        if (curr_old_seq[i] != old_cmp_str[i])
556 +          ++old_mismatch;
557 +      }
558 +
559 +          if (color)
560 +      new_patch_str = curr_seq.substr(0, dist_to_left);
561 +        }
562 +      else if (dist_to_left < 0)
563 +        {
564 +          new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
565 +          old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
566 +
567 +          size_t abs_dist = -dist_to_left;
568 +          string new_seq;
569 +          if (color)
570 +      {
571 +        string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
572 +        ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
573 +
574 +        string color, qual;
575 +        if (antisense)
576 +          {
577 +            color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
578 +            qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
579 +          }
580 +          else
581 +          {
582 +            color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
583 +            qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
584 +          }
585 +
586 +        BWA_decode(color, qual, ref, new_seq);
587 +        new_seq = new_seq.substr(1);
588 +      }
589 +
590 +          const string& prev_old_seq = prev_hit->seq();
591 +          size_t prev_old_seq_len = prev_old_seq.length();
592 +          const string& prev_seq = color ? new_seq : prev_hit->seq();
593 +          size_t prev_seq_len = prev_seq.length();
594 +          for (size_t i = 0; i < abs_dist; ++i)
595 +      {
596 +        if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
597 +          ++new_mismatch;
598 +        if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
599 +          ++old_mismatch;
600 +      }
601 +
602 +          if (color)
603 +      new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
604 +        }
605 +
606 +      int temp_diff_mismatches = new_mismatch - old_mismatch;
607 +      if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
608 +        {
609 +          ++lb;
610 +          continue;
611 +        }
612 +
613 +      if (color)
614 +        {
615 +          /*
616 +           * We need to recover the origianl sequence.
617 +           */
618 +          if (found_closure)
619 +      {
620 +        seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
621 +              prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
622 +      }
623 +
624 +          if (dist_to_left > 0)
625 +      seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
626 +          else if (dist_to_left < 0)
627 +      seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
628 +        }
629 +
630 +      new_diff_mismatches = temp_diff_mismatches;
631 +
632 +      new_left = prev_hit->left();
633 +      new_cigar = prev_hit->cigar();
634 +
635 +      int new_left_back_len = new_cigar.back().length;
636 +      new_left_back_len += dist_to_left;
637 +
638 +      vector<CigarOp> new_right_cig = curr_hit->cigar();
639 +      int new_right_front_len = new_right_cig.front().length;
640 +      new_right_front_len -= dist_to_right;
641 +      if (new_left_back_len > 0)
642 +        new_cigar.back().length = new_left_back_len;
643 +      else
644 +        new_cigar.pop_back();
645 +
646 +      /*
647 +       * FIXME, currently just differentiating between a deletion and a
648 +       * reference skip based on length. However, would probably be better
649 +       * to denote the difference explicitly, this would allow the user
650 +       * to supply their own (very large) deletions
651 +       */
652 +      if ((lb->right - lb->left - 1) <= max_deletion_length)
653 +        {
654 +          new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
655 +          antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
656 +        }
657 +      else
658 +        {
659 +          new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
660 +          antisense_closure = lb->antisense;
661 +        }
662 +
663 +      new_right_cig.front().length = new_right_front_len;
664 +      size_t c = new_right_front_len > 0 ? 0 : 1;
665 +      for (; c < new_right_cig.size(); ++c)
666 +        new_cigar.push_back(new_right_cig[c]);
667 +
668 +      mismatch = new_diff_mismatches;
669 +      found_closure = true;
670 +    }
671 +        ++lb;
672 +      }
673 +
674 +    if (!found_closure)
675 +      {
676 +        return BowtieHit();
677 +      }
678 +  }
679 +
680 +      if (found_closure)
681 +  {
682 +    bool end = false;
683 +    BowtieHit merged_hit(reference_id,
684 +                         reference_id,
685 +             insert_id,
686 +             new_left,
687 +             new_cigar,
688 +             antisense,
689 +             antisense_closure,
690 +             prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
691 +             prev_hit->splice_mms() + curr_hit->splice_mms(),
692 +             end);
693 +
694 +    if (curr_seg_index > 1)
695 +      merged_hit.seq(seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, 2 * segment_length));
696 +    else
697 +      merged_hit.seq(seq.substr(0, first_seg_length + segment_length));
698 +
699 +    prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
700 +    /*
701 +     * prev_hit now points PAST the last element removed
702 +     */
703 +    prev_hit = hit_chain.insert(prev_hit, merged_hit);
704 +    /*
705 +     * merged_hit has been inserted before the old position of
706 +     * prev_hit. New location of prev_hit is merged_hit
707 +     */
708 +    curr_hit = prev_hit;
709 +    ++curr_hit;
710 +    ++curr_seg_index;
711 +    continue;
712 +  }
713 +
714 +      ++prev_hit;
715 +      ++curr_hit;
716 +      ++curr_seg_index;
717 +    }
718 +
719 +  bool saw_antisense_splice = false;
720 +  bool saw_sense_splice = false;
721 +  vector<CigarOp> long_cigar;
722 +  int num_mismatches = 0;
723 +  int num_splice_mms = 0;
724 +  for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
725 +    {
726 +      num_mismatches += s->edit_dist();
727 +      num_splice_mms += s->splice_mms();
728 +
729 +      /*
730 +       * Check whether the sequence contains any reference skips. Previously,
731 +       * this was just a check to see whether the sequence was contiguous; however
732 +       * we don't want to count an indel event as a splice
733 +       */
734 +      bool containsSplice = s->is_spliced();
735 +      if (containsSplice)
736 +  {
737 +    if (s->antisense_splice())
738 +      {
739 +        if (saw_sense_splice)
740 +    return BowtieHit();
741 +        saw_antisense_splice = true;
742 +      }
743 +    else
744 +      {
745 +        if (saw_antisense_splice)
746 +    return BowtieHit();
747 +        saw_sense_splice = true;
748 +      }
749 +  }
750 +      const vector<CigarOp>& cigar = s->cigar();
751 +      if (long_cigar.empty())
752 +  {
753 +    long_cigar = cigar;
754 +  }
755 +      else
756 +  {
757 +    CigarOp& last = long_cigar.back();
758 +    /*
759 +     * If necessary, merge the back and front
760 +     * cigar operations
761 +     */
762 +    if(last.opcode == cigar[0].opcode){
763 +      last.length += cigar[0].length;
764 +      for (size_t b = 1; b < cigar.size(); ++b)
765 +        {
766 +    long_cigar.push_back(cigar[b]);
767 +        }
768 +    }else{
769 +      for(size_t b = 0; b < cigar.size(); ++b)
770 +        {
771 +    long_cigar.push_back(cigar[b]);
772 +        }
773 +    }
774 +  }
775 +    }
776 +
777 +  bool end = false;
778 +  BowtieHit new_hit(reference_id,
779 +                    reference_id,
780 +        insert_id,
781 +        left,
782 +        long_cigar,
783 +        antisense,
784 +        saw_antisense_splice,
785 +        num_mismatches,
786 +        num_splice_mms,
787 +        end);
788 +
789 +  new_hit.seq(seq);
790 +  new_hit.qual(qual);
791 +
792 +  int new_read_len = new_hit.read_len();
793 +  if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt))
794 +    {
795 +      fprintf(stderr, "Warning: malformed closure\n");
796 +      return BowtieHit();
797 +    }
798 +
799 +  return new_hit;
800 + }
801 +
802   BowtieHit merge_chain(RefSequenceTable& rt,
803                        const string& read_seq,
804                        const string& read_quals,
# Line 1320 | Line 1957
1957    if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1958      {
1959        new_hit.seq(seq);
1960 <      new_hit.qual(qual);
1960 >      if (bowtie2)
1961 >        {
1962 >          // for the time being, let's compare "seq" and "read_seq"
1963 >          if (seq != read_seq)
1964 >            {
1965 >              string temp_qual = read_quals;
1966 >              reverse(temp_qual.begin(), temp_qual.end());
1967 >              new_hit.qual(temp_qual);
1968 >            }
1969 >          else
1970 >            new_hit.qual(read_quals);
1971 >        }
1972 >      else
1973 >        new_hit.qual(qual);
1974      }
1975    else
1976      {
# Line 1360 | Line 2010
2010      }
2011  
2012    int new_read_len = new_hit.read_len();
2013 <  if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt))
2013 >  if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt, bDebug))
2014      {
2015        // daehwan
2016        if (bDebug)
# Line 1516 | Line 2166
2166              }
2167          }
2168  
2169 <      bh = merge_chain(rt,
2170 <                       read_seq,
2171 <                       read_quals,
2172 <                       possible_juncs,
2173 <                       possible_insertions,
2174 <                       possible_fusions,
2175 <                       hit_chain,
2176 <                       fusion_dir);
2169 >      // todo: merge_chain_color needs to be merged into merge_chain fuction.
2170 >      if (color)
2171 >        bh = merge_chain_color(rt,
2172 >                         read_seq,
2173 >                         read_quals,
2174 >                         possible_juncs,
2175 >                         possible_insertions,
2176 >                               hit_chain);
2177 >      else
2178 >        bh = merge_chain(rt,
2179 >                         read_seq,
2180 >                         read_quals,
2181 >                         possible_juncs,
2182 >                         possible_insertions,
2183 >                         possible_fusions,
2184 >                         hit_chain,
2185 >                         fusion_dir);
2186      }
2187    else
2188      {
# Line 1620 | Line 2279
2279                     << "\t"
2280                     << (currHit->antisense_align2() ? "-" : "+")
2281                     << endl;
2282 <              cout << "daehwan - curr corrds: "
2282 >              cout << "daehwan - curr coords: "
2283                     << currHit->left()
2284                     << "\t"
2285                     << currHit->right()
# Line 1915 | Line 2574
2574   {      
2575    vector<BowtieHit> seg_hit_stack;
2576    bool join_success = false;
2577 <
2577 >  
2578    // ignore segments that map to more than this many places.
2579    if (bowtie2)
2580      {
1922      const size_t max_seg_hits = max_multihits * 2;
2581        for (size_t s = 0; s < seg_hits_for_read.size(); ++s)
2582          {
2583 <          if (seg_hits_for_read[s].hits.size() > max_seg_hits)
2583 >          if (seg_hits_for_read[s].hits.size() > max_seg_multihits)
2584              return join_success;
2585          }
2586      }
# Line 1931 | Line 2589
2589      {
2590        BowtieHit& bh = seg_hits_for_read[0].hits[i];
2591  
2592 +      // daehwan - remove this
2593 +      //if (bh.insert_id() == 811745)
2594 +      //bDebug = true;
2595 +      
2596        if (bh.fusion_opcode() == FUSION_RR)
2597          seg_hit_stack.push_back(bh.reverse());
2598        else
# Line 1947 | Line 2609
2609                                    seg_hit_stack,
2610                                    joined_hits);
2611        if (success)
2612 <  join_success = true;
2612 >        join_success = true;
2613        seg_hit_stack.pop_back();
2614      }
2615 <
2615 >  
2616    return join_success;
2617   }
2618  
# Line 1968 | Line 2630
2630        readstream.seek(read_offset);
2631  
2632      bool need_seq = true;
2633 <    bool need_qual = color;
2633 >    bool need_qual = true;
2634  
2635      vector<HitStream> contig_hits;
2636      vector<HitStream> spliced_hits;
# Line 2131 | Line 2793
2793                          if (joined_hits[i].fusion_opcode() != FUSION_NOTHING)
2794                            ref_name2 = rt->get_name(joined_hits[i].ref_id2());
2795  
2796 <                        if (color && !color_out)
2796 >                        vector<string> extra_fields;
2797 >
2798 >                        if (!color)
2799 >                          bowtie_sam_extra(joined_hits[i], *rt, extra_fields);
2800 >
2801 >                        if (color)
2802                            print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2803 <                                       joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true);
2803 >                                       joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true, &extra_fields);
2804                          else
2805                            print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2806 <                                       read.seq.c_str(), read.qual.c_str(), false);
2806 >                                       read.seq.c_str(), read.qual.c_str(), false,  &extra_fields);
2807                        }
2808                    }
2809                  else

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines