ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/long_spanning_reads.cpp
(Generate patch)
# Line 31 | Line 31
31   #include <seqan/modifier.h>
32   #include <getopt.h>
33  
34 + #include <boost/thread.hpp>
35 +
36   #include "common.h"
37 + #include "utils.h"
38   #include "bwt_map.h"
39   #include "tokenize.h"
40   #include "segments.h"
# Line 40 | Line 43
43   #include "junctions.h"
44   #include "insertions.h"
45   #include "deletions.h"
46 + #include "fusions.h"
47  
48   using namespace seqan;
49   using namespace std;
50  
51 + // daehwan
52 + bool bDebug = false;
53 +
54   void print_usage()
55   {
56      fprintf(stderr, "Usage:   long_spanning_reads <reads.fa/.fq> <possible_juncs1,...,possible_juncsN> <possible_insertions1,...,possible_insertionsN> <possible_deletions1,...,possible_deletionsN> <seg1.bwtout,...,segN.bwtout> [spliced_seg1.bwtout,...,spliced_segN.bwtout]\n");
# Line 57 | Line 64
64  
65   void get_seqs(istream& ref_stream,
66          RefSequenceTable& rt,
67 <        bool keep_seqs = true,
61 <        bool strip_slash = false)
67 >        bool keep_seqs = true)
68   {
69    while(ref_stream.good() &&
70    !ref_stream.eof())
# Line 118 | Line 124
124    HitsForRead& curr_seg_hits = seg_hits_for_read[right_file];
125  
126    if (right_file < (int)spliced_hits.size() && right_file >= 0)
127 <    {
127 >    {    
128        // Scan forward in the spliced hits file for this hit group
129        HitsForRead spliced_group;
130        HitsForRead curr_spliced_group;
# Line 157 | Line 163
163   }
164  
165   BowtieHit merge_chain(RefSequenceTable& rt,
166 <          const string& read_seq,
167 <          const string& read_quals,
168 <          std::set<Junction>& possible_juncs,
169 <          std::set<Insertion>& possible_insertions,
170 <          list<BowtieHit>& hit_chain)
166 >                      const string& read_seq,
167 >                      const string& read_quals,
168 >                      std::set<Junction>& possible_juncs,
169 >                      std::set<Insertion>& possible_insertions,
170 >                      std::set<Fusion>& possible_fusions,
171 >                      list<BowtieHit>& hit_chain,
172 >                      int fusion_dir = FUSION_NOTHING)
173   {
174    bool antisense = hit_chain.front().antisense_align();
167  uint32_t reference_id = hit_chain.front().ref_id();
175    uint64_t insert_id = hit_chain.front().insert_id();
176 <
177 <  int left = hit_chain.front().left();
178 <
176 >  
177 >  const int left = hit_chain.front().left();    
178 >  
179    list<BowtieHit>::iterator prev_hit = hit_chain.begin();
180    list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
181 <
181 >  
182    string seq;
183    string qual;
184    int old_read_length = 0;
# Line 188 | Line 195
195      {
196        rev_read_seq = read_seq;
197        reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
198 <
198 >      
199        rev_read_quals = read_quals;
200        reverse(rev_read_quals.begin(), rev_read_quals.end());
201      }
202  
203 +  size_t num_fusions = prev_hit->fusion_opcode() == FUSION_NOTHING ? 0 : 1;
204 +  bool fusion_passed = false;
205    while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
206      {
207 <      /*
208 <       * Note that the gap size may be negative, since left() and right() return
200 <       * signed integers, this will be OK.
201 <       */
202 <      int gap  = curr_hit->left() - prev_hit->right();
203 <      if (gap < -(int)max_insertion_length ||
204 <    (gap > (int)max_deletion_length &&
205 <     (gap < min_report_intron_length || gap > max_report_intron_length)))
206 <  {
207 <    return BowtieHit();
208 <  }
207 >      if (prev_hit->ref_id() != prev_hit->ref_id2() || prev_hit->ref_id2() != curr_hit->ref_id())
208 >          fusion_passed = true;
209  
210 +      if (prev_hit->ref_id2() != curr_hit->ref_id())
211 +        ++num_fusions;
212 +      
213 +      if (curr_hit->fusion_opcode() != FUSION_NOTHING)
214 +        ++num_fusions;
215 +
216 +      if (prev_hit->ref_id2() == curr_hit->ref_id())
217 +        {
218 +          bool reversed = false;
219 +          if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
220 +            reversed = true;
221 +          
222 +          /*
223 +           * Note that the gap size may be negative, since left() and right() return
224 +           * signed integers, this will be OK.
225 +           */
226 +          int gap;
227 +          if (reversed)
228 +            gap = prev_hit->right() - curr_hit->left();
229 +          else
230 +            gap = curr_hit->left() - prev_hit->right();
231 +
232 +          // daehwan
233 +          if (bDebug)
234 +            {
235 +              cout << "curr left: " << curr_hit->left() << endl
236 +                   << "prev right: " << prev_hit->right() << endl
237 +                   << "gap: " << gap << endl;
238 +            }
239 +          
240 +          if (gap < -(int)max_insertion_length ||
241 +              (gap > (int)max_deletion_length &&
242 +               (gap < min_report_intron_length || gap > min(max_report_intron_length, (int)fusion_min_dist))))
243 +            {
244 +              fusion_passed = true;
245 +              ++num_fusions;
246 +            }
247 +        }
248 +
249 +      if (num_fusions >= 2)
250 +          return BowtieHit();
251 +      
252        ++prev_hit;
253        ++curr_hit;
254      }
255 <
255 >  
256    prev_hit = hit_chain.begin();
257    curr_hit = ++(hit_chain.begin());
258  
259 <  RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id());
260 <  if (!ref_str)
261 <    return BowtieHit();
262 <
259 >  // daehwan
260 >  if (bDebug)
261 >    {
262 >      cout << "daehwan - test" << endl;
263 >    }
264 >  
265    int curr_seg_index = 1;
266 +  fusion_passed = false;
267    while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
268      {
269 +      // daehwan
270 +      if (bDebug)
271 +        {
272 +          cout << "daehwan - start - stitch" << endl;
273 +          cout << "prev right: " << prev_hit->right() << endl;
274 +          cout << "curr left: " << curr_hit->left() << endl;
275 +          cout << "prev back: " << prev_hit->cigar().back().opcode << endl;
276 +          cout << "curr front: " << curr_hit->cigar().front().opcode << endl;
277 +          cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
278 +          cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
279 +        }
280 +
281 +      if (prev_hit->fusion_opcode() != FUSION_NOTHING || prev_hit->ref_id2() != curr_hit->ref_id())
282 +        fusion_passed = true;
283 +      
284        /*
285         * This code is assuming that the cigar strings end and start with a match
286         * While segment alignments can actually end with a junction, insertion or deletion, the hope
287         * is that in those cases, the right and left ends of the alignments will correctly
288         * line up, so we won't get to this bit of code
289         */
290 <      if (prev_hit->cigar().back().opcode != MATCH || curr_hit->cigar().front().opcode != MATCH)
291 <  {
292 <    return BowtieHit();
293 <  }
294 <
290 >      if (!(prev_hit->cigar().back().opcode == MATCH || curr_hit->cigar().front().opcode == MATCH ||
291 >            prev_hit->cigar().back().opcode == mATCH || curr_hit->cigar().front().opcode == mATCH))
292 >        {
293 >          return BowtieHit();
294 >        }
295 >
296 >      // daehwan
297 >      if (bDebug)
298 >        {
299 >          cout << "daehwan - pass - enough matched bases" << endl;
300 >        }
301 >      
302        if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
303 <  {
304 <    /*
305 <     * There is no way that we can splice these together into a valid
306 <     * alignment
307 <     */
308 <    return BowtieHit();
309 <  }
303 >        {
304 >          /*
305 >           * There is no way that we can splice these together into a valid
306 >           * alignment
307 >           */
308 >          return BowtieHit();
309 >        }
310  
311        bool found_closure = false;
312        /*
# Line 249 | Line 316
316        vector<CigarOp> new_cigar;
317        int new_left = -1;
318        int mismatch = 0;
319 <
319 >      
320        /*
321         * Take the length of matched bases in each segment into consideration for closures,
322         * this can be a problem for reads of variable lengths.
# Line 257 | Line 324
324        int prev_right_end_match_length = prev_hit->cigar().back().length;
325        int curr_left_end_match_length = curr_hit->cigar().front().length;
326  
327 <      if (prev_hit->right() > curr_hit->left())
261 <  {
262 <    std::set<Insertion>::iterator lb, ub;
263 <    /*
264 <     * Note, this offset is determined by the min-anchor length supplied to
265 <     * juncs_db, which is currently hard-coded at 3 in tophat.py
266 <     * as this value determines what sort of read segments
267 <     * should be able to align directly to the splice sequences
268 <     */
269 <    int left_boundary = prev_hit->right() - 4;
270 <    int right_boundary = curr_hit->left() + 4;
271 <
272 <    /*
273 <     * Create a dummy sequence to represent the maximum possible insertion
274 <     */
275 <    std::string maxInsertedSequence = "";
276 <    maxInsertedSequence.resize(max_insertion_length,'A');
327 >      bool check_fusion = prev_hit->ref_id2() != curr_hit->ref_id();
328  
329 <    lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
330 <    ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
331 <
332 <    int reference_mismatch = 0;
333 <
334 <    while (lb != ub && lb != possible_insertions.end())
335 <      {
336 <        /*
337 <         * In the following code, we will check to make sure that the segments have the proper
338 <         * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
339 <         * In general, reads with insertions must match the inserted sequence exactly.
340 <         */
341 <        if (((int)lb->sequence.size()) == (prev_hit->right() - curr_hit->left()))
342 <    {
343 <      /*
344 <       * Check we have enough matched bases on prev or curr segment.
345 <       */
346 <      int insert_to_prev_right = prev_hit->right() - lb->left - 1;
347 <      int curr_left_to_insert = lb->left - curr_hit->left() + 1;
348 <      if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
349 <        {
350 <          ++lb;
351 <          continue;
352 <        }
353 <
354 <      /*
355 <       * Keep track of how many mismatches were made to the genome in the region
356 <       * where we should actually be matching against the insertion
357 <       */
358 <      int this_reference_mismatch = 0;
359 <      int insertion_mismatch = 0;
360 <      int insertion_len = lb->sequence.length();
361 <      const seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
362 <
363 <      /*
364 <       * First check to see if we need to adjust number of observed errors for the left (prev)
365 <       * hit. This is only the case if this segment runs into the insertion. To be consistent
366 <       * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
367 <       */
368 <      string colorSegmentSequence_prev;
369 <      if (insert_to_prev_right > 0)
370 <        {
371 <          const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
372 <          const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
373 <
374 <          if (color)
375 <      {
376 <        string color;
377 <
378 <        if (antisense)
379 <            color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
380 <        else
381 <            color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
382 <
383 <        color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
384 <        colorSegmentSequence_prev = convert_color_to_bp(color);
385 <      }
386 <
387 <          const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
388 <
389 <          /*
390 <           * Scan right in the read until we run out of read
391 <           */
392 <          for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
393 <      {
394 <        /*
395 <         * Any mismatch to the insertion is a failure
396 <         */
397 <        if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
398 <          {
399 <            ++this_reference_mismatch;
400 <          }
401 <
402 <        if (read_index < insertion_len)
403 <          {
404 <            if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
405 <        {
406 <          ++insertion_mismatch;
407 <          break;
408 <        }
409 <          }
410 <        else
411 <          {
412 <            if (referenceSequence[read_index - insertion_len] == 'N' ||
413 <          referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
414 <        {
415 <          --this_reference_mismatch;
416 <        }
417 <          }
418 <      }
419 <        }
420 <
421 <      string colorSegmentSequence_curr;
422 <      if (curr_left_to_insert > 0)
423 <        {
424 <          const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
425 <          const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
426 <
427 <          if (color)
428 <      {
429 <        string color;
430 <        if (antisense)
431 <            color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
432 <        else
433 <            color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
434 <
435 <        color.push_back(curr_hit->seq()[curr_left_to_insert]);
436 <        reverse(color.begin(), color.end());
437 <        string bp = convert_color_to_bp(color);
438 <        reverse(bp.begin(), bp.end());
439 <        colorSegmentSequence_curr = bp;
440 <      }
441 <
442 <          const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
443 <
444 <          /*
445 <           * Scan left in the read until
446 <           * We ran out of read sequence (insertion extends past segment)
447 <           */
448 <          for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
449 <      {
450 <        int segmentPosition = curr_left_to_insert - read_index - 1;
451 <        int insertionPosition = insertion_len - read_index - 1;
452 <
453 <        if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
454 <          {
455 <            ++this_reference_mismatch;
456 <          }
457 <
458 <        if (read_index < insertion_len)
459 <          {
460 <            if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
461 <        {
462 <          ++insertion_mismatch;
463 <          break;
464 <        }
465 <          }
466 <        else
467 <          {
468 <            if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
469 <          (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
470 <        {
471 <          --this_reference_mismatch;
472 <        }
473 <          }
474 <      }
475 <        }
329 >      if (prev_hit->ref_id2() == curr_hit->ref_id())
330 >        {
331 >          // daehwan
332 >          if (bDebug)
333 >            {
334 >              cout << "daehwan - start - junction or insertion" << endl;
335 >              cout << "prev right: " << prev_hit->right() << endl;
336 >              cout << "curr left: " << curr_hit->left() << endl;
337 >              cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
338 >              cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
339 >            }
340 >
341 >          bool reversed = false;
342 >          if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
343 >            reversed = true;
344 >
345 >          uint32_t reference_id = prev_hit->ref_id2();
346 >          RefSequenceTable::Sequence* ref_str = rt.get_seq(reference_id);
347 >          
348 >          int left_boundary, right_boundary;
349 >          if (reversed)
350 >            {
351 >              left_boundary = curr_hit->left() - 4;
352 >              right_boundary = prev_hit->right() + 4;
353 >            }
354 >          else
355 >            {
356 >              left_boundary = prev_hit->right() - 4;
357 >              right_boundary = curr_hit->left() + 4;
358 >            }
359 >
360 >          int dist_btw_two;
361 >          if (reversed)
362 >            dist_btw_two = prev_hit->right() - curr_hit->left();
363 >          else
364 >            dist_btw_two = curr_hit->left() - prev_hit->right();
365 >
366 >          if (dist_btw_two < 0 && dist_btw_two >= -max_insertion_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
367 >            {
368 >              std::set<Insertion>::iterator lb, ub;
369 >              
370 >              /*
371 >               * Create a dummy sequence to represent the maximum possible insertion
372 >               */
373 >              std::string maxInsertedSequence = "";
374 >              maxInsertedSequence.resize(max_insertion_length,'A');
375 >
376 >              lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
377 >              ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));      
378 >              
379 >              int reference_mismatch = 0;
380 >              
381 >              while (lb != ub && lb != possible_insertions.end())
382 >                {
383 >                  /*
384 >                   * In the following code, we will check to make sure that the segments have the proper
385 >                   * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
386 >                   * In general, reads with insertions must match the inserted sequence exactly.
387 >                   */
388 >                  if (((int)lb->sequence.size()) == (reversed ? curr_hit->left() - prev_hit->right() : prev_hit->right() - curr_hit->left()))
389 >                    {
390 >                      /*
391 >                       * Check we have enough matched bases on prev or curr segment.
392 >                       */
393 >                      int insert_to_prev_right, curr_left_to_insert;
394 >                      if (reversed)
395 >                        {
396 >                          insert_to_prev_right = lb->left - prev_hit->right();
397 >                          curr_left_to_insert = curr_hit->left() - lb->left;
398 >                        }
399 >                      else
400 >                        {
401 >                          insert_to_prev_right = prev_hit->right() - lb->left - 1;
402 >                          curr_left_to_insert = lb->left - curr_hit->left() + 1;
403 >                        }
404 >
405 >                      if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
406 >                        {
407 >                          ++lb;
408 >                          continue;
409 >                        }
410 >
411 >                      // daehwan
412 >                      if (bDebug)
413 >                        {
414 >                          cout << "insert_to_prev_right: " << insert_to_prev_right << endl;
415 >                          cout << "curr_left_to_insert: " << curr_left_to_insert << endl;
416 >                          cout << "curr_seg_index: " << curr_seg_index << endl;
417 >                        }
418 >                      
419 >                      /*
420 >                       * Keep track of how many mismatches were made to the genome in the region
421 >                       * where we should actually be matching against the insertion
422 >                       */
423 >                      int this_reference_mismatch = 0;
424 >                      int insertion_mismatch = 0;
425 >                      int insertion_len = lb->sequence.length();
426 >                      seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
427 >                      if (reversed)
428 >                        {
429 >                          seqan::convertInPlace(insertionSequence, seqan::FunctorComplement<seqan::Dna>());
430 >                          seqan::reverseInPlace(insertionSequence);
431 >                        }
432 >                      
433 >                      /*
434 >                       * First check to see if we need to adjust number of observed errors for the left (prev)
435 >                       * hit. This is only the case if this segment runs into the insertion. To be consistent
436 >                       * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
437 >                       */
438 >                      string colorSegmentSequence_prev;
439 >                      if (insert_to_prev_right > 0)
440 >                        {
441 >                          seqan::Dna5String referenceSequence, oldSegmentSequence;
442 >
443 >                          if (reversed)
444 >                            {
445 >                              referenceSequence = seqan::infix(*ref_str, prev_hit->right() + 1, lb->left + 1);
446 >                              seqan::convertInPlace(referenceSequence, seqan::FunctorComplement<seqan::Dna>());
447 >                              seqan::reverseInPlace(referenceSequence);
448 >
449 >                              string temp;
450 >
451 >                              // daehwan
452 >                              if (bDebug)
453 >                                {
454 >                                  cout << "reversed: " << read_seq.length() << " " << read_seq << endl;
455 >                                }
456 >                              
457 >                              temp = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right);
458 >                              oldSegmentSequence = seqan::Dna5String(temp);
459 >                            }
460 >                          else
461 >                            {
462 >                              referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
463 >
464 >                              // daehwan
465 >                              if (bDebug)
466 >                                {
467 >                                  cout << "non-reversed: " << prev_hit->seq() << endl;
468 >                                }
469 >
470 >                              oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
471 >                            }
472 >                          
473 >                          if (color)
474 >                            {
475 >                              string color;
476 >                              
477 >                              if (antisense)
478 >                                color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
479 >                              else
480 >                                color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
481 >                              
482 >                              color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
483 >                              colorSegmentSequence_prev = convert_color_to_bp(color);
484 >                            }
485 >                          
486 >                          const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
487 >
488 >                          // daehwan
489 >                          if (bDebug)
490 >                            {
491 >                              cout << "ref: " << referenceSequence << endl;
492 >                              cout << "old: " << oldSegmentSequence << endl;
493 >                              cout << "ins: " << insertionSequence << endl;
494 >                            }
495 >                          
496 >                          /*
497 >                           * Scan right in the read until we run out of read
498 >                           */
499 >                          for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
500 >                            {
501 >                              /*  
502 >                               * Any mismatch to the insertion is a failure
503 >                               */
504 >                              if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
505 >                                {
506 >                                  ++this_reference_mismatch;
507 >                                }
508 >                              
509 >                              if (read_index < insertion_len)
510 >                                {
511 >                                  if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
512 >                                    {
513 >                                      ++insertion_mismatch;
514 >                                      break;
515 >                                    }
516 >                                }
517 >                              else
518 >                                {
519 >                                  if (referenceSequence[read_index - insertion_len] == 'N' ||
520 >                                      referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
521 >                                    {
522 >                                      --this_reference_mismatch;
523 >                                    }
524 >                                }
525 >                            }
526 >                        }
527 >                      
528 >                      string colorSegmentSequence_curr;
529 >                      if (curr_left_to_insert > 0)
530 >                        {
531 >                          seqan::Dna5String referenceSequence, oldSegmentSequence;
532 >                          if (reversed)
533 >                            {
534 >                              referenceSequence = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
535 >                              seqan::convertInPlace(referenceSequence, seqan::FunctorComplement<seqan::Dna>());
536 >                              seqan::reverseInPlace(referenceSequence);
537 >
538 >                              string temp = read_seq.substr(curr_seg_index * segment_length, curr_left_to_insert);
539 >                              oldSegmentSequence = seqan::Dna5String(temp);
540 >                            }
541 >                          else
542 >                            {
543 >                              referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
544 >                              oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
545 >                            }
546 >                          
547 >                          if (color)
548 >                            {
549 >                              string color;
550 >                              if (antisense)
551 >                                color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
552 >                              else
553 >                                color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
554 >                              
555 >                              color.push_back(curr_hit->seq()[curr_left_to_insert]);
556 >                              reverse(color.begin(), color.end());
557 >                              string bp = convert_color_to_bp(color);
558 >                              reverse(bp.begin(), bp.end());
559 >                              colorSegmentSequence_curr = bp;
560 >                            }
561 >                          
562 >                          const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
563 >
564 >                          // daehwan
565 >                          if (bDebug)
566 >                            {
567 >                              cout << "ref: " << referenceSequence << endl;
568 >                              cout << "old: " << oldSegmentSequence << endl;
569 >                              cout << "ins: " << insertionSequence << endl;
570 >                            }
571 >                      
572 >                          /*
573 >                           * Scan left in the read until
574 >                           * We ran out of read sequence (insertion extends past segment)
575 >                           */
576 >                          for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
577 >                            {
578 >                              int segmentPosition = curr_left_to_insert - read_index - 1;
579 >                              int insertionPosition = insertion_len - read_index - 1;
580 >                              
581 >                              if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
582 >                                {
583 >                                  ++this_reference_mismatch;
584 >                                }
585 >                              
586 >                              if (read_index < insertion_len)
587 >                                {
588 >                                  if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
589 >                                    {
590 >                                      ++insertion_mismatch;
591 >                                      break;
592 >                                    }
593 >                                }
594 >                              else
595 >                                {
596 >                                  if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
597 >                                      (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
598 >                                    {
599 >                                      --this_reference_mismatch;
600 >                                    }
601 >                                }
602 >                            }
603 >                        }
604 >                      
605 >                      if (found_closure)
606 >                        {
607 >                          // fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
608 >                          return BowtieHit();
609 >                        }
610 >
611 >                      if (insertion_mismatch == 0)
612 >                        {
613 >                          reference_mismatch = this_reference_mismatch;
614 >                          mismatch = -reference_mismatch;
615 >                          found_closure = true;
616 >                          new_left = prev_hit->left();
617 >                          new_cigar = prev_hit->cigar();
618 >                          
619 >                          /*
620 >                           * Need to make a new insert operation between the two match character that begin
621 >                           * and end the intersection of these two junction. Note that we necessarily assume
622 >                           * that this insertion can't span beyond the boundaries of these reads. That should
623 >                           * probably be better enforced somewhere
624 >                           */
625 >
626 >                          new_cigar.back().length -= insert_to_prev_right;
627 >                          
628 >                          if (new_cigar.back().length <= 0)
629 >                            new_cigar.pop_back();
630 >
631 >                          if (reversed)
632 >                            new_cigar.push_back(CigarOp(iNS, lb->sequence.size()));
633 >                          else
634 >                            new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
635 >                          
636 >                          vector<CigarOp> new_right_cigar = curr_hit->cigar();
637 >                          new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
638 >                          
639 >                          /*
640 >                           * Finish stitching together the new cigar string
641 >                           */
642 >                          size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
643 >                          for (; c < new_right_cigar.size(); ++c)
644 >                            {
645 >                              new_cigar.push_back(new_right_cigar[c]);
646 >                            }
647 >                          
648 >                          if (color)
649 >                            {
650 >                              if (insert_to_prev_right > 0)
651 >                                seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
652 >                              
653 >                              if (curr_left_to_insert > 0)
654 >                                seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
655 >                            }
656 >                        }
657 >                    }
658 >                  ++lb;
659 >                }
660 >              
661 >              if (!found_closure)
662 >                {
663 >                  return BowtieHit();
664 >                }
665 >            }
666 >
667 >          /*
668 >           * Stitch segments together using juctions or deletions if necessary.
669 >           */
670 >          else if (dist_btw_two > 0 && dist_btw_two <= max_report_intron_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
671 >            {
672 >              std::set<Junction>::iterator lb, ub;
673 >
674 >              // daehwan
675 >              if (bDebug)
676 >                {
677 >                  cout << "junction" << endl;
678 >                  cout << "min: " << left_boundary << "-" << right_boundary - 8 << endl;
679 >                  cout << "max: " << left_boundary + 8 << "-" << right_boundary << endl;
680 >                }
681 >
682 >              lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
683 >              ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
684 >              
685 >              int new_diff_mismatches = 0xff;
686 >              while (lb != ub && lb != possible_juncs.end())
687 >                {
688 >                  int dist_to_left, dist_to_right;
689 >                  if (reversed)
690 >                    {
691 >                      dist_to_left = lb->left - curr_hit->left();
692 >                      dist_to_right = lb->right - prev_hit->right() - 1;
693 >                    }
694 >                  else
695 >                    {
696 >                      dist_to_left = lb->left - prev_hit->right() + 1;
697 >                      dist_to_right = lb->right - curr_hit->left();
698 >                    }
699 >                  
700 >                  if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
701 >                    {
702 >                      /*
703 >                       * Check we have enough matched bases on prev or curr segment.
704 >                       */
705 >                      if ((reversed && (dist_to_left > prev_right_end_match_length || -dist_to_left > curr_left_end_match_length)) ||
706 >                          (!reversed && (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)))
707 >                        {
708 >                          ++lb;
709 >                          continue;
710 >                        }
711 >
712 >                      // daehwan
713 >                      if (bDebug)
714 >                        {
715 >                          cout << "candidate junction: " << endl;
716 >                          cout << "coords: " << lb->left << "-" << lb->right << endl;
717 >                          cout << "dist to left: " << dist_to_left << endl;
718 >                        }
719 >                      
720 >                      Dna5String new_cmp_str, old_cmp_str;
721 >                      int new_mismatch = 0, old_mismatch = 0;
722 >                      string new_patch_str; // this is for colorspace reads
723 >                      if (dist_to_left > 0)
724 >                        {
725 >                          if (reversed)
726 >                            {
727 >                              new_cmp_str = seqan::infix(*ref_str, curr_hit->left() + 1, lb->left + 1);
728 >                              seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
729 >                              seqan::reverseInPlace(new_cmp_str);
730 >                              
731 >                              old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb->right);
732 >                              seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
733 >                              seqan::reverseInPlace(old_cmp_str);
734 >                            }
735 >                          else
736 >                            {
737 >                              new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
738 >                              old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
739 >                            }
740 >                          
741 >                          string new_seq;
742 >                          if (color)
743 >                            {
744 >                              string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
745 >                              
746 >                              string color, qual;
747 >                              if (antisense)
748 >                                {
749 >                                  color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
750 >                                  qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
751 >                                }
752 >                              else
753 >                                {
754 >                                  color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
755 >                                  qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
756 >                                }
757 >                              
758 >                              BWA_decode(color, qual, ref, new_seq);
759 >                              new_seq = new_seq.substr(1);
760 >                            }
761 >
762 >                          string curr_hit_seq;
763 >                          if (reversed)
764 >                            curr_hit_seq = read_seq.substr(curr_seg_index * segment_length - dist_to_left, dist_to_left);
765 >                          else
766 >                            curr_hit_seq = curr_hit->seq();
767 >                          
768 >                          const string& curr_old_seq = curr_hit_seq;
769 >                          const string& curr_seq = color ? new_seq : curr_hit_seq;
770 >                          for (int i = 0; i < dist_to_left; ++i)
771 >                            {
772 >                              if (curr_seq[i] != new_cmp_str[i])
773 >                                ++new_mismatch;
774 >                              
775 >                              if (curr_old_seq[i] != old_cmp_str[i])
776 >                                ++old_mismatch;
777 >                            }
778 >                          
779 >                          if (color)
780 >                            new_patch_str = curr_seq.substr(0, dist_to_left);
781 >                        }
782 >                      else if (dist_to_left < 0)
783 >                        {
784 >                          if (reversed)
785 >                            {
786 >                              new_cmp_str = seqan::infix(*ref_str, lb->right, prev_hit->right() + 1);
787 >                              seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
788 >                              seqan::reverseInPlace(new_cmp_str);
789 >
790 >                              old_cmp_str = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
791 >                              seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
792 >                              seqan::reverseInPlace(old_cmp_str);
793 >                            }
794 >                          else
795 >                            {
796 >                              new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
797 >                              old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
798 >                            }
799 >
800 >                          size_t abs_dist = -dist_to_left;
801 >                          string new_seq;
802 >                          if (color)
803 >                            {
804 >                              string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
805 >                              ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
806 >                              
807 >                              string color, qual;
808 >                              if (antisense)
809 >                                {
810 >                                  color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
811 >                                  qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
812 >                                }
813 >                              else
814 >                                {
815 >                                  color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
816 >                                  qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
817 >                                }
818 >                              
819 >                              BWA_decode(color, qual, ref, new_seq);
820 >                              new_seq = new_seq.substr(1);
821 >                            }
822 >
823 >                          string prev_hit_seq;
824 >                          if (reversed)
825 >                            prev_hit_seq = read_seq.substr(curr_seg_index * segment_length, abs_dist);
826 >                          else
827 >                            prev_hit_seq = prev_hit->seq();
828 >
829 >                          // daehwan
830 >                          if (bDebug)
831 >                            {
832 >                              cout << "reverse: " << (int)reversed << endl;
833 >                              cout << "new cmp str: " << new_cmp_str << endl;
834 >                              cout << "old cmp str: " << old_cmp_str << endl;
835 >                              cout << "hit seq: " << prev_hit_seq << endl;
836 >                              cout << "curr seq: " << curr_hit->seq() << endl;
837 >
838 >                              cout << read_seq
839 >                                   << endl;
840 >                              cout << read_seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, segment_length)
841 >                                   << endl;
842 >                            }
843 >                          
844 >                          const string& prev_old_seq = prev_hit_seq;
845 >                          size_t prev_old_seq_len = prev_old_seq.length();
846 >                          const string& prev_seq = color ? new_seq : prev_hit_seq;
847 >                          size_t prev_seq_len = prev_seq.length();
848 >                          for (size_t i = 0; i < abs_dist; ++i)
849 >                            {
850 >                              if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
851 >                                ++new_mismatch;
852 >                              if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
853 >                                ++old_mismatch;
854 >                            }
855 >
856 >                          if (color)
857 >                            new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
858 >                        }
859 >                      
860 >                      int temp_diff_mismatches = new_mismatch - old_mismatch;
861 >
862 >                      // daehwan
863 >                      if (bDebug)
864 >                        {
865 >                          cout << "new mismatch: " << new_mismatch << endl;
866 >                          cout << "old mismatch: " << old_mismatch << endl;
867 >                          cout << "new_diff_mismatch: " << new_diff_mismatches << endl;
868 >                          cout << "temp mismatch: " << temp_diff_mismatches << endl;
869 >                        }
870 >                      
871 >                      if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
872 >                        {
873 >                          ++lb;
874 >                          continue;
875 >                        }
876 >
877 >                      if (color)
878 >                        {
879 >                          /*
880 >                           * We need to recover the origianl sequence.
881 >                           */
882 >                          if (found_closure)
883 >                            {
884 >                              seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
885 >                                          prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
886 >                            }
887 >                          
888 >                          if (dist_to_left > 0)
889 >                            seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
890 >                          else if (dist_to_left < 0)
891 >                            seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
892 >                        }
893 >                      
894 >                      new_diff_mismatches = temp_diff_mismatches;
895 >                      
896 >                      new_left = prev_hit->left();
897 >                      new_cigar = prev_hit->cigar();
898 >                      
899 >                      int new_left_back_len = new_cigar.back().length;
900 >                      if (reversed)
901 >                        new_left_back_len -= dist_to_left;
902 >                      else
903 >                        new_left_back_len += dist_to_left;
904 >                      
905 >                      vector<CigarOp> new_right_cig = curr_hit->cigar();
906 >                      int new_right_front_len = new_right_cig.front().length;
907 >                      if (reversed)
908 >                        new_right_front_len += dist_to_right;
909 >                      else
910 >                        new_right_front_len -= dist_to_right;
911 >                      if (new_left_back_len > 0)
912 >                        new_cigar.back().length = new_left_back_len;                                      
913 >                      else
914 >                        new_cigar.pop_back();
915 >                      
916 >                      /*
917 >                       * FIXME, currently just differentiating between a deletion and a
918 >                       * reference skip based on length. However, would probably be better
919 >                       * to denote the difference explicitly, this would allow the user
920 >                       * to supply their own (very large) deletions
921 >                       */
922 >                      if ((lb->right - lb->left - 1) <= max_deletion_length)
923 >                        {
924 >                          if (reversed)
925 >                            new_cigar.push_back(CigarOp(dEL, lb->right - lb->left - 1));
926 >                          else
927 >                            new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
928 >                          antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
929 >                        }
930 >                      else
931 >                        {
932 >                          if (reversed)
933 >                            new_cigar.push_back(CigarOp(rEF_SKIP, lb->right - lb->left - 1));
934 >                          else
935 >                            new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
936 >                          antisense_closure = lb->antisense;
937 >                        }
938 >                      
939 >                      new_right_cig.front().length = new_right_front_len;
940 >                      size_t c = new_right_front_len > 0 ? 0 : 1;
941 >                      for (; c < new_right_cig.size(); ++c)
942 >                        new_cigar.push_back(new_right_cig[c]);
943 >                      
944 >                      mismatch = new_diff_mismatches;
945 >                      found_closure = true;
946 >                    }
947 >                  ++lb;
948 >                }
949 >
950 >              if (!found_closure)
951 >                {
952 >                  return BowtieHit();
953 >                }
954 >            }
955 >          else if (!(dist_btw_two == 0 && prev_hit->antisense_align2() == curr_hit->antisense_align()))
956 >             check_fusion = true;
957 >        }
958 >
959 >      if (check_fusion)
960 >        {
961 >          std::set<Fusion>::iterator lb, ub;
962 >
963 >          uint32_t ref_id1 = prev_hit->ref_id2();
964 >          uint32_t ref_id2 = curr_hit->ref_id();
965 >          uint32_t left = prev_hit->right() - 4;
966 >          uint32_t right = curr_hit->left() - 4;
967 >
968 >          // daehwan
969 >          if (bDebug)
970 >            {
971 >              cout << "daehwan - start - fusion" << endl
972 >                   << "ref_id1: " << ref_id1 << endl
973 >                   << "ref_id2: " << ref_id2 << endl
974 >                   << "left: " << left << endl
975 >                   << "right: " << right << endl
976 >                   << "dir: " << fusion_dir << endl;
977 >            }
978 >
979 >          bool reversed = false;
980 >          if (fusion_dir != FUSION_FF &&
981 >              (ref_id2 < ref_id1 || (ref_id1 == ref_id2 && left > right)))
982 >            {
983 >              reversed = true;
984 >              
985 >              uint32_t temp = ref_id1;
986 >              ref_id1 = ref_id2;
987 >              ref_id2 = temp;
988 >              
989 >              temp = left;
990 >              left = right;
991 >              right = temp;
992 >            }
993 >              
994 >          lb = possible_fusions.upper_bound(Fusion(ref_id1, ref_id2, left, right));
995 >          ub = possible_fusions.lower_bound(Fusion(ref_id1, ref_id2, left + 8, right + 8));
996 >
997 >          RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id2());
998 >          RefSequenceTable::Sequence* ref_str2 = rt.get_seq(curr_hit->ref_id());
999 >          
1000 >          int new_diff_mismatches = 0xff;
1001 >          while (lb != ub && lb != possible_fusions.end())
1002 >            {
1003 >              int lb_left = lb->left;
1004 >              int lb_right = lb->right;
1005 >              
1006 >              if (reversed)
1007 >                {
1008 >                  lb_left = lb->right;
1009 >                  lb_right = lb->left;
1010 >                }
1011 >
1012 >              int dist_to_left, dist_to_right;
1013 >              if (fusion_dir == FUSION_RF)
1014 >                dist_to_left = prev_hit->right() - lb_left + 1;
1015 >              else
1016 >                dist_to_left = lb_left - prev_hit->right() + 1;
1017 >              
1018 >              if (fusion_dir == FUSION_FR)
1019 >                dist_to_right = curr_hit->left() - lb_right;
1020 >              else
1021 >                dist_to_right = lb_right - curr_hit->left();
1022 >
1023 >              // daehwan
1024 >              if (bDebug)
1025 >                {
1026 >                  cout << "daehwan - fusion gap" << endl;
1027 >                  cout << "dist left: " << dist_to_left << endl;
1028 >                  cout << "dist right: " << dist_to_right << endl;
1029 >                }
1030 >              
1031 >              if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
1032 >                {
1033 >                  /*
1034 >                   * Check we have enough matched bases on prev or curr segment.
1035 >                   */
1036 >                  if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)
1037 >                    {
1038 >                      ++lb;
1039 >                      continue;
1040 >                    }
1041 >
1042 >                  Dna5String new_cmp_str, old_cmp_str;
1043 >                  int new_mismatch = 0, old_mismatch = 0;
1044 >                  string new_patch_str; // this is for colorspace reads
1045 >                  if (dist_to_left > 0)
1046 >                    {
1047 >                      if (fusion_dir == FUSION_RF)
1048 >                        {
1049 >                          new_cmp_str = seqan::infix(*ref_str, lb_left, prev_hit->right() + 1);
1050 >                          seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1051 >                          seqan::reverseInPlace(new_cmp_str);
1052 >                        }
1053 >                      else
1054 >                        new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb_left + 1);
1055 >
1056 >                      if (fusion_dir == FUSION_FR)
1057 >                        {
1058 >                          old_cmp_str = seqan::infix(*ref_str2, lb_right + 1, curr_hit->left() + 1);
1059 >                          seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1060 >                          seqan::reverseInPlace(old_cmp_str);
1061 >                        }
1062 >                      else
1063 >                        old_cmp_str = seqan::infix(*ref_str2, curr_hit->left(), lb_right);
1064 >
1065 >                      // daehwan
1066 >                      if (bDebug)
1067 >                        {
1068 >                          cout << "new str: " << new_cmp_str << endl;
1069 >                          cout << "old str: " << old_cmp_str << endl;
1070 >                          cout << "curr seq: " << curr_hit->seq() << endl;
1071 >                        }
1072 >
1073 >                      string curr_hit_seq;
1074 >                      if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1075 >                        curr_hit_seq = curr_hit->seq();
1076 >                      else
1077 >                        curr_hit_seq = read_seq.substr(curr_seg_index * segment_length, segment_length);
1078 >                      
1079 >                      string new_seq;
1080 >                      const string& curr_old_seq = curr_hit_seq;
1081 >                      const string& curr_seq = color ? new_seq : curr_hit_seq;
1082 >                      for (int i = 0; i < dist_to_left; ++i)
1083 >                        {
1084 >                          if (curr_seq[i] != new_cmp_str[i])
1085 >                            ++new_mismatch;
1086 >                          
1087 >                          if (curr_old_seq[i] != old_cmp_str[i])
1088 >                            ++old_mismatch;
1089 >                        }
1090 >                    }
1091 >                  else if (dist_to_left < 0)
1092 >                    {
1093 >                      if (fusion_dir == FUSION_FR)
1094 >                        {
1095 >                          new_cmp_str = seqan::infix(*ref_str2, curr_hit->left() + 1, lb_right + 1);
1096 >                          seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1097 >                          seqan::reverseInPlace(new_cmp_str);
1098 >                        }
1099 >                      else
1100 >                        new_cmp_str = seqan::infix(*ref_str2, lb_right, curr_hit->left());
1101 >
1102 >                      if (fusion_dir == FUSION_RF)
1103 >                        {
1104 >                          old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb_left);
1105 >                          seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1106 >                          seqan::reverseInPlace(old_cmp_str);
1107 >                        }
1108 >                      else
1109 >                        old_cmp_str = seqan::infix(*ref_str, lb_left + 1, prev_hit->right());
1110 >
1111 >                      string prev_hit_seq;
1112 >                      if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1113 >                        prev_hit_seq = prev_hit->seq();
1114 >                      else
1115 >                        prev_hit_seq = read_seq.substr((curr_seg_index - 1) * segment_length, segment_length);
1116 >
1117 >                      size_t abs_dist = -dist_to_left;
1118 >                      string new_seq;
1119 >                          
1120 >                      const string& prev_old_seq = prev_hit_seq;
1121 >                      size_t prev_old_seq_len = prev_old_seq.length();
1122 >                      const string& prev_seq = color ? new_seq : prev_hit_seq;
1123 >                      size_t prev_seq_len = prev_seq.length();
1124 >                      for (size_t i = 0; i < abs_dist; ++i)
1125 >                        {
1126 >                          if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
1127 >                            ++new_mismatch;
1128 >                          if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
1129 >                            ++old_mismatch;
1130 >                        }
1131 >                    }
1132 >
1133 >                  int temp_diff_mismatches = new_mismatch - old_mismatch;
1134 >                  if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
1135 >                    {
1136 >                      ++lb;
1137 >                      continue;
1138 >                    }
1139 >                  new_diff_mismatches = temp_diff_mismatches;
1140 >
1141 >                  new_left = prev_hit->left();
1142 >                  new_cigar = prev_hit->cigar();
1143 >                      
1144 >                  int new_left_back_len = new_cigar.back().length;
1145 >                  new_left_back_len += dist_to_left;
1146 >                  
1147 >                  vector<CigarOp> new_right_cig = curr_hit->cigar();
1148 >                  int new_right_front_len = new_right_cig.front().length;
1149 >                  new_right_front_len -= dist_to_right;
1150 >                  if (new_left_back_len > 0)
1151 >                    new_cigar.back().length = new_left_back_len;                                          
1152 >                  else
1153 >                    new_cigar.pop_back();
1154 >
1155 >                  new_cigar.push_back(CigarOp((CigarOpCode)fusion_dir, lb_right));
1156 >                  antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
1157 >                  
1158 >                  new_right_cig.front().length = new_right_front_len;
1159 >                  size_t c = new_right_front_len > 0 ? 0 : 1;
1160 >                  for (; c < new_right_cig.size(); ++c)
1161 >                    new_cigar.push_back(new_right_cig[c]);
1162 >                  
1163 >                  mismatch = new_diff_mismatches;
1164 >                  found_closure = true;
1165 >                  ++num_fusions;
1166 >
1167 >                  // daehwan
1168 >                  if (bDebug)
1169 >                    {
1170 >                      cout << "daehwan - fusion gap - found" << endl;
1171 >                    }
1172 >
1173 >                }
1174 >              ++lb;
1175 >            }
1176 >
1177 >          // daehwan
1178 >          if (bDebug)
1179 >            {
1180 >              cout << "daehwan2 - end - fusion: " << (found_closure ? "found" : "not found") << endl;
1181 >            }
1182 >          
1183 >          if (!found_closure)
1184 >            {
1185 >              return BowtieHit();
1186 >            }
1187 >        }
1188  
1189        if (found_closure)
1190 <        {
1191 <    fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
1192 <    return BowtieHit();
1193 <        }
1194 <
1195 <      if (insertion_mismatch == 0)
1196 <        {
1197 <    reference_mismatch = this_reference_mismatch;
1198 <    mismatch = -reference_mismatch;
1199 <    found_closure = true;
1200 <    new_left = prev_hit->left();
1201 <    new_cigar = prev_hit->cigar();
1202 <
1203 <    /*
1204 <     * Need to make a new insert operation between the two match character that begin
1205 <     * and end the intersection of these two junction. Note that we necessarily assume
1206 <     * that this insertion can't span beyond the boundaries of these reads. That should
1207 <     * probably be better enforced somewhere
1208 <     */
1209 <
1210 <    new_cigar.back().length -= insert_to_prev_right;
1211 <    if (new_cigar.back().length <= 0)
1212 <      new_cigar.pop_back();
1213 <
1214 <    new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
1215 <    vector<CigarOp> new_right_cigar = curr_hit->cigar();
1216 <    new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
1217 <
1218 <    /*
1219 <     * Finish stitching together the new cigar string
1220 <     */
1221 <    size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
1222 <    for (; c < new_right_cigar.size(); ++c)
1223 <      {
1224 <        new_cigar.push_back(new_right_cigar[c]);
1225 <      }
1226 <
1227 <    if (color)
1228 <      {
1229 <        if (insert_to_prev_right > 0)
1230 <          seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
1231 <
1232 <        if (curr_left_to_insert > 0)
1233 <          seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
1234 <      }
1235 <        }
1236 <    }
1237 <        ++lb;
1238 <  }
476 <
477 <  if (!found_closure)
478 <  {
479 <    return BowtieHit();
480 <  }
481 <      }
482 <
483 <      /*
484 <       * Stitch segments together using juctions or deletions if necessary.
485 <       */
486 <      else if (prev_hit->right() < curr_hit->left())
487 <  {
488 <    std::set<Junction>::iterator lb, ub;
489 <
490 <    int left_boundary = prev_hit->right() - 4;
491 <    int right_boundary = curr_hit->left() + 4;
492 <
493 <    lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
494 <    ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
495 <
496 <    int new_diff_mismatches = 0xff;
497 <    while (lb != ub && lb != possible_juncs.end())
498 <      {
499 <        int dist_to_left = lb->left - prev_hit->right() + 1;
500 <        int dist_to_right = lb->right - curr_hit->left();
501 <
502 <        if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
503 <    {
504 <      /*
505 <       * Check we have enough matched bases on prev or curr segment.
506 <       */
507 <      if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length )
508 <        {
509 <          ++lb;
510 <          continue;
511 <        }
512 <
513 <      Dna5String new_cmp_str, old_cmp_str;
514 <      int new_mismatch = 0, old_mismatch = 0;
515 <      string new_patch_str; // this is for colorspace reads
516 <      if (dist_to_left > 0)
517 <        {
518 <          new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
519 <          old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
520 <
521 <          string new_seq;
522 <          if (color)
523 <      {
524 <        string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
525 <
526 <        string color, qual;
527 <        if (antisense)
528 <          {
529 <            color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
530 <            qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
531 <          }
532 <          else
533 <          {
534 <            color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
535 <            qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
536 <          }
537 <
538 <        BWA_decode(color, qual, ref, new_seq);
539 <        new_seq = new_seq.substr(1);
540 <      }
541 <
542 <          const string& curr_old_seq = curr_hit->seq();
543 <          const string& curr_seq = color ? new_seq : curr_hit->seq();
544 <          for (int i = 0; i < dist_to_left; ++i)
545 <      {
546 <        if (curr_seq[i] != new_cmp_str[i])
547 <          ++new_mismatch;
548 <
549 <        if (curr_old_seq[i] != old_cmp_str[i])
550 <          ++old_mismatch;
551 <      }
552 <
553 <          if (color)
554 <      new_patch_str = curr_seq.substr(0, dist_to_left);
555 <        }
556 <      else if (dist_to_left < 0)
557 <        {
558 <          new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
559 <          old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
560 <
561 <          size_t abs_dist = -dist_to_left;
562 <          string new_seq;
563 <          if (color)
564 <      {
565 <        string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
566 <        ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
567 <
568 <        string color, qual;
569 <        if (antisense)
570 <          {
571 <            color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
572 <            qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
573 <          }
574 <          else
575 <          {
576 <            color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
577 <            qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
578 <          }
579 <
580 <        BWA_decode(color, qual, ref, new_seq);
581 <        new_seq = new_seq.substr(1);
582 <      }
583 <
584 <          const string& prev_old_seq = prev_hit->seq();
585 <          size_t prev_old_seq_len = prev_old_seq.length();
586 <          const string& prev_seq = color ? new_seq : prev_hit->seq();
587 <          size_t prev_seq_len = prev_seq.length();
588 <          for (size_t i = 0; i < abs_dist; ++i)
589 <      {
590 <        if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
591 <          ++new_mismatch;
592 <        if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
593 <          ++old_mismatch;
594 <      }
595 <
596 <          if (color)
597 <      new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
598 <        }
599 <
600 <      int temp_diff_mismatches = new_mismatch - old_mismatch;
601 <      if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
602 <        {
603 <          ++lb;
604 <          continue;
605 <        }
606 <
607 <      if (color)
608 <        {
609 <          /*
610 <           * We need to recover the origianl sequence.
611 <           */
612 <          if (found_closure)
613 <      {
614 <        seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
615 <              prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
616 <      }
617 <
618 <          if (dist_to_left > 0)
619 <      seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
620 <          else if (dist_to_left < 0)
621 <      seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
622 <        }
623 <
624 <      new_diff_mismatches = temp_diff_mismatches;
625 <
626 <      new_left = prev_hit->left();
627 <      new_cigar = prev_hit->cigar();
628 <
629 <      int new_left_back_len = new_cigar.back().length;
630 <      new_left_back_len += dist_to_left;
631 <
632 <      vector<CigarOp> new_right_cig = curr_hit->cigar();
633 <      int new_right_front_len = new_right_cig.front().length;
634 <      new_right_front_len -= dist_to_right;
635 <      if (new_left_back_len > 0)
636 <        new_cigar.back().length = new_left_back_len;
637 <      else
638 <        new_cigar.pop_back();
639 <
640 <      /*
641 <       * FIXME, currently just differentiating between a deletion and a
642 <       * reference skip based on length. However, would probably be better
643 <       * to denote the difference explicitly, this would allow the user
644 <       * to supply their own (very large) deletions
645 <       */
646 <      if ((lb->right - lb->left - 1) <= max_deletion_length)
647 <        {
648 <          new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
649 <          antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
650 <        }
651 <      else
652 <        {
653 <          new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
654 <          antisense_closure = lb->antisense;
655 <        }
656 <
657 <      new_right_cig.front().length = new_right_front_len;
658 <      size_t c = new_right_front_len > 0 ? 0 : 1;
659 <      for (; c < new_right_cig.size(); ++c)
660 <        new_cigar.push_back(new_right_cig[c]);
661 <
662 <      mismatch = new_diff_mismatches;
663 <      found_closure = true;
664 <    }
665 <        ++lb;
666 <      }
667 <
668 <    if (!found_closure)
669 <      {
670 <        return BowtieHit();
671 <      }
672 <  }
673 <
674 <      if (found_closure)
675 <  {
676 <    bool end = false;
677 <    BowtieHit merged_hit(reference_id,
678 <             insert_id,
679 <             new_left,
680 <             new_cigar,
681 <             antisense,
682 <             antisense_closure,
683 <             prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
684 <             prev_hit->splice_mms() + curr_hit->splice_mms(),
685 <             end);
686 <
687 <    if (curr_seg_index > 1)
688 <      merged_hit.seq(seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, 2 * segment_length));
689 <    else
690 <      merged_hit.seq(seq.substr(0, first_seg_length + segment_length));
691 <
692 <    prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
693 <    /*
694 <     * prev_hit now points PAST the last element removed
695 <     */
696 <    prev_hit = hit_chain.insert(prev_hit, merged_hit);
697 <    /*
698 <     * merged_hit has been inserted before the old position of
699 <     * prev_hit. New location of prev_hit is merged_hit
700 <     */
701 <    curr_hit = prev_hit;
702 <    ++curr_hit;
703 <    ++curr_seg_index;
704 <    continue;
705 <  }
706 <
1190 >        {
1191 >          bool end = false;
1192 >          BowtieHit merged_hit(prev_hit->ref_id(),
1193 >                               curr_hit->ref_id2(),
1194 >                               insert_id,
1195 >                               new_left,
1196 >                               new_cigar,
1197 >                               antisense,
1198 >                               antisense_closure,
1199 >                               prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
1200 >                               prev_hit->splice_mms() + curr_hit->splice_mms(),
1201 >                               end);
1202 >
1203 >          // daehwan - should fix this for SOLiD dataset
1204 >          merged_hit.seq(prev_hit->seq() + curr_hit->seq());
1205 >
1206 >          // daehwan
1207 >          if (bDebug)
1208 >            {
1209 >              cout << "fusing of " << merged_hit.left() << " and " << merged_hit.right() << endl;
1210 >              cout << print_cigar(merged_hit.cigar()) << endl;
1211 >              if (!merged_hit.check_editdist_consistency(rt))
1212 >                {
1213 >                  cout << "btw " << print_cigar(prev_hit->cigar()) << " and " << print_cigar(curr_hit->cigar()) << endl;
1214 >                  cout << "this is malformed hit" << endl;
1215 >                }
1216 >            }
1217 >
1218 >          prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
1219 >          /*
1220 >           * prev_hit now points PAST the last element removed
1221 >           */
1222 >          prev_hit = hit_chain.insert(prev_hit, merged_hit);
1223 >          /*
1224 >           * merged_hit has been inserted before the old position of
1225 >           * prev_hit. New location of prev_hit is merged_hit
1226 >           */
1227 >          curr_hit = prev_hit;
1228 >          ++curr_hit;
1229 >          ++curr_seg_index;
1230 >          continue;
1231 >        }
1232 >
1233 >      // daehwan
1234 >      if (bDebug)
1235 >        {
1236 >          cout << "daehwan - test 0.3" << endl;
1237 >        }
1238 >      
1239        ++prev_hit;
1240        ++curr_hit;
1241        ++curr_seg_index;
1242      }
1243  
1244 +  // daehwan
1245 +  if (bDebug)
1246 +    {
1247 +      cout << "daehwan - test2" << endl;
1248 +    }
1249 +  
1250    bool saw_antisense_splice = false;
1251    bool saw_sense_splice = false;
1252    vector<CigarOp> long_cigar;
# Line 726 | Line 1264
1264         */
1265        bool containsSplice = s->is_spliced();
1266        if (containsSplice)
1267 <  {
1268 <    if (s->antisense_splice())
1269 <      {
1270 <        if (saw_sense_splice)
1271 <    return BowtieHit();
1272 <        saw_antisense_splice = true;
1273 <      }
1274 <    else
1275 <      {
1276 <        if (saw_antisense_splice)
1277 <    return BowtieHit();
1278 <        saw_sense_splice = true;
1279 <      }
1280 <  }
1267 >        {
1268 >          if (s->antisense_splice())
1269 >            {
1270 >              if (saw_sense_splice)
1271 >                return BowtieHit();
1272 >              saw_antisense_splice = true;
1273 >            }
1274 >          else
1275 >            {
1276 >              if (saw_antisense_splice)
1277 >                return BowtieHit();
1278 >              saw_sense_splice = true;
1279 >            }
1280 >        }
1281        const vector<CigarOp>& cigar = s->cigar();
1282        if (long_cigar.empty())
1283 <  {
1284 <    long_cigar = cigar;
1285 <  }
1283 >        {
1284 >          long_cigar = cigar;
1285 >        }
1286        else
1287 <  {
1288 <    CigarOp& last = long_cigar.back();
1289 <    /*
1290 <     * If necessary, merge the back and front
1291 <     * cigar operations
1292 <     */
1293 <    if(last.opcode == cigar[0].opcode){
1294 <      last.length += cigar[0].length;
1295 <      for (size_t b = 1; b < cigar.size(); ++b)
1296 <        {
1297 <    long_cigar.push_back(cigar[b]);
1298 <        }
1299 <    }else{
1300 <      for(size_t b = 0; b < cigar.size(); ++b)
1301 <        {
1302 <    long_cigar.push_back(cigar[b]);
1303 <        }
1304 <    }
1305 <  }
1287 >        {
1288 >          CigarOp& last = long_cigar.back();
1289 >          /*
1290 >           * If necessary, merge the back and front
1291 >           * cigar operations
1292 >           */
1293 >          if(last.opcode == cigar[0].opcode){
1294 >            last.length += cigar[0].length;
1295 >            for (size_t b = 1; b < cigar.size(); ++b)
1296 >              {
1297 >                long_cigar.push_back(cigar[b]);
1298 >              }
1299 >          }else{
1300 >            for(size_t b = 0; b < cigar.size(); ++b)
1301 >              {
1302 >                long_cigar.push_back(cigar[b]);
1303 >              }
1304 >          }
1305 >        }
1306      }
1307  
1308    bool end = false;
1309 <  BowtieHit new_hit(reference_id,
1310 <        insert_id,
1311 <        left,
1312 <        long_cigar,
1313 <        antisense,
1314 <        saw_antisense_splice,
1315 <        num_mismatches,
1316 <        num_splice_mms,
1317 <        end);
1309 >  BowtieHit new_hit(hit_chain.front().ref_id(),
1310 >                    hit_chain.back().ref_id2(),
1311 >                    insert_id,
1312 >                    left,
1313 >                    long_cigar,
1314 >                    antisense,
1315 >                    saw_antisense_splice,
1316 >                    num_mismatches,
1317 >                    num_splice_mms,
1318 >                    end);
1319  
1320 <  new_hit.seq(seq);
1321 <  new_hit.qual(qual);
1320 >  if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1321 >    {
1322 >      new_hit.seq(seq);
1323 >      new_hit.qual(qual);
1324 >    }
1325 >  else
1326 >    {
1327 >      new_hit.seq(read_seq);
1328 >      new_hit.qual(read_quals);
1329 >    }
1330 >
1331 >  bool do_reverse = new_hit.ref_id() > new_hit.ref_id2();
1332 >  if (new_hit.ref_id() == new_hit.ref_id2())
1333 >    {
1334 >      vector<Fusion> fusions;
1335 >      bool auto_sort = false;
1336 >      fusions_from_spliced_hit(new_hit, fusions, auto_sort);
1337 >      if (fusions.size() > 0)
1338 >        {
1339 >          const Fusion& fusion = fusions[0];
1340 >          do_reverse = fusion.left > fusion.right;
1341 >        }
1342 >    }
1343 >
1344 >  if (do_reverse)
1345 >    {
1346 >      new_hit = new_hit.reverse();
1347 >    }
1348 >
1349 >  if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
1350 >    {
1351 >      new_hit.antisense_align(!new_hit.antisense_align());
1352 >    }
1353 >
1354 >  // daehwan
1355 >  if (bDebug)
1356 >    {
1357 >      cout << "daehwan - test3" << endl;
1358 >      cout << new_hit.left() << " " << print_cigar(new_hit.cigar()) << endl;
1359 >      cout << new_hit.ref_id() << "-" << new_hit.ref_id2() << ": " << new_hit.fusion_opcode() << endl;
1360 >    }
1361  
1362    int new_read_len = new_hit.read_len();
1363    if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt))
1364      {
1365 <      fprintf(stderr, "Warning: malformed closure\n");
1365 >      // daehwan
1366 >      if (bDebug)
1367 >        {
1368 >          cout << "Warning: " << new_hit.insert_id() << " malformed closure: " << print_cigar(new_hit.cigar()) << endl;
1369 >          exit(1);
1370 >        }
1371 >      
1372 >      fprintf(stderr, "Warning: %d malformed closure\n", new_hit.insert_id());
1373        return BowtieHit();
1374      }
1375  
1376    return new_hit;
1377   }
1378  
1379 +
1380   int multi_closure = 0;
1381   int anchor_too_short = 0;
1382   int gap_too_short = 0;
# Line 798 | Line 1384
1384   bool valid_hit(const BowtieHit& bh)
1385   {
1386    if (bh.insert_id())
1387 <  {
1388 <    /*
1389 <     * validate the cigar chain - no gaps shorter than an intron, etc.
1390 <     * also,
1391 <     *   -Don't start or end with an indel or refskip
1392 <     *  -Only a match operation is allowed is allowed
1393 <     *   adjacent to an indel or refskip
1394 <     *      -Indels should confrom to length restrictions
1395 <     */
1396 <    const CigarOp* prevCig = &(bh.cigar()[0]);
1397 <    const CigarOp* currCig = &(bh.cigar()[1]);
1398 <    for (size_t i = 1; i < bh.cigar().size(); ++i){
1399 <      currCig = &(bh.cigar()[i]);
1400 <      if(currCig->opcode != MATCH && prevCig->opcode != MATCH){
1401 <        return false;
1402 <      }
1403 <      if(currCig->opcode == INS){
1404 <        if(currCig->length > max_insertion_length){
1405 <          return false;
1406 <        }
1407 <      }
1408 <      if(currCig->opcode == DEL){
1409 <        if(currCig->length > max_deletion_length){
1410 <          return false;
1411 <        }
1412 <      }
1413 <      if(currCig->opcode == REF_SKIP){
1414 <        if(currCig->length < (uint64_t)min_report_intron_length){
1415 <          gap_too_short++;
1416 <          return false;
1417 <        }
1418 <      }
1419 <      prevCig = currCig;
1387 >    {  
1388 >      /*        
1389 >       * validate the cigar chain - no gaps shorter than an intron, etc.
1390 >       * also,
1391 >       *        -Don't start or end with an indel or refskip
1392 >       *        -Only a match operation is allowed is allowed
1393 >       *         adjacent to an indel or refskip
1394 >       *      -Indels should confrom to length restrictions
1395 >       */
1396 >      const CigarOp* prevCig = &(bh.cigar()[0]);
1397 >      const CigarOp* currCig = &(bh.cigar()[1]);
1398 >      for (size_t i = 1; i < bh.cigar().size(); ++i){
1399 >        currCig = &(bh.cigar()[i]);
1400 >        if(!(currCig->opcode == MATCH || currCig->opcode == mATCH) &&
1401 >           !(prevCig->opcode == MATCH || prevCig->opcode == mATCH)){
1402 >          return false;
1403 >        }
1404 >        if(currCig->opcode == INS || currCig->opcode == iNS){
1405 >          if(currCig->length > max_insertion_length){
1406 >            return false;
1407 >          }
1408 >        }
1409 >        if(currCig->opcode == DEL || currCig->opcode == dEL){
1410 >          if(currCig->length > max_deletion_length){
1411 >            return false;
1412 >          }
1413 >        }
1414 >        if(currCig->opcode == REF_SKIP || currCig->opcode == rEF_SKIP){
1415 >          if(currCig->length < (uint64_t)min_report_intron_length){
1416 >            gap_too_short++;
1417 >            return false;
1418 >          }
1419 >        }
1420 >        prevCig = currCig;
1421 >      }
1422 >      if (!(bh.cigar().front().opcode == MATCH || bh.cigar().front().opcode == mATCH) ||
1423 >          !(bh.cigar().back().opcode == MATCH || bh.cigar().back().opcode == mATCH)/* ||
1424 >                                                                                      (int)bh.cigar().front().length < min_anchor_len||
1425 >                                                                                      (int)bh.cigar().back().length < min_anchor_len*/ )
1426 >        {
1427 >          anchor_too_short++;
1428 >          return false;
1429 >        }      
1430      }
1431 <    if (bh.cigar().front().opcode != MATCH ||
836 <      bh.cigar().back().opcode != MATCH /* ||
837 <      (int)bh.cigar().front().length < min_anchor_len||
838 <      (int)bh.cigar().back().length < min_anchor_len*/ )
1431 >  else
1432      {
1433 <      anchor_too_short++;
1433 >      multi_closure++;
1434        return false;
1435      }
1436 <  }
844 <  else
845 <  {
846 <    multi_closure++;
847 <    return false;
848 <  }
849 <
1436 >  
1437    return true;
1438   }
1439  
1440   void merge_segment_chain(RefSequenceTable& rt,
1441 <       const string& read_seq,
1442 <       const string& read_quals,
1443 <       std::set<Junction>& possible_juncs,
1444 <       std::set<Insertion>& possible_insertions,
1445 <       vector<BowtieHit>& hits,
1446 <       vector<BowtieHit>& merged_hits)
1441 >                         const string& read_seq,
1442 >                         const string& read_quals,
1443 >                         std::set<Junction>& possible_juncs,
1444 >                         std::set<Insertion>& possible_insertions,
1445 >                         std::set<Fusion>& possible_fusions,
1446 >                         vector<BowtieHit>& hits,
1447 >                         vector<BowtieHit>& merged_hits,
1448 >                         int fusion_dir = FUSION_NOTHING)
1449   {
1450    if (hits.size() == 0)
1451      return;
# Line 865 | Line 1454
1454    if (hits.size() > 1)
1455      {
1456        list<BowtieHit> hit_chain;
1457 <
1458 <      if (hits.front().antisense_align())
1459 <  copy(hits.rbegin(), hits.rend(), back_inserter(hit_chain));
1457 >      if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1458 >        {
1459 >          if (hits.front().antisense_align())
1460 >            copy(hits.rbegin(), hits.rend(), back_inserter(hit_chain));
1461 >          else
1462 >            copy(hits.begin(), hits.end(), back_inserter(hit_chain));
1463 >        }
1464        else
1465 <  copy(hits.begin(), hits.end(), back_inserter(hit_chain));
1466 <
1467 <      bh = merge_chain(rt, read_seq, read_quals, possible_juncs, possible_insertions, hit_chain);
1465 >        {
1466 >          bool bSawFusion = false;
1467 >          for (size_t i = 0; i < hits.size(); ++i)
1468 >            {
1469 >              bool pushed = false;
1470 >              if (!bSawFusion)
1471 >                {
1472 >                  if (i > 0)
1473 >                    {
1474 >                      if (hits[i-1].ref_id() != hits[i].ref_id())
1475 >                        bSawFusion = true;
1476 >                      else if(hits[i-1].antisense_align() != hits[i].antisense_align())
1477 >                        bSawFusion = true;
1478 >                      else
1479 >                        {
1480 >                          int dist = 0;
1481 >                          if (hits[i].antisense_align())
1482 >                            dist = hits[i-1].left() - hits[i].right();
1483 >                          else
1484 >                            dist = hits[i].left() - hits[i-1].right();
1485 >
1486 >                          if (dist >= max_report_intron_length || dist < -(int)max_insertion_length)
1487 >                            bSawFusion = true;
1488 >                        }
1489 >                    }
1490 >                }
1491 >              
1492 >              if (hits[i].fusion_opcode() == FUSION_NOTHING &&
1493 >                  ((fusion_dir == FUSION_FR && bSawFusion) || (fusion_dir == FUSION_RF && !bSawFusion)) &&
1494 >                  hits[i].left() < hits[i].right())
1495 >                {
1496 >                  hit_chain.push_back(hits[i].reverse());
1497 >                  pushed = true;
1498 >                }
1499 >
1500 >              if (i > 0 &&
1501 >                  hits[i].fusion_opcode() != FUSION_NOTHING &&
1502 >                  hits[i].ref_id() != hits[i-1].ref_id())
1503 >                {
1504 >                  hit_chain.push_back(hits[i].reverse());
1505 >                  pushed = true;
1506 >                }
1507 >
1508 >              if (!bSawFusion)
1509 >                {
1510 >                  if (hits[i].fusion_opcode() != FUSION_NOTHING)
1511 >                    bSawFusion = true;
1512 >                }
1513 >
1514 >              if (!pushed)
1515 >                hit_chain.push_back(hits[i]);
1516 >            }
1517 >        }
1518 >
1519 >      bh = merge_chain(rt,
1520 >                       read_seq,
1521 >                       read_quals,
1522 >                       possible_juncs,
1523 >                       possible_insertions,
1524 >                       possible_fusions,
1525 >                       hit_chain,
1526 >                       fusion_dir);
1527      }
1528    else
1529      {
# Line 883 | Line 1535
1535   }
1536  
1537   bool dfs_seg_hits(RefSequenceTable& rt,
1538 <      const string& read_seq,
1539 <      const string& read_quals,
1540 <      std::set<Junction>& possible_juncs,
1541 <      std::set<Insertion>& possible_insertions,
1542 <      vector<HitsForRead>& seg_hits_for_read,
1543 <      size_t curr,
1544 <      vector<BowtieHit>& seg_hit_stack,
1545 <      vector<BowtieHit>& joined_hits)
1538 >                  const string& read_seq,
1539 >                  const string& read_quals,
1540 >                  std::set<Junction>& possible_juncs,
1541 >                  std::set<Insertion>& possible_insertions,
1542 >                  std::set<Fusion>& possible_fusions,
1543 >                  vector<HitsForRead>& seg_hits_for_read,
1544 >                  size_t curr,
1545 >                  vector<BowtieHit>& seg_hit_stack,
1546 >                  vector<BowtieHit>& joined_hits,
1547 >                  int fusion_dir = FUSION_NOTHING)
1548   {
1549    assert (!seg_hit_stack.empty());
1550    bool join_success = false;
1551  
1552    if (curr < seg_hits_for_read.size())
899  {
900    for (size_t i = 0; i < seg_hits_for_read[curr].hits.size(); ++i)
1553      {
1554 <      BowtieHit& bh = seg_hits_for_read[curr].hits[i];
1555 <      BowtieHit& back = seg_hit_stack.back();
1556 <
1557 <      bool consistent_sense = bh.antisense_align() == back.antisense_align();
1558 <      bool same_contig = bh.ref_id() == back.ref_id();
1559 <
1560 <      // FIXME: when we have stranded reads, we need to fix this condition
1561 <      //bool consistent_strand = (bh.contiguous() || back.contiguous() ||
1562 <      //              (bh.antisense_splice() == back.antisense_splice()));
1563 <      if (consistent_sense && same_contig /*&& consistent_strand*/)
1564 <      {
1565 <        if (bh.antisense_align())
1566 <        {
1567 <          unsigned int bh_r = bh.right();
1568 <          unsigned int back_left = seg_hit_stack.back().left();
1569 <          if ((bh_r + max_report_intron_length >= back_left &&
1570 <            back_left >= bh_r) || (back_left + max_insertion_length >= bh_r && back_left < bh_r))
1571 <          {
1572 <            // these hits are compatible, so push bh onto the
1573 <            // stack, recurse, and pop it when done.
1574 <            seg_hit_stack.push_back(bh);
1575 <            bool success = dfs_seg_hits(rt,
1576 <                      read_seq,
1577 <                      read_quals,
1578 <                      possible_juncs,
1579 <                      possible_insertions,
1580 <                      seg_hits_for_read,
1581 <                      curr + 1,
1582 <                      seg_hit_stack,
1583 <                      joined_hits);
1584 <            if (success)
1585 <              join_success = true;
1586 <
1587 <            seg_hit_stack.pop_back();
1588 <          }
1589 <        }
1590 <        else
1591 <        {
1592 <          unsigned int bh_l = bh.left();
1593 <
1594 <          unsigned int back_right = seg_hit_stack.back().right();
1595 <          if ((back_right + max_report_intron_length >= bh_l  &&
1596 <            bh_l >= back_right) || (bh_l + max_insertion_length >= back_right && bh_l < back_right))
1597 <          {
1598 <            // these hits are compatible, so push bh onto the
1599 <            // stack, recurse, and pop it when done.
1600 <            seg_hit_stack.push_back(bh);
1601 <            bool success = dfs_seg_hits(rt,
1602 <                      read_seq,
1603 <                      read_quals,
1604 <                      possible_juncs,
1605 <                      possible_insertions,
1606 <                      seg_hits_for_read,
1607 <                      curr + 1,
1608 <                      seg_hit_stack,
1609 <                      joined_hits);
1610 <
1611 <            if (success)
1612 <              join_success = true;
1613 <
1614 <            seg_hit_stack.pop_back();
1615 <          }
1616 <        }
1617 <      }
1554 >      for (size_t i = 0; i < seg_hits_for_read[curr].hits.size(); ++i)
1555 >        {
1556 >          /*
1557 >           * As we reverse segments depending on directions like FR or RF,
1558 >           * it's necessary to recover the original segments.
1559 >           */
1560 >          BowtieHit bh = seg_hits_for_read[curr].hits[i];
1561 >          BowtieHit bh_prev = seg_hit_stack.back();
1562 >          
1563 >          BowtieHit* prevHit = &bh_prev;
1564 >          BowtieHit* currHit = &bh;
1565 >
1566 >          // daehwan
1567 >          // if (bh.insert_id() == 11921733)
1568 >          //  bDebug = true;
1569 >
1570 >          /*
1571 >           * Each segment has at most one fusion by assumption,
1572 >           */
1573 >          bool prevHit_fused = prevHit->fusion_opcode() != FUSION_NOTHING;
1574 >          bool currHit_fused = currHit->fusion_opcode() != FUSION_NOTHING;
1575 >
1576 >          /*
1577 >           * Count the number of fusions on prev and curr segments,
1578 >           * but this doesn't take into account the gap (if exists) between the two segments.
1579 >           */
1580 >          size_t num_fusions = prevHit_fused ? 1 : 0;
1581 >          num_fusions += currHit_fused ? 1 : 0;
1582 >
1583 >          int dir = prevHit_fused ? prevHit->fusion_opcode() : currHit->fusion_opcode();
1584 >
1585 >          /*
1586 >           * We don't allow reads that span more than two fusion points.
1587 >           */
1588 >          if (num_fusions >= 2)
1589 >            continue;
1590 >
1591 >          if (fusion_dir != FUSION_NOTHING && currHit_fused)
1592 >            continue;
1593 >
1594 >          if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1595 >            {
1596 >              if ((currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id()) ||
1597 >                  (!currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id2()))
1598 >                continue;
1599 >            }
1600 >
1601 >          if (bDebug)
1602 >            {
1603 >              cout << "daehwan - prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": "
1604 >                   << print_cigar(prevHit->cigar()) << endl;
1605 >              cout << "daehwan - prev sense: "
1606 >                   << (prevHit->antisense_align() ? "-" : "+")
1607 >                   << "\t"
1608 >                   << (prevHit->antisense_align2() ? "-" : "+")
1609 >                   << endl;
1610 >              cout << "daehwan - prev coords: "
1611 >                   << prevHit->left()
1612 >                   << "\t"
1613 >                   << prevHit->right()
1614 >                   << endl;
1615 >              
1616 >              cout << "daehwan - curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": "
1617 >                   << print_cigar(currHit->cigar()) << endl;
1618 >              cout << "daehwan - curr sense: "
1619 >                   << (currHit->antisense_align() ? "-" : "+")
1620 >                   << "\t"
1621 >                   << (currHit->antisense_align2() ? "-" : "+")
1622 >                   << endl;
1623 >              cout << "daehwan - curr corrds: "
1624 >                   << currHit->left()
1625 >                   << "\t"
1626 >                   << currHit->right()
1627 >                   << endl;
1628 >            }
1629 >
1630 >          if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF) &&
1631 >              prevHit->ref_id2() != currHit->ref_id())
1632 >            continue;
1633 >
1634 >          if ((fusion_dir == FUSION_FR && !currHit->antisense_align()) ||
1635 >              (fusion_dir == FUSION_RF && currHit->antisense_align()))
1636 >            continue;
1637 >
1638 >          if (currHit_fused && dir == FUSION_RR)
1639 >            *currHit = currHit->reverse();
1640 >
1641 >          if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RF ||
1642 >              (currHit_fused && currHit->ref_id() == currHit->ref_id2() && (dir == FUSION_FR || dir == FUSION_RF)))
1643 >            {
1644 >              if (currHit_fused)
1645 >                {
1646 >                  if ((dir == FUSION_FR && currHit->antisense_align()) ||
1647 >                      (dir == FUSION_RF && !currHit->antisense_align()))
1648 >                    *currHit = currHit->reverse();
1649 >                }
1650 >              else
1651 >                {
1652 >                  if (fusion_dir == FUSION_FR && currHit->antisense_align())
1653 >                    *currHit = currHit->reverse();
1654 >                }
1655 >            }
1656 >          /*
1657 >           * Switch prevHit and currHit in FUSION_NOTHING and FUSION_FF cases
1658 >           * to make it easier to check the distance in the gap between the two segments.
1659 >           */
1660 >          else if ((num_fusions == 0 && prevHit->antisense_align() && currHit->antisense_align() && prevHit->ref_id() == currHit->ref_id() && prevHit->left() <= currHit->right() + max_report_intron_length && prevHit->left() + max_insertion_length >= currHit->right()) ||
1661 >                   (num_fusions == 1 && (dir == FUSION_FF || dir == FUSION_RR)&&
1662 >                    ((!prevHit_fused && prevHit->antisense_align()) || (!currHit_fused && currHit->antisense_align())))
1663 >                   )
1664 >            {
1665 >              BowtieHit* tempHit = prevHit;
1666 >              prevHit = currHit;
1667 >              currHit = tempHit;
1668 >            }
1669 >          else if (num_fusions == 0)
1670 >            {
1671 >              if (prevHit->ref_id2() == currHit->ref_id() && prevHit->antisense_align() == currHit->antisense_align())
1672 >                {
1673 >                  int dist = 0;
1674 >                  if (prevHit->antisense_align())
1675 >                    dist = prevHit->left() - currHit->right();
1676 >                  else
1677 >                    dist = currHit->left() - prevHit->right();
1678 >                  
1679 >                  if (dist > max_report_intron_length || dist < -(int)max_insertion_length)
1680 >                    {
1681 >                      if ((prevHit->antisense_align() && prevHit->left() > currHit->left()) ||
1682 >                          (!prevHit->antisense_align() && prevHit->left() < currHit->left()))
1683 >                        dir = FUSION_FF;
1684 >                      else
1685 >                        dir = FUSION_RR;
1686 >                    }
1687 >                }
1688 >              else
1689 >                {
1690 >                  if (prevHit->antisense_align() == currHit->antisense_align())
1691 >                    {
1692 >                      if ((prevHit->antisense_align() && prevHit->ref_id() > currHit->ref_id()) ||
1693 >                          (!prevHit->antisense_align() && prevHit->ref_id() < currHit->ref_id()))
1694 >                        dir = FUSION_FF;
1695 >                      else
1696 >                        dir = FUSION_RR;
1697 >                    }
1698 >                  else if (!prevHit->antisense_align())
1699 >                    dir = FUSION_FR;
1700 >                  else
1701 >                    dir = FUSION_RF;
1702 >
1703 >                  if (dir == FUSION_FR)
1704 >                    *currHit = currHit->reverse();
1705 >                  else if(dir == FUSION_RF)
1706 >                    *prevHit = prevHit->reverse();
1707 >                }
1708 >            }
1709 >
1710 >          // daehwan - test
1711 >          if (bDebug)
1712 >            {
1713 >              cout << "insert id: " << prevHit->insert_id() << endl;
1714 >              cout << "(" << curr - 1 << ") prev: " << prevHit->seq() << " : " << (prevHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
1715 >              cout << "(" << curr << ") curr: " << currHit->seq() << " : " << (currHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
1716 >              cout << "prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": " << print_cigar(prevHit->cigar()) << endl;
1717 >              cout << "curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": " << print_cigar(currHit->cigar()) << endl;
1718 >              cout << "prev coords: "
1719 >                   << prevHit->left()
1720 >                   << "\t"
1721 >                   << prevHit->right()
1722 >                   << endl;
1723 >              cout << "curr corrds: "
1724 >                   << currHit->left()
1725 >                   << "\t"
1726 >                   << currHit->right()
1727 >                   << endl;
1728 >              cout << "prev sense: "
1729 >                   << (prevHit->antisense_align() ? "-" : "+")
1730 >                   << "\t"
1731 >                   << (prevHit->antisense_align2() ? "-" : "+")
1732 >                   << endl;
1733 >              cout << "curr sense: "
1734 >                   << (currHit->antisense_align() ? "-" : "+")
1735 >                   << "\t"
1736 >                   << (currHit->antisense_align2() ? "-" : "+")
1737 >                   << endl;
1738 >            }
1739 >
1740 >          if (num_fusions == 1)
1741 >            {
1742 >              // daehwan
1743 >              if (bDebug)
1744 >                {
1745 >                  cout << "direction: " << (int)dir << endl;
1746 >                }
1747 >
1748 >              /*
1749 >               * orient the fused segment, which depends on a fusion direction.
1750 >               */
1751 >              if (dir != FUSION_FF && dir != FUSION_RR)
1752 >                {
1753 >                  bool prevHit_rep = false;
1754 >                  bool currHit_rep = false;
1755 >
1756 >                  if (prevHit_fused)
1757 >                    {
1758 >                      if ((dir == FUSION_FR && !currHit->antisense_align()) ||
1759 >                          (dir == FUSION_RF && currHit->antisense_align()))
1760 >                        continue;
1761 >
1762 >                      if (prevHit->ref_id2() != currHit->ref_id())
1763 >                        prevHit_rep = true;
1764 >                      else if ((dir == FUSION_FR && prevHit->antisense_align()) ||
1765 >                               (dir == FUSION_RF && !prevHit->antisense_align()))
1766 >                        prevHit_rep = true;
1767 >                    }
1768 >                  
1769 >                  if (currHit_fused)
1770 >                    {
1771 >                      if ((dir == FUSION_FR && prevHit->antisense_align()) ||
1772 >                          (dir == FUSION_RF && !prevHit->antisense_align()))
1773 >                        continue;
1774 >                      
1775 >                      if (currHit->ref_id() != prevHit->ref_id2())
1776 >                        currHit_rep = true;
1777 >                    }
1778 >                  
1779 >                  if (bDebug)
1780 >                    {
1781 >                      if (prevHit_rep) cout << "1. reversed in prev" << endl;
1782 >                      if (currHit_rep) cout << "1. reversed in curr" << endl;
1783 >                    }
1784 >
1785 >                  if (prevHit_rep)
1786 >                    *prevHit = prevHit->reverse();
1787 >                  if (currHit_rep)
1788 >                    *currHit = currHit->reverse();
1789 >
1790 >                  prevHit_rep = false;
1791 >                  currHit_rep = false;
1792 >
1793 >                  if (prevHit_fused)
1794 >                    {
1795 >                      if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
1796 >                        currHit_rep = true;
1797 >                    }
1798 >                  else
1799 >                    {
1800 >                      if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
1801 >                        prevHit_rep = true;
1802 >                    }
1803 >
1804 >                  if (prevHit_rep)
1805 >                    *prevHit = prevHit->reverse();
1806 >                  if (currHit_rep)
1807 >                    *currHit = currHit->reverse();
1808 >
1809 >                  // daehwan
1810 >                  if (bDebug)
1811 >                    {
1812 >                      if (prevHit_rep) cout << "2. reversed in prev" << endl;
1813 >                      if (currHit_rep) cout << "2. reversed in curr" << endl;
1814 >                    }
1815 >                }            
1816 >            }
1817 >
1818 >          bool same_contig = prevHit->ref_id2() == currHit->ref_id();
1819 >          if (!same_contig && num_fusions > 0)
1820 >            continue;
1821 >
1822 >          if (same_contig && num_fusions >= 1)
1823 >            {
1824 >              if (prevHit->antisense_align2() != currHit->antisense_align())
1825 >                continue;
1826 >            }
1827 >
1828 >          int bh_l = 0, back_right = 0, dist = 0;
1829 >          if (same_contig)
1830 >            {
1831 >              if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF || dir == FUSION_FR || dir == FUSION_RF) &&
1832 >                  prevHit->antisense_align2())
1833 >                {
1834 >                  bh_l = prevHit->right() + 1;
1835 >                  back_right = currHit->left() + 1;
1836 >                }
1837 >              else
1838 >                {
1839 >                  bh_l = currHit->left();
1840 >                  back_right = prevHit->right();
1841 >                }
1842 >
1843 >              dist = bh_l - back_right;
1844 >            }
1845 >
1846 >          // daehwan - pass
1847 >          if (bDebug)
1848 >            {
1849 >              cout << "daehwan - pass" << endl;
1850 >              cout << "prev coords: " << prevHit->left() << "\t" << prevHit->right() << endl;
1851 >              cout << "curr coords: " << currHit->left() << "\t" << currHit->right() << endl;
1852 >            }
1853 >
1854 >          if (!same_contig ||
1855 >              (same_contig && num_fusions == 0 && dir != FUSION_NOTHING && fusion_dir == FUSION_NOTHING) ||
1856 >              (same_contig && dist <= max_report_intron_length && dist >= -(int)max_insertion_length && prevHit->is_forwarding_right() == currHit->is_forwarding_left()))
1857 >            {
1858 >              // daehwan
1859 >              if (bDebug)
1860 >                {
1861 >                  cout << "daehwan - really passed!!" << endl;
1862 >                }
1863 >
1864 >              BowtieHit tempHit = seg_hit_stack.back();
1865 >              seg_hit_stack.back() = bh_prev;
1866 >              
1867 >              // these hits are compatible, so push bh onto the
1868 >              // stack, recurse, and pop it when done.
1869 >              seg_hit_stack.push_back(bh);
1870 >              bool success = dfs_seg_hits(rt,
1871 >                                          read_seq,
1872 >                                          read_quals,
1873 >                                          possible_juncs,
1874 >                                          possible_insertions,
1875 >                                          possible_fusions,
1876 >                                          seg_hits_for_read,
1877 >                                          curr + 1,
1878 >                                          seg_hit_stack,
1879 >                                          joined_hits,
1880 >                                          dir == FUSION_NOTHING ? fusion_dir : dir);
1881 >              
1882 >              if (success)
1883 >                join_success = true;
1884 >              
1885 >              seg_hit_stack.pop_back();
1886 >              seg_hit_stack.back() = tempHit;
1887 >            }
1888 >        }
1889      }
967  }
1890    else
1891 <  {
1892 <    merge_segment_chain(rt,
1893 <            read_seq,
1894 <            read_quals,
1895 <            possible_juncs,
1896 <            possible_insertions,
1897 <            seg_hit_stack,
1898 <            joined_hits);
1899 <    return join_success = true;
1900 <  }
1891 >    {
1892 >      merge_segment_chain(rt,
1893 >                          read_seq,
1894 >                          read_quals,
1895 >                          possible_juncs,
1896 >                          possible_insertions,
1897 >                          possible_fusions,
1898 >                          seg_hit_stack,
1899 >                          joined_hits,
1900 >                          fusion_dir);
1901 >
1902 >      return join_success = true;
1903 >    }
1904    return join_success;
1905   }
1906  
1907   bool join_segments_for_read(RefSequenceTable& rt,
1908 <          const string& read_seq,
1909 <          const string& read_quals,
1910 <          std::set<Junction>& possible_juncs,
1911 <          std::set<Insertion>& possible_insertions,
1912 <          vector<HitsForRead>& seg_hits_for_read,
1913 <          vector<BowtieHit>& joined_hits)
1914 < {
1908 >                            const string& read_seq,
1909 >                            const string& read_quals,
1910 >                            std::set<Junction>& possible_juncs,
1911 >                            std::set<Insertion>& possible_insertions,
1912 >                            std::set<Fusion>& possible_fusions,
1913 >                            vector<HitsForRead>& seg_hits_for_read,
1914 >                            vector<BowtieHit>& joined_hits)
1915 > {      
1916    vector<BowtieHit> seg_hit_stack;
1917    bool join_success = false;
1918  
1919 +  // ignore segments that map to more than this many places.
1920 +  if (bowtie2)
1921 +    {
1922 +      const size_t max_seg_hits = max_multihits * 2;
1923 +      for (size_t s = 0; s < seg_hits_for_read.size(); ++s)
1924 +        {
1925 +          if (seg_hits_for_read[s].hits.size() > max_seg_hits)
1926 +            return join_success;
1927 +        }
1928 +    }
1929 +
1930    for (size_t i = 0; i < seg_hits_for_read[0].hits.size(); ++i)
1931      {
1932        BowtieHit& bh = seg_hits_for_read[0].hits[i];
1933 <      seg_hit_stack.push_back(bh);
1933 >
1934 >      if (bh.fusion_opcode() == FUSION_RR)
1935 >        seg_hit_stack.push_back(bh.reverse());
1936 >      else
1937 >        seg_hit_stack.push_back(bh);
1938 >      
1939        bool success = dfs_seg_hits(rt,
1940 <          read_seq,
1941 <          read_quals,
1942 <          possible_juncs,
1943 <          possible_insertions,
1944 <          seg_hits_for_read,
1945 <          1,
1946 <          seg_hit_stack,
1947 <          joined_hits);
1940 >                                  read_seq,
1941 >                                  read_quals,
1942 >                                  possible_juncs,
1943 >                                  possible_insertions,
1944 >                                  possible_fusions,
1945 >                                  seg_hits_for_read,
1946 >                                  1,
1947 >                                  seg_hit_stack,
1948 >                                  joined_hits);
1949        if (success)
1950    join_success = true;
1951        seg_hit_stack.pop_back();
# Line 1011 | Line 1954
1954    return join_success;
1955   }
1956  
1957 < void join_segment_hits(GBamWriter& bam_writer, std::set<Junction>& possible_juncs,
1015 <           std::set<Insertion>& possible_insertions,
1016 <           ReadTable& unmapped_reads,
1017 <           RefSequenceTable& rt,
1018 <           //FILE* reads_file,
1019 <           ReadStream& readstream,
1020 <           vector<HitStream>& contig_hits,
1021 <           vector<HitStream>& spliced_hits)
1957 > struct JoinSegmentsWorker
1958   {
1959 <  uint32_t curr_contig_obs_order = VMAXINT32;
1024 <  HitStream* first_seg_contig_stream = NULL;
1025 <  uint64_t next_contig_id = 0;
1026 <
1027 <  if (contig_hits.size())
1028 <    {
1029 <      first_seg_contig_stream = &(contig_hits.front());
1030 <      next_contig_id = first_seg_contig_stream->next_group_id();
1031 <      curr_contig_obs_order = unmapped_reads.observation_order(next_contig_id);
1032 <    }
1033 <
1034 <  HitsForRead curr_hit_group;
1035 <
1036 <  uint32_t curr_spliced_obs_order = VMAXINT32;
1037 <  HitStream* first_seg_spliced_stream = NULL;
1038 <  uint64_t next_spliced_id = 0;
1039 <
1040 <  if (spliced_hits.size())
1041 <    {
1042 <      first_seg_spliced_stream = &(spliced_hits.front());
1043 <      next_spliced_id = first_seg_spliced_stream->next_group_id();
1044 <      curr_spliced_obs_order = unmapped_reads.observation_order(next_spliced_id);
1045 <    }
1046 <
1047 <  while(curr_contig_obs_order != VMAXINT32 ||
1048 <  curr_spliced_obs_order != VMAXINT32)
1049 <    {
1050 <      uint32_t read_in_process;
1051 <      vector<HitsForRead> seg_hits_for_read;
1052 <      seg_hits_for_read.resize(contig_hits.size());
1053 <
1054 <      if (curr_contig_obs_order < curr_spliced_obs_order)
1055 <  {
1056 <    first_seg_contig_stream->next_read_hits(curr_hit_group);
1057 <    seg_hits_for_read.front() = curr_hit_group;
1058 <
1059 <    next_contig_id = first_seg_contig_stream->next_group_id();
1060 <    uint32_t next_order = unmapped_reads.observation_order(next_contig_id);
1061 <
1062 <    read_in_process = curr_contig_obs_order;
1063 <    curr_contig_obs_order = next_order;
1064 <  }
1065 <      else if  (curr_spliced_obs_order < curr_contig_obs_order)
1959 >  void operator()()
1960    {
1961 <    first_seg_spliced_stream->next_read_hits(curr_hit_group);
1962 <    seg_hits_for_read.front() = curr_hit_group;
1963 <
1964 <    next_spliced_id = first_seg_spliced_stream->next_group_id();
1965 <    uint32_t next_order = unmapped_reads.observation_order(next_spliced_id);
1966 <
1967 <    read_in_process = curr_spliced_obs_order;
1968 <    curr_spliced_obs_order = next_order;
1969 <  }
1970 <      else if (curr_contig_obs_order == curr_spliced_obs_order &&
1971 <         curr_contig_obs_order != VMAXINT32 &&
1972 <         curr_spliced_obs_order != VMAXINT32)
1973 <  {
1974 <    first_seg_contig_stream->next_read_hits(curr_hit_group);
1975 <
1976 <    HitsForRead curr_spliced_group;
1977 <    first_seg_spliced_stream->next_read_hits(curr_spliced_group);
1978 <
1979 <    curr_hit_group.hits.insert(curr_hit_group.hits.end(),
1980 <             curr_spliced_group.hits.begin(),
1981 <             curr_spliced_group.hits.end());
1982 <    seg_hits_for_read.front() = curr_hit_group;
1983 <    read_in_process = curr_spliced_obs_order;
1984 <
1985 <    next_contig_id = first_seg_contig_stream->next_group_id();
1986 <    uint32_t next_order = unmapped_reads.observation_order(next_contig_id);
1987 <
1988 <    next_spliced_id = first_seg_spliced_stream->next_group_id();
1989 <    uint32_t next_spliced_order = unmapped_reads.observation_order(next_spliced_id);
1990 <
1991 <    curr_spliced_obs_order = next_spliced_order;
1992 <    curr_contig_obs_order = next_order;
1993 <  }
1994 <      else
1995 <  {
1996 <    break;
1997 <  }
1998 <
1999 <      if (contig_hits.size() > 1)
2000 <      {
2001 <        look_right_for_hit_group(unmapped_reads,
2002 <              contig_hits,
2003 <              0,
2004 <              spliced_hits,
2005 <              curr_hit_group,
2006 <              seg_hits_for_read);
2007 <      }
2008 <
2009 <      size_t last_non_empty = seg_hits_for_read.size() - 1;
2010 <      while(last_non_empty >= 0 && seg_hits_for_read[last_non_empty].hits.empty())
2011 <      {
2012 <        --last_non_empty;
2013 <      }
2014 <
2015 <      seg_hits_for_read.resize(last_non_empty + 1);
2016 <      if (!seg_hits_for_read[last_non_empty].hits[0].end())
2017 <         continue;
2018 <
2019 <      if (!seg_hits_for_read.empty() && !seg_hits_for_read[0].hits.empty())
2020 <  {
2021 <    uint64_t insert_id = seg_hits_for_read[0].hits[0].insert_id();
2022 <    Read read;
2023 <    /* if (get_read_from_stream(insert_id,
2024 <           reads_file,
2025 <           reads_format,
2026 <           false,
2027 <           read)) */
2028 <    if (readstream.getRead(insert_id, read))
2029 <      {
2030 <        vector<BowtieHit> joined_hits;
2031 <        join_segments_for_read(rt,
2032 <             read.seq.c_str(),
2033 <             read.qual.c_str(),
2034 <             possible_juncs,
2035 <             possible_insertions,
2036 <             seg_hits_for_read,
2037 <             joined_hits);
2038 <
2039 <        sort(joined_hits.begin(), joined_hits.end());
2040 <        vector<BowtieHit>::iterator new_end = unique(joined_hits.begin(), joined_hits.end());
2041 <        joined_hits.erase(new_end, joined_hits.end());
2042 <
2043 <        for (size_t i = 0; i < joined_hits.size(); i++)
2044 <        {
2045 <          const char* ref_name = rt.get_name(joined_hits[i].ref_id());
2046 <          if (color && !color_out)
2047 <            //print_hit(stdout, read_name, joined_hits[i], ref_name, joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true);
2048 <            print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, joined_hits[i].seq().c_str(),
2049 <                                         joined_hits[i].qual().c_str(), true);
2050 <          else
2051 <            print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name,
2052 <                                          read.seq.c_str(), read.qual.c_str(), false);
2053 <            //print_hit(stdout, read_name, joined_hits[i], ref_name, read_seq, read_quals, false);
2054 <        }
2055 <      }
2056 <    else
2057 <      {
2058 <        err_die("Error: could not get read # %d from stream\n",
2059 <               read_in_process);
2060 <      }
2061 <  }
2062 <      else
2063 <  {
2064 <    //fprintf(stderr, "Warning: couldn't join segments for read # %d\n", read_in_process);
2065 <  }
2066 < }
2067 < }
2068 <
2069 < void driver(GBamWriter& bam_writer, istream& ref_stream,
2070 <      vector<FILE*>& possible_juncs_files,
2071 <      vector<FILE*>& possible_insertions_files,
2072 <      vector<FILE*>& possible_deletions_files,
2073 <      vector<FZPipe>& spliced_segmap_files, //.sam.z files
2074 <       vector<string>& segmap_fnames, //.bam files
2075 <      ReadStream& readstream)
1961 >    ReadTable it;
1962 >    GBamWriter bam_writer(bam_output_fname.c_str(), sam_header_fname.c_str());
1963 >    ReadStream readstream(reads_fname);
1964 >    if (readstream.file() == NULL)
1965 >      err_die("Error: cannot open %s for reading\n", reads_fname.c_str());
1966 >
1967 >    if (read_offset > 0)
1968 >      readstream.seek(read_offset);
1969 >
1970 >    bool need_seq = true;
1971 >    bool need_qual = color;
1972 >
1973 >    vector<HitStream> contig_hits;
1974 >    vector<HitStream> spliced_hits;
1975 >    vector<HitFactory*> factories;
1976 >    for (size_t i = 0; i < segmap_fnames.size(); ++i)
1977 >      {
1978 >        HitFactory* fac = new BAMHitFactory(it, *rt);
1979 >        factories.push_back(fac);
1980 >        HitStream hs(segmap_fnames[i], fac, false, false, false, need_seq, need_qual);
1981 >        
1982 >        if (seg_offsets[i] > 0)
1983 >          hs.seek(seg_offsets[i]);
1984 >
1985 >        contig_hits.push_back(hs);
1986 >      }
1987 >    
1988 >    for (size_t i = 0; i < spliced_segmap_fnames.size(); ++i)
1989 >      {
1990 >        int anchor_length = 0;
1991 >        HitFactory* fac = new SplicedBAMHitFactory(it, *rt, anchor_length);
1992 >        factories.push_back(fac);
1993 >        
1994 >        HitStream hs(spliced_segmap_fnames[i], fac, true, false, false, need_seq, need_qual);
1995 >        if (spliced_seg_offsets[i] > 0)
1996 >          hs.seek(spliced_seg_offsets[i]);
1997 >
1998 >        spliced_hits.push_back(hs);
1999 >      }
2000 >
2001 >    uint32_t curr_contig_obs_order = VMAXINT32;
2002 >    HitStream* first_seg_contig_stream = NULL;
2003 >    uint64_t next_contig_id = 0;
2004 >    
2005 >    if (contig_hits.size() > 0)
2006 >      {
2007 >        first_seg_contig_stream = &(contig_hits.front());
2008 >        next_contig_id = first_seg_contig_stream->next_group_id();
2009 >        curr_contig_obs_order = it.observation_order(next_contig_id);
2010 >      }
2011 >
2012 >    HitsForRead curr_hit_group;
2013 >    
2014 >    uint32_t curr_spliced_obs_order = VMAXINT32;
2015 >    HitStream* first_seg_spliced_stream = NULL;
2016 >    uint64_t next_spliced_id = 0;
2017 >    
2018 >    if (spliced_hits.size() > 0)
2019 >      {
2020 >        first_seg_spliced_stream = &(spliced_hits.front());
2021 >        next_spliced_id = first_seg_spliced_stream->next_group_id();
2022 >        curr_spliced_obs_order = it.observation_order(next_spliced_id);
2023 >      }
2024 >
2025 >    while((curr_contig_obs_order != VMAXINT32 || curr_spliced_obs_order != VMAXINT32) &&
2026 >          (curr_contig_obs_order < end_id || curr_spliced_obs_order < end_id))
2027 >      {
2028 >        uint32_t read_in_process;
2029 >        vector<HitsForRead> seg_hits_for_read;
2030 >        seg_hits_for_read.resize(contig_hits.size());
2031 >
2032 >        if (curr_contig_obs_order < curr_spliced_obs_order)
2033 >          {
2034 >            first_seg_contig_stream->next_read_hits(curr_hit_group);
2035 >            seg_hits_for_read.front() = curr_hit_group;
2036 >            
2037 >            next_contig_id = first_seg_contig_stream->next_group_id();
2038 >            uint32_t next_order = it.observation_order(next_contig_id);
2039 >            
2040 >            read_in_process = curr_contig_obs_order;
2041 >            curr_contig_obs_order = next_order;
2042 >          }
2043 >        else if  (curr_spliced_obs_order < curr_contig_obs_order)
2044 >          {
2045 >            first_seg_spliced_stream->next_read_hits(curr_hit_group);
2046 >            seg_hits_for_read.front() = curr_hit_group;
2047 >                
2048 >            next_spliced_id = first_seg_spliced_stream->next_group_id();
2049 >            uint32_t next_order = it.observation_order(next_spliced_id);
2050 >
2051 >            read_in_process = curr_spliced_obs_order;
2052 >            curr_spliced_obs_order = next_order;
2053 >
2054 >            if (read_in_process < begin_id)
2055 >              continue;
2056 >          }
2057 >        else if (curr_contig_obs_order == curr_spliced_obs_order &&
2058 >                 curr_contig_obs_order != VMAXINT32 &&
2059 >                 curr_spliced_obs_order != VMAXINT32)
2060 >          {
2061 >            first_seg_contig_stream->next_read_hits(curr_hit_group);
2062 >            
2063 >            HitsForRead curr_spliced_group;
2064 >            first_seg_spliced_stream->next_read_hits(curr_spliced_group);
2065 >            
2066 >            curr_hit_group.hits.insert(curr_hit_group.hits.end(),
2067 >                                       curr_spliced_group.hits.begin(),
2068 >                                       curr_spliced_group.hits.end());
2069 >            seg_hits_for_read.front() = curr_hit_group;
2070 >            read_in_process = curr_spliced_obs_order;
2071 >            
2072 >            next_contig_id = first_seg_contig_stream->next_group_id();
2073 >            uint32_t next_order = it.observation_order(next_contig_id);
2074 >            
2075 >            next_spliced_id = first_seg_spliced_stream->next_group_id();
2076 >            uint32_t next_spliced_order = it.observation_order(next_spliced_id);
2077 >            
2078 >            curr_spliced_obs_order = next_spliced_order;
2079 >            curr_contig_obs_order = next_order;
2080 >          }
2081 >        else
2082 >          {
2083 >            break;
2084 >          }
2085 >        
2086 >        if (contig_hits.size() > 1)
2087 >          {
2088 >            look_right_for_hit_group(it,
2089 >                                     contig_hits,
2090 >                                     0,
2091 >                                     spliced_hits,
2092 >                                     curr_hit_group,
2093 >                                     seg_hits_for_read);
2094 >          }
2095 >
2096 >        size_t last_non_empty = seg_hits_for_read.size() - 1;
2097 >        while(last_non_empty >= 0 && seg_hits_for_read[last_non_empty].hits.empty())
2098 >          {
2099 >            --last_non_empty;
2100 >          }
2101 >        
2102 >        seg_hits_for_read.resize(last_non_empty + 1);
2103 >        if (!seg_hits_for_read[last_non_empty].hits[0].end())
2104 >          continue;
2105 >        
2106 >        if (!seg_hits_for_read.empty() && !seg_hits_for_read[0].hits.empty())
2107 >          {
2108 >            uint64_t insert_id = seg_hits_for_read[0].hits[0].insert_id();
2109 >            if (insert_id >= begin_id && insert_id < end_id)
2110 >              {
2111 >                Read read;
2112 >                if (readstream.getRead(insert_id, read))
2113 >                  {
2114 >                    vector<BowtieHit> joined_hits;
2115 >                    join_segments_for_read(*rt,
2116 >                                           read.seq.c_str(),
2117 >                                           read.qual.c_str(),
2118 >                                           *possible_juncs,
2119 >                                           *possible_insertions,
2120 >                                           *possible_fusions,
2121 >                                           seg_hits_for_read,
2122 >                                           joined_hits);
2123 >                    
2124 >                    sort(joined_hits.begin(), joined_hits.end());
2125 >                    vector<BowtieHit>::iterator new_end = unique(joined_hits.begin(), joined_hits.end());
2126 >                    joined_hits.erase(new_end, joined_hits.end());
2127 >                    for (size_t i = 0; i < joined_hits.size(); i++)
2128 >                      {
2129 >                        const char* ref_name = rt->get_name(joined_hits[i].ref_id());
2130 >                        const char* ref_name2 = "";
2131 >                        if (joined_hits[i].fusion_opcode() != FUSION_NOTHING)
2132 >                          ref_name2 = rt->get_name(joined_hits[i].ref_id2());
2133 >
2134 >                        if (color && !color_out)
2135 >                          print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2136 >                                       joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true);
2137 >                        else
2138 >                          print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2139 >                                       read.seq.c_str(), read.qual.c_str(), false);
2140 >                      }
2141 >                  }
2142 >                else
2143 >                  {
2144 >                    err_die("Error: could not get read # %d from stream\n",
2145 >                            read_in_process);
2146 >                  }
2147 >              }
2148 >          }
2149 >        else
2150 >          {
2151 >            //fprintf(stderr, "Warning: couldn't join segments for read # %d\n", read_in_process);
2152 >          }
2153 >      }
2154 >
2155 >    for (size_t fac = 0; fac < factories.size(); ++fac)
2156 >      {
2157 >        delete factories[fac];
2158 >      }
2159 >    factories.clear();
2160 >  }
2161 >
2162 >  string bam_output_fname;
2163 >  string sam_header_fname;
2164 >  string reads_fname;
2165 >
2166 >  vector<string> segmap_fnames;
2167 >  vector<string> spliced_segmap_fnames;
2168 >  
2169 >  std::set<Junction>* possible_juncs;
2170 >  std::set<Insertion>* possible_insertions;
2171 >  std::set<Fusion>* possible_fusions;
2172 >  
2173 >  RefSequenceTable* rt;
2174 >
2175 >  uint64_t begin_id;
2176 >  uint64_t end_id;
2177 >  int64_t read_offset;
2178 >  vector<int64_t> seg_offsets;
2179 >  vector<int64_t> spliced_seg_offsets;
2180 > };
2181 >
2182 > void driver(const string& bam_output_fname,
2183 >            istream& ref_stream,
2184 >            vector<FILE*>& possible_juncs_files,
2185 >            vector<FILE*>& possible_insertions_files,
2186 >            vector<FILE*>& possible_deletions_files,
2187 >            vector<FILE*>& possible_fusions_files,
2188 >            vector<string>& spliced_segmap_fnames, //.bam files
2189 >            vector<string>& segmap_fnames, //.bam files
2190 >            const string& reads_fname)
2191   {
2192 +  if (!parallel)
2193 +    num_threads = 1;
2194 +
2195    if (segmap_fnames.size() == 0)
2196    {
2197      fprintf(stderr, "No hits to process, exiting\n");
2198      exit(0);
2199    }
2200  
2201 <  RefSequenceTable rt(true, true);
2201 >  RefSequenceTable rt(sam_header, true);
2202    fprintf (stderr, "Loading reference sequences...\n");
2203 <  get_seqs(ref_stream, rt, true, false);
2203 >  get_seqs(ref_stream, rt, true);
2204      fprintf (stderr, "        reference sequences loaded.\n");
1193  ReadTable it;
1194
1195  bool need_seq = true;
1196  bool need_qual = color;
1197  //rewind(reads_file);
1198  readstream.rewind();
1199
1200  //vector<HitTable> seg_hits;
1201  vector<HitStream> contig_hits;
1202  vector<HitStream> spliced_hits;
1203
1204  vector<HitFactory*> factories;
1205  for (size_t i = 0; i < segmap_fnames.size(); ++i)
1206  {
1207    HitFactory* fac = new BAMHitFactory(it, rt);
1208    factories.push_back(fac);
1209    HitStream hs(segmap_fnames[i], fac, false, false, false, need_seq, need_qual);
1210    contig_hits.push_back(hs);
1211  }
1212
1213  fprintf(stderr, "Loading spliced hits...");
1214
1215  //vector<vector<pair<uint32_t, HitsForRead> > > spliced_hits;
1216  for (size_t i = 0; i < spliced_segmap_files.size(); ++i)
1217  {
1218    int anchor_length = 0;
1219
1220    // we allow read alignments even 1bp overlapping with juctions.
1221    // if (i == 0 || i == spliced_seg_files.size() - 1)
1222    // anchor_length = min_anchor_len;
1223
1224    HitFactory* fac = new SplicedSAMHitFactory(it,
1225                    rt,
1226                    anchor_length);
1227    factories.push_back(fac);
1228
1229    HitStream hs(spliced_segmap_files[i].file, fac, true, false, false, need_seq, need_qual);
1230    spliced_hits.push_back(hs);
1231
1232  }
1233  fprintf(stderr, "done\n");
2205  
2206    fprintf(stderr, "Loading junctions...");
2207    std::set<Junction> possible_juncs;
# Line 1254 | Line 2225
2225    }
2226    fprintf(stderr, "done\n");
2227  
1257
2228    fprintf(stderr, "Loading deletions...");
2229  
2230    for (size_t i = 0; i < possible_deletions_files.size(); ++i)
# Line 1286 | Line 2256
2256    }
2257    fprintf(stderr, "done\n");
2258  
1289
2259    /*
2260     * Read the insertions from the list of insertion
2261     * files into a set
2262     */
2263 +  fprintf(stderr, "Loading insertions...");
2264    std::set<Insertion> possible_insertions;
2265    for (size_t i=0; i < possible_insertions_files.size(); ++i)
2266    {
# Line 1320 | Line 2290
2290        possible_insertions.insert(Insertion(ref_id, left_coord, sequence));
2291      }
2292    }
2293 +  fprintf(stderr, "done\n");
2294  
2295 <  join_segment_hits(bam_writer, possible_juncs, possible_insertions, it, rt, readstream, contig_hits, spliced_hits);
2296 <  //join_segment_hits(possible_juncs, possible_insertions, it, rt, reads_file.file, contig_hits, spliced_hits);
2295 >  vector<uint64_t> read_ids;
2296 >  vector<vector<int64_t> > offsets;
2297 >  if (num_threads > 1)
2298 >    {
2299 >      vector<string> fnames;
2300 >      fnames.push_back(reads_fname);
2301 >      fnames.insert(fnames.end(), spliced_segmap_fnames.rbegin(), spliced_segmap_fnames.rend());
2302 >      fnames.insert(fnames.end(), segmap_fnames.rbegin(), segmap_fnames.rend());
2303 >      bool enough_data = calculate_offsets(fnames, read_ids, offsets);
2304 >      if (!enough_data)
2305 >        num_threads = 1;
2306 >    }
2307 >
2308 >  std::set<Fusion> possible_fusions;
2309 >  if (fusion_search)
2310 >    {
2311 >      fprintf(stderr, "Loading fusions...");
2312 >      for (size_t i=0; i < possible_fusions_files.size(); ++i)
2313 >        {
2314 >          char splice_buf[2048];        
2315 >          FILE* fusions_file = possible_fusions_files[i];
2316 >          if(!fusions_file){
2317 >            continue;
2318 >          }
2319 >          while(fgets(splice_buf, 2048, fusions_file)){
2320 >            char* nl = strrchr(splice_buf, '\n');
2321 >            char* buf = splice_buf;
2322 >            if (nl) *nl = 0;
2323 >            
2324 >            char* ref_name1 = strsep((char**)&buf, "\t");
2325 >            char* scan_left_coord = strsep((char**)&buf, "\t");
2326 >            char* ref_name2 = strsep((char**)&buf, "\t");
2327 >            char* scan_right_coord = strsep((char**)&buf, "\t");
2328 >            char* scan_dir = strsep((char**)&buf, "\t");
2329 >            
2330 >            if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir)
2331 >              {
2332 >                fprintf(stderr,"Error: malformed insertion coordinate record\n");
2333 >                exit(1);
2334 >              }
2335 >            
2336 >            uint32_t ref_id1 = rt.get_id(ref_name1,NULL,0);
2337 >            uint32_t ref_id2 = rt.get_id(ref_name2,NULL,0);
2338 >            uint32_t left_coord = atoi(scan_left_coord);
2339 >            uint32_t right_coord = atoi(scan_right_coord);
2340 >            uint32_t dir = FUSION_FF;
2341 >            if (strcmp(scan_dir, "fr") == 0)
2342 >              dir = FUSION_FR;
2343 >            else if(strcmp(scan_dir, "rf") == 0)
2344 >              dir = FUSION_RF;
2345 >            else if (strcmp(scan_dir, "rr") == 0)
2346 >              dir = FUSION_RR;
2347 >            
2348 >            possible_fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir));
2349 >          }
2350 >        }
2351 >      fprintf(stderr, "done\n");
2352 >    }
2353 >      
2354 >  vector<boost::thread*> threads;
2355 >  for (int i = 0; i < num_threads; ++i)
2356 >    {
2357 >      JoinSegmentsWorker worker;
2358  
2359 <  /*for (size_t i = 0; i < segmap_fnames.size(); ++i)
2360 <      segmap_fnames[i].close();
2361 <  */
2362 <  for (size_t i = 0; i < spliced_segmap_files.size(); ++i)
2363 <        spliced_segmap_files[i].close();
2364 <  readstream.close();
2359 >      if (num_threads == 1)
2360 >        worker.bam_output_fname = bam_output_fname;
2361 >      else
2362 >        {
2363 >          string filename_base = bam_output_fname.substr(0, bam_output_fname.length() - 4);
2364 >          char filename[1024] = {0};
2365 >          sprintf(filename, "%s%d.bam", filename_base.c_str(), i);
2366 >          worker.bam_output_fname = filename;
2367 >        }
2368 >      
2369 >      worker.sam_header_fname = sam_header;
2370 >      worker.reads_fname = reads_fname;
2371 >      worker.segmap_fnames = segmap_fnames;
2372 >      worker.spliced_segmap_fnames = spliced_segmap_fnames;
2373 >      worker.possible_juncs = &possible_juncs;
2374 >      worker.possible_insertions = &possible_insertions;
2375 >      worker.possible_fusions = &possible_fusions;
2376 >      worker.rt = &rt;
2377 >      if (i == 0)
2378 >        {
2379 >          worker.begin_id = 0;
2380 >          worker.seg_offsets = vector<int64_t>(segmap_fnames.size(), 0);
2381 >          worker.spliced_seg_offsets = vector<int64_t>(spliced_segmap_fnames.size(), 0);
2382 >          worker.read_offset = 0;
2383 >        }
2384 >      else
2385 >        {
2386 >          worker.begin_id = read_ids[i-1];
2387 >          worker.seg_offsets.insert(worker.seg_offsets.end(),
2388 >                                    offsets[i-1].rbegin(),
2389 >                                    offsets[i-1].rbegin() + segmap_fnames.size());
2390 >          worker.spliced_seg_offsets.insert(worker.spliced_seg_offsets.end(),
2391 >                                            offsets[i-1].rbegin() + segmap_fnames.size(),
2392 >                                            offsets[i-1].rend() - 1);
2393 >          worker.read_offset = offsets[i-1][0];
2394 >        }
2395 >      
2396 >      worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
2397  
2398 <  for (size_t fac = 0; fac < factories.size(); ++fac)
2399 <  {
2400 <    delete factories[fac];
2401 <  }
2398 >      if (num_threads > 1)
2399 >        threads.push_back(new boost::thread(worker));
2400 >      else
2401 >        worker();
2402 >    }
2403 >
2404 >  for (size_t i = 0; i < threads.size(); ++i)
2405 >    {
2406 >      threads[i]->join();
2407 >      delete threads[i];
2408 >      threads[i] = NULL;
2409 >    }
2410 >  threads.clear();
2411 >  
2412   } //driver
2413  
2414   int main(int argc, char** argv)
# Line 1380 | Line 2454
2454    string insertions_file_list = argv[optind++];
2455    if(optind >= argc)
2456        {
2457 <  print_usage();
2458 <  return 1;
2457 >        print_usage();
2458 >        return 1;
2459        }
2460 +  
2461 +  string deletions_file_list = argv[optind++];
2462 +  
2463 +  if(optind >= argc)
2464 +    {
2465 +      print_usage();
2466 +      return 1;
2467 +    }
2468  
2469 <    string deletions_file_list = argv[optind++];
2470 <
2471 <    if(optind >= argc)
2472 <      {
2473 <  print_usage();
2474 <  return 1;
2475 <      }
2476 <
2477 <
2469 >  string fusions_file_list = argv[optind++];
2470 >  
2471 >  if(optind >= argc)
2472 >    {
2473 >      print_usage();
2474 >      return 1;
2475 >    }
2476 >  
2477 >  string bam_output_fname = argv[optind++];
2478 >  
2479 >  if(optind >= argc)
2480 >    {
2481 >      print_usage();
2482 >      return 1;
2483 >    }
2484 >  
2485    string segmap_file_list = argv[optind++];
2486    string spliced_segmap_flist;
2487    if(optind < argc)
# Line 1409 | Line 2498
2498    //FILE* reads_file = fopen(reads_file_name.c_str(), "r");
2499    vector<string> segmap_file_names;
2500    tokenize(segmap_file_list, ",",segmap_file_names);
1412  ReadStream readstream(reads_file_name);
1413  if (readstream.file()==NULL)
1414     err_die("Error: cannot open %s for reading\n",
1415        reads_file_name.c_str());
2501  
2502    vector<string> juncs_file_names;
2503    vector<FILE*> juncs_files;
# Line 1472 | Line 2557
2557  
2558    vector<string> spliced_segmap_file_names;
2559    vector<FZPipe> spliced_segmap_files;
1475  //these are headerless .sam.z files
2560    string unzcmd;
2561    tokenize(spliced_segmap_flist, ",",spliced_segmap_file_names);
2562 <  for (size_t i = 0; i < spliced_segmap_file_names.size(); ++i)
2562 >
2563 >  vector<string> fusions_file_names;
2564 >  vector<FILE*> fusions_files;
2565 >  tokenize(fusions_file_list, ",",fusions_file_names);
2566 >  for (size_t i = 0; i < fusions_file_names.size(); ++i)
2567      {
2568        fprintf(stderr, "Opening %s for reading\n",
2569 <        spliced_segmap_file_names[i].c_str());
2570 <      if (unzcmd.empty())
2571 <          unzcmd=getUnpackCmd(spliced_segmap_file_names[i],false);
2572 <      FZPipe spliced_seg_file(spliced_segmap_file_names[i], unzcmd);
2573 <      if (spliced_seg_file.file == NULL)
2574 <        {
2575 <        err_die("Error: cannot open %s for reading\n",
2576 <             spliced_segmap_file_names[i].c_str());
2577 <        }
2578 <        spliced_segmap_files.push_back(spliced_seg_file);
2579 <    }
2569 >              fusions_file_names[i].c_str());
2570 >      FILE* fusions_file = fopen(fusions_file_names[i].c_str(), "r");
2571 >      if (fusions_file == NULL)
2572 >        {
2573 >          fprintf(stderr, "Warning: cannot open %s for reading\n",
2574 >                  fusions_file_names[i].c_str());
2575 >          continue;
2576 >        }
2577 >      fusions_files.push_back(fusions_file);
2578 >    }
2579 >  
2580 >  
2581 >  driver(bam_output_fname,
2582 >         ref_stream,
2583 >         juncs_files,
2584 >         insertions_files,
2585 >         deletions_files,
2586 >         fusions_files,
2587 >         spliced_segmap_file_names,
2588 >         segmap_file_names,
2589 >         reads_file_name);
2590  
1493  if (spliced_segmap_files.size())
1494    {
1495      if (spliced_segmap_files.size() != segmap_file_names.size())
1496        {
1497        err_die("Error: each segment file must have a corresponding spliced segment file\n");
1498        }
1499    }
1500  GBamWriter bam_writer("-", sam_header.c_str());
1501  driver(bam_writer, ref_stream, juncs_files, insertions_files, deletions_files, spliced_segmap_files, segmap_file_names, readstream);
1502  //driver(ref_stream, juncs_files, insertions_files, deletions_files, spliced_segment_files, segment_files, reads_file);
2591    return 0;
2592   }

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines