ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/long_spanning_reads.cpp
Revision: 159
Committed: Mon Jan 30 22:38:08 2012 UTC (8 years, 8 months ago) by gpertea
File size: 99258 byte(s)
Log Message:
wip

Line User Rev File contents
1 gpertea 29 #ifdef HAVE_CONFIG_H
2     #include <config.h>
3     #endif
4    
5     /*
6     * long_spanning_reads.cpp
7     * TopHat
8     *
9     * Created by Cole Trapnell on 2/5/09.
10     * Copyright 2009 Cole Trapnell. All rights reserved.
11     *
12     */
13    
14     #include <cassert>
15     #include <cstdio>
16     #include <vector>
17     #include <string>
18     #include <map>
19     #include <algorithm>
20     #include <set>
21     #include <cstdlib>
22     #include <cstring>
23     #include <bitset>
24     //#include <stdexcept>
25     #include <iostream>
26     #include <fstream>
27     #include <sstream>
28     #include <seqan/sequence.h>
29     #include <seqan/find.h>
30     #include <seqan/file.h>
31     #include <seqan/modifier.h>
32     #include <getopt.h>
33    
34 gpertea 154 #include <boost/thread.hpp>
35    
36 gpertea 29 #include "common.h"
37 gpertea 154 #include "utils.h"
38 gpertea 29 #include "bwt_map.h"
39     #include "tokenize.h"
40     #include "segments.h"
41     #include "reads.h"
42    
43     #include "junctions.h"
44     #include "insertions.h"
45     #include "deletions.h"
46 gpertea 154 #include "fusions.h"
47 gpertea 29
48     using namespace seqan;
49     using namespace std;
50    
51 gpertea 154 // daehwan
52     bool bDebug = false;
53    
54 gpertea 29 void print_usage()
55     {
56     fprintf(stderr, "Usage: long_spanning_reads <reads.fa/.fq> <possible_juncs1,...,possible_juncsN> <possible_insertions1,...,possible_insertionsN> <possible_deletions1,...,possible_deletionsN> <seg1.bwtout,...,segN.bwtout> [spliced_seg1.bwtout,...,spliced_segN.bwtout]\n");
57     }
58    
59     bool key_lt(const pair<uint32_t, HitsForRead>& lhs,
60     const pair<uint32_t, HitsForRead>& rhs)
61     {
62     return lhs.first < rhs.first;
63     }
64    
65     void get_seqs(istream& ref_stream,
66     RefSequenceTable& rt,
67 gpertea 154 bool keep_seqs = true)
68 gpertea 29 {
69     while(ref_stream.good() &&
70     !ref_stream.eof())
71     {
72     RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
73     string name;
74     readMeta(ref_stream, name, Fasta());
75     string::size_type space_pos = name.find_first_of(" \t\r");
76     if (space_pos != string::npos)
77     {
78     name.resize(space_pos);
79     }
80     seqan::read(ref_stream, *ref_str, Fasta());
81    
82     rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
83     }
84     }
85    
86    
87     void look_right_for_hit_group(ReadTable& unmapped_reads,
88     vector<HitStream>& contig_hits,
89     size_t curr_file,
90     vector<HitStream>& spliced_hits,
91     const HitsForRead& targets,
92     vector<HitsForRead>& seg_hits_for_read)
93     {
94     int right_file = curr_file + 1;
95    
96     HitStream& right = contig_hits[right_file];
97     uint64_t curr_next_group_id = targets.insert_id;
98     int curr_order = unmapped_reads.observation_order(curr_next_group_id);
99    
100     assert (curr_order != -1);
101     while(true)
102     {
103     HitsForRead hit_group;
104     uint64_t right_next_group_id = right.next_group_id();
105     int right_order = unmapped_reads.observation_order(right_next_group_id);
106    
107     // If we would have seen the hits by now, bail out.
108     if (curr_order < right_order || right_order == -1)
109     {
110     break;
111     }
112     if (right.next_read_hits(hit_group))
113     {
114     if (hit_group.insert_id == targets.insert_id)
115     {
116     // Some of the targets may be missing, we need to
117     // process them individually
118     seg_hits_for_read[right_file] = hit_group;
119     break;
120     }
121     }
122     }
123    
124     HitsForRead& curr_seg_hits = seg_hits_for_read[right_file];
125    
126     if (right_file < (int)spliced_hits.size() && right_file >= 0)
127 gpertea 154 {
128 gpertea 29 // Scan forward in the spliced hits file for this hit group
129     HitsForRead spliced_group;
130     HitsForRead curr_spliced_group;
131     while (spliced_hits[right_file].next_group_id() > 0 &&
132     spliced_hits[right_file].next_group_id() <= (uint32_t)curr_order)
133     {
134     spliced_hits[right_file].next_read_hits(curr_spliced_group);
135    
136     if (curr_spliced_group.insert_id == (uint32_t)curr_order)
137     {
138     spliced_group = curr_spliced_group;
139     break;
140     }
141     }
142    
143     if (!spliced_group.hits.empty())
144     {
145     curr_seg_hits.insert_id = spliced_group.insert_id;
146     curr_seg_hits.hits.insert(curr_seg_hits.hits.end(),
147     spliced_group.hits.begin(),
148     spliced_group.hits.end());
149     }
150     }
151    
152     if (curr_seg_hits.hits.empty())
153     return;
154     else if (right_file + 1 < (int)contig_hits.size())
155     {
156     look_right_for_hit_group(unmapped_reads,
157     contig_hits,
158     curr_file + 1,
159     spliced_hits,
160     curr_seg_hits,
161     seg_hits_for_read);
162     }
163     }
164    
165 gpertea 159 BowtieHit merge_chain_color(RefSequenceTable& rt,
166     const string& read_seq,
167     const string& read_quals,
168     std::set<Junction>& possible_juncs,
169     std::set<Insertion>& possible_insertions,
170     list<BowtieHit>& hit_chain)
171     {
172     bool antisense = hit_chain.front().antisense_align();
173     uint32_t reference_id = hit_chain.front().ref_id();
174     uint64_t insert_id = hit_chain.front().insert_id();
175    
176     int left = hit_chain.front().left();
177    
178     list<BowtieHit>::iterator prev_hit = hit_chain.begin();
179     list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
180    
181     string seq;
182     string qual;
183     int old_read_length = 0;
184     int first_seg_length = hit_chain.front().seq().length();
185     for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
186     {
187     seq += i->seq();
188     qual += i->qual();
189     old_read_length += i->read_len();
190     }
191    
192     string rev_read_seq, rev_read_quals;
193     if (color && antisense)
194     {
195     rev_read_seq = read_seq;
196     reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
197    
198     rev_read_quals = read_quals;
199     reverse(rev_read_quals.begin(), rev_read_quals.end());
200     }
201    
202     while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
203     {
204     /*
205     * Note that the gap size may be negative, since left() and right() return
206     * signed integers, this will be OK.
207     */
208     int gap = curr_hit->left() - prev_hit->right();
209     if (gap < -(int)max_insertion_length ||
210     (gap > (int)max_deletion_length &&
211     (gap < min_report_intron_length || gap > max_report_intron_length)))
212     {
213     return BowtieHit();
214     }
215    
216     ++prev_hit;
217     ++curr_hit;
218     }
219    
220     prev_hit = hit_chain.begin();
221     curr_hit = ++(hit_chain.begin());
222    
223     RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id());
224     if (!ref_str)
225     return BowtieHit();
226    
227     int curr_seg_index = 1;
228     while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
229     {
230     /*
231     * This code is assuming that the cigar strings end and start with a match
232     * While segment alignments can actually end with a junction, insertion or deletion, the hope
233     * is that in those cases, the right and left ends of the alignments will correctly
234     * line up, so we won't get to this bit of code
235     */
236     if (prev_hit->cigar().back().opcode != MATCH || curr_hit->cigar().front().opcode != MATCH)
237     {
238     return BowtieHit();
239     }
240    
241     if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
242     {
243     /*
244     * There is no way that we can splice these together into a valid
245     * alignment
246     */
247     return BowtieHit();
248     }
249    
250     bool found_closure = false;
251     /*
252     * Note that antisense_splice is the same for prev and curr
253     */
254     bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
255     vector<CigarOp> new_cigar;
256     int new_left = -1;
257     int mismatch = 0;
258    
259     /*
260     * Take the length of matched bases in each segment into consideration for closures,
261     * this can be a problem for reads of variable lengths.
262     */
263     int prev_right_end_match_length = prev_hit->cigar().back().length;
264     int curr_left_end_match_length = curr_hit->cigar().front().length;
265    
266     if (prev_hit->right() > curr_hit->left())
267     {
268     std::set<Insertion>::iterator lb, ub;
269     /*
270     * Note, this offset is determined by the min-anchor length supplied to
271     * juncs_db, which is currently hard-coded at 3 in tophat.py
272     * as this value determines what sort of read segments
273     * should be able to align directly to the splice sequences
274     */
275     int left_boundary = prev_hit->right() - 4;
276     int right_boundary = curr_hit->left() + 4;
277    
278     /*
279     * Create a dummy sequence to represent the maximum possible insertion
280     */
281     std::string maxInsertedSequence = "";
282     maxInsertedSequence.resize(max_insertion_length,'A');
283    
284     lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
285     ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
286    
287     int reference_mismatch = 0;
288    
289     while (lb != ub && lb != possible_insertions.end())
290     {
291     /*
292     * In the following code, we will check to make sure that the segments have the proper
293     * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
294     * In general, reads with insertions must match the inserted sequence exactly.
295     */
296     if (((int)lb->sequence.size()) == (prev_hit->right() - curr_hit->left()))
297     {
298     /*
299     * Check we have enough matched bases on prev or curr segment.
300     */
301     int insert_to_prev_right = prev_hit->right() - lb->left - 1;
302     int curr_left_to_insert = lb->left - curr_hit->left() + 1;
303     if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
304     {
305     ++lb;
306     continue;
307     }
308    
309     /*
310     * Keep track of how many mismatches were made to the genome in the region
311     * where we should actually be matching against the insertion
312     */
313     int this_reference_mismatch = 0;
314     int insertion_mismatch = 0;
315     int insertion_len = lb->sequence.length();
316     const seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
317    
318     /*
319     * First check to see if we need to adjust number of observed errors for the left (prev)
320     * hit. This is only the case if this segment runs into the insertion. To be consistent
321     * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
322     */
323     string colorSegmentSequence_prev;
324     if (insert_to_prev_right > 0)
325     {
326     const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
327     const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
328    
329     if (color)
330     {
331     string color;
332    
333     if (antisense)
334     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
335     else
336     color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
337    
338     color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
339     colorSegmentSequence_prev = convert_color_to_bp(color);
340     }
341    
342     const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
343    
344     /*
345     * Scan right in the read until we run out of read
346     */
347     for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
348     {
349     /*
350     * Any mismatch to the insertion is a failure
351     */
352     if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
353     {
354     ++this_reference_mismatch;
355     }
356    
357     if (read_index < insertion_len)
358     {
359     if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
360     {
361     ++insertion_mismatch;
362     break;
363     }
364     }
365     else
366     {
367     if (referenceSequence[read_index - insertion_len] == 'N' ||
368     referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
369     {
370     --this_reference_mismatch;
371     }
372     }
373     }
374     }
375    
376     string colorSegmentSequence_curr;
377     if (curr_left_to_insert > 0)
378     {
379     const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
380     const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
381    
382     if (color)
383     {
384     string color;
385     if (antisense)
386     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
387     else
388     color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
389    
390     color.push_back(curr_hit->seq()[curr_left_to_insert]);
391     reverse(color.begin(), color.end());
392     string bp = convert_color_to_bp(color);
393     reverse(bp.begin(), bp.end());
394     colorSegmentSequence_curr = bp;
395     }
396    
397     const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
398    
399     /*
400     * Scan left in the read until
401     * We ran out of read sequence (insertion extends past segment)
402     */
403     for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
404     {
405     int segmentPosition = curr_left_to_insert - read_index - 1;
406     int insertionPosition = insertion_len - read_index - 1;
407    
408     if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
409     {
410     ++this_reference_mismatch;
411     }
412    
413     if (read_index < insertion_len)
414     {
415     if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
416     {
417     ++insertion_mismatch;
418     break;
419     }
420     }
421     else
422     {
423     if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
424     (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
425     {
426     --this_reference_mismatch;
427     }
428     }
429     }
430     }
431    
432     if (found_closure)
433     {
434     fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
435     return BowtieHit();
436     }
437    
438     if (insertion_mismatch == 0)
439     {
440     reference_mismatch = this_reference_mismatch;
441     mismatch = -reference_mismatch;
442     found_closure = true;
443     new_left = prev_hit->left();
444     new_cigar = prev_hit->cigar();
445    
446     /*
447     * Need to make a new insert operation between the two match character that begin
448     * and end the intersection of these two junction. Note that we necessarily assume
449     * that this insertion can't span beyond the boundaries of these reads. That should
450     * probably be better enforced somewhere
451     */
452    
453     new_cigar.back().length -= insert_to_prev_right;
454     if (new_cigar.back().length <= 0)
455     new_cigar.pop_back();
456    
457     new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
458     vector<CigarOp> new_right_cigar = curr_hit->cigar();
459     new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
460    
461     /*
462     * Finish stitching together the new cigar string
463     */
464     size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
465     for (; c < new_right_cigar.size(); ++c)
466     {
467     new_cigar.push_back(new_right_cigar[c]);
468     }
469    
470     if (color)
471     {
472     if (insert_to_prev_right > 0)
473     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
474    
475     if (curr_left_to_insert > 0)
476     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
477     }
478     }
479     }
480     ++lb;
481     }
482    
483     if (!found_closure)
484     {
485     return BowtieHit();
486     }
487     }
488    
489     /*
490     * Stitch segments together using juctions or deletions if necessary.
491     */
492     else if (prev_hit->right() < curr_hit->left())
493     {
494     std::set<Junction>::iterator lb, ub;
495    
496     int left_boundary = prev_hit->right() - 4;
497     int right_boundary = curr_hit->left() + 4;
498    
499     lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
500     ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
501    
502     int new_diff_mismatches = 0xff;
503     while (lb != ub && lb != possible_juncs.end())
504     {
505     int dist_to_left = lb->left - prev_hit->right() + 1;
506     int dist_to_right = lb->right - curr_hit->left();
507    
508     if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
509     {
510     /*
511     * Check we have enough matched bases on prev or curr segment.
512     */
513     if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length )
514     {
515     ++lb;
516     continue;
517     }
518    
519     Dna5String new_cmp_str, old_cmp_str;
520     int new_mismatch = 0, old_mismatch = 0;
521     string new_patch_str; // this is for colorspace reads
522     if (dist_to_left > 0)
523     {
524     new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
525     old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
526    
527     string new_seq;
528     if (color)
529     {
530     string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
531    
532     string color, qual;
533     if (antisense)
534     {
535     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
536     qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
537     }
538     else
539     {
540     color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
541     qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
542     }
543    
544     BWA_decode(color, qual, ref, new_seq);
545     new_seq = new_seq.substr(1);
546     }
547    
548     const string& curr_old_seq = curr_hit->seq();
549     const string& curr_seq = color ? new_seq : curr_hit->seq();
550     for (int i = 0; i < dist_to_left; ++i)
551     {
552     if (curr_seq[i] != new_cmp_str[i])
553     ++new_mismatch;
554    
555     if (curr_old_seq[i] != old_cmp_str[i])
556     ++old_mismatch;
557     }
558    
559     if (color)
560     new_patch_str = curr_seq.substr(0, dist_to_left);
561     }
562     else if (dist_to_left < 0)
563     {
564     new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
565     old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
566    
567     size_t abs_dist = -dist_to_left;
568     string new_seq;
569     if (color)
570     {
571     string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
572     ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
573    
574     string color, qual;
575     if (antisense)
576     {
577     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
578     qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
579     }
580     else
581     {
582     color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
583     qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
584     }
585    
586     BWA_decode(color, qual, ref, new_seq);
587     new_seq = new_seq.substr(1);
588     }
589    
590     const string& prev_old_seq = prev_hit->seq();
591     size_t prev_old_seq_len = prev_old_seq.length();
592     const string& prev_seq = color ? new_seq : prev_hit->seq();
593     size_t prev_seq_len = prev_seq.length();
594     for (size_t i = 0; i < abs_dist; ++i)
595     {
596     if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
597     ++new_mismatch;
598     if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
599     ++old_mismatch;
600     }
601    
602     if (color)
603     new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
604     }
605    
606     int temp_diff_mismatches = new_mismatch - old_mismatch;
607     if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
608     {
609     ++lb;
610     continue;
611     }
612    
613     if (color)
614     {
615     /*
616     * We need to recover the origianl sequence.
617     */
618     if (found_closure)
619     {
620     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
621     prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
622     }
623    
624     if (dist_to_left > 0)
625     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
626     else if (dist_to_left < 0)
627     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
628     }
629    
630     new_diff_mismatches = temp_diff_mismatches;
631    
632     new_left = prev_hit->left();
633     new_cigar = prev_hit->cigar();
634    
635     int new_left_back_len = new_cigar.back().length;
636     new_left_back_len += dist_to_left;
637    
638     vector<CigarOp> new_right_cig = curr_hit->cigar();
639     int new_right_front_len = new_right_cig.front().length;
640     new_right_front_len -= dist_to_right;
641     if (new_left_back_len > 0)
642     new_cigar.back().length = new_left_back_len;
643     else
644     new_cigar.pop_back();
645    
646     /*
647     * FIXME, currently just differentiating between a deletion and a
648     * reference skip based on length. However, would probably be better
649     * to denote the difference explicitly, this would allow the user
650     * to supply their own (very large) deletions
651     */
652     if ((lb->right - lb->left - 1) <= max_deletion_length)
653     {
654     new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
655     antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
656     }
657     else
658     {
659     new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
660     antisense_closure = lb->antisense;
661     }
662    
663     new_right_cig.front().length = new_right_front_len;
664     size_t c = new_right_front_len > 0 ? 0 : 1;
665     for (; c < new_right_cig.size(); ++c)
666     new_cigar.push_back(new_right_cig[c]);
667    
668     mismatch = new_diff_mismatches;
669     found_closure = true;
670     }
671     ++lb;
672     }
673    
674     if (!found_closure)
675     {
676     return BowtieHit();
677     }
678     }
679    
680     if (found_closure)
681     {
682     bool end = false;
683     BowtieHit merged_hit(reference_id,
684     reference_id,
685     insert_id,
686     new_left,
687     new_cigar,
688     antisense,
689     antisense_closure,
690     prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
691     prev_hit->splice_mms() + curr_hit->splice_mms(),
692     end);
693    
694     if (curr_seg_index > 1)
695     merged_hit.seq(seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, 2 * segment_length));
696     else
697     merged_hit.seq(seq.substr(0, first_seg_length + segment_length));
698    
699     prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
700     /*
701     * prev_hit now points PAST the last element removed
702     */
703     prev_hit = hit_chain.insert(prev_hit, merged_hit);
704     /*
705     * merged_hit has been inserted before the old position of
706     * prev_hit. New location of prev_hit is merged_hit
707     */
708     curr_hit = prev_hit;
709     ++curr_hit;
710     ++curr_seg_index;
711     continue;
712     }
713    
714     ++prev_hit;
715     ++curr_hit;
716     ++curr_seg_index;
717     }
718    
719     bool saw_antisense_splice = false;
720     bool saw_sense_splice = false;
721     vector<CigarOp> long_cigar;
722     int num_mismatches = 0;
723     int num_splice_mms = 0;
724     for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
725     {
726     num_mismatches += s->edit_dist();
727     num_splice_mms += s->splice_mms();
728    
729     /*
730     * Check whether the sequence contains any reference skips. Previously,
731     * this was just a check to see whether the sequence was contiguous; however
732     * we don't want to count an indel event as a splice
733     */
734     bool containsSplice = s->is_spliced();
735     if (containsSplice)
736     {
737     if (s->antisense_splice())
738     {
739     if (saw_sense_splice)
740     return BowtieHit();
741     saw_antisense_splice = true;
742     }
743     else
744     {
745     if (saw_antisense_splice)
746     return BowtieHit();
747     saw_sense_splice = true;
748     }
749     }
750     const vector<CigarOp>& cigar = s->cigar();
751     if (long_cigar.empty())
752     {
753     long_cigar = cigar;
754     }
755     else
756     {
757     CigarOp& last = long_cigar.back();
758     /*
759     * If necessary, merge the back and front
760     * cigar operations
761     */
762     if(last.opcode == cigar[0].opcode){
763     last.length += cigar[0].length;
764     for (size_t b = 1; b < cigar.size(); ++b)
765     {
766     long_cigar.push_back(cigar[b]);
767     }
768     }else{
769     for(size_t b = 0; b < cigar.size(); ++b)
770     {
771     long_cigar.push_back(cigar[b]);
772     }
773     }
774     }
775     }
776    
777     bool end = false;
778     BowtieHit new_hit(reference_id,
779     reference_id,
780     insert_id,
781     left,
782     long_cigar,
783     antisense,
784     saw_antisense_splice,
785     num_mismatches,
786     num_splice_mms,
787     end);
788    
789     new_hit.seq(seq);
790     new_hit.qual(qual);
791    
792     int new_read_len = new_hit.read_len();
793     if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt))
794     {
795     fprintf(stderr, "Warning: malformed closure\n");
796     return BowtieHit();
797     }
798    
799     return new_hit;
800     }
801    
802 gpertea 29 BowtieHit merge_chain(RefSequenceTable& rt,
803 gpertea 154 const string& read_seq,
804     const string& read_quals,
805     std::set<Junction>& possible_juncs,
806     std::set<Insertion>& possible_insertions,
807     std::set<Fusion>& possible_fusions,
808     list<BowtieHit>& hit_chain,
809     int fusion_dir = FUSION_NOTHING)
810 gpertea 29 {
811     bool antisense = hit_chain.front().antisense_align();
812     uint64_t insert_id = hit_chain.front().insert_id();
813 gpertea 154
814     const int left = hit_chain.front().left();
815    
816 gpertea 29 list<BowtieHit>::iterator prev_hit = hit_chain.begin();
817     list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
818 gpertea 154
819 gpertea 29 string seq;
820     string qual;
821     int old_read_length = 0;
822     int first_seg_length = hit_chain.front().seq().length();
823     for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
824     {
825     seq += i->seq();
826     qual += i->qual();
827     old_read_length += i->read_len();
828     }
829    
830     string rev_read_seq, rev_read_quals;
831     if (color && antisense)
832     {
833     rev_read_seq = read_seq;
834     reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
835 gpertea 154
836 gpertea 29 rev_read_quals = read_quals;
837     reverse(rev_read_quals.begin(), rev_read_quals.end());
838     }
839    
840 gpertea 154 size_t num_fusions = prev_hit->fusion_opcode() == FUSION_NOTHING ? 0 : 1;
841     bool fusion_passed = false;
842 gpertea 29 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
843     {
844 gpertea 154 if (prev_hit->ref_id() != prev_hit->ref_id2() || prev_hit->ref_id2() != curr_hit->ref_id())
845     fusion_passed = true;
846 gpertea 29
847 gpertea 154 if (prev_hit->ref_id2() != curr_hit->ref_id())
848     ++num_fusions;
849    
850     if (curr_hit->fusion_opcode() != FUSION_NOTHING)
851     ++num_fusions;
852    
853     if (prev_hit->ref_id2() == curr_hit->ref_id())
854     {
855     bool reversed = false;
856     if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
857     reversed = true;
858    
859     /*
860     * Note that the gap size may be negative, since left() and right() return
861     * signed integers, this will be OK.
862     */
863     int gap;
864     if (reversed)
865     gap = prev_hit->right() - curr_hit->left();
866     else
867     gap = curr_hit->left() - prev_hit->right();
868    
869     // daehwan
870     if (bDebug)
871     {
872     cout << "curr left: " << curr_hit->left() << endl
873     << "prev right: " << prev_hit->right() << endl
874     << "gap: " << gap << endl;
875     }
876    
877     if (gap < -(int)max_insertion_length ||
878     (gap > (int)max_deletion_length &&
879     (gap < min_report_intron_length || gap > min(max_report_intron_length, (int)fusion_min_dist))))
880     {
881     fusion_passed = true;
882     ++num_fusions;
883     }
884     }
885    
886     if (num_fusions >= 2)
887     return BowtieHit();
888    
889 gpertea 29 ++prev_hit;
890     ++curr_hit;
891     }
892 gpertea 154
893 gpertea 29 prev_hit = hit_chain.begin();
894     curr_hit = ++(hit_chain.begin());
895    
896 gpertea 154 // daehwan
897     if (bDebug)
898     {
899     cout << "daehwan - test" << endl;
900     }
901    
902 gpertea 29 int curr_seg_index = 1;
903 gpertea 154 fusion_passed = false;
904 gpertea 29 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
905     {
906 gpertea 154 // daehwan
907     if (bDebug)
908     {
909     cout << "daehwan - start - stitch" << endl;
910     cout << "prev right: " << prev_hit->right() << endl;
911     cout << "curr left: " << curr_hit->left() << endl;
912     cout << "prev back: " << prev_hit->cigar().back().opcode << endl;
913     cout << "curr front: " << curr_hit->cigar().front().opcode << endl;
914     cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
915     cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
916     }
917    
918     if (prev_hit->fusion_opcode() != FUSION_NOTHING || prev_hit->ref_id2() != curr_hit->ref_id())
919     fusion_passed = true;
920    
921 gpertea 29 /*
922     * This code is assuming that the cigar strings end and start with a match
923     * While segment alignments can actually end with a junction, insertion or deletion, the hope
924     * is that in those cases, the right and left ends of the alignments will correctly
925     * line up, so we won't get to this bit of code
926     */
927 gpertea 154 if (!(prev_hit->cigar().back().opcode == MATCH || curr_hit->cigar().front().opcode == MATCH ||
928     prev_hit->cigar().back().opcode == mATCH || curr_hit->cigar().front().opcode == mATCH))
929     {
930     return BowtieHit();
931     }
932 gpertea 29
933 gpertea 154 // daehwan
934     if (bDebug)
935     {
936     cout << "daehwan - pass - enough matched bases" << endl;
937     }
938    
939 gpertea 29 if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
940 gpertea 154 {
941     /*
942     * There is no way that we can splice these together into a valid
943     * alignment
944     */
945     return BowtieHit();
946     }
947 gpertea 29
948     bool found_closure = false;
949     /*
950     * Note that antisense_splice is the same for prev and curr
951     */
952     bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
953     vector<CigarOp> new_cigar;
954     int new_left = -1;
955     int mismatch = 0;
956 gpertea 154
957 gpertea 29 /*
958     * Take the length of matched bases in each segment into consideration for closures,
959     * this can be a problem for reads of variable lengths.
960     */
961     int prev_right_end_match_length = prev_hit->cigar().back().length;
962     int curr_left_end_match_length = curr_hit->cigar().front().length;
963    
964 gpertea 154 bool check_fusion = prev_hit->ref_id2() != curr_hit->ref_id();
965 gpertea 29
966 gpertea 154 if (prev_hit->ref_id2() == curr_hit->ref_id())
967     {
968     // daehwan
969     if (bDebug)
970     {
971     cout << "daehwan - start - junction or insertion" << endl;
972     cout << "prev right: " << prev_hit->right() << endl;
973     cout << "curr left: " << curr_hit->left() << endl;
974     cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
975     cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
976     }
977 gpertea 29
978 gpertea 154 bool reversed = false;
979     if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
980     reversed = true;
981 gpertea 29
982 gpertea 154 uint32_t reference_id = prev_hit->ref_id2();
983     RefSequenceTable::Sequence* ref_str = rt.get_seq(reference_id);
984    
985     int left_boundary, right_boundary;
986     if (reversed)
987     {
988     left_boundary = curr_hit->left() - 4;
989     right_boundary = prev_hit->right() + 4;
990     }
991     else
992     {
993     left_boundary = prev_hit->right() - 4;
994     right_boundary = curr_hit->left() + 4;
995     }
996 gpertea 29
997 gpertea 154 int dist_btw_two;
998     if (reversed)
999     dist_btw_two = prev_hit->right() - curr_hit->left();
1000     else
1001     dist_btw_two = curr_hit->left() - prev_hit->right();
1002 gpertea 29
1003 gpertea 154 if (dist_btw_two < 0 && dist_btw_two >= -max_insertion_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
1004     {
1005     std::set<Insertion>::iterator lb, ub;
1006    
1007     /*
1008     * Create a dummy sequence to represent the maximum possible insertion
1009     */
1010     std::string maxInsertedSequence = "";
1011     maxInsertedSequence.resize(max_insertion_length,'A');
1012 gpertea 29
1013 gpertea 154 lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
1014     ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
1015    
1016     int reference_mismatch = 0;
1017    
1018     while (lb != ub && lb != possible_insertions.end())
1019     {
1020     /*
1021     * In the following code, we will check to make sure that the segments have the proper
1022     * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
1023     * In general, reads with insertions must match the inserted sequence exactly.
1024     */
1025     if (((int)lb->sequence.size()) == (reversed ? curr_hit->left() - prev_hit->right() : prev_hit->right() - curr_hit->left()))
1026     {
1027     /*
1028     * Check we have enough matched bases on prev or curr segment.
1029     */
1030     int insert_to_prev_right, curr_left_to_insert;
1031     if (reversed)
1032     {
1033     insert_to_prev_right = lb->left - prev_hit->right();
1034     curr_left_to_insert = curr_hit->left() - lb->left;
1035     }
1036     else
1037     {
1038     insert_to_prev_right = prev_hit->right() - lb->left - 1;
1039     curr_left_to_insert = lb->left - curr_hit->left() + 1;
1040     }
1041 gpertea 29
1042 gpertea 154 if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
1043     {
1044     ++lb;
1045     continue;
1046     }
1047 gpertea 29
1048 gpertea 154 // daehwan
1049     if (bDebug)
1050     {
1051     cout << "insert_to_prev_right: " << insert_to_prev_right << endl;
1052     cout << "curr_left_to_insert: " << curr_left_to_insert << endl;
1053     cout << "curr_seg_index: " << curr_seg_index << endl;
1054     }
1055    
1056     /*
1057     * Keep track of how many mismatches were made to the genome in the region
1058     * where we should actually be matching against the insertion
1059     */
1060     int this_reference_mismatch = 0;
1061     int insertion_mismatch = 0;
1062     int insertion_len = lb->sequence.length();
1063     seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
1064     if (reversed)
1065     {
1066     seqan::convertInPlace(insertionSequence, seqan::FunctorComplement<seqan::Dna>());
1067     seqan::reverseInPlace(insertionSequence);
1068     }
1069    
1070     /*
1071     * First check to see if we need to adjust number of observed errors for the left (prev)
1072     * hit. This is only the case if this segment runs into the insertion. To be consistent
1073     * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
1074     */
1075     string colorSegmentSequence_prev;
1076     if (insert_to_prev_right > 0)
1077     {
1078     seqan::Dna5String referenceSequence, oldSegmentSequence;
1079 gpertea 29
1080 gpertea 154 if (reversed)
1081     {
1082     referenceSequence = seqan::infix(*ref_str, prev_hit->right() + 1, lb->left + 1);
1083     seqan::convertInPlace(referenceSequence, seqan::FunctorComplement<seqan::Dna>());
1084     seqan::reverseInPlace(referenceSequence);
1085 gpertea 29
1086 gpertea 154 string temp;
1087 gpertea 29
1088 gpertea 154 // daehwan
1089     if (bDebug)
1090     {
1091     cout << "reversed: " << read_seq.length() << " " << read_seq << endl;
1092     }
1093    
1094     temp = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right);
1095     oldSegmentSequence = seqan::Dna5String(temp);
1096     }
1097     else
1098     {
1099     referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
1100 gpertea 29
1101 gpertea 154 // daehwan
1102     if (bDebug)
1103     {
1104     cout << "non-reversed: " << prev_hit->seq() << endl;
1105     }
1106 gpertea 29
1107 gpertea 154 oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
1108     }
1109    
1110     if (color)
1111     {
1112     string color;
1113    
1114     if (antisense)
1115     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
1116     else
1117     color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
1118    
1119     color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
1120     colorSegmentSequence_prev = convert_color_to_bp(color);
1121     }
1122    
1123     const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
1124 gpertea 29
1125 gpertea 154 // daehwan
1126     if (bDebug)
1127     {
1128     cout << "ref: " << referenceSequence << endl;
1129     cout << "old: " << oldSegmentSequence << endl;
1130     cout << "ins: " << insertionSequence << endl;
1131     }
1132    
1133     /*
1134     * Scan right in the read until we run out of read
1135     */
1136     for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
1137     {
1138     /*
1139     * Any mismatch to the insertion is a failure
1140     */
1141     if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
1142     {
1143     ++this_reference_mismatch;
1144     }
1145    
1146     if (read_index < insertion_len)
1147     {
1148     if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
1149     {
1150     ++insertion_mismatch;
1151     break;
1152     }
1153     }
1154     else
1155     {
1156     if (referenceSequence[read_index - insertion_len] == 'N' ||
1157     referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
1158     {
1159     --this_reference_mismatch;
1160     }
1161     }
1162     }
1163     }
1164    
1165     string colorSegmentSequence_curr;
1166     if (curr_left_to_insert > 0)
1167     {
1168     seqan::Dna5String referenceSequence, oldSegmentSequence;
1169     if (reversed)
1170     {
1171     referenceSequence = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
1172     seqan::convertInPlace(referenceSequence, seqan::FunctorComplement<seqan::Dna>());
1173     seqan::reverseInPlace(referenceSequence);
1174 gpertea 29
1175 gpertea 154 string temp = read_seq.substr(curr_seg_index * segment_length, curr_left_to_insert);
1176     oldSegmentSequence = seqan::Dna5String(temp);
1177     }
1178     else
1179     {
1180     referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
1181     oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
1182     }
1183    
1184     if (color)
1185     {
1186     string color;
1187     if (antisense)
1188     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
1189     else
1190     color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
1191    
1192     color.push_back(curr_hit->seq()[curr_left_to_insert]);
1193     reverse(color.begin(), color.end());
1194     string bp = convert_color_to_bp(color);
1195     reverse(bp.begin(), bp.end());
1196     colorSegmentSequence_curr = bp;
1197     }
1198    
1199     const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
1200 gpertea 29
1201 gpertea 154 // daehwan
1202     if (bDebug)
1203     {
1204     cout << "ref: " << referenceSequence << endl;
1205     cout << "old: " << oldSegmentSequence << endl;
1206     cout << "ins: " << insertionSequence << endl;
1207     }
1208    
1209     /*
1210     * Scan left in the read until
1211     * We ran out of read sequence (insertion extends past segment)
1212     */
1213     for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
1214     {
1215     int segmentPosition = curr_left_to_insert - read_index - 1;
1216     int insertionPosition = insertion_len - read_index - 1;
1217    
1218     if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
1219     {
1220     ++this_reference_mismatch;
1221     }
1222    
1223     if (read_index < insertion_len)
1224     {
1225     if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
1226     {
1227     ++insertion_mismatch;
1228     break;
1229     }
1230     }
1231     else
1232     {
1233     if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
1234     (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
1235     {
1236     --this_reference_mismatch;
1237     }
1238     }
1239     }
1240     }
1241    
1242     if (found_closure)
1243     {
1244     // fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
1245     return BowtieHit();
1246     }
1247 gpertea 29
1248 gpertea 154 if (insertion_mismatch == 0)
1249     {
1250     reference_mismatch = this_reference_mismatch;
1251     mismatch = -reference_mismatch;
1252     found_closure = true;
1253     new_left = prev_hit->left();
1254     new_cigar = prev_hit->cigar();
1255    
1256     /*
1257     * Need to make a new insert operation between the two match character that begin
1258     * and end the intersection of these two junction. Note that we necessarily assume
1259     * that this insertion can't span beyond the boundaries of these reads. That should
1260     * probably be better enforced somewhere
1261     */
1262 gpertea 29
1263 gpertea 154 new_cigar.back().length -= insert_to_prev_right;
1264    
1265     if (new_cigar.back().length <= 0)
1266     new_cigar.pop_back();
1267 gpertea 29
1268 gpertea 154 if (reversed)
1269     new_cigar.push_back(CigarOp(iNS, lb->sequence.size()));
1270     else
1271     new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
1272    
1273     vector<CigarOp> new_right_cigar = curr_hit->cigar();
1274     new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
1275    
1276     /*
1277     * Finish stitching together the new cigar string
1278     */
1279     size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
1280     for (; c < new_right_cigar.size(); ++c)
1281     {
1282     new_cigar.push_back(new_right_cigar[c]);
1283     }
1284    
1285     if (color)
1286     {
1287     if (insert_to_prev_right > 0)
1288     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
1289    
1290     if (curr_left_to_insert > 0)
1291     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
1292     }
1293     }
1294     }
1295     ++lb;
1296     }
1297    
1298     if (!found_closure)
1299     {
1300     return BowtieHit();
1301     }
1302     }
1303 gpertea 29
1304 gpertea 154 /*
1305     * Stitch segments together using juctions or deletions if necessary.
1306     */
1307     else if (dist_btw_two > 0 && dist_btw_two <= max_report_intron_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
1308     {
1309     std::set<Junction>::iterator lb, ub;
1310 gpertea 29
1311 gpertea 154 // daehwan
1312     if (bDebug)
1313     {
1314     cout << "junction" << endl;
1315     cout << "min: " << left_boundary << "-" << right_boundary - 8 << endl;
1316     cout << "max: " << left_boundary + 8 << "-" << right_boundary << endl;
1317     }
1318 gpertea 29
1319 gpertea 154 lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
1320     ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
1321    
1322     int new_diff_mismatches = 0xff;
1323     while (lb != ub && lb != possible_juncs.end())
1324     {
1325     int dist_to_left, dist_to_right;
1326     if (reversed)
1327     {
1328     dist_to_left = lb->left - curr_hit->left();
1329     dist_to_right = lb->right - prev_hit->right() - 1;
1330     }
1331     else
1332     {
1333     dist_to_left = lb->left - prev_hit->right() + 1;
1334     dist_to_right = lb->right - curr_hit->left();
1335     }
1336    
1337     if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
1338     {
1339     /*
1340     * Check we have enough matched bases on prev or curr segment.
1341     */
1342     if ((reversed && (dist_to_left > prev_right_end_match_length || -dist_to_left > curr_left_end_match_length)) ||
1343     (!reversed && (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)))
1344     {
1345     ++lb;
1346     continue;
1347     }
1348 gpertea 29
1349 gpertea 154 // daehwan
1350     if (bDebug)
1351     {
1352     cout << "candidate junction: " << endl;
1353     cout << "coords: " << lb->left << "-" << lb->right << endl;
1354     cout << "dist to left: " << dist_to_left << endl;
1355     }
1356    
1357     Dna5String new_cmp_str, old_cmp_str;
1358     int new_mismatch = 0, old_mismatch = 0;
1359     string new_patch_str; // this is for colorspace reads
1360     if (dist_to_left > 0)
1361     {
1362     if (reversed)
1363     {
1364     new_cmp_str = seqan::infix(*ref_str, curr_hit->left() + 1, lb->left + 1);
1365     seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1366     seqan::reverseInPlace(new_cmp_str);
1367    
1368     old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb->right);
1369     seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1370     seqan::reverseInPlace(old_cmp_str);
1371     }
1372     else
1373     {
1374     new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
1375     old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
1376     }
1377    
1378     string new_seq;
1379     if (color)
1380     {
1381     string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
1382    
1383     string color, qual;
1384     if (antisense)
1385     {
1386     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
1387     qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
1388     }
1389     else
1390     {
1391     color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
1392     qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
1393     }
1394    
1395     BWA_decode(color, qual, ref, new_seq);
1396     new_seq = new_seq.substr(1);
1397     }
1398 gpertea 29
1399 gpertea 154 string curr_hit_seq;
1400     if (reversed)
1401     curr_hit_seq = read_seq.substr(curr_seg_index * segment_length - dist_to_left, dist_to_left);
1402     else
1403     curr_hit_seq = curr_hit->seq();
1404    
1405     const string& curr_old_seq = curr_hit_seq;
1406     const string& curr_seq = color ? new_seq : curr_hit_seq;
1407     for (int i = 0; i < dist_to_left; ++i)
1408     {
1409     if (curr_seq[i] != new_cmp_str[i])
1410     ++new_mismatch;
1411    
1412     if (curr_old_seq[i] != old_cmp_str[i])
1413     ++old_mismatch;
1414     }
1415    
1416     if (color)
1417     new_patch_str = curr_seq.substr(0, dist_to_left);
1418     }
1419     else if (dist_to_left < 0)
1420     {
1421     if (reversed)
1422     {
1423     new_cmp_str = seqan::infix(*ref_str, lb->right, prev_hit->right() + 1);
1424     seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1425     seqan::reverseInPlace(new_cmp_str);
1426 gpertea 29
1427 gpertea 154 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
1428     seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1429     seqan::reverseInPlace(old_cmp_str);
1430     }
1431     else
1432     {
1433     new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
1434     old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
1435     }
1436 gpertea 29
1437 gpertea 154 size_t abs_dist = -dist_to_left;
1438     string new_seq;
1439     if (color)
1440     {
1441     string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
1442     ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
1443    
1444     string color, qual;
1445     if (antisense)
1446     {
1447     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
1448     qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
1449     }
1450     else
1451     {
1452     color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
1453     qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
1454     }
1455    
1456     BWA_decode(color, qual, ref, new_seq);
1457     new_seq = new_seq.substr(1);
1458     }
1459 gpertea 29
1460 gpertea 154 string prev_hit_seq;
1461     if (reversed)
1462     prev_hit_seq = read_seq.substr(curr_seg_index * segment_length, abs_dist);
1463     else
1464     prev_hit_seq = prev_hit->seq();
1465 gpertea 29
1466 gpertea 154 // daehwan
1467     if (bDebug)
1468     {
1469     cout << "reverse: " << (int)reversed << endl;
1470     cout << "new cmp str: " << new_cmp_str << endl;
1471     cout << "old cmp str: " << old_cmp_str << endl;
1472     cout << "hit seq: " << prev_hit_seq << endl;
1473     cout << "curr seq: " << curr_hit->seq() << endl;
1474 gpertea 29
1475 gpertea 154 cout << read_seq
1476     << endl;
1477     cout << read_seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, segment_length)
1478     << endl;
1479     }
1480    
1481     const string& prev_old_seq = prev_hit_seq;
1482     size_t prev_old_seq_len = prev_old_seq.length();
1483     const string& prev_seq = color ? new_seq : prev_hit_seq;
1484     size_t prev_seq_len = prev_seq.length();
1485     for (size_t i = 0; i < abs_dist; ++i)
1486     {
1487     if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
1488     ++new_mismatch;
1489     if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
1490     ++old_mismatch;
1491     }
1492 gpertea 29
1493 gpertea 154 if (color)
1494     new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
1495     }
1496    
1497     int temp_diff_mismatches = new_mismatch - old_mismatch;
1498 gpertea 29
1499 gpertea 154 // daehwan
1500     if (bDebug)
1501     {
1502     cout << "new mismatch: " << new_mismatch << endl;
1503     cout << "old mismatch: " << old_mismatch << endl;
1504     cout << "new_diff_mismatch: " << new_diff_mismatches << endl;
1505     cout << "temp mismatch: " << temp_diff_mismatches << endl;
1506     }
1507    
1508     if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
1509     {
1510     ++lb;
1511     continue;
1512     }
1513 gpertea 29
1514 gpertea 154 if (color)
1515     {
1516     /*
1517     * We need to recover the origianl sequence.
1518     */
1519     if (found_closure)
1520     {
1521     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
1522     prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
1523     }
1524    
1525     if (dist_to_left > 0)
1526     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
1527     else if (dist_to_left < 0)
1528     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
1529     }
1530    
1531     new_diff_mismatches = temp_diff_mismatches;
1532    
1533     new_left = prev_hit->left();
1534     new_cigar = prev_hit->cigar();
1535    
1536     int new_left_back_len = new_cigar.back().length;
1537     if (reversed)
1538     new_left_back_len -= dist_to_left;
1539     else
1540     new_left_back_len += dist_to_left;
1541    
1542     vector<CigarOp> new_right_cig = curr_hit->cigar();
1543     int new_right_front_len = new_right_cig.front().length;
1544     if (reversed)
1545     new_right_front_len += dist_to_right;
1546     else
1547     new_right_front_len -= dist_to_right;
1548     if (new_left_back_len > 0)
1549     new_cigar.back().length = new_left_back_len;
1550     else
1551     new_cigar.pop_back();
1552    
1553     /*
1554     * FIXME, currently just differentiating between a deletion and a
1555     * reference skip based on length. However, would probably be better
1556     * to denote the difference explicitly, this would allow the user
1557     * to supply their own (very large) deletions
1558     */
1559     if ((lb->right - lb->left - 1) <= max_deletion_length)
1560     {
1561     if (reversed)
1562     new_cigar.push_back(CigarOp(dEL, lb->right - lb->left - 1));
1563     else
1564     new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
1565     antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
1566     }
1567     else
1568     {
1569     if (reversed)
1570     new_cigar.push_back(CigarOp(rEF_SKIP, lb->right - lb->left - 1));
1571     else
1572     new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
1573     antisense_closure = lb->antisense;
1574     }
1575    
1576     new_right_cig.front().length = new_right_front_len;
1577     size_t c = new_right_front_len > 0 ? 0 : 1;
1578     for (; c < new_right_cig.size(); ++c)
1579     new_cigar.push_back(new_right_cig[c]);
1580    
1581     mismatch = new_diff_mismatches;
1582     found_closure = true;
1583     }
1584     ++lb;
1585     }
1586 gpertea 29
1587 gpertea 154 if (!found_closure)
1588     {
1589     return BowtieHit();
1590     }
1591     }
1592     else if (!(dist_btw_two == 0 && prev_hit->antisense_align2() == curr_hit->antisense_align()))
1593     check_fusion = true;
1594     }
1595 gpertea 29
1596 gpertea 154 if (check_fusion)
1597     {
1598     std::set<Fusion>::iterator lb, ub;
1599 gpertea 29
1600 gpertea 154 uint32_t ref_id1 = prev_hit->ref_id2();
1601     uint32_t ref_id2 = curr_hit->ref_id();
1602     uint32_t left = prev_hit->right() - 4;
1603     uint32_t right = curr_hit->left() - 4;
1604 gpertea 29
1605 gpertea 154 // daehwan
1606     if (bDebug)
1607     {
1608     cout << "daehwan - start - fusion" << endl
1609     << "ref_id1: " << ref_id1 << endl
1610     << "ref_id2: " << ref_id2 << endl
1611     << "left: " << left << endl
1612     << "right: " << right << endl
1613     << "dir: " << fusion_dir << endl;
1614     }
1615 gpertea 29
1616 gpertea 154 bool reversed = false;
1617     if (fusion_dir != FUSION_FF &&
1618     (ref_id2 < ref_id1 || (ref_id1 == ref_id2 && left > right)))
1619     {
1620     reversed = true;
1621    
1622     uint32_t temp = ref_id1;
1623     ref_id1 = ref_id2;
1624     ref_id2 = temp;
1625    
1626     temp = left;
1627     left = right;
1628     right = temp;
1629     }
1630    
1631     lb = possible_fusions.upper_bound(Fusion(ref_id1, ref_id2, left, right));
1632     ub = possible_fusions.lower_bound(Fusion(ref_id1, ref_id2, left + 8, right + 8));
1633 gpertea 29
1634 gpertea 154 RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id2());
1635     RefSequenceTable::Sequence* ref_str2 = rt.get_seq(curr_hit->ref_id());
1636    
1637     int new_diff_mismatches = 0xff;
1638     while (lb != ub && lb != possible_fusions.end())
1639     {
1640     int lb_left = lb->left;
1641     int lb_right = lb->right;
1642    
1643     if (reversed)
1644     {
1645     lb_left = lb->right;
1646     lb_right = lb->left;
1647     }
1648 gpertea 29
1649 gpertea 154 int dist_to_left, dist_to_right;
1650     if (fusion_dir == FUSION_RF)
1651     dist_to_left = prev_hit->right() - lb_left + 1;
1652     else
1653     dist_to_left = lb_left - prev_hit->right() + 1;
1654    
1655     if (fusion_dir == FUSION_FR)
1656     dist_to_right = curr_hit->left() - lb_right;
1657     else
1658     dist_to_right = lb_right - curr_hit->left();
1659 gpertea 29
1660 gpertea 154 // daehwan
1661     if (bDebug)
1662     {
1663     cout << "daehwan - fusion gap" << endl;
1664     cout << "dist left: " << dist_to_left << endl;
1665     cout << "dist right: " << dist_to_right << endl;
1666     }
1667    
1668     if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
1669     {
1670     /*
1671     * Check we have enough matched bases on prev or curr segment.
1672     */
1673     if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)
1674     {
1675     ++lb;
1676     continue;
1677     }
1678 gpertea 29
1679 gpertea 154 Dna5String new_cmp_str, old_cmp_str;
1680     int new_mismatch = 0, old_mismatch = 0;
1681     string new_patch_str; // this is for colorspace reads
1682     if (dist_to_left > 0)
1683     {
1684     if (fusion_dir == FUSION_RF)
1685     {
1686     new_cmp_str = seqan::infix(*ref_str, lb_left, prev_hit->right() + 1);
1687     seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1688     seqan::reverseInPlace(new_cmp_str);
1689     }
1690     else
1691     new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb_left + 1);
1692 gpertea 29
1693 gpertea 154 if (fusion_dir == FUSION_FR)
1694     {
1695     old_cmp_str = seqan::infix(*ref_str2, lb_right + 1, curr_hit->left() + 1);
1696     seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1697     seqan::reverseInPlace(old_cmp_str);
1698     }
1699     else
1700     old_cmp_str = seqan::infix(*ref_str2, curr_hit->left(), lb_right);
1701 gpertea 29
1702 gpertea 154 // daehwan
1703     if (bDebug)
1704     {
1705     cout << "new str: " << new_cmp_str << endl;
1706     cout << "old str: " << old_cmp_str << endl;
1707     cout << "curr seq: " << curr_hit->seq() << endl;
1708     }
1709 gpertea 29
1710 gpertea 154 string curr_hit_seq;
1711     if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1712     curr_hit_seq = curr_hit->seq();
1713     else
1714     curr_hit_seq = read_seq.substr(curr_seg_index * segment_length, segment_length);
1715    
1716     string new_seq;
1717     const string& curr_old_seq = curr_hit_seq;
1718     const string& curr_seq = color ? new_seq : curr_hit_seq;
1719     for (int i = 0; i < dist_to_left; ++i)
1720     {
1721     if (curr_seq[i] != new_cmp_str[i])
1722     ++new_mismatch;
1723    
1724     if (curr_old_seq[i] != old_cmp_str[i])
1725     ++old_mismatch;
1726     }
1727     }
1728     else if (dist_to_left < 0)
1729     {
1730     if (fusion_dir == FUSION_FR)
1731     {
1732     new_cmp_str = seqan::infix(*ref_str2, curr_hit->left() + 1, lb_right + 1);
1733     seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1734     seqan::reverseInPlace(new_cmp_str);
1735     }
1736     else
1737     new_cmp_str = seqan::infix(*ref_str2, lb_right, curr_hit->left());
1738 gpertea 29
1739 gpertea 154 if (fusion_dir == FUSION_RF)
1740     {
1741     old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb_left);
1742     seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1743     seqan::reverseInPlace(old_cmp_str);
1744     }
1745     else
1746     old_cmp_str = seqan::infix(*ref_str, lb_left + 1, prev_hit->right());
1747 gpertea 29
1748 gpertea 154 string prev_hit_seq;
1749     if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1750     prev_hit_seq = prev_hit->seq();
1751     else
1752     prev_hit_seq = read_seq.substr((curr_seg_index - 1) * segment_length, segment_length);
1753 gpertea 29
1754 gpertea 154 size_t abs_dist = -dist_to_left;
1755     string new_seq;
1756    
1757     const string& prev_old_seq = prev_hit_seq;
1758     size_t prev_old_seq_len = prev_old_seq.length();
1759     const string& prev_seq = color ? new_seq : prev_hit_seq;
1760     size_t prev_seq_len = prev_seq.length();
1761     for (size_t i = 0; i < abs_dist; ++i)
1762     {
1763     if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
1764     ++new_mismatch;
1765     if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
1766     ++old_mismatch;
1767     }
1768     }
1769 gpertea 29
1770 gpertea 154 int temp_diff_mismatches = new_mismatch - old_mismatch;
1771     if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
1772     {
1773     ++lb;
1774     continue;
1775     }
1776     new_diff_mismatches = temp_diff_mismatches;
1777 gpertea 29
1778 gpertea 154 new_left = prev_hit->left();
1779     new_cigar = prev_hit->cigar();
1780    
1781     int new_left_back_len = new_cigar.back().length;
1782     new_left_back_len += dist_to_left;
1783    
1784     vector<CigarOp> new_right_cig = curr_hit->cigar();
1785     int new_right_front_len = new_right_cig.front().length;
1786     new_right_front_len -= dist_to_right;
1787     if (new_left_back_len > 0)
1788     new_cigar.back().length = new_left_back_len;
1789     else
1790     new_cigar.pop_back();
1791 gpertea 29
1792 gpertea 154 new_cigar.push_back(CigarOp((CigarOpCode)fusion_dir, lb_right));
1793     antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
1794    
1795     new_right_cig.front().length = new_right_front_len;
1796     size_t c = new_right_front_len > 0 ? 0 : 1;
1797     for (; c < new_right_cig.size(); ++c)
1798     new_cigar.push_back(new_right_cig[c]);
1799    
1800     mismatch = new_diff_mismatches;
1801     found_closure = true;
1802     ++num_fusions;
1803 gpertea 29
1804 gpertea 154 // daehwan
1805     if (bDebug)
1806     {
1807     cout << "daehwan - fusion gap - found" << endl;
1808     }
1809 gpertea 29
1810 gpertea 154 }
1811     ++lb;
1812     }
1813 gpertea 29
1814 gpertea 154 // daehwan
1815     if (bDebug)
1816     {
1817     cout << "daehwan2 - end - fusion: " << (found_closure ? "found" : "not found") << endl;
1818     }
1819    
1820     if (!found_closure)
1821     {
1822     return BowtieHit();
1823     }
1824     }
1825 gpertea 29
1826 gpertea 154 if (found_closure)
1827     {
1828     bool end = false;
1829     BowtieHit merged_hit(prev_hit->ref_id(),
1830     curr_hit->ref_id2(),
1831     insert_id,
1832     new_left,
1833     new_cigar,
1834     antisense,
1835     antisense_closure,
1836     prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
1837     prev_hit->splice_mms() + curr_hit->splice_mms(),
1838     end);
1839 gpertea 29
1840 gpertea 154 // daehwan - should fix this for SOLiD dataset
1841     merged_hit.seq(prev_hit->seq() + curr_hit->seq());
1842 gpertea 29
1843 gpertea 154 // daehwan
1844     if (bDebug)
1845     {
1846     cout << "fusing of " << merged_hit.left() << " and " << merged_hit.right() << endl;
1847     cout << print_cigar(merged_hit.cigar()) << endl;
1848     if (!merged_hit.check_editdist_consistency(rt))
1849     {
1850     cout << "btw " << print_cigar(prev_hit->cigar()) << " and " << print_cigar(curr_hit->cigar()) << endl;
1851     cout << "this is malformed hit" << endl;
1852     }
1853     }
1854 gpertea 29
1855 gpertea 154 prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
1856     /*
1857     * prev_hit now points PAST the last element removed
1858     */
1859     prev_hit = hit_chain.insert(prev_hit, merged_hit);
1860     /*
1861     * merged_hit has been inserted before the old position of
1862     * prev_hit. New location of prev_hit is merged_hit
1863     */
1864     curr_hit = prev_hit;
1865     ++curr_hit;
1866     ++curr_seg_index;
1867     continue;
1868     }
1869 gpertea 29
1870 gpertea 154 // daehwan
1871     if (bDebug)
1872     {
1873     cout << "daehwan - test 0.3" << endl;
1874     }
1875    
1876 gpertea 29 ++prev_hit;
1877     ++curr_hit;
1878     ++curr_seg_index;
1879     }
1880    
1881 gpertea 154 // daehwan
1882     if (bDebug)
1883     {
1884     cout << "daehwan - test2" << endl;
1885     }
1886    
1887 gpertea 29 bool saw_antisense_splice = false;
1888     bool saw_sense_splice = false;
1889     vector<CigarOp> long_cigar;
1890     int num_mismatches = 0;
1891     int num_splice_mms = 0;
1892     for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
1893     {
1894     num_mismatches += s->edit_dist();
1895     num_splice_mms += s->splice_mms();
1896    
1897     /*
1898     * Check whether the sequence contains any reference skips. Previously,
1899     * this was just a check to see whether the sequence was contiguous; however
1900     * we don't want to count an indel event as a splice
1901     */
1902     bool containsSplice = s->is_spliced();
1903     if (containsSplice)
1904 gpertea 154 {
1905     if (s->antisense_splice())
1906     {
1907     if (saw_sense_splice)
1908     return BowtieHit();
1909     saw_antisense_splice = true;
1910     }
1911     else
1912     {
1913     if (saw_antisense_splice)
1914     return BowtieHit();
1915     saw_sense_splice = true;
1916     }
1917     }
1918 gpertea 29 const vector<CigarOp>& cigar = s->cigar();
1919     if (long_cigar.empty())
1920 gpertea 154 {
1921     long_cigar = cigar;
1922     }
1923 gpertea 29 else
1924 gpertea 154 {
1925     CigarOp& last = long_cigar.back();
1926     /*
1927     * If necessary, merge the back and front
1928     * cigar operations
1929     */
1930     if(last.opcode == cigar[0].opcode){
1931     last.length += cigar[0].length;
1932     for (size_t b = 1; b < cigar.size(); ++b)
1933     {
1934     long_cigar.push_back(cigar[b]);
1935     }
1936     }else{
1937     for(size_t b = 0; b < cigar.size(); ++b)
1938     {
1939     long_cigar.push_back(cigar[b]);
1940     }
1941     }
1942     }
1943 gpertea 29 }
1944    
1945     bool end = false;
1946 gpertea 154 BowtieHit new_hit(hit_chain.front().ref_id(),
1947     hit_chain.back().ref_id2(),
1948     insert_id,
1949     left,
1950     long_cigar,
1951     antisense,
1952     saw_antisense_splice,
1953     num_mismatches,
1954     num_splice_mms,
1955     end);
1956 gpertea 29
1957 gpertea 154 if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1958     {
1959     new_hit.seq(seq);
1960 gpertea 159 if (bowtie2)
1961     {
1962     // for the time being, let's compare "seq" and "read_seq"
1963     if (seq != read_seq)
1964     {
1965     string temp_qual = read_quals;
1966     reverse(temp_qual.begin(), temp_qual.end());
1967     new_hit.qual(temp_qual);
1968     }
1969     else
1970     new_hit.qual(read_quals);
1971     }
1972     else
1973     new_hit.qual(qual);
1974 gpertea 154 }
1975     else
1976     {
1977     new_hit.seq(read_seq);
1978     new_hit.qual(read_quals);
1979     }
1980 gpertea 29
1981 gpertea 154 bool do_reverse = new_hit.ref_id() > new_hit.ref_id2();
1982     if (new_hit.ref_id() == new_hit.ref_id2())
1983     {
1984     vector<Fusion> fusions;
1985     bool auto_sort = false;
1986     fusions_from_spliced_hit(new_hit, fusions, auto_sort);
1987     if (fusions.size() > 0)
1988     {
1989     const Fusion& fusion = fusions[0];
1990     do_reverse = fusion.left > fusion.right;
1991     }
1992     }
1993    
1994     if (do_reverse)
1995     {
1996     new_hit = new_hit.reverse();
1997     }
1998    
1999     if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
2000     {
2001     new_hit.antisense_align(!new_hit.antisense_align());
2002     }
2003    
2004     // daehwan
2005     if (bDebug)
2006     {
2007     cout << "daehwan - test3" << endl;
2008     cout << new_hit.left() << " " << print_cigar(new_hit.cigar()) << endl;
2009     cout << new_hit.ref_id() << "-" << new_hit.ref_id2() << ": " << new_hit.fusion_opcode() << endl;
2010     }
2011    
2012 gpertea 29 int new_read_len = new_hit.read_len();
2013 gpertea 159 if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt, bDebug))
2014 gpertea 29 {
2015 gpertea 154 // daehwan
2016     if (bDebug)
2017     {
2018     cout << "Warning: " << new_hit.insert_id() << " malformed closure: " << print_cigar(new_hit.cigar()) << endl;
2019     exit(1);
2020     }
2021    
2022     fprintf(stderr, "Warning: %d malformed closure\n", new_hit.insert_id());
2023 gpertea 29 return BowtieHit();
2024     }
2025    
2026     return new_hit;
2027     }
2028    
2029 gpertea 154
2030 gpertea 29 int multi_closure = 0;
2031     int anchor_too_short = 0;
2032     int gap_too_short = 0;
2033    
2034     bool valid_hit(const BowtieHit& bh)
2035     {
2036     if (bh.insert_id())
2037 gpertea 154 {
2038     /*
2039     * validate the cigar chain - no gaps shorter than an intron, etc.
2040     * also,
2041     * -Don't start or end with an indel or refskip
2042     * -Only a match operation is allowed is allowed
2043     * adjacent to an indel or refskip
2044     * -Indels should confrom to length restrictions
2045     */
2046     const CigarOp* prevCig = &(bh.cigar()[0]);
2047     const CigarOp* currCig = &(bh.cigar()[1]);
2048     for (size_t i = 1; i < bh.cigar().size(); ++i){
2049     currCig = &(bh.cigar()[i]);
2050     if(!(currCig->opcode == MATCH || currCig->opcode == mATCH) &&
2051     !(prevCig->opcode == MATCH || prevCig->opcode == mATCH)){
2052     return false;
2053     }
2054     if(currCig->opcode == INS || currCig->opcode == iNS){
2055     if(currCig->length > max_insertion_length){
2056     return false;
2057     }
2058     }
2059     if(currCig->opcode == DEL || currCig->opcode == dEL){
2060     if(currCig->length > max_deletion_length){
2061     return false;
2062     }
2063     }
2064     if(currCig->opcode == REF_SKIP || currCig->opcode == rEF_SKIP){
2065     if(currCig->length < (uint64_t)min_report_intron_length){
2066     gap_too_short++;
2067     return false;
2068     }
2069     }
2070     prevCig = currCig;
2071 gpertea 29 }
2072 gpertea 154 if (!(bh.cigar().front().opcode == MATCH || bh.cigar().front().opcode == mATCH) ||
2073     !(bh.cigar().back().opcode == MATCH || bh.cigar().back().opcode == mATCH)/* ||
2074     (int)bh.cigar().front().length < min_anchor_len||
2075     (int)bh.cigar().back().length < min_anchor_len*/ )
2076     {
2077     anchor_too_short++;
2078     return false;
2079     }
2080 gpertea 29 }
2081 gpertea 154 else
2082 gpertea 29 {
2083 gpertea 154 multi_closure++;
2084 gpertea 29 return false;
2085     }
2086 gpertea 154
2087 gpertea 29 return true;
2088     }
2089    
2090     void merge_segment_chain(RefSequenceTable& rt,
2091 gpertea 154 const string& read_seq,
2092     const string& read_quals,
2093     std::set<Junction>& possible_juncs,
2094     std::set<Insertion>& possible_insertions,
2095     std::set<Fusion>& possible_fusions,
2096     vector<BowtieHit>& hits,
2097     vector<BowtieHit>& merged_hits,
2098     int fusion_dir = FUSION_NOTHING)
2099 gpertea 29 {
2100     if (hits.size() == 0)
2101     return;
2102    
2103     BowtieHit bh;
2104     if (hits.size() > 1)
2105     {
2106     list<BowtieHit> hit_chain;
2107 gpertea 154 if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
2108     {
2109     if (hits.front().antisense_align())
2110     copy(hits.rbegin(), hits.rend(), back_inserter(hit_chain));
2111     else
2112     copy(hits.begin(), hits.end(), back_inserter(hit_chain));
2113     }
2114 gpertea 29 else
2115 gpertea 154 {
2116     bool bSawFusion = false;
2117     for (size_t i = 0; i < hits.size(); ++i)
2118     {
2119     bool pushed = false;
2120     if (!bSawFusion)
2121     {
2122     if (i > 0)
2123     {
2124     if (hits[i-1].ref_id() != hits[i].ref_id())
2125     bSawFusion = true;
2126     else if(hits[i-1].antisense_align() != hits[i].antisense_align())
2127     bSawFusion = true;
2128     else
2129     {
2130     int dist = 0;
2131     if (hits[i].antisense_align())
2132     dist = hits[i-1].left() - hits[i].right();
2133     else
2134     dist = hits[i].left() - hits[i-1].right();
2135 gpertea 29
2136 gpertea 154 if (dist >= max_report_intron_length || dist < -(int)max_insertion_length)
2137     bSawFusion = true;
2138     }
2139     }
2140     }
2141    
2142     if (hits[i].fusion_opcode() == FUSION_NOTHING &&
2143     ((fusion_dir == FUSION_FR && bSawFusion) || (fusion_dir == FUSION_RF && !bSawFusion)) &&
2144     hits[i].left() < hits[i].right())
2145     {
2146     hit_chain.push_back(hits[i].reverse());
2147     pushed = true;
2148     }
2149    
2150     if (i > 0 &&
2151     hits[i].fusion_opcode() != FUSION_NOTHING &&
2152     hits[i].ref_id() != hits[i-1].ref_id())
2153     {
2154     hit_chain.push_back(hits[i].reverse());
2155     pushed = true;
2156     }
2157    
2158     if (!bSawFusion)
2159     {
2160     if (hits[i].fusion_opcode() != FUSION_NOTHING)
2161     bSawFusion = true;
2162     }
2163    
2164     if (!pushed)
2165     hit_chain.push_back(hits[i]);
2166     }
2167     }
2168    
2169 gpertea 159 // todo: merge_chain_color needs to be merged into merge_chain fuction.
2170     if (color)
2171     bh = merge_chain_color(rt,
2172     read_seq,
2173     read_quals,
2174     possible_juncs,
2175     possible_insertions,
2176     hit_chain);
2177     else
2178     bh = merge_chain(rt,
2179     read_seq,
2180     read_quals,
2181     possible_juncs,
2182     possible_insertions,
2183     possible_fusions,
2184     hit_chain,
2185     fusion_dir);
2186 gpertea 29 }
2187     else
2188     {
2189     bh = hits[0];
2190     }
2191    
2192     if (valid_hit(bh))
2193     merged_hits.push_back(bh);
2194     }
2195    
2196     bool dfs_seg_hits(RefSequenceTable& rt,
2197 gpertea 154 const string& read_seq,
2198     const string& read_quals,
2199     std::set<Junction>& possible_juncs,
2200     std::set<Insertion>& possible_insertions,
2201     std::set<Fusion>& possible_fusions,
2202     vector<HitsForRead>& seg_hits_for_read,
2203     size_t curr,
2204     vector<BowtieHit>& seg_hit_stack,
2205     vector<BowtieHit>& joined_hits,
2206     int fusion_dir = FUSION_NOTHING)
2207 gpertea 29 {
2208     assert (!seg_hit_stack.empty());
2209     bool join_success = false;
2210    
2211     if (curr < seg_hits_for_read.size())
2212     {
2213 gpertea 154 for (size_t i = 0; i < seg_hits_for_read[curr].hits.size(); ++i)
2214     {
2215     /*
2216     * As we reverse segments depending on directions like FR or RF,
2217     * it's necessary to recover the original segments.
2218     */
2219     BowtieHit bh = seg_hits_for_read[curr].hits[i];
2220     BowtieHit bh_prev = seg_hit_stack.back();
2221    
2222     BowtieHit* prevHit = &bh_prev;
2223     BowtieHit* currHit = &bh;
2224 gpertea 29
2225 gpertea 154 // daehwan
2226     // if (bh.insert_id() == 11921733)
2227     // bDebug = true;
2228 gpertea 29
2229 gpertea 154 /*
2230     * Each segment has at most one fusion by assumption,
2231     */
2232     bool prevHit_fused = prevHit->fusion_opcode() != FUSION_NOTHING;
2233     bool currHit_fused = currHit->fusion_opcode() != FUSION_NOTHING;
2234 gpertea 29
2235 gpertea 154 /*
2236     * Count the number of fusions on prev and curr segments,
2237     * but this doesn't take into account the gap (if exists) between the two segments.
2238     */
2239     size_t num_fusions = prevHit_fused ? 1 : 0;
2240     num_fusions += currHit_fused ? 1 : 0;
2241 gpertea 29
2242 gpertea 154 int dir = prevHit_fused ? prevHit->fusion_opcode() : currHit->fusion_opcode();
2243 gpertea 29
2244 gpertea 154 /*
2245     * We don't allow reads that span more than two fusion points.
2246     */
2247     if (num_fusions >= 2)
2248     continue;
2249 gpertea 29
2250 gpertea 154 if (fusion_dir != FUSION_NOTHING && currHit_fused)
2251     continue;
2252    
2253     if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
2254     {
2255     if ((currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id()) ||
2256     (!currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id2()))
2257     continue;
2258     }
2259    
2260     if (bDebug)
2261     {
2262     cout << "daehwan - prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": "
2263     << print_cigar(prevHit->cigar()) << endl;
2264     cout << "daehwan - prev sense: "
2265     << (prevHit->antisense_align() ? "-" : "+")
2266     << "\t"
2267     << (prevHit->antisense_align2() ? "-" : "+")
2268     << endl;
2269     cout << "daehwan - prev coords: "
2270     << prevHit->left()
2271     << "\t"
2272     << prevHit->right()
2273     << endl;
2274    
2275     cout << "daehwan - curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": "
2276     << print_cigar(currHit->cigar()) << endl;
2277     cout << "daehwan - curr sense: "
2278     << (currHit->antisense_align() ? "-" : "+")
2279     << "\t"
2280     << (currHit->antisense_align2() ? "-" : "+")
2281     << endl;
2282 gpertea 159 cout << "daehwan - curr coords: "
2283 gpertea 154 << currHit->left()
2284     << "\t"
2285     << currHit->right()
2286     << endl;
2287     }
2288    
2289     if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF) &&
2290     prevHit->ref_id2() != currHit->ref_id())
2291     continue;
2292    
2293     if ((fusion_dir == FUSION_FR && !currHit->antisense_align()) ||
2294     (fusion_dir == FUSION_RF && currHit->antisense_align()))
2295     continue;
2296    
2297     if (currHit_fused && dir == FUSION_RR)
2298     *currHit = currHit->reverse();
2299    
2300     if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RF ||
2301     (currHit_fused && currHit->ref_id() == currHit->ref_id2() && (dir == FUSION_FR || dir == FUSION_RF)))
2302     {
2303     if (currHit_fused)
2304     {
2305     if ((dir == FUSION_FR && currHit->antisense_align()) ||
2306     (dir == FUSION_RF && !currHit->antisense_align()))
2307     *currHit = currHit->reverse();
2308     }
2309     else
2310     {
2311     if (fusion_dir == FUSION_FR && currHit->antisense_align())
2312     *currHit = currHit->reverse();
2313     }
2314     }
2315     /*
2316     * Switch prevHit and currHit in FUSION_NOTHING and FUSION_FF cases
2317     * to make it easier to check the distance in the gap between the two segments.
2318     */
2319     else if ((num_fusions == 0 && prevHit->antisense_align() && currHit->antisense_align() && prevHit->ref_id() == currHit->ref_id() && prevHit->left() <= currHit->right() + max_report_intron_length && prevHit->left() + max_insertion_length >= currHit->right()) ||
2320     (num_fusions == 1 && (dir == FUSION_FF || dir == FUSION_RR)&&
2321     ((!prevHit_fused && prevHit->antisense_align()) || (!currHit_fused && currHit->antisense_align())))
2322     )
2323     {
2324     BowtieHit* tempHit = prevHit;
2325     prevHit = currHit;
2326     currHit = tempHit;
2327     }
2328     else if (num_fusions == 0)
2329     {
2330     if (prevHit->ref_id2() == currHit->ref_id() && prevHit->antisense_align() == currHit->antisense_align())
2331     {
2332     int dist = 0;
2333     if (prevHit->antisense_align())
2334     dist = prevHit->left() - currHit->right();
2335     else
2336     dist = currHit->left() - prevHit->right();
2337    
2338     if (dist > max_report_intron_length || dist < -(int)max_insertion_length)
2339     {
2340     if ((prevHit->antisense_align() && prevHit->left() > currHit->left()) ||
2341     (!prevHit->antisense_align() && prevHit->left() < currHit->left()))
2342     dir = FUSION_FF;
2343     else
2344     dir = FUSION_RR;
2345     }
2346     }
2347     else
2348     {
2349     if (prevHit->antisense_align() == currHit->antisense_align())
2350     {
2351     if ((prevHit->antisense_align() && prevHit->ref_id() > currHit->ref_id()) ||
2352     (!prevHit->antisense_align() && prevHit->ref_id() < currHit->ref_id()))
2353     dir = FUSION_FF;
2354     else
2355     dir = FUSION_RR;
2356     }
2357     else if (!prevHit->antisense_align())
2358     dir = FUSION_FR;
2359     else
2360     dir = FUSION_RF;
2361    
2362     if (dir == FUSION_FR)
2363     *currHit = currHit->reverse();
2364     else if(dir == FUSION_RF)
2365     *prevHit = prevHit->reverse();
2366     }
2367     }
2368    
2369     // daehwan - test
2370     if (bDebug)
2371     {
2372     cout << "insert id: " << prevHit->insert_id() << endl;
2373     cout << "(" << curr - 1 << ") prev: " << prevHit->seq() << " : " << (prevHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
2374     cout << "(" << curr << ") curr: " << currHit->seq() << " : " << (currHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
2375     cout << "prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": " << print_cigar(prevHit->cigar()) << endl;
2376     cout << "curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": " << print_cigar(currHit->cigar()) << endl;
2377     cout << "prev coords: "
2378     << prevHit->left()
2379     << "\t"
2380     << prevHit->right()
2381     << endl;
2382     cout << "curr corrds: "
2383     << currHit->left()
2384     << "\t"
2385     << currHit->right()
2386     << endl;
2387     cout << "prev sense: "
2388     << (prevHit->antisense_align() ? "-" : "+")
2389     << "\t"
2390     << (prevHit->antisense_align2() ? "-" : "+")
2391     << endl;
2392     cout << "curr sense: "
2393     << (currHit->antisense_align() ? "-" : "+")
2394     << "\t"
2395     << (currHit->antisense_align2() ? "-" : "+")
2396     << endl;
2397     }
2398    
2399     if (num_fusions == 1)
2400     {
2401     // daehwan
2402     if (bDebug)
2403     {
2404     cout << "direction: " << (int)dir << endl;
2405     }
2406    
2407     /*
2408     * orient the fused segment, which depends on a fusion direction.
2409     */
2410     if (dir != FUSION_FF && dir != FUSION_RR)
2411     {
2412     bool prevHit_rep = false;
2413     bool currHit_rep = false;
2414    
2415     if (prevHit_fused)
2416     {
2417     if ((dir == FUSION_FR && !currHit->antisense_align()) ||
2418     (dir == FUSION_RF && currHit->antisense_align()))
2419     continue;
2420    
2421     if (prevHit->ref_id2() != currHit->ref_id())
2422     prevHit_rep = true;
2423     else if ((dir == FUSION_FR && prevHit->antisense_align()) ||
2424     (dir == FUSION_RF && !prevHit->antisense_align()))
2425     prevHit_rep = true;
2426     }
2427    
2428     if (currHit_fused)
2429     {
2430     if ((dir == FUSION_FR && prevHit->antisense_align()) ||
2431     (dir == FUSION_RF && !prevHit->antisense_align()))
2432     continue;
2433    
2434     if (currHit->ref_id() != prevHit->ref_id2())
2435     currHit_rep = true;
2436     }
2437    
2438     if (bDebug)
2439     {
2440     if (prevHit_rep) cout << "1. reversed in prev" << endl;
2441     if (currHit_rep) cout << "1. reversed in curr" << endl;
2442     }
2443    
2444     if (prevHit_rep)
2445     *prevHit = prevHit->reverse();
2446     if (currHit_rep)
2447     *currHit = currHit->reverse();
2448    
2449     prevHit_rep = false;
2450     currHit_rep = false;
2451    
2452     if (prevHit_fused)
2453     {
2454     if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
2455     currHit_rep = true;
2456     }
2457     else
2458     {
2459     if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
2460     prevHit_rep = true;
2461     }
2462    
2463     if (prevHit_rep)
2464     *prevHit = prevHit->reverse();
2465     if (currHit_rep)
2466     *currHit = currHit->reverse();
2467    
2468     // daehwan
2469     if (bDebug)
2470     {
2471     if (prevHit_rep) cout << "2. reversed in prev" << endl;
2472     if (currHit_rep) cout << "2. reversed in curr" << endl;
2473     }
2474     }
2475     }
2476    
2477     bool same_contig = prevHit->ref_id2() == currHit->ref_id();
2478     if (!same_contig && num_fusions > 0)
2479     continue;
2480    
2481     if (same_contig && num_fusions >= 1)
2482     {
2483     if (prevHit->antisense_align2() != currHit->antisense_align())
2484     continue;
2485     }
2486    
2487     int bh_l = 0, back_right = 0, dist = 0;
2488     if (same_contig)
2489     {
2490     if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF || dir == FUSION_FR || dir == FUSION_RF) &&
2491     prevHit->antisense_align2())
2492     {
2493     bh_l = prevHit->right() + 1;
2494     back_right = currHit->left() + 1;
2495     }
2496     else
2497     {
2498     bh_l = currHit->left();
2499     back_right = prevHit->right();
2500     }
2501    
2502     dist = bh_l - back_right;
2503     }
2504    
2505     // daehwan - pass
2506     if (bDebug)
2507     {
2508     cout << "daehwan - pass" << endl;
2509     cout << "prev coords: " << prevHit->left() << "\t" << prevHit->right() << endl;
2510     cout << "curr coords: " << currHit->left() << "\t" << currHit->right() << endl;
2511     }
2512    
2513     if (!same_contig ||
2514     (same_contig && num_fusions == 0 && dir != FUSION_NOTHING && fusion_dir == FUSION_NOTHING) ||
2515     (same_contig && dist <= max_report_intron_length && dist >= -(int)max_insertion_length && prevHit->is_forwarding_right() == currHit->is_forwarding_left()))
2516     {
2517     // daehwan
2518     if (bDebug)
2519     {
2520     cout << "daehwan - really passed!!" << endl;
2521     }
2522    
2523     BowtieHit tempHit = seg_hit_stack.back();
2524     seg_hit_stack.back() = bh_prev;
2525    
2526     // these hits are compatible, so push bh onto the
2527     // stack, recurse, and pop it when done.
2528     seg_hit_stack.push_back(bh);
2529     bool success = dfs_seg_hits(rt,
2530     read_seq,
2531     read_quals,
2532     possible_juncs,
2533     possible_insertions,
2534     possible_fusions,
2535     seg_hits_for_read,
2536     curr + 1,
2537     seg_hit_stack,
2538     joined_hits,
2539     dir == FUSION_NOTHING ? fusion_dir : dir);
2540    
2541     if (success)
2542     join_success = true;
2543    
2544     seg_hit_stack.pop_back();
2545     seg_hit_stack.back() = tempHit;
2546     }
2547     }
2548 gpertea 29 }
2549     else
2550 gpertea 154 {
2551     merge_segment_chain(rt,
2552     read_seq,
2553     read_quals,
2554     possible_juncs,
2555     possible_insertions,
2556     possible_fusions,
2557     seg_hit_stack,
2558     joined_hits,
2559     fusion_dir);
2560    
2561     return join_success = true;
2562     }
2563 gpertea 29 return join_success;
2564     }
2565    
2566     bool join_segments_for_read(RefSequenceTable& rt,
2567 gpertea 154 const string& read_seq,
2568     const string& read_quals,
2569     std::set<Junction>& possible_juncs,
2570     std::set<Insertion>& possible_insertions,
2571     std::set<Fusion>& possible_fusions,
2572     vector<HitsForRead>& seg_hits_for_read,
2573     vector<BowtieHit>& joined_hits)
2574     {
2575 gpertea 29 vector<BowtieHit> seg_hit_stack;
2576     bool join_success = false;
2577 gpertea 159
2578 gpertea 154 // ignore segments that map to more than this many places.
2579     if (bowtie2)
2580     {
2581     for (size_t s = 0; s < seg_hits_for_read.size(); ++s)
2582     {
2583 gpertea 159 if (seg_hits_for_read[s].hits.size() > max_seg_multihits)
2584 gpertea 154 return join_success;
2585     }
2586     }
2587    
2588 gpertea 29 for (size_t i = 0; i < seg_hits_for_read[0].hits.size(); ++i)
2589     {
2590     BowtieHit& bh = seg_hits_for_read[0].hits[i];
2591 gpertea 154
2592 gpertea 159 // daehwan - remove this
2593     //if (bh.insert_id() == 811745)
2594     //bDebug = true;
2595    
2596 gpertea 154 if (bh.fusion_opcode() == FUSION_RR)
2597     seg_hit_stack.push_back(bh.reverse());
2598     else
2599     seg_hit_stack.push_back(bh);
2600    
2601 gpertea 29 bool success = dfs_seg_hits(rt,
2602 gpertea 154 read_seq,
2603     read_quals,
2604     possible_juncs,
2605     possible_insertions,
2606     possible_fusions,
2607     seg_hits_for_read,
2608     1,
2609     seg_hit_stack,
2610     joined_hits);
2611 gpertea 29 if (success)
2612 gpertea 159 join_success = true;
2613 gpertea 29 seg_hit_stack.pop_back();
2614     }
2615 gpertea 159
2616 gpertea 29 return join_success;
2617     }
2618    
2619 gpertea 154 struct JoinSegmentsWorker
2620 gpertea 29 {
2621 gpertea 154 void operator()()
2622     {
2623     ReadTable it;
2624     GBamWriter bam_writer(bam_output_fname.c_str(), sam_header_fname.c_str());
2625     ReadStream readstream(reads_fname);
2626     if (readstream.file() == NULL)
2627     err_die("Error: cannot open %s for reading\n", reads_fname.c_str());
2628 gpertea 29
2629 gpertea 154 if (read_offset > 0)
2630     readstream.seek(read_offset);
2631 gpertea 29
2632 gpertea 154 bool need_seq = true;
2633 gpertea 159 bool need_qual = true;
2634 gpertea 29
2635 gpertea 154 vector<HitStream> contig_hits;
2636     vector<HitStream> spliced_hits;
2637     vector<HitFactory*> factories;
2638     for (size_t i = 0; i < segmap_fnames.size(); ++i)
2639     {
2640     HitFactory* fac = new BAMHitFactory(it, *rt);
2641     factories.push_back(fac);
2642     HitStream hs(segmap_fnames[i], fac, false, false, false, need_seq, need_qual);
2643    
2644     if (seg_offsets[i] > 0)
2645     hs.seek(seg_offsets[i]);
2646 gpertea 29
2647 gpertea 154 contig_hits.push_back(hs);
2648     }
2649    
2650     for (size_t i = 0; i < spliced_segmap_fnames.size(); ++i)
2651     {
2652     int anchor_length = 0;
2653     HitFactory* fac = new SplicedBAMHitFactory(it, *rt, anchor_length);
2654     factories.push_back(fac);
2655    
2656     HitStream hs(spliced_segmap_fnames[i], fac, true, false, false, need_seq, need_qual);
2657     if (spliced_seg_offsets[i] > 0)
2658     hs.seek(spliced_seg_offsets[i]);
2659 gpertea 29
2660 gpertea 154 spliced_hits.push_back(hs);
2661     }
2662 gpertea 29
2663 gpertea 154 uint32_t curr_contig_obs_order = VMAXINT32;
2664     HitStream* first_seg_contig_stream = NULL;
2665     uint64_t next_contig_id = 0;
2666    
2667     if (contig_hits.size() > 0)
2668     {
2669     first_seg_contig_stream = &(contig_hits.front());
2670     next_contig_id = first_seg_contig_stream->next_group_id();
2671     curr_contig_obs_order = it.observation_order(next_contig_id);
2672     }
2673 gpertea 29
2674 gpertea 154 HitsForRead curr_hit_group;
2675    
2676     uint32_t curr_spliced_obs_order = VMAXINT32;
2677     HitStream* first_seg_spliced_stream = NULL;
2678     uint64_t next_spliced_id = 0;
2679    
2680     if (spliced_hits.size() > 0)
2681     {
2682     first_seg_spliced_stream = &(spliced_hits.front());
2683     next_spliced_id = first_seg_spliced_stream->next_group_id();
2684     curr_spliced_obs_order = it.observation_order(next_spliced_id);
2685     }
2686 gpertea 29
2687 gpertea 154 while((curr_contig_obs_order != VMAXINT32 || curr_spliced_obs_order != VMAXINT32) &&
2688     (curr_contig_obs_order < end_id || curr_spliced_obs_order < end_id))
2689     {
2690     uint32_t read_in_process;
2691     vector<HitsForRead> seg_hits_for_read;
2692     seg_hits_for_read.resize(contig_hits.size());
2693 gpertea 29
2694 gpertea 154 if (curr_contig_obs_order < curr_spliced_obs_order)
2695     {
2696     first_seg_contig_stream->next_read_hits(curr_hit_group);
2697     seg_hits_for_read.front() = curr_hit_group;
2698    
2699     next_contig_id = first_seg_contig_stream->next_group_id();
2700     uint32_t next_order = it.observation_order(next_contig_id);
2701    
2702     read_in_process = curr_contig_obs_order;
2703     curr_contig_obs_order = next_order;
2704     }
2705     else if (curr_spliced_obs_order < curr_contig_obs_order)
2706     {
2707     first_seg_spliced_stream->next_read_hits(curr_hit_group);
2708     seg_hits_for_read.front() = curr_hit_group;
2709    
2710     next_spliced_id = first_seg_spliced_stream->next_group_id();
2711     uint32_t next_order = it.observation_order(next_spliced_id);
2712 gpertea 29
2713 gpertea 154 read_in_process = curr_spliced_obs_order;
2714     curr_spliced_obs_order = next_order;
2715 gpertea 29
2716 gpertea 154 if (read_in_process < begin_id)
2717     continue;
2718     }
2719     else if (curr_contig_obs_order == curr_spliced_obs_order &&
2720     curr_contig_obs_order != VMAXINT32 &&
2721     curr_spliced_obs_order != VMAXINT32)
2722     {
2723     first_seg_contig_stream->next_read_hits(curr_hit_group);
2724    
2725     HitsForRead curr_spliced_group;
2726     first_seg_spliced_stream->next_read_hits(curr_spliced_group);
2727    
2728     curr_hit_group.hits.insert(curr_hit_group.hits.end(),
2729     curr_spliced_group.hits.begin(),
2730     curr_spliced_group.hits.end());
2731     seg_hits_for_read.front() = curr_hit_group;
2732     read_in_process = curr_spliced_obs_order;
2733    
2734     next_contig_id = first_seg_contig_stream->next_group_id();
2735     uint32_t next_order = it.observation_order(next_contig_id);
2736    
2737     next_spliced_id = first_seg_spliced_stream->next_group_id();
2738     uint32_t next_spliced_order = it.observation_order(next_spliced_id);
2739    
2740     curr_spliced_obs_order = next_spliced_order;
2741     curr_contig_obs_order = next_order;
2742     }
2743     else
2744     {
2745     break;
2746     }
2747    
2748     if (contig_hits.size() > 1)
2749     {
2750     look_right_for_hit_group(it,
2751     contig_hits,
2752     0,
2753     spliced_hits,
2754     curr_hit_group,
2755     seg_hits_for_read);
2756     }
2757 gpertea 29
2758 gpertea 154 size_t last_non_empty = seg_hits_for_read.size() - 1;
2759     while(last_non_empty >= 0 && seg_hits_for_read[last_non_empty].hits.empty())
2760     {
2761     --last_non_empty;
2762     }
2763    
2764     seg_hits_for_read.resize(last_non_empty + 1);
2765     if (!seg_hits_for_read[last_non_empty].hits[0].end())
2766     continue;
2767    
2768     if (!seg_hits_for_read.empty() && !seg_hits_for_read[0].hits.empty())
2769     {
2770     uint64_t insert_id = seg_hits_for_read[0].hits[0].insert_id();
2771     if (insert_id >= begin_id && insert_id < end_id)
2772     {
2773     Read read;
2774     if (readstream.getRead(insert_id, read))
2775     {
2776     vector<BowtieHit> joined_hits;
2777     join_segments_for_read(*rt,
2778     read.seq.c_str(),
2779     read.qual.c_str(),
2780     *possible_juncs,
2781     *possible_insertions,
2782     *possible_fusions,
2783     seg_hits_for_read,
2784     joined_hits);
2785    
2786     sort(joined_hits.begin(), joined_hits.end());
2787     vector<BowtieHit>::iterator new_end = unique(joined_hits.begin(), joined_hits.end());
2788     joined_hits.erase(new_end, joined_hits.end());
2789     for (size_t i = 0; i < joined_hits.size(); i++)
2790     {
2791     const char* ref_name = rt->get_name(joined_hits[i].ref_id());
2792     const char* ref_name2 = "";
2793     if (joined_hits[i].fusion_opcode() != FUSION_NOTHING)
2794     ref_name2 = rt->get_name(joined_hits[i].ref_id2());
2795 gpertea 29
2796 gpertea 159 vector<string> extra_fields;
2797    
2798     if (!color)
2799     bowtie_sam_extra(joined_hits[i], *rt, extra_fields);
2800    
2801     if (color)
2802 gpertea 154 print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2803 gpertea 159 joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true, &extra_fields);
2804 gpertea 154 else
2805     print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2806 gpertea 159 read.seq.c_str(), read.qual.c_str(), false, &extra_fields);
2807 gpertea 154 }
2808     }
2809     else
2810     {
2811     err_die("Error: could not get read # %d from stream\n",
2812     read_in_process);
2813     }
2814     }
2815     }
2816     else
2817     {
2818     //fprintf(stderr, "Warning: couldn't join segments for read # %d\n", read_in_process);
2819     }
2820 gpertea 135 }
2821 gpertea 29
2822 gpertea 154 for (size_t fac = 0; fac < factories.size(); ++fac)
2823 gpertea 135 {
2824 gpertea 154 delete factories[fac];
2825 gpertea 135 }
2826 gpertea 154 factories.clear();
2827     }
2828 gpertea 29
2829 gpertea 154 string bam_output_fname;
2830     string sam_header_fname;
2831     string reads_fname;
2832 gpertea 29
2833 gpertea 154 vector<string> segmap_fnames;
2834     vector<string> spliced_segmap_fnames;
2835    
2836     std::set<Junction>* possible_juncs;
2837     std::set<Insertion>* possible_insertions;
2838     std::set<Fusion>* possible_fusions;
2839    
2840     RefSequenceTable* rt;
2841 gpertea 29
2842 gpertea 154 uint64_t begin_id;
2843     uint64_t end_id;
2844     int64_t read_offset;
2845     vector<int64_t> seg_offsets;
2846     vector<int64_t> spliced_seg_offsets;
2847     };
2848 gpertea 29
2849 gpertea 154 void driver(const string& bam_output_fname,
2850     istream& ref_stream,
2851     vector<FILE*>& possible_juncs_files,
2852     vector<FILE*>& possible_insertions_files,
2853     vector<FILE*>& possible_deletions_files,
2854     vector<FILE*>& possible_fusions_files,
2855     vector<string>& spliced_segmap_fnames, //.bam files
2856     vector<string>& segmap_fnames, //.bam files
2857     const string& reads_fname)
2858     {
2859     if (!parallel)
2860     num_threads = 1;
2861 gpertea 29
2862 gpertea 69 if (segmap_fnames.size() == 0)
2863 gpertea 29 {
2864     fprintf(stderr, "No hits to process, exiting\n");
2865     exit(0);
2866     }
2867    
2868 gpertea 154 RefSequenceTable rt(sam_header, true);
2869 gpertea 29 fprintf (stderr, "Loading reference sequences...\n");
2870 gpertea 154 get_seqs(ref_stream, rt, true);
2871 gpertea 29 fprintf (stderr, " reference sequences loaded.\n");
2872    
2873     fprintf(stderr, "Loading junctions...");
2874     std::set<Junction> possible_juncs;
2875    
2876     for (size_t i = 0; i < possible_juncs_files.size(); ++i)
2877     {
2878     char buf[2048];
2879     while(!feof(possible_juncs_files[i]) &&
2880     fgets(buf, sizeof(buf), possible_juncs_files[i]))
2881     {
2882     char junc_ref_name[256];
2883     int left;
2884     int right;
2885     char orientation;
2886     int ret = sscanf(buf, "%s %d %d %c", junc_ref_name, &left, &right, &orientation);
2887     if (ret != 4)
2888     continue;
2889     uint32_t ref_id = rt.get_id(junc_ref_name, NULL, 0);
2890     possible_juncs.insert(Junction(ref_id, left, right, orientation == '-'));
2891     }
2892     }
2893     fprintf(stderr, "done\n");
2894    
2895     fprintf(stderr, "Loading deletions...");
2896    
2897     for (size_t i = 0; i < possible_deletions_files.size(); ++i)
2898     {
2899     char splice_buf[2048];
2900     FILE* deletions_file = possible_deletions_files[i];
2901     if(!deletions_file){
2902     continue;
2903     }
2904     while(fgets(splice_buf, 2048, deletions_file)){
2905     char* nl = strrchr(splice_buf, '\n');
2906     char* buf = splice_buf;
2907     if (nl) *nl = 0;
2908    
2909     char* ref_name = get_token((char**)&buf, "\t");
2910     char* scan_left_coord = get_token((char**)&buf, "\t");
2911     char* scan_right_coord = get_token((char**)&buf, "\t");
2912    
2913     if (!scan_left_coord || !scan_right_coord)
2914     {
2915     err_die("Error: malformed deletion coordinate record\n");
2916     }
2917    
2918     uint32_t ref_id = rt.get_id(ref_name,NULL,0);
2919     uint32_t left_coord = atoi(scan_left_coord);
2920     uint32_t right_coord = atoi(scan_right_coord);
2921     possible_juncs.insert((Junction)Deletion(ref_id, left_coord - 1,right_coord, false));
2922     }
2923     }
2924     fprintf(stderr, "done\n");
2925    
2926     /*
2927     * Read the insertions from the list of insertion
2928     * files into a set
2929     */
2930 gpertea 154 fprintf(stderr, "Loading insertions...");
2931 gpertea 29 std::set<Insertion> possible_insertions;
2932     for (size_t i=0; i < possible_insertions_files.size(); ++i)
2933     {
2934     char splice_buf[2048];
2935     FILE* insertions_file = possible_insertions_files[i];
2936     if(!insertions_file){
2937     continue;
2938     }
2939     while(fgets(splice_buf, 2048, insertions_file)){
2940     char* nl = strrchr(splice_buf, '\n');
2941     char* buf = splice_buf;
2942     if (nl) *nl = 0;
2943    
2944     char* ref_name = get_token((char**)&buf, "\t");
2945     char* scan_left_coord = get_token((char**)&buf, "\t");
2946     char* scan_right_coord = get_token((char**)&buf, "\t");
2947     char* scan_sequence = get_token((char**)&buf, "\t");
2948    
2949     if (!scan_left_coord || !scan_sequence || !scan_right_coord)
2950     {
2951     err_die("Error: malformed insertion coordinate record\n");
2952     }
2953    
2954     uint32_t ref_id = rt.get_id(ref_name,NULL,0);
2955     uint32_t left_coord = atoi(scan_left_coord);
2956     std::string sequence(scan_sequence);
2957     possible_insertions.insert(Insertion(ref_id, left_coord, sequence));
2958     }
2959     }
2960 gpertea 154 fprintf(stderr, "done\n");
2961 gpertea 29
2962 gpertea 154 vector<uint64_t> read_ids;
2963     vector<vector<int64_t> > offsets;
2964     if (num_threads > 1)
2965     {
2966     vector<string> fnames;
2967     fnames.push_back(reads_fname);
2968     fnames.insert(fnames.end(), spliced_segmap_fnames.rbegin(), spliced_segmap_fnames.rend());
2969     fnames.insert(fnames.end(), segmap_fnames.rbegin(), segmap_fnames.rend());
2970     bool enough_data = calculate_offsets(fnames, read_ids, offsets);
2971     if (!enough_data)
2972     num_threads = 1;
2973     }
2974 gpertea 29
2975 gpertea 154 std::set<Fusion> possible_fusions;
2976     if (fusion_search)
2977     {
2978     fprintf(stderr, "Loading fusions...");
2979     for (size_t i=0; i < possible_fusions_files.size(); ++i)
2980     {
2981     char splice_buf[2048];
2982     FILE* fusions_file = possible_fusions_files[i];
2983     if(!fusions_file){
2984     continue;
2985     }
2986     while(fgets(splice_buf, 2048, fusions_file)){
2987     char* nl = strrchr(splice_buf, '\n');
2988     char* buf = splice_buf;
2989     if (nl) *nl = 0;
2990    
2991     char* ref_name1 = strsep((char**)&buf, "\t");
2992     char* scan_left_coord = strsep((char**)&buf, "\t");
2993     char* ref_name2 = strsep((char**)&buf, "\t");
2994     char* scan_right_coord = strsep((char**)&buf, "\t");
2995     char* scan_dir = strsep((char**)&buf, "\t");
2996    
2997     if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir)
2998     {
2999     fprintf(stderr,"Error: malformed insertion coordinate record\n");
3000     exit(1);
3001     }
3002    
3003     uint32_t ref_id1 = rt.get_id(ref_name1,NULL,0);
3004     uint32_t ref_id2 = rt.get_id(ref_name2,NULL,0);
3005     uint32_t left_coord = atoi(scan_left_coord);
3006     uint32_t right_coord = atoi(scan_right_coord);
3007     uint32_t dir = FUSION_FF;
3008     if (strcmp(scan_dir, "fr") == 0)
3009     dir = FUSION_FR;
3010     else if(strcmp(scan_dir, "rf") == 0)
3011     dir = FUSION_RF;
3012     else if (strcmp(scan_dir, "rr") == 0)
3013     dir = FUSION_RR;
3014    
3015     possible_fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir));
3016     }
3017     }
3018     fprintf(stderr, "done\n");
3019     }
3020    
3021     vector<boost::thread*> threads;
3022     for (int i = 0; i < num_threads; ++i)
3023     {
3024     JoinSegmentsWorker worker;
3025 gpertea 29
3026 gpertea 154 if (num_threads == 1)
3027     worker.bam_output_fname = bam_output_fname;
3028     else
3029     {
3030     string filename_base = bam_output_fname.substr(0, bam_output_fname.length() - 4);
3031     char filename[1024] = {0};
3032     sprintf(filename, "%s%d.bam", filename_base.c_str(), i);
3033     worker.bam_output_fname = filename;
3034     }
3035    
3036     worker.sam_header_fname = sam_header;
3037     worker.reads_fname = reads_fname;
3038     worker.segmap_fnames = segmap_fnames;
3039     worker.spliced_segmap_fnames = spliced_segmap_fnames;
3040     worker.possible_juncs = &possible_juncs;
3041     worker.possible_insertions = &possible_insertions;
3042     worker.possible_fusions = &possible_fusions;
3043     worker.rt = &rt;
3044     if (i == 0)
3045     {
3046     worker.begin_id = 0;
3047     worker.seg_offsets = vector<int64_t>(segmap_fnames.size(), 0);
3048     worker.spliced_seg_offsets = vector<int64_t>(spliced_segmap_fnames.size(), 0);
3049     worker.read_offset = 0;
3050     }
3051     else
3052     {
3053     worker.begin_id = read_ids[i-1];
3054     worker.seg_offsets.insert(worker.seg_offsets.end(),
3055     offsets[i-1].rbegin(),
3056     offsets[i-1].rbegin() + segmap_fnames.size());
3057     worker.spliced_seg_offsets.insert(worker.spliced_seg_offsets.end(),
3058     offsets[i-1].rbegin() + segmap_fnames.size(),
3059     offsets[i-1].rend() - 1);
3060     worker.read_offset = offsets[i-1][0];
3061     }
3062    
3063     worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
3064    
3065     if (num_threads > 1)
3066     threads.push_back(new boost::thread(worker));
3067     else
3068     worker();
3069     }
3070    
3071     for (size_t i = 0; i < threads.size(); ++i)
3072     {
3073     threads[i]->join();
3074     delete threads[i];
3075     threads[i] = NULL;
3076     }
3077     threads.clear();
3078    
3079 gpertea 29 } //driver
3080    
3081     int main(int argc, char** argv)
3082     {
3083     fprintf(stderr, "long_spanning_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
3084     fprintf(stderr, "--------------------------------------------\n");
3085    
3086     int parse_ret = parse_options(argc, argv, print_usage);
3087     if (parse_ret)
3088     return parse_ret;
3089    
3090     if(optind >= argc)
3091     {
3092     print_usage();
3093     return 1;
3094     }
3095    
3096     string ref_file_name = argv[optind++];
3097    
3098     if(optind >= argc)
3099     {
3100     print_usage();
3101     return 1;
3102     }
3103    
3104    
3105     string reads_file_name = argv[optind++];
3106    
3107     if(optind >= argc)
3108     {
3109     print_usage();
3110     return 1;
3111     }
3112    
3113     string juncs_file_list = argv[optind++];
3114    
3115     if(optind >= argc)
3116     {
3117     print_usage();
3118     return 1;
3119     }
3120    
3121     string insertions_file_list = argv[optind++];
3122     if(optind >= argc)
3123     {
3124 gpertea 154 print_usage();
3125     return 1;
3126 gpertea 29 }
3127 gpertea 154
3128     string deletions_file_list = argv[optind++];
3129    
3130     if(optind >= argc)
3131     {
3132     print_usage();
3133     return 1;
3134     }
3135 gpertea 29
3136 gpertea 154 string fusions_file_list = argv[optind++];
3137    
3138     if(optind >= argc)
3139     {
3140     print_usage();
3141     return 1;
3142     }
3143    
3144     string bam_output_fname = argv[optind++];
3145    
3146     if(optind >= argc)
3147     {
3148     print_usage();
3149     return 1;
3150     }
3151    
3152 gpertea 69 string segmap_file_list = argv[optind++];
3153     string spliced_segmap_flist;
3154 gpertea 29 if(optind < argc)
3155     {
3156 gpertea 69 spliced_segmap_flist = argv[optind++];
3157 gpertea 29 }
3158    
3159     ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
3160     if (!ref_stream.good())
3161     err_die("Error: cannot open %s for reading\n",ref_file_name.c_str());
3162    
3163     checkSamHeader();
3164    
3165     //FILE* reads_file = fopen(reads_file_name.c_str(), "r");
3166 gpertea 69 vector<string> segmap_file_names;
3167     tokenize(segmap_file_list, ",",segmap_file_names);
3168 gpertea 29
3169     vector<string> juncs_file_names;
3170     vector<FILE*> juncs_files;
3171     tokenize(juncs_file_list, ",",juncs_file_names);
3172     for (size_t i = 0; i < juncs_file_names.size(); ++i) {
3173     //fprintf(stderr, "Opening %s for reading\n",
3174     // juncs_file_names[i].c_str());
3175     FILE* juncs_file = fopen(juncs_file_names[i].c_str(), "r");
3176     if (juncs_file == NULL) {
3177     fprintf(stderr, "Warning: cannot open %s for reading\n",
3178     juncs_file_names[i].c_str());
3179     continue;
3180     }
3181     juncs_files.push_back(juncs_file);
3182     }
3183    
3184     /*
3185     * Read in the deletion file names
3186     */
3187     vector<string> deletions_file_names;
3188     vector<FILE*> deletions_files;
3189     tokenize(deletions_file_list, ",",deletions_file_names);
3190     for (size_t i = 0; i < deletions_file_names.size(); ++i)
3191     {
3192     //fprintf(stderr, "Opening %s for reading\n",
3193     // deletions_file_names[i].c_str());
3194     FILE* deletions_file = fopen(deletions_file_names[i].c_str(), "r");
3195     if (deletions_file == NULL) {
3196     fprintf(stderr, "Warning: cannot open %s for reading\n",
3197     deletions_file_names[i].c_str());
3198     continue;
3199     }
3200     deletions_files.push_back(deletions_file);
3201     }
3202    
3203     /*
3204     * Read in the list of filenames that contain
3205     * insertion coordinates
3206     */
3207    
3208     vector<string> insertions_file_names;
3209     vector<FILE*> insertions_files;
3210     tokenize(insertions_file_list, ",",insertions_file_names);
3211     for (size_t i = 0; i < insertions_file_names.size(); ++i)
3212     {
3213     //fprintf(stderr, "Opening %s for reading\n",
3214     // insertions_file_names[i].c_str());
3215     FILE* insertions_file = fopen(insertions_file_names[i].c_str(), "r");
3216     if (insertions_file == NULL)
3217     {
3218     fprintf(stderr, "Warning: cannot open %s for reading\n",
3219     insertions_file_names[i].c_str());
3220     continue;
3221     }
3222     insertions_files.push_back(insertions_file);
3223     }
3224    
3225 gpertea 69 vector<string> spliced_segmap_file_names;
3226     vector<FZPipe> spliced_segmap_files;
3227 gpertea 135 string unzcmd;
3228 gpertea 69 tokenize(spliced_segmap_flist, ",",spliced_segmap_file_names);
3229 gpertea 154
3230     vector<string> fusions_file_names;
3231     vector<FILE*> fusions_files;
3232     tokenize(fusions_file_list, ",",fusions_file_names);
3233     for (size_t i = 0; i < fusions_file_names.size(); ++i)
3234 gpertea 29 {
3235 gpertea 135 fprintf(stderr, "Opening %s for reading\n",
3236 gpertea 154 fusions_file_names[i].c_str());
3237     FILE* fusions_file = fopen(fusions_file_names[i].c_str(), "r");
3238     if (fusions_file == NULL)
3239 gpertea 29 {
3240 gpertea 154 fprintf(stderr, "Warning: cannot open %s for reading\n",
3241     fusions_file_names[i].c_str());
3242     continue;
3243 gpertea 29 }
3244 gpertea 154 fusions_files.push_back(fusions_file);
3245 gpertea 29 }
3246 gpertea 154
3247    
3248     driver(bam_output_fname,
3249     ref_stream,
3250     juncs_files,
3251     insertions_files,
3252     deletions_files,
3253     fusions_files,
3254     spliced_segmap_file_names,
3255     segmap_file_names,
3256     reads_file_name);
3257 gpertea 29
3258     return 0;
3259     }