ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/long_spanning_reads.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (8 years, 4 months ago) by gpertea
File size: 77245 byte(s)
Log Message:
massive update with Daehwan's work

Line User Rev File contents
1 gpertea 29 #ifdef HAVE_CONFIG_H
2     #include <config.h>
3     #endif
4    
5     /*
6     * long_spanning_reads.cpp
7     * TopHat
8     *
9     * Created by Cole Trapnell on 2/5/09.
10     * Copyright 2009 Cole Trapnell. All rights reserved.
11     *
12     */
13    
14     #include <cassert>
15     #include <cstdio>
16     #include <vector>
17     #include <string>
18     #include <map>
19     #include <algorithm>
20     #include <set>
21     #include <cstdlib>
22     #include <cstring>
23     #include <bitset>
24     //#include <stdexcept>
25     #include <iostream>
26     #include <fstream>
27     #include <sstream>
28     #include <seqan/sequence.h>
29     #include <seqan/find.h>
30     #include <seqan/file.h>
31     #include <seqan/modifier.h>
32     #include <getopt.h>
33    
34 gpertea 154 #include <boost/thread.hpp>
35    
36 gpertea 29 #include "common.h"
37 gpertea 154 #include "utils.h"
38 gpertea 29 #include "bwt_map.h"
39     #include "tokenize.h"
40     #include "segments.h"
41     #include "reads.h"
42    
43     #include "junctions.h"
44     #include "insertions.h"
45     #include "deletions.h"
46 gpertea 154 #include "fusions.h"
47 gpertea 29
48     using namespace seqan;
49     using namespace std;
50    
51 gpertea 154 // daehwan
52     bool bDebug = false;
53    
54 gpertea 29 void print_usage()
55     {
56     fprintf(stderr, "Usage: long_spanning_reads <reads.fa/.fq> <possible_juncs1,...,possible_juncsN> <possible_insertions1,...,possible_insertionsN> <possible_deletions1,...,possible_deletionsN> <seg1.bwtout,...,segN.bwtout> [spliced_seg1.bwtout,...,spliced_segN.bwtout]\n");
57     }
58    
59     bool key_lt(const pair<uint32_t, HitsForRead>& lhs,
60     const pair<uint32_t, HitsForRead>& rhs)
61     {
62     return lhs.first < rhs.first;
63     }
64    
65     void get_seqs(istream& ref_stream,
66     RefSequenceTable& rt,
67 gpertea 154 bool keep_seqs = true)
68 gpertea 29 {
69     while(ref_stream.good() &&
70     !ref_stream.eof())
71     {
72     RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
73     string name;
74     readMeta(ref_stream, name, Fasta());
75     string::size_type space_pos = name.find_first_of(" \t\r");
76     if (space_pos != string::npos)
77     {
78     name.resize(space_pos);
79     }
80     seqan::read(ref_stream, *ref_str, Fasta());
81    
82     rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
83     }
84     }
85    
86    
87     void look_right_for_hit_group(ReadTable& unmapped_reads,
88     vector<HitStream>& contig_hits,
89     size_t curr_file,
90     vector<HitStream>& spliced_hits,
91     const HitsForRead& targets,
92     vector<HitsForRead>& seg_hits_for_read)
93     {
94     int right_file = curr_file + 1;
95    
96     HitStream& right = contig_hits[right_file];
97     uint64_t curr_next_group_id = targets.insert_id;
98     int curr_order = unmapped_reads.observation_order(curr_next_group_id);
99    
100     assert (curr_order != -1);
101     while(true)
102     {
103     HitsForRead hit_group;
104     uint64_t right_next_group_id = right.next_group_id();
105     int right_order = unmapped_reads.observation_order(right_next_group_id);
106    
107     // If we would have seen the hits by now, bail out.
108     if (curr_order < right_order || right_order == -1)
109     {
110     break;
111     }
112     if (right.next_read_hits(hit_group))
113     {
114     if (hit_group.insert_id == targets.insert_id)
115     {
116     // Some of the targets may be missing, we need to
117     // process them individually
118     seg_hits_for_read[right_file] = hit_group;
119     break;
120     }
121     }
122     }
123    
124     HitsForRead& curr_seg_hits = seg_hits_for_read[right_file];
125    
126     if (right_file < (int)spliced_hits.size() && right_file >= 0)
127 gpertea 154 {
128 gpertea 29 // Scan forward in the spliced hits file for this hit group
129     HitsForRead spliced_group;
130     HitsForRead curr_spliced_group;
131     while (spliced_hits[right_file].next_group_id() > 0 &&
132     spliced_hits[right_file].next_group_id() <= (uint32_t)curr_order)
133     {
134     spliced_hits[right_file].next_read_hits(curr_spliced_group);
135    
136     if (curr_spliced_group.insert_id == (uint32_t)curr_order)
137     {
138     spliced_group = curr_spliced_group;
139     break;
140     }
141     }
142    
143     if (!spliced_group.hits.empty())
144     {
145     curr_seg_hits.insert_id = spliced_group.insert_id;
146     curr_seg_hits.hits.insert(curr_seg_hits.hits.end(),
147     spliced_group.hits.begin(),
148     spliced_group.hits.end());
149     }
150     }
151    
152     if (curr_seg_hits.hits.empty())
153     return;
154     else if (right_file + 1 < (int)contig_hits.size())
155     {
156     look_right_for_hit_group(unmapped_reads,
157     contig_hits,
158     curr_file + 1,
159     spliced_hits,
160     curr_seg_hits,
161     seg_hits_for_read);
162     }
163     }
164    
165     BowtieHit merge_chain(RefSequenceTable& rt,
166 gpertea 154 const string& read_seq,
167     const string& read_quals,
168     std::set<Junction>& possible_juncs,
169     std::set<Insertion>& possible_insertions,
170     std::set<Fusion>& possible_fusions,
171     list<BowtieHit>& hit_chain,
172     int fusion_dir = FUSION_NOTHING)
173 gpertea 29 {
174     bool antisense = hit_chain.front().antisense_align();
175     uint64_t insert_id = hit_chain.front().insert_id();
176 gpertea 154
177     const int left = hit_chain.front().left();
178    
179 gpertea 29 list<BowtieHit>::iterator prev_hit = hit_chain.begin();
180     list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
181 gpertea 154
182 gpertea 29 string seq;
183     string qual;
184     int old_read_length = 0;
185     int first_seg_length = hit_chain.front().seq().length();
186     for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
187     {
188     seq += i->seq();
189     qual += i->qual();
190     old_read_length += i->read_len();
191     }
192    
193     string rev_read_seq, rev_read_quals;
194     if (color && antisense)
195     {
196     rev_read_seq = read_seq;
197     reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
198 gpertea 154
199 gpertea 29 rev_read_quals = read_quals;
200     reverse(rev_read_quals.begin(), rev_read_quals.end());
201     }
202    
203 gpertea 154 size_t num_fusions = prev_hit->fusion_opcode() == FUSION_NOTHING ? 0 : 1;
204     bool fusion_passed = false;
205 gpertea 29 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
206     {
207 gpertea 154 if (prev_hit->ref_id() != prev_hit->ref_id2() || prev_hit->ref_id2() != curr_hit->ref_id())
208     fusion_passed = true;
209 gpertea 29
210 gpertea 154 if (prev_hit->ref_id2() != curr_hit->ref_id())
211     ++num_fusions;
212    
213     if (curr_hit->fusion_opcode() != FUSION_NOTHING)
214     ++num_fusions;
215    
216     if (prev_hit->ref_id2() == curr_hit->ref_id())
217     {
218     bool reversed = false;
219     if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
220     reversed = true;
221    
222     /*
223     * Note that the gap size may be negative, since left() and right() return
224     * signed integers, this will be OK.
225     */
226     int gap;
227     if (reversed)
228     gap = prev_hit->right() - curr_hit->left();
229     else
230     gap = curr_hit->left() - prev_hit->right();
231    
232     // daehwan
233     if (bDebug)
234     {
235     cout << "curr left: " << curr_hit->left() << endl
236     << "prev right: " << prev_hit->right() << endl
237     << "gap: " << gap << endl;
238     }
239    
240     if (gap < -(int)max_insertion_length ||
241     (gap > (int)max_deletion_length &&
242     (gap < min_report_intron_length || gap > min(max_report_intron_length, (int)fusion_min_dist))))
243     {
244     fusion_passed = true;
245     ++num_fusions;
246     }
247     }
248    
249     if (num_fusions >= 2)
250     return BowtieHit();
251    
252 gpertea 29 ++prev_hit;
253     ++curr_hit;
254     }
255 gpertea 154
256 gpertea 29 prev_hit = hit_chain.begin();
257     curr_hit = ++(hit_chain.begin());
258    
259 gpertea 154 // daehwan
260     if (bDebug)
261     {
262     cout << "daehwan - test" << endl;
263     }
264    
265 gpertea 29 int curr_seg_index = 1;
266 gpertea 154 fusion_passed = false;
267 gpertea 29 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
268     {
269 gpertea 154 // daehwan
270     if (bDebug)
271     {
272     cout << "daehwan - start - stitch" << endl;
273     cout << "prev right: " << prev_hit->right() << endl;
274     cout << "curr left: " << curr_hit->left() << endl;
275     cout << "prev back: " << prev_hit->cigar().back().opcode << endl;
276     cout << "curr front: " << curr_hit->cigar().front().opcode << endl;
277     cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
278     cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
279     }
280    
281     if (prev_hit->fusion_opcode() != FUSION_NOTHING || prev_hit->ref_id2() != curr_hit->ref_id())
282     fusion_passed = true;
283    
284 gpertea 29 /*
285     * This code is assuming that the cigar strings end and start with a match
286     * While segment alignments can actually end with a junction, insertion or deletion, the hope
287     * is that in those cases, the right and left ends of the alignments will correctly
288     * line up, so we won't get to this bit of code
289     */
290 gpertea 154 if (!(prev_hit->cigar().back().opcode == MATCH || curr_hit->cigar().front().opcode == MATCH ||
291     prev_hit->cigar().back().opcode == mATCH || curr_hit->cigar().front().opcode == mATCH))
292     {
293     return BowtieHit();
294     }
295 gpertea 29
296 gpertea 154 // daehwan
297     if (bDebug)
298     {
299     cout << "daehwan - pass - enough matched bases" << endl;
300     }
301    
302 gpertea 29 if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
303 gpertea 154 {
304     /*
305     * There is no way that we can splice these together into a valid
306     * alignment
307     */
308     return BowtieHit();
309     }
310 gpertea 29
311     bool found_closure = false;
312     /*
313     * Note that antisense_splice is the same for prev and curr
314     */
315     bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
316     vector<CigarOp> new_cigar;
317     int new_left = -1;
318     int mismatch = 0;
319 gpertea 154
320 gpertea 29 /*
321     * Take the length of matched bases in each segment into consideration for closures,
322     * this can be a problem for reads of variable lengths.
323     */
324     int prev_right_end_match_length = prev_hit->cigar().back().length;
325     int curr_left_end_match_length = curr_hit->cigar().front().length;
326    
327 gpertea 154 bool check_fusion = prev_hit->ref_id2() != curr_hit->ref_id();
328 gpertea 29
329 gpertea 154 if (prev_hit->ref_id2() == curr_hit->ref_id())
330     {
331     // daehwan
332     if (bDebug)
333     {
334     cout << "daehwan - start - junction or insertion" << endl;
335     cout << "prev right: " << prev_hit->right() << endl;
336     cout << "curr left: " << curr_hit->left() << endl;
337     cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
338     cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
339     }
340 gpertea 29
341 gpertea 154 bool reversed = false;
342     if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
343     reversed = true;
344 gpertea 29
345 gpertea 154 uint32_t reference_id = prev_hit->ref_id2();
346     RefSequenceTable::Sequence* ref_str = rt.get_seq(reference_id);
347    
348     int left_boundary, right_boundary;
349     if (reversed)
350     {
351     left_boundary = curr_hit->left() - 4;
352     right_boundary = prev_hit->right() + 4;
353     }
354     else
355     {
356     left_boundary = prev_hit->right() - 4;
357     right_boundary = curr_hit->left() + 4;
358     }
359 gpertea 29
360 gpertea 154 int dist_btw_two;
361     if (reversed)
362     dist_btw_two = prev_hit->right() - curr_hit->left();
363     else
364     dist_btw_two = curr_hit->left() - prev_hit->right();
365 gpertea 29
366 gpertea 154 if (dist_btw_two < 0 && dist_btw_two >= -max_insertion_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
367     {
368     std::set<Insertion>::iterator lb, ub;
369    
370     /*
371     * Create a dummy sequence to represent the maximum possible insertion
372     */
373     std::string maxInsertedSequence = "";
374     maxInsertedSequence.resize(max_insertion_length,'A');
375 gpertea 29
376 gpertea 154 lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
377     ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
378    
379     int reference_mismatch = 0;
380    
381     while (lb != ub && lb != possible_insertions.end())
382     {
383     /*
384     * In the following code, we will check to make sure that the segments have the proper
385     * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
386     * In general, reads with insertions must match the inserted sequence exactly.
387     */
388     if (((int)lb->sequence.size()) == (reversed ? curr_hit->left() - prev_hit->right() : prev_hit->right() - curr_hit->left()))
389     {
390     /*
391     * Check we have enough matched bases on prev or curr segment.
392     */
393     int insert_to_prev_right, curr_left_to_insert;
394     if (reversed)
395     {
396     insert_to_prev_right = lb->left - prev_hit->right();
397     curr_left_to_insert = curr_hit->left() - lb->left;
398     }
399     else
400     {
401     insert_to_prev_right = prev_hit->right() - lb->left - 1;
402     curr_left_to_insert = lb->left - curr_hit->left() + 1;
403     }
404 gpertea 29
405 gpertea 154 if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
406     {
407     ++lb;
408     continue;
409     }
410 gpertea 29
411 gpertea 154 // daehwan
412     if (bDebug)
413     {
414     cout << "insert_to_prev_right: " << insert_to_prev_right << endl;
415     cout << "curr_left_to_insert: " << curr_left_to_insert << endl;
416     cout << "curr_seg_index: " << curr_seg_index << endl;
417     }
418    
419     /*
420     * Keep track of how many mismatches were made to the genome in the region
421     * where we should actually be matching against the insertion
422     */
423     int this_reference_mismatch = 0;
424     int insertion_mismatch = 0;
425     int insertion_len = lb->sequence.length();
426     seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
427     if (reversed)
428     {
429     seqan::convertInPlace(insertionSequence, seqan::FunctorComplement<seqan::Dna>());
430     seqan::reverseInPlace(insertionSequence);
431     }
432    
433     /*
434     * First check to see if we need to adjust number of observed errors for the left (prev)
435     * hit. This is only the case if this segment runs into the insertion. To be consistent
436     * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
437     */
438     string colorSegmentSequence_prev;
439     if (insert_to_prev_right > 0)
440     {
441     seqan::Dna5String referenceSequence, oldSegmentSequence;
442 gpertea 29
443 gpertea 154 if (reversed)
444     {
445     referenceSequence = seqan::infix(*ref_str, prev_hit->right() + 1, lb->left + 1);
446     seqan::convertInPlace(referenceSequence, seqan::FunctorComplement<seqan::Dna>());
447     seqan::reverseInPlace(referenceSequence);
448 gpertea 29
449 gpertea 154 string temp;
450 gpertea 29
451 gpertea 154 // daehwan
452     if (bDebug)
453     {
454     cout << "reversed: " << read_seq.length() << " " << read_seq << endl;
455     }
456    
457     temp = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right);
458     oldSegmentSequence = seqan::Dna5String(temp);
459     }
460     else
461     {
462     referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
463 gpertea 29
464 gpertea 154 // daehwan
465     if (bDebug)
466     {
467     cout << "non-reversed: " << prev_hit->seq() << endl;
468     }
469 gpertea 29
470 gpertea 154 oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
471     }
472    
473     if (color)
474     {
475     string color;
476    
477     if (antisense)
478     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
479     else
480     color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
481    
482     color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
483     colorSegmentSequence_prev = convert_color_to_bp(color);
484     }
485    
486     const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
487 gpertea 29
488 gpertea 154 // daehwan
489     if (bDebug)
490     {
491     cout << "ref: " << referenceSequence << endl;
492     cout << "old: " << oldSegmentSequence << endl;
493     cout << "ins: " << insertionSequence << endl;
494     }
495    
496     /*
497     * Scan right in the read until we run out of read
498     */
499     for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
500     {
501     /*
502     * Any mismatch to the insertion is a failure
503     */
504     if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
505     {
506     ++this_reference_mismatch;
507     }
508    
509     if (read_index < insertion_len)
510     {
511     if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
512     {
513     ++insertion_mismatch;
514     break;
515     }
516     }
517     else
518     {
519     if (referenceSequence[read_index - insertion_len] == 'N' ||
520     referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
521     {
522     --this_reference_mismatch;
523     }
524     }
525     }
526     }
527    
528     string colorSegmentSequence_curr;
529     if (curr_left_to_insert > 0)
530     {
531     seqan::Dna5String referenceSequence, oldSegmentSequence;
532     if (reversed)
533     {
534     referenceSequence = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
535     seqan::convertInPlace(referenceSequence, seqan::FunctorComplement<seqan::Dna>());
536     seqan::reverseInPlace(referenceSequence);
537 gpertea 29
538 gpertea 154 string temp = read_seq.substr(curr_seg_index * segment_length, curr_left_to_insert);
539     oldSegmentSequence = seqan::Dna5String(temp);
540     }
541     else
542     {
543     referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
544     oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
545     }
546    
547     if (color)
548     {
549     string color;
550     if (antisense)
551     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
552     else
553     color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
554    
555     color.push_back(curr_hit->seq()[curr_left_to_insert]);
556     reverse(color.begin(), color.end());
557     string bp = convert_color_to_bp(color);
558     reverse(bp.begin(), bp.end());
559     colorSegmentSequence_curr = bp;
560     }
561    
562     const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
563 gpertea 29
564 gpertea 154 // daehwan
565     if (bDebug)
566     {
567     cout << "ref: " << referenceSequence << endl;
568     cout << "old: " << oldSegmentSequence << endl;
569     cout << "ins: " << insertionSequence << endl;
570     }
571    
572     /*
573     * Scan left in the read until
574     * We ran out of read sequence (insertion extends past segment)
575     */
576     for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
577     {
578     int segmentPosition = curr_left_to_insert - read_index - 1;
579     int insertionPosition = insertion_len - read_index - 1;
580    
581     if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
582     {
583     ++this_reference_mismatch;
584     }
585    
586     if (read_index < insertion_len)
587     {
588     if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
589     {
590     ++insertion_mismatch;
591     break;
592     }
593     }
594     else
595     {
596     if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
597     (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
598     {
599     --this_reference_mismatch;
600     }
601     }
602     }
603     }
604    
605     if (found_closure)
606     {
607     // fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
608     return BowtieHit();
609     }
610 gpertea 29
611 gpertea 154 if (insertion_mismatch == 0)
612     {
613     reference_mismatch = this_reference_mismatch;
614     mismatch = -reference_mismatch;
615     found_closure = true;
616     new_left = prev_hit->left();
617     new_cigar = prev_hit->cigar();
618    
619     /*
620     * Need to make a new insert operation between the two match character that begin
621     * and end the intersection of these two junction. Note that we necessarily assume
622     * that this insertion can't span beyond the boundaries of these reads. That should
623     * probably be better enforced somewhere
624     */
625 gpertea 29
626 gpertea 154 new_cigar.back().length -= insert_to_prev_right;
627    
628     if (new_cigar.back().length <= 0)
629     new_cigar.pop_back();
630 gpertea 29
631 gpertea 154 if (reversed)
632     new_cigar.push_back(CigarOp(iNS, lb->sequence.size()));
633     else
634     new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
635    
636     vector<CigarOp> new_right_cigar = curr_hit->cigar();
637     new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
638    
639     /*
640     * Finish stitching together the new cigar string
641     */
642     size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
643     for (; c < new_right_cigar.size(); ++c)
644     {
645     new_cigar.push_back(new_right_cigar[c]);
646     }
647    
648     if (color)
649     {
650     if (insert_to_prev_right > 0)
651     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
652    
653     if (curr_left_to_insert > 0)
654     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
655     }
656     }
657     }
658     ++lb;
659     }
660    
661     if (!found_closure)
662     {
663     return BowtieHit();
664     }
665     }
666 gpertea 29
667 gpertea 154 /*
668     * Stitch segments together using juctions or deletions if necessary.
669     */
670     else if (dist_btw_two > 0 && dist_btw_two <= max_report_intron_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
671     {
672     std::set<Junction>::iterator lb, ub;
673 gpertea 29
674 gpertea 154 // daehwan
675     if (bDebug)
676     {
677     cout << "junction" << endl;
678     cout << "min: " << left_boundary << "-" << right_boundary - 8 << endl;
679     cout << "max: " << left_boundary + 8 << "-" << right_boundary << endl;
680     }
681 gpertea 29
682 gpertea 154 lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
683     ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
684    
685     int new_diff_mismatches = 0xff;
686     while (lb != ub && lb != possible_juncs.end())
687     {
688     int dist_to_left, dist_to_right;
689     if (reversed)
690     {
691     dist_to_left = lb->left - curr_hit->left();
692     dist_to_right = lb->right - prev_hit->right() - 1;
693     }
694     else
695     {
696     dist_to_left = lb->left - prev_hit->right() + 1;
697     dist_to_right = lb->right - curr_hit->left();
698     }
699    
700     if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
701     {
702     /*
703     * Check we have enough matched bases on prev or curr segment.
704     */
705     if ((reversed && (dist_to_left > prev_right_end_match_length || -dist_to_left > curr_left_end_match_length)) ||
706     (!reversed && (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)))
707     {
708     ++lb;
709     continue;
710     }
711 gpertea 29
712 gpertea 154 // daehwan
713     if (bDebug)
714     {
715     cout << "candidate junction: " << endl;
716     cout << "coords: " << lb->left << "-" << lb->right << endl;
717     cout << "dist to left: " << dist_to_left << endl;
718     }
719    
720     Dna5String new_cmp_str, old_cmp_str;
721     int new_mismatch = 0, old_mismatch = 0;
722     string new_patch_str; // this is for colorspace reads
723     if (dist_to_left > 0)
724     {
725     if (reversed)
726     {
727     new_cmp_str = seqan::infix(*ref_str, curr_hit->left() + 1, lb->left + 1);
728     seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
729     seqan::reverseInPlace(new_cmp_str);
730    
731     old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb->right);
732     seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
733     seqan::reverseInPlace(old_cmp_str);
734     }
735     else
736     {
737     new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
738     old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
739     }
740    
741     string new_seq;
742     if (color)
743     {
744     string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
745    
746     string color, qual;
747     if (antisense)
748     {
749     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
750     qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
751     }
752     else
753     {
754     color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
755     qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
756     }
757    
758     BWA_decode(color, qual, ref, new_seq);
759     new_seq = new_seq.substr(1);
760     }
761 gpertea 29
762 gpertea 154 string curr_hit_seq;
763     if (reversed)
764     curr_hit_seq = read_seq.substr(curr_seg_index * segment_length - dist_to_left, dist_to_left);
765     else
766     curr_hit_seq = curr_hit->seq();
767    
768     const string& curr_old_seq = curr_hit_seq;
769     const string& curr_seq = color ? new_seq : curr_hit_seq;
770     for (int i = 0; i < dist_to_left; ++i)
771     {
772     if (curr_seq[i] != new_cmp_str[i])
773     ++new_mismatch;
774    
775     if (curr_old_seq[i] != old_cmp_str[i])
776     ++old_mismatch;
777     }
778    
779     if (color)
780     new_patch_str = curr_seq.substr(0, dist_to_left);
781     }
782     else if (dist_to_left < 0)
783     {
784     if (reversed)
785     {
786     new_cmp_str = seqan::infix(*ref_str, lb->right, prev_hit->right() + 1);
787     seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
788     seqan::reverseInPlace(new_cmp_str);
789 gpertea 29
790 gpertea 154 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
791     seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
792     seqan::reverseInPlace(old_cmp_str);
793     }
794     else
795     {
796     new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
797     old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
798     }
799 gpertea 29
800 gpertea 154 size_t abs_dist = -dist_to_left;
801     string new_seq;
802     if (color)
803     {
804     string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
805     ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
806    
807     string color, qual;
808     if (antisense)
809     {
810     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
811     qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
812     }
813     else
814     {
815     color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
816     qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
817     }
818    
819     BWA_decode(color, qual, ref, new_seq);
820     new_seq = new_seq.substr(1);
821     }
822 gpertea 29
823 gpertea 154 string prev_hit_seq;
824     if (reversed)
825     prev_hit_seq = read_seq.substr(curr_seg_index * segment_length, abs_dist);
826     else
827     prev_hit_seq = prev_hit->seq();
828 gpertea 29
829 gpertea 154 // daehwan
830     if (bDebug)
831     {
832     cout << "reverse: " << (int)reversed << endl;
833     cout << "new cmp str: " << new_cmp_str << endl;
834     cout << "old cmp str: " << old_cmp_str << endl;
835     cout << "hit seq: " << prev_hit_seq << endl;
836     cout << "curr seq: " << curr_hit->seq() << endl;
837 gpertea 29
838 gpertea 154 cout << read_seq
839     << endl;
840     cout << read_seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, segment_length)
841     << endl;
842     }
843    
844     const string& prev_old_seq = prev_hit_seq;
845     size_t prev_old_seq_len = prev_old_seq.length();
846     const string& prev_seq = color ? new_seq : prev_hit_seq;
847     size_t prev_seq_len = prev_seq.length();
848     for (size_t i = 0; i < abs_dist; ++i)
849     {
850     if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
851     ++new_mismatch;
852     if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
853     ++old_mismatch;
854     }
855 gpertea 29
856 gpertea 154 if (color)
857     new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
858     }
859    
860     int temp_diff_mismatches = new_mismatch - old_mismatch;
861 gpertea 29
862 gpertea 154 // daehwan
863     if (bDebug)
864     {
865     cout << "new mismatch: " << new_mismatch << endl;
866     cout << "old mismatch: " << old_mismatch << endl;
867     cout << "new_diff_mismatch: " << new_diff_mismatches << endl;
868     cout << "temp mismatch: " << temp_diff_mismatches << endl;
869     }
870    
871     if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
872     {
873     ++lb;
874     continue;
875     }
876 gpertea 29
877 gpertea 154 if (color)
878     {
879     /*
880     * We need to recover the origianl sequence.
881     */
882     if (found_closure)
883     {
884     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
885     prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
886     }
887    
888     if (dist_to_left > 0)
889     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
890     else if (dist_to_left < 0)
891     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
892     }
893    
894     new_diff_mismatches = temp_diff_mismatches;
895    
896     new_left = prev_hit->left();
897     new_cigar = prev_hit->cigar();
898    
899     int new_left_back_len = new_cigar.back().length;
900     if (reversed)
901     new_left_back_len -= dist_to_left;
902     else
903     new_left_back_len += dist_to_left;
904    
905     vector<CigarOp> new_right_cig = curr_hit->cigar();
906     int new_right_front_len = new_right_cig.front().length;
907     if (reversed)
908     new_right_front_len += dist_to_right;
909     else
910     new_right_front_len -= dist_to_right;
911     if (new_left_back_len > 0)
912     new_cigar.back().length = new_left_back_len;
913     else
914     new_cigar.pop_back();
915    
916     /*
917     * FIXME, currently just differentiating between a deletion and a
918     * reference skip based on length. However, would probably be better
919     * to denote the difference explicitly, this would allow the user
920     * to supply their own (very large) deletions
921     */
922     if ((lb->right - lb->left - 1) <= max_deletion_length)
923     {
924     if (reversed)
925     new_cigar.push_back(CigarOp(dEL, lb->right - lb->left - 1));
926     else
927     new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
928     antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
929     }
930     else
931     {
932     if (reversed)
933     new_cigar.push_back(CigarOp(rEF_SKIP, lb->right - lb->left - 1));
934     else
935     new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
936     antisense_closure = lb->antisense;
937     }
938    
939     new_right_cig.front().length = new_right_front_len;
940     size_t c = new_right_front_len > 0 ? 0 : 1;
941     for (; c < new_right_cig.size(); ++c)
942     new_cigar.push_back(new_right_cig[c]);
943    
944     mismatch = new_diff_mismatches;
945     found_closure = true;
946     }
947     ++lb;
948     }
949 gpertea 29
950 gpertea 154 if (!found_closure)
951     {
952     return BowtieHit();
953     }
954     }
955     else if (!(dist_btw_two == 0 && prev_hit->antisense_align2() == curr_hit->antisense_align()))
956     check_fusion = true;
957     }
958 gpertea 29
959 gpertea 154 if (check_fusion)
960     {
961     std::set<Fusion>::iterator lb, ub;
962 gpertea 29
963 gpertea 154 uint32_t ref_id1 = prev_hit->ref_id2();
964     uint32_t ref_id2 = curr_hit->ref_id();
965     uint32_t left = prev_hit->right() - 4;
966     uint32_t right = curr_hit->left() - 4;
967 gpertea 29
968 gpertea 154 // daehwan
969     if (bDebug)
970     {
971     cout << "daehwan - start - fusion" << endl
972     << "ref_id1: " << ref_id1 << endl
973     << "ref_id2: " << ref_id2 << endl
974     << "left: " << left << endl
975     << "right: " << right << endl
976     << "dir: " << fusion_dir << endl;
977     }
978 gpertea 29
979 gpertea 154 bool reversed = false;
980     if (fusion_dir != FUSION_FF &&
981     (ref_id2 < ref_id1 || (ref_id1 == ref_id2 && left > right)))
982     {
983     reversed = true;
984    
985     uint32_t temp = ref_id1;
986     ref_id1 = ref_id2;
987     ref_id2 = temp;
988    
989     temp = left;
990     left = right;
991     right = temp;
992     }
993    
994     lb = possible_fusions.upper_bound(Fusion(ref_id1, ref_id2, left, right));
995     ub = possible_fusions.lower_bound(Fusion(ref_id1, ref_id2, left + 8, right + 8));
996 gpertea 29
997 gpertea 154 RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id2());
998     RefSequenceTable::Sequence* ref_str2 = rt.get_seq(curr_hit->ref_id());
999    
1000     int new_diff_mismatches = 0xff;
1001     while (lb != ub && lb != possible_fusions.end())
1002     {
1003     int lb_left = lb->left;
1004     int lb_right = lb->right;
1005    
1006     if (reversed)
1007     {
1008     lb_left = lb->right;
1009     lb_right = lb->left;
1010     }
1011 gpertea 29
1012 gpertea 154 int dist_to_left, dist_to_right;
1013     if (fusion_dir == FUSION_RF)
1014     dist_to_left = prev_hit->right() - lb_left + 1;
1015     else
1016     dist_to_left = lb_left - prev_hit->right() + 1;
1017    
1018     if (fusion_dir == FUSION_FR)
1019     dist_to_right = curr_hit->left() - lb_right;
1020     else
1021     dist_to_right = lb_right - curr_hit->left();
1022 gpertea 29
1023 gpertea 154 // daehwan
1024     if (bDebug)
1025     {
1026     cout << "daehwan - fusion gap" << endl;
1027     cout << "dist left: " << dist_to_left << endl;
1028     cout << "dist right: " << dist_to_right << endl;
1029     }
1030    
1031     if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
1032     {
1033     /*
1034     * Check we have enough matched bases on prev or curr segment.
1035     */
1036     if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)
1037     {
1038     ++lb;
1039     continue;
1040     }
1041 gpertea 29
1042 gpertea 154 Dna5String new_cmp_str, old_cmp_str;
1043     int new_mismatch = 0, old_mismatch = 0;
1044     string new_patch_str; // this is for colorspace reads
1045     if (dist_to_left > 0)
1046     {
1047     if (fusion_dir == FUSION_RF)
1048     {
1049     new_cmp_str = seqan::infix(*ref_str, lb_left, prev_hit->right() + 1);
1050     seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1051     seqan::reverseInPlace(new_cmp_str);
1052     }
1053     else
1054     new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb_left + 1);
1055 gpertea 29
1056 gpertea 154 if (fusion_dir == FUSION_FR)
1057     {
1058     old_cmp_str = seqan::infix(*ref_str2, lb_right + 1, curr_hit->left() + 1);
1059     seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1060     seqan::reverseInPlace(old_cmp_str);
1061     }
1062     else
1063     old_cmp_str = seqan::infix(*ref_str2, curr_hit->left(), lb_right);
1064 gpertea 29
1065 gpertea 154 // daehwan
1066     if (bDebug)
1067     {
1068     cout << "new str: " << new_cmp_str << endl;
1069     cout << "old str: " << old_cmp_str << endl;
1070     cout << "curr seq: " << curr_hit->seq() << endl;
1071     }
1072 gpertea 29
1073 gpertea 154 string curr_hit_seq;
1074     if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1075     curr_hit_seq = curr_hit->seq();
1076     else
1077     curr_hit_seq = read_seq.substr(curr_seg_index * segment_length, segment_length);
1078    
1079     string new_seq;
1080     const string& curr_old_seq = curr_hit_seq;
1081     const string& curr_seq = color ? new_seq : curr_hit_seq;
1082     for (int i = 0; i < dist_to_left; ++i)
1083     {
1084     if (curr_seq[i] != new_cmp_str[i])
1085     ++new_mismatch;
1086    
1087     if (curr_old_seq[i] != old_cmp_str[i])
1088     ++old_mismatch;
1089     }
1090     }
1091     else if (dist_to_left < 0)
1092     {
1093     if (fusion_dir == FUSION_FR)
1094     {
1095     new_cmp_str = seqan::infix(*ref_str2, curr_hit->left() + 1, lb_right + 1);
1096     seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1097     seqan::reverseInPlace(new_cmp_str);
1098     }
1099     else
1100     new_cmp_str = seqan::infix(*ref_str2, lb_right, curr_hit->left());
1101 gpertea 29
1102 gpertea 154 if (fusion_dir == FUSION_RF)
1103     {
1104     old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb_left);
1105     seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1106     seqan::reverseInPlace(old_cmp_str);
1107     }
1108     else
1109     old_cmp_str = seqan::infix(*ref_str, lb_left + 1, prev_hit->right());
1110 gpertea 29
1111 gpertea 154 string prev_hit_seq;
1112     if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1113     prev_hit_seq = prev_hit->seq();
1114     else
1115     prev_hit_seq = read_seq.substr((curr_seg_index - 1) * segment_length, segment_length);
1116 gpertea 29
1117 gpertea 154 size_t abs_dist = -dist_to_left;
1118     string new_seq;
1119    
1120     const string& prev_old_seq = prev_hit_seq;
1121     size_t prev_old_seq_len = prev_old_seq.length();
1122     const string& prev_seq = color ? new_seq : prev_hit_seq;
1123     size_t prev_seq_len = prev_seq.length();
1124     for (size_t i = 0; i < abs_dist; ++i)
1125     {
1126     if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
1127     ++new_mismatch;
1128     if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
1129     ++old_mismatch;
1130     }
1131     }
1132 gpertea 29
1133 gpertea 154 int temp_diff_mismatches = new_mismatch - old_mismatch;
1134     if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
1135     {
1136     ++lb;
1137     continue;
1138     }
1139     new_diff_mismatches = temp_diff_mismatches;
1140 gpertea 29
1141 gpertea 154 new_left = prev_hit->left();
1142     new_cigar = prev_hit->cigar();
1143    
1144     int new_left_back_len = new_cigar.back().length;
1145     new_left_back_len += dist_to_left;
1146    
1147     vector<CigarOp> new_right_cig = curr_hit->cigar();
1148     int new_right_front_len = new_right_cig.front().length;
1149     new_right_front_len -= dist_to_right;
1150     if (new_left_back_len > 0)
1151     new_cigar.back().length = new_left_back_len;
1152     else
1153     new_cigar.pop_back();
1154 gpertea 29
1155 gpertea 154 new_cigar.push_back(CigarOp((CigarOpCode)fusion_dir, lb_right));
1156     antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
1157    
1158     new_right_cig.front().length = new_right_front_len;
1159     size_t c = new_right_front_len > 0 ? 0 : 1;
1160     for (; c < new_right_cig.size(); ++c)
1161     new_cigar.push_back(new_right_cig[c]);
1162    
1163     mismatch = new_diff_mismatches;
1164     found_closure = true;
1165     ++num_fusions;
1166 gpertea 29
1167 gpertea 154 // daehwan
1168     if (bDebug)
1169     {
1170     cout << "daehwan - fusion gap - found" << endl;
1171     }
1172 gpertea 29
1173 gpertea 154 }
1174     ++lb;
1175     }
1176 gpertea 29
1177 gpertea 154 // daehwan
1178     if (bDebug)
1179     {
1180     cout << "daehwan2 - end - fusion: " << (found_closure ? "found" : "not found") << endl;
1181     }
1182    
1183     if (!found_closure)
1184     {
1185     return BowtieHit();
1186     }
1187     }
1188 gpertea 29
1189 gpertea 154 if (found_closure)
1190     {
1191     bool end = false;
1192     BowtieHit merged_hit(prev_hit->ref_id(),
1193     curr_hit->ref_id2(),
1194     insert_id,
1195     new_left,
1196     new_cigar,
1197     antisense,
1198     antisense_closure,
1199     prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
1200     prev_hit->splice_mms() + curr_hit->splice_mms(),
1201     end);
1202 gpertea 29
1203 gpertea 154 // daehwan - should fix this for SOLiD dataset
1204     merged_hit.seq(prev_hit->seq() + curr_hit->seq());
1205 gpertea 29
1206 gpertea 154 // daehwan
1207     if (bDebug)
1208     {
1209     cout << "fusing of " << merged_hit.left() << " and " << merged_hit.right() << endl;
1210     cout << print_cigar(merged_hit.cigar()) << endl;
1211     if (!merged_hit.check_editdist_consistency(rt))
1212     {
1213     cout << "btw " << print_cigar(prev_hit->cigar()) << " and " << print_cigar(curr_hit->cigar()) << endl;
1214     cout << "this is malformed hit" << endl;
1215     }
1216     }
1217 gpertea 29
1218 gpertea 154 prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
1219     /*
1220     * prev_hit now points PAST the last element removed
1221     */
1222     prev_hit = hit_chain.insert(prev_hit, merged_hit);
1223     /*
1224     * merged_hit has been inserted before the old position of
1225     * prev_hit. New location of prev_hit is merged_hit
1226     */
1227     curr_hit = prev_hit;
1228     ++curr_hit;
1229     ++curr_seg_index;
1230     continue;
1231     }
1232 gpertea 29
1233 gpertea 154 // daehwan
1234     if (bDebug)
1235     {
1236     cout << "daehwan - test 0.3" << endl;
1237     }
1238    
1239 gpertea 29 ++prev_hit;
1240     ++curr_hit;
1241     ++curr_seg_index;
1242     }
1243    
1244 gpertea 154 // daehwan
1245     if (bDebug)
1246     {
1247     cout << "daehwan - test2" << endl;
1248     }
1249    
1250 gpertea 29 bool saw_antisense_splice = false;
1251     bool saw_sense_splice = false;
1252     vector<CigarOp> long_cigar;
1253     int num_mismatches = 0;
1254     int num_splice_mms = 0;
1255     for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
1256     {
1257     num_mismatches += s->edit_dist();
1258     num_splice_mms += s->splice_mms();
1259    
1260     /*
1261     * Check whether the sequence contains any reference skips. Previously,
1262     * this was just a check to see whether the sequence was contiguous; however
1263     * we don't want to count an indel event as a splice
1264     */
1265     bool containsSplice = s->is_spliced();
1266     if (containsSplice)
1267 gpertea 154 {
1268     if (s->antisense_splice())
1269     {
1270     if (saw_sense_splice)
1271     return BowtieHit();
1272     saw_antisense_splice = true;
1273     }
1274     else
1275     {
1276     if (saw_antisense_splice)
1277     return BowtieHit();
1278     saw_sense_splice = true;
1279     }
1280     }
1281 gpertea 29 const vector<CigarOp>& cigar = s->cigar();
1282     if (long_cigar.empty())
1283 gpertea 154 {
1284     long_cigar = cigar;
1285     }
1286 gpertea 29 else
1287 gpertea 154 {
1288     CigarOp& last = long_cigar.back();
1289     /*
1290     * If necessary, merge the back and front
1291     * cigar operations
1292     */
1293     if(last.opcode == cigar[0].opcode){
1294     last.length += cigar[0].length;
1295     for (size_t b = 1; b < cigar.size(); ++b)
1296     {
1297     long_cigar.push_back(cigar[b]);
1298     }
1299     }else{
1300     for(size_t b = 0; b < cigar.size(); ++b)
1301     {
1302     long_cigar.push_back(cigar[b]);
1303     }
1304     }
1305     }
1306 gpertea 29 }
1307    
1308     bool end = false;
1309 gpertea 154 BowtieHit new_hit(hit_chain.front().ref_id(),
1310     hit_chain.back().ref_id2(),
1311     insert_id,
1312     left,
1313     long_cigar,
1314     antisense,
1315     saw_antisense_splice,
1316     num_mismatches,
1317     num_splice_mms,
1318     end);
1319 gpertea 29
1320 gpertea 154 if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1321     {
1322     new_hit.seq(seq);
1323     new_hit.qual(qual);
1324     }
1325     else
1326     {
1327     new_hit.seq(read_seq);
1328     new_hit.qual(read_quals);
1329     }
1330 gpertea 29
1331 gpertea 154 bool do_reverse = new_hit.ref_id() > new_hit.ref_id2();
1332     if (new_hit.ref_id() == new_hit.ref_id2())
1333     {
1334     vector<Fusion> fusions;
1335     bool auto_sort = false;
1336     fusions_from_spliced_hit(new_hit, fusions, auto_sort);
1337     if (fusions.size() > 0)
1338     {
1339     const Fusion& fusion = fusions[0];
1340     do_reverse = fusion.left > fusion.right;
1341     }
1342     }
1343    
1344     if (do_reverse)
1345     {
1346     new_hit = new_hit.reverse();
1347     }
1348    
1349     if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
1350     {
1351     new_hit.antisense_align(!new_hit.antisense_align());
1352     }
1353    
1354     // daehwan
1355     if (bDebug)
1356     {
1357     cout << "daehwan - test3" << endl;
1358     cout << new_hit.left() << " " << print_cigar(new_hit.cigar()) << endl;
1359     cout << new_hit.ref_id() << "-" << new_hit.ref_id2() << ": " << new_hit.fusion_opcode() << endl;
1360     }
1361    
1362 gpertea 29 int new_read_len = new_hit.read_len();
1363     if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt))
1364     {
1365 gpertea 154 // daehwan
1366     if (bDebug)
1367     {
1368     cout << "Warning: " << new_hit.insert_id() << " malformed closure: " << print_cigar(new_hit.cigar()) << endl;
1369     exit(1);
1370     }
1371    
1372     fprintf(stderr, "Warning: %d malformed closure\n", new_hit.insert_id());
1373 gpertea 29 return BowtieHit();
1374     }
1375    
1376     return new_hit;
1377     }
1378    
1379 gpertea 154
1380 gpertea 29 int multi_closure = 0;
1381     int anchor_too_short = 0;
1382     int gap_too_short = 0;
1383    
1384     bool valid_hit(const BowtieHit& bh)
1385     {
1386     if (bh.insert_id())
1387 gpertea 154 {
1388     /*
1389     * validate the cigar chain - no gaps shorter than an intron, etc.
1390     * also,
1391     * -Don't start or end with an indel or refskip
1392     * -Only a match operation is allowed is allowed
1393     * adjacent to an indel or refskip
1394     * -Indels should confrom to length restrictions
1395     */
1396     const CigarOp* prevCig = &(bh.cigar()[0]);
1397     const CigarOp* currCig = &(bh.cigar()[1]);
1398     for (size_t i = 1; i < bh.cigar().size(); ++i){
1399     currCig = &(bh.cigar()[i]);
1400     if(!(currCig->opcode == MATCH || currCig->opcode == mATCH) &&
1401     !(prevCig->opcode == MATCH || prevCig->opcode == mATCH)){
1402     return false;
1403     }
1404     if(currCig->opcode == INS || currCig->opcode == iNS){
1405     if(currCig->length > max_insertion_length){
1406     return false;
1407     }
1408     }
1409     if(currCig->opcode == DEL || currCig->opcode == dEL){
1410     if(currCig->length > max_deletion_length){
1411     return false;
1412     }
1413     }
1414     if(currCig->opcode == REF_SKIP || currCig->opcode == rEF_SKIP){
1415     if(currCig->length < (uint64_t)min_report_intron_length){
1416     gap_too_short++;
1417     return false;
1418     }
1419     }
1420     prevCig = currCig;
1421 gpertea 29 }
1422 gpertea 154 if (!(bh.cigar().front().opcode == MATCH || bh.cigar().front().opcode == mATCH) ||
1423     !(bh.cigar().back().opcode == MATCH || bh.cigar().back().opcode == mATCH)/* ||
1424     (int)bh.cigar().front().length < min_anchor_len||
1425     (int)bh.cigar().back().length < min_anchor_len*/ )
1426     {
1427     anchor_too_short++;
1428     return false;
1429     }
1430 gpertea 29 }
1431 gpertea 154 else
1432 gpertea 29 {
1433 gpertea 154 multi_closure++;
1434 gpertea 29 return false;
1435     }
1436 gpertea 154
1437 gpertea 29 return true;
1438     }
1439    
1440     void merge_segment_chain(RefSequenceTable& rt,
1441 gpertea 154 const string& read_seq,
1442     const string& read_quals,
1443     std::set<Junction>& possible_juncs,
1444     std::set<Insertion>& possible_insertions,
1445     std::set<Fusion>& possible_fusions,
1446     vector<BowtieHit>& hits,
1447     vector<BowtieHit>& merged_hits,
1448     int fusion_dir = FUSION_NOTHING)
1449 gpertea 29 {
1450     if (hits.size() == 0)
1451     return;
1452    
1453     BowtieHit bh;
1454     if (hits.size() > 1)
1455     {
1456     list<BowtieHit> hit_chain;
1457 gpertea 154 if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1458     {
1459     if (hits.front().antisense_align())
1460     copy(hits.rbegin(), hits.rend(), back_inserter(hit_chain));
1461     else
1462     copy(hits.begin(), hits.end(), back_inserter(hit_chain));
1463     }
1464 gpertea 29 else
1465 gpertea 154 {
1466     bool bSawFusion = false;
1467     for (size_t i = 0; i < hits.size(); ++i)
1468     {
1469     bool pushed = false;
1470     if (!bSawFusion)
1471     {
1472     if (i > 0)
1473     {
1474     if (hits[i-1].ref_id() != hits[i].ref_id())
1475     bSawFusion = true;
1476     else if(hits[i-1].antisense_align() != hits[i].antisense_align())
1477     bSawFusion = true;
1478     else
1479     {
1480     int dist = 0;
1481     if (hits[i].antisense_align())
1482     dist = hits[i-1].left() - hits[i].right();
1483     else
1484     dist = hits[i].left() - hits[i-1].right();
1485 gpertea 29
1486 gpertea 154 if (dist >= max_report_intron_length || dist < -(int)max_insertion_length)
1487     bSawFusion = true;
1488     }
1489     }
1490     }
1491    
1492     if (hits[i].fusion_opcode() == FUSION_NOTHING &&
1493     ((fusion_dir == FUSION_FR && bSawFusion) || (fusion_dir == FUSION_RF && !bSawFusion)) &&
1494     hits[i].left() < hits[i].right())
1495     {
1496     hit_chain.push_back(hits[i].reverse());
1497     pushed = true;
1498     }
1499    
1500     if (i > 0 &&
1501     hits[i].fusion_opcode() != FUSION_NOTHING &&
1502     hits[i].ref_id() != hits[i-1].ref_id())
1503     {
1504     hit_chain.push_back(hits[i].reverse());
1505     pushed = true;
1506     }
1507    
1508     if (!bSawFusion)
1509     {
1510     if (hits[i].fusion_opcode() != FUSION_NOTHING)
1511     bSawFusion = true;
1512     }
1513    
1514     if (!pushed)
1515     hit_chain.push_back(hits[i]);
1516     }
1517     }
1518    
1519     bh = merge_chain(rt,
1520     read_seq,
1521     read_quals,
1522     possible_juncs,
1523     possible_insertions,
1524     possible_fusions,
1525     hit_chain,
1526     fusion_dir);
1527 gpertea 29 }
1528     else
1529     {
1530     bh = hits[0];
1531     }
1532    
1533     if (valid_hit(bh))
1534     merged_hits.push_back(bh);
1535     }
1536    
1537     bool dfs_seg_hits(RefSequenceTable& rt,
1538 gpertea 154 const string& read_seq,
1539     const string& read_quals,
1540     std::set<Junction>& possible_juncs,
1541     std::set<Insertion>& possible_insertions,
1542     std::set<Fusion>& possible_fusions,
1543     vector<HitsForRead>& seg_hits_for_read,
1544     size_t curr,
1545     vector<BowtieHit>& seg_hit_stack,
1546     vector<BowtieHit>& joined_hits,
1547     int fusion_dir = FUSION_NOTHING)
1548 gpertea 29 {
1549     assert (!seg_hit_stack.empty());
1550     bool join_success = false;
1551    
1552     if (curr < seg_hits_for_read.size())
1553     {
1554 gpertea 154 for (size_t i = 0; i < seg_hits_for_read[curr].hits.size(); ++i)
1555     {
1556     /*
1557     * As we reverse segments depending on directions like FR or RF,
1558     * it's necessary to recover the original segments.
1559     */
1560     BowtieHit bh = seg_hits_for_read[curr].hits[i];
1561     BowtieHit bh_prev = seg_hit_stack.back();
1562    
1563     BowtieHit* prevHit = &bh_prev;
1564     BowtieHit* currHit = &bh;
1565 gpertea 29
1566 gpertea 154 // daehwan
1567     // if (bh.insert_id() == 11921733)
1568     // bDebug = true;
1569 gpertea 29
1570 gpertea 154 /*
1571     * Each segment has at most one fusion by assumption,
1572     */
1573     bool prevHit_fused = prevHit->fusion_opcode() != FUSION_NOTHING;
1574     bool currHit_fused = currHit->fusion_opcode() != FUSION_NOTHING;
1575 gpertea 29
1576 gpertea 154 /*
1577     * Count the number of fusions on prev and curr segments,
1578     * but this doesn't take into account the gap (if exists) between the two segments.
1579     */
1580     size_t num_fusions = prevHit_fused ? 1 : 0;
1581     num_fusions += currHit_fused ? 1 : 0;
1582 gpertea 29
1583 gpertea 154 int dir = prevHit_fused ? prevHit->fusion_opcode() : currHit->fusion_opcode();
1584 gpertea 29
1585 gpertea 154 /*
1586     * We don't allow reads that span more than two fusion points.
1587     */
1588     if (num_fusions >= 2)
1589     continue;
1590 gpertea 29
1591 gpertea 154 if (fusion_dir != FUSION_NOTHING && currHit_fused)
1592     continue;
1593    
1594     if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1595     {
1596     if ((currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id()) ||
1597     (!currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id2()))
1598     continue;
1599     }
1600    
1601     if (bDebug)
1602     {
1603     cout << "daehwan - prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": "
1604     << print_cigar(prevHit->cigar()) << endl;
1605     cout << "daehwan - prev sense: "
1606     << (prevHit->antisense_align() ? "-" : "+")
1607     << "\t"
1608     << (prevHit->antisense_align2() ? "-" : "+")
1609     << endl;
1610     cout << "daehwan - prev coords: "
1611     << prevHit->left()
1612     << "\t"
1613     << prevHit->right()
1614     << endl;
1615    
1616     cout << "daehwan - curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": "
1617     << print_cigar(currHit->cigar()) << endl;
1618     cout << "daehwan - curr sense: "
1619     << (currHit->antisense_align() ? "-" : "+")
1620     << "\t"
1621     << (currHit->antisense_align2() ? "-" : "+")
1622     << endl;
1623     cout << "daehwan - curr corrds: "
1624     << currHit->left()
1625     << "\t"
1626     << currHit->right()
1627     << endl;
1628     }
1629    
1630     if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF) &&
1631     prevHit->ref_id2() != currHit->ref_id())
1632     continue;
1633    
1634     if ((fusion_dir == FUSION_FR && !currHit->antisense_align()) ||
1635     (fusion_dir == FUSION_RF && currHit->antisense_align()))
1636     continue;
1637    
1638     if (currHit_fused && dir == FUSION_RR)
1639     *currHit = currHit->reverse();
1640    
1641     if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RF ||
1642     (currHit_fused && currHit->ref_id() == currHit->ref_id2() && (dir == FUSION_FR || dir == FUSION_RF)))
1643     {
1644     if (currHit_fused)
1645     {
1646     if ((dir == FUSION_FR && currHit->antisense_align()) ||
1647     (dir == FUSION_RF && !currHit->antisense_align()))
1648     *currHit = currHit->reverse();
1649     }
1650     else
1651     {
1652     if (fusion_dir == FUSION_FR && currHit->antisense_align())
1653     *currHit = currHit->reverse();
1654     }
1655     }
1656     /*
1657     * Switch prevHit and currHit in FUSION_NOTHING and FUSION_FF cases
1658     * to make it easier to check the distance in the gap between the two segments.
1659     */
1660     else if ((num_fusions == 0 && prevHit->antisense_align() && currHit->antisense_align() && prevHit->ref_id() == currHit->ref_id() && prevHit->left() <= currHit->right() + max_report_intron_length && prevHit->left() + max_insertion_length >= currHit->right()) ||
1661     (num_fusions == 1 && (dir == FUSION_FF || dir == FUSION_RR)&&
1662     ((!prevHit_fused && prevHit->antisense_align()) || (!currHit_fused && currHit->antisense_align())))
1663     )
1664     {
1665     BowtieHit* tempHit = prevHit;
1666     prevHit = currHit;
1667     currHit = tempHit;
1668     }
1669     else if (num_fusions == 0)
1670     {
1671     if (prevHit->ref_id2() == currHit->ref_id() && prevHit->antisense_align() == currHit->antisense_align())
1672     {
1673     int dist = 0;
1674     if (prevHit->antisense_align())
1675     dist = prevHit->left() - currHit->right();
1676     else
1677     dist = currHit->left() - prevHit->right();
1678    
1679     if (dist > max_report_intron_length || dist < -(int)max_insertion_length)
1680     {
1681     if ((prevHit->antisense_align() && prevHit->left() > currHit->left()) ||
1682     (!prevHit->antisense_align() && prevHit->left() < currHit->left()))
1683     dir = FUSION_FF;
1684     else
1685     dir = FUSION_RR;
1686     }
1687     }
1688     else
1689     {
1690     if (prevHit->antisense_align() == currHit->antisense_align())
1691     {
1692     if ((prevHit->antisense_align() && prevHit->ref_id() > currHit->ref_id()) ||
1693     (!prevHit->antisense_align() && prevHit->ref_id() < currHit->ref_id()))
1694     dir = FUSION_FF;
1695     else
1696     dir = FUSION_RR;
1697     }
1698     else if (!prevHit->antisense_align())
1699     dir = FUSION_FR;
1700     else
1701     dir = FUSION_RF;
1702    
1703     if (dir == FUSION_FR)
1704     *currHit = currHit->reverse();
1705     else if(dir == FUSION_RF)
1706     *prevHit = prevHit->reverse();
1707     }
1708     }
1709    
1710     // daehwan - test
1711     if (bDebug)
1712     {
1713     cout << "insert id: " << prevHit->insert_id() << endl;
1714     cout << "(" << curr - 1 << ") prev: " << prevHit->seq() << " : " << (prevHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
1715     cout << "(" << curr << ") curr: " << currHit->seq() << " : " << (currHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
1716     cout << "prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": " << print_cigar(prevHit->cigar()) << endl;
1717     cout << "curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": " << print_cigar(currHit->cigar()) << endl;
1718     cout << "prev coords: "
1719     << prevHit->left()
1720     << "\t"
1721     << prevHit->right()
1722     << endl;
1723     cout << "curr corrds: "
1724     << currHit->left()
1725     << "\t"
1726     << currHit->right()
1727     << endl;
1728     cout << "prev sense: "
1729     << (prevHit->antisense_align() ? "-" : "+")
1730     << "\t"
1731     << (prevHit->antisense_align2() ? "-" : "+")
1732     << endl;
1733     cout << "curr sense: "
1734     << (currHit->antisense_align() ? "-" : "+")
1735     << "\t"
1736     << (currHit->antisense_align2() ? "-" : "+")
1737     << endl;
1738     }
1739    
1740     if (num_fusions == 1)
1741     {
1742     // daehwan
1743     if (bDebug)
1744     {
1745     cout << "direction: " << (int)dir << endl;
1746     }
1747    
1748     /*
1749     * orient the fused segment, which depends on a fusion direction.
1750     */
1751     if (dir != FUSION_FF && dir != FUSION_RR)
1752     {
1753     bool prevHit_rep = false;
1754     bool currHit_rep = false;
1755    
1756     if (prevHit_fused)
1757     {
1758     if ((dir == FUSION_FR && !currHit->antisense_align()) ||
1759     (dir == FUSION_RF && currHit->antisense_align()))
1760     continue;
1761    
1762     if (prevHit->ref_id2() != currHit->ref_id())
1763     prevHit_rep = true;
1764     else if ((dir == FUSION_FR && prevHit->antisense_align()) ||
1765     (dir == FUSION_RF && !prevHit->antisense_align()))
1766     prevHit_rep = true;
1767     }
1768    
1769     if (currHit_fused)
1770     {
1771     if ((dir == FUSION_FR && prevHit->antisense_align()) ||
1772     (dir == FUSION_RF && !prevHit->antisense_align()))
1773     continue;
1774    
1775     if (currHit->ref_id() != prevHit->ref_id2())
1776     currHit_rep = true;
1777     }
1778    
1779     if (bDebug)
1780     {
1781     if (prevHit_rep) cout << "1. reversed in prev" << endl;
1782     if (currHit_rep) cout << "1. reversed in curr" << endl;
1783     }
1784    
1785     if (prevHit_rep)
1786     *prevHit = prevHit->reverse();
1787     if (currHit_rep)
1788     *currHit = currHit->reverse();
1789    
1790     prevHit_rep = false;
1791     currHit_rep = false;
1792    
1793     if (prevHit_fused)
1794     {
1795     if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
1796     currHit_rep = true;
1797     }
1798     else
1799     {
1800     if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
1801     prevHit_rep = true;
1802     }
1803    
1804     if (prevHit_rep)
1805     *prevHit = prevHit->reverse();
1806     if (currHit_rep)
1807     *currHit = currHit->reverse();
1808    
1809     // daehwan
1810     if (bDebug)
1811     {
1812     if (prevHit_rep) cout << "2. reversed in prev" << endl;
1813     if (currHit_rep) cout << "2. reversed in curr" << endl;
1814     }
1815     }
1816     }
1817    
1818     bool same_contig = prevHit->ref_id2() == currHit->ref_id();
1819     if (!same_contig && num_fusions > 0)
1820     continue;
1821    
1822     if (same_contig && num_fusions >= 1)
1823     {
1824     if (prevHit->antisense_align2() != currHit->antisense_align())
1825     continue;
1826     }
1827    
1828     int bh_l = 0, back_right = 0, dist = 0;
1829     if (same_contig)
1830     {
1831     if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF || dir == FUSION_FR || dir == FUSION_RF) &&
1832     prevHit->antisense_align2())
1833     {
1834     bh_l = prevHit->right() + 1;
1835     back_right = currHit->left() + 1;
1836     }
1837     else
1838     {
1839     bh_l = currHit->left();
1840     back_right = prevHit->right();
1841     }
1842    
1843     dist = bh_l - back_right;
1844     }
1845    
1846     // daehwan - pass
1847     if (bDebug)
1848     {
1849     cout << "daehwan - pass" << endl;
1850     cout << "prev coords: " << prevHit->left() << "\t" << prevHit->right() << endl;
1851     cout << "curr coords: " << currHit->left() << "\t" << currHit->right() << endl;
1852     }
1853    
1854     if (!same_contig ||
1855     (same_contig && num_fusions == 0 && dir != FUSION_NOTHING && fusion_dir == FUSION_NOTHING) ||
1856     (same_contig && dist <= max_report_intron_length && dist >= -(int)max_insertion_length && prevHit->is_forwarding_right() == currHit->is_forwarding_left()))
1857     {
1858     // daehwan
1859     if (bDebug)
1860     {
1861     cout << "daehwan - really passed!!" << endl;
1862     }
1863    
1864     BowtieHit tempHit = seg_hit_stack.back();
1865     seg_hit_stack.back() = bh_prev;
1866    
1867     // these hits are compatible, so push bh onto the
1868     // stack, recurse, and pop it when done.
1869     seg_hit_stack.push_back(bh);
1870     bool success = dfs_seg_hits(rt,
1871     read_seq,
1872     read_quals,
1873     possible_juncs,
1874     possible_insertions,
1875     possible_fusions,
1876     seg_hits_for_read,
1877     curr + 1,
1878     seg_hit_stack,
1879     joined_hits,
1880     dir == FUSION_NOTHING ? fusion_dir : dir);
1881    
1882     if (success)
1883     join_success = true;
1884    
1885     seg_hit_stack.pop_back();
1886     seg_hit_stack.back() = tempHit;
1887     }
1888     }
1889 gpertea 29 }
1890     else
1891 gpertea 154 {
1892     merge_segment_chain(rt,
1893     read_seq,
1894     read_quals,
1895     possible_juncs,
1896     possible_insertions,
1897     possible_fusions,
1898     seg_hit_stack,
1899     joined_hits,
1900     fusion_dir);
1901    
1902     return join_success = true;
1903     }
1904 gpertea 29 return join_success;
1905     }
1906    
1907     bool join_segments_for_read(RefSequenceTable& rt,
1908 gpertea 154 const string& read_seq,
1909     const string& read_quals,
1910     std::set<Junction>& possible_juncs,
1911     std::set<Insertion>& possible_insertions,
1912     std::set<Fusion>& possible_fusions,
1913     vector<HitsForRead>& seg_hits_for_read,
1914     vector<BowtieHit>& joined_hits)
1915     {
1916 gpertea 29 vector<BowtieHit> seg_hit_stack;
1917     bool join_success = false;
1918    
1919 gpertea 154 // ignore segments that map to more than this many places.
1920     if (bowtie2)
1921     {
1922     const size_t max_seg_hits = max_multihits * 2;
1923     for (size_t s = 0; s < seg_hits_for_read.size(); ++s)
1924     {
1925     if (seg_hits_for_read[s].hits.size() > max_seg_hits)
1926     return join_success;
1927     }
1928     }
1929    
1930 gpertea 29 for (size_t i = 0; i < seg_hits_for_read[0].hits.size(); ++i)
1931     {
1932     BowtieHit& bh = seg_hits_for_read[0].hits[i];
1933 gpertea 154
1934     if (bh.fusion_opcode() == FUSION_RR)
1935     seg_hit_stack.push_back(bh.reverse());
1936     else
1937     seg_hit_stack.push_back(bh);
1938    
1939 gpertea 29 bool success = dfs_seg_hits(rt,
1940 gpertea 154 read_seq,
1941     read_quals,
1942     possible_juncs,
1943     possible_insertions,
1944     possible_fusions,
1945     seg_hits_for_read,
1946     1,
1947     seg_hit_stack,
1948     joined_hits);
1949 gpertea 29 if (success)
1950     join_success = true;
1951     seg_hit_stack.pop_back();
1952     }
1953    
1954     return join_success;
1955     }
1956    
1957 gpertea 154 struct JoinSegmentsWorker
1958 gpertea 29 {
1959 gpertea 154 void operator()()
1960     {
1961     ReadTable it;
1962     GBamWriter bam_writer(bam_output_fname.c_str(), sam_header_fname.c_str());
1963     ReadStream readstream(reads_fname);
1964     if (readstream.file() == NULL)
1965     err_die("Error: cannot open %s for reading\n", reads_fname.c_str());
1966 gpertea 29
1967 gpertea 154 if (read_offset > 0)
1968     readstream.seek(read_offset);
1969 gpertea 29
1970 gpertea 154 bool need_seq = true;
1971     bool need_qual = color;
1972 gpertea 29
1973 gpertea 154 vector<HitStream> contig_hits;
1974     vector<HitStream> spliced_hits;
1975     vector<HitFactory*> factories;
1976     for (size_t i = 0; i < segmap_fnames.size(); ++i)
1977     {
1978     HitFactory* fac = new BAMHitFactory(it, *rt);
1979     factories.push_back(fac);
1980     HitStream hs(segmap_fnames[i], fac, false, false, false, need_seq, need_qual);
1981    
1982     if (seg_offsets[i] > 0)
1983     hs.seek(seg_offsets[i]);
1984 gpertea 29
1985 gpertea 154 contig_hits.push_back(hs);
1986     }
1987    
1988     for (size_t i = 0; i < spliced_segmap_fnames.size(); ++i)
1989     {
1990     int anchor_length = 0;
1991     HitFactory* fac = new SplicedBAMHitFactory(it, *rt, anchor_length);
1992     factories.push_back(fac);
1993    
1994     HitStream hs(spliced_segmap_fnames[i], fac, true, false, false, need_seq, need_qual);
1995     if (spliced_seg_offsets[i] > 0)
1996     hs.seek(spliced_seg_offsets[i]);
1997 gpertea 29
1998 gpertea 154 spliced_hits.push_back(hs);
1999     }
2000 gpertea 29
2001 gpertea 154 uint32_t curr_contig_obs_order = VMAXINT32;
2002     HitStream* first_seg_contig_stream = NULL;
2003     uint64_t next_contig_id = 0;
2004    
2005     if (contig_hits.size() > 0)
2006     {
2007     first_seg_contig_stream = &(contig_hits.front());
2008     next_contig_id = first_seg_contig_stream->next_group_id();
2009     curr_contig_obs_order = it.observation_order(next_contig_id);
2010     }
2011 gpertea 29
2012 gpertea 154 HitsForRead curr_hit_group;
2013    
2014     uint32_t curr_spliced_obs_order = VMAXINT32;
2015     HitStream* first_seg_spliced_stream = NULL;
2016     uint64_t next_spliced_id = 0;
2017    
2018     if (spliced_hits.size() > 0)
2019     {
2020     first_seg_spliced_stream = &(spliced_hits.front());
2021     next_spliced_id = first_seg_spliced_stream->next_group_id();
2022     curr_spliced_obs_order = it.observation_order(next_spliced_id);
2023     }
2024 gpertea 29
2025 gpertea 154 while((curr_contig_obs_order != VMAXINT32 || curr_spliced_obs_order != VMAXINT32) &&
2026     (curr_contig_obs_order < end_id || curr_spliced_obs_order < end_id))
2027     {
2028     uint32_t read_in_process;
2029     vector<HitsForRead> seg_hits_for_read;
2030     seg_hits_for_read.resize(contig_hits.size());
2031 gpertea 29
2032 gpertea 154 if (curr_contig_obs_order < curr_spliced_obs_order)
2033     {
2034     first_seg_contig_stream->next_read_hits(curr_hit_group);
2035     seg_hits_for_read.front() = curr_hit_group;
2036    
2037     next_contig_id = first_seg_contig_stream->next_group_id();
2038     uint32_t next_order = it.observation_order(next_contig_id);
2039    
2040     read_in_process = curr_contig_obs_order;
2041     curr_contig_obs_order = next_order;
2042     }
2043     else if (curr_spliced_obs_order < curr_contig_obs_order)
2044     {
2045     first_seg_spliced_stream->next_read_hits(curr_hit_group);
2046     seg_hits_for_read.front() = curr_hit_group;
2047    
2048     next_spliced_id = first_seg_spliced_stream->next_group_id();
2049     uint32_t next_order = it.observation_order(next_spliced_id);
2050 gpertea 29
2051 gpertea 154 read_in_process = curr_spliced_obs_order;
2052     curr_spliced_obs_order = next_order;
2053 gpertea 29
2054 gpertea 154 if (read_in_process < begin_id)
2055     continue;
2056     }
2057     else if (curr_contig_obs_order == curr_spliced_obs_order &&
2058     curr_contig_obs_order != VMAXINT32 &&
2059     curr_spliced_obs_order != VMAXINT32)
2060     {
2061     first_seg_contig_stream->next_read_hits(curr_hit_group);
2062    
2063     HitsForRead curr_spliced_group;
2064     first_seg_spliced_stream->next_read_hits(curr_spliced_group);
2065    
2066     curr_hit_group.hits.insert(curr_hit_group.hits.end(),
2067     curr_spliced_group.hits.begin(),
2068     curr_spliced_group.hits.end());
2069     seg_hits_for_read.front() = curr_hit_group;
2070     read_in_process = curr_spliced_obs_order;
2071    
2072     next_contig_id = first_seg_contig_stream->next_group_id();
2073     uint32_t next_order = it.observation_order(next_contig_id);
2074    
2075     next_spliced_id = first_seg_spliced_stream->next_group_id();
2076     uint32_t next_spliced_order = it.observation_order(next_spliced_id);
2077    
2078     curr_spliced_obs_order = next_spliced_order;
2079     curr_contig_obs_order = next_order;
2080     }
2081     else
2082     {
2083     break;
2084     }
2085    
2086     if (contig_hits.size() > 1)
2087     {
2088     look_right_for_hit_group(it,
2089     contig_hits,
2090     0,
2091     spliced_hits,
2092     curr_hit_group,
2093     seg_hits_for_read);
2094     }
2095 gpertea 29
2096 gpertea 154 size_t last_non_empty = seg_hits_for_read.size() - 1;
2097     while(last_non_empty >= 0 && seg_hits_for_read[last_non_empty].hits.empty())
2098     {
2099     --last_non_empty;
2100     }
2101    
2102     seg_hits_for_read.resize(last_non_empty + 1);
2103     if (!seg_hits_for_read[last_non_empty].hits[0].end())
2104     continue;
2105    
2106     if (!seg_hits_for_read.empty() && !seg_hits_for_read[0].hits.empty())
2107     {
2108     uint64_t insert_id = seg_hits_for_read[0].hits[0].insert_id();
2109     if (insert_id >= begin_id && insert_id < end_id)
2110     {
2111     Read read;
2112     if (readstream.getRead(insert_id, read))
2113     {
2114     vector<BowtieHit> joined_hits;
2115     join_segments_for_read(*rt,
2116     read.seq.c_str(),
2117     read.qual.c_str(),
2118     *possible_juncs,
2119     *possible_insertions,
2120     *possible_fusions,
2121     seg_hits_for_read,
2122     joined_hits);
2123    
2124     sort(joined_hits.begin(), joined_hits.end());
2125     vector<BowtieHit>::iterator new_end = unique(joined_hits.begin(), joined_hits.end());
2126     joined_hits.erase(new_end, joined_hits.end());
2127     for (size_t i = 0; i < joined_hits.size(); i++)
2128     {
2129     const char* ref_name = rt->get_name(joined_hits[i].ref_id());
2130     const char* ref_name2 = "";
2131     if (joined_hits[i].fusion_opcode() != FUSION_NOTHING)
2132     ref_name2 = rt->get_name(joined_hits[i].ref_id2());
2133 gpertea 29
2134 gpertea 154 if (color && !color_out)
2135     print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2136     joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true);
2137     else
2138     print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2139     read.seq.c_str(), read.qual.c_str(), false);
2140     }
2141     }
2142     else
2143     {
2144     err_die("Error: could not get read # %d from stream\n",
2145     read_in_process);
2146     }
2147     }
2148     }
2149     else
2150     {
2151     //fprintf(stderr, "Warning: couldn't join segments for read # %d\n", read_in_process);
2152     }
2153 gpertea 135 }
2154 gpertea 29
2155 gpertea 154 for (size_t fac = 0; fac < factories.size(); ++fac)
2156 gpertea 135 {
2157 gpertea 154 delete factories[fac];
2158 gpertea 135 }
2159 gpertea 154 factories.clear();
2160     }
2161 gpertea 29
2162 gpertea 154 string bam_output_fname;
2163     string sam_header_fname;
2164     string reads_fname;
2165 gpertea 29
2166 gpertea 154 vector<string> segmap_fnames;
2167     vector<string> spliced_segmap_fnames;
2168    
2169     std::set<Junction>* possible_juncs;
2170     std::set<Insertion>* possible_insertions;
2171     std::set<Fusion>* possible_fusions;
2172    
2173     RefSequenceTable* rt;
2174 gpertea 29
2175 gpertea 154 uint64_t begin_id;
2176     uint64_t end_id;
2177     int64_t read_offset;
2178     vector<int64_t> seg_offsets;
2179     vector<int64_t> spliced_seg_offsets;
2180     };
2181 gpertea 29
2182 gpertea 154 void driver(const string& bam_output_fname,
2183     istream& ref_stream,
2184     vector<FILE*>& possible_juncs_files,
2185     vector<FILE*>& possible_insertions_files,
2186     vector<FILE*>& possible_deletions_files,
2187     vector<FILE*>& possible_fusions_files,
2188     vector<string>& spliced_segmap_fnames, //.bam files
2189     vector<string>& segmap_fnames, //.bam files
2190     const string& reads_fname)
2191     {
2192     if (!parallel)
2193     num_threads = 1;
2194 gpertea 29
2195 gpertea 69 if (segmap_fnames.size() == 0)
2196 gpertea 29 {
2197     fprintf(stderr, "No hits to process, exiting\n");
2198     exit(0);
2199     }
2200    
2201 gpertea 154 RefSequenceTable rt(sam_header, true);
2202 gpertea 29 fprintf (stderr, "Loading reference sequences...\n");
2203 gpertea 154 get_seqs(ref_stream, rt, true);
2204 gpertea 29 fprintf (stderr, " reference sequences loaded.\n");
2205    
2206     fprintf(stderr, "Loading junctions...");
2207     std::set<Junction> possible_juncs;
2208    
2209     for (size_t i = 0; i < possible_juncs_files.size(); ++i)
2210     {
2211     char buf[2048];
2212     while(!feof(possible_juncs_files[i]) &&
2213     fgets(buf, sizeof(buf), possible_juncs_files[i]))
2214     {
2215     char junc_ref_name[256];
2216     int left;
2217     int right;
2218     char orientation;
2219     int ret = sscanf(buf, "%s %d %d %c", junc_ref_name, &left, &right, &orientation);
2220     if (ret != 4)
2221     continue;
2222     uint32_t ref_id = rt.get_id(junc_ref_name, NULL, 0);
2223     possible_juncs.insert(Junction(ref_id, left, right, orientation == '-'));
2224     }
2225     }
2226     fprintf(stderr, "done\n");
2227    
2228     fprintf(stderr, "Loading deletions...");
2229    
2230     for (size_t i = 0; i < possible_deletions_files.size(); ++i)
2231     {
2232     char splice_buf[2048];
2233     FILE* deletions_file = possible_deletions_files[i];
2234     if(!deletions_file){
2235     continue;
2236     }
2237     while(fgets(splice_buf, 2048, deletions_file)){
2238     char* nl = strrchr(splice_buf, '\n');
2239     char* buf = splice_buf;
2240     if (nl) *nl = 0;
2241    
2242     char* ref_name = get_token((char**)&buf, "\t");
2243     char* scan_left_coord = get_token((char**)&buf, "\t");
2244     char* scan_right_coord = get_token((char**)&buf, "\t");
2245    
2246     if (!scan_left_coord || !scan_right_coord)
2247     {
2248     err_die("Error: malformed deletion coordinate record\n");
2249     }
2250    
2251     uint32_t ref_id = rt.get_id(ref_name,NULL,0);
2252     uint32_t left_coord = atoi(scan_left_coord);
2253     uint32_t right_coord = atoi(scan_right_coord);
2254     possible_juncs.insert((Junction)Deletion(ref_id, left_coord - 1,right_coord, false));
2255     }
2256     }
2257     fprintf(stderr, "done\n");
2258    
2259     /*
2260     * Read the insertions from the list of insertion
2261     * files into a set
2262     */
2263 gpertea 154 fprintf(stderr, "Loading insertions...");
2264 gpertea 29 std::set<Insertion> possible_insertions;
2265     for (size_t i=0; i < possible_insertions_files.size(); ++i)
2266     {
2267     char splice_buf[2048];
2268     FILE* insertions_file = possible_insertions_files[i];
2269     if(!insertions_file){
2270     continue;
2271     }
2272     while(fgets(splice_buf, 2048, insertions_file)){
2273     char* nl = strrchr(splice_buf, '\n');
2274     char* buf = splice_buf;
2275     if (nl) *nl = 0;
2276    
2277     char* ref_name = get_token((char**)&buf, "\t");
2278     char* scan_left_coord = get_token((char**)&buf, "\t");
2279     char* scan_right_coord = get_token((char**)&buf, "\t");
2280     char* scan_sequence = get_token((char**)&buf, "\t");
2281    
2282     if (!scan_left_coord || !scan_sequence || !scan_right_coord)
2283     {
2284     err_die("Error: malformed insertion coordinate record\n");
2285     }
2286    
2287     uint32_t ref_id = rt.get_id(ref_name,NULL,0);
2288     uint32_t left_coord = atoi(scan_left_coord);
2289     std::string sequence(scan_sequence);
2290     possible_insertions.insert(Insertion(ref_id, left_coord, sequence));
2291     }
2292     }
2293 gpertea 154 fprintf(stderr, "done\n");
2294 gpertea 29
2295 gpertea 154 vector<uint64_t> read_ids;
2296     vector<vector<int64_t> > offsets;
2297     if (num_threads > 1)
2298     {
2299     vector<string> fnames;
2300     fnames.push_back(reads_fname);
2301     fnames.insert(fnames.end(), spliced_segmap_fnames.rbegin(), spliced_segmap_fnames.rend());
2302     fnames.insert(fnames.end(), segmap_fnames.rbegin(), segmap_fnames.rend());
2303     bool enough_data = calculate_offsets(fnames, read_ids, offsets);
2304     if (!enough_data)
2305     num_threads = 1;
2306     }
2307 gpertea 29
2308 gpertea 154 std::set<Fusion> possible_fusions;
2309     if (fusion_search)
2310     {
2311     fprintf(stderr, "Loading fusions...");
2312     for (size_t i=0; i < possible_fusions_files.size(); ++i)
2313     {
2314     char splice_buf[2048];
2315     FILE* fusions_file = possible_fusions_files[i];
2316     if(!fusions_file){
2317     continue;
2318     }
2319     while(fgets(splice_buf, 2048, fusions_file)){
2320     char* nl = strrchr(splice_buf, '\n');
2321     char* buf = splice_buf;
2322     if (nl) *nl = 0;
2323    
2324     char* ref_name1 = strsep((char**)&buf, "\t");
2325     char* scan_left_coord = strsep((char**)&buf, "\t");
2326     char* ref_name2 = strsep((char**)&buf, "\t");
2327     char* scan_right_coord = strsep((char**)&buf, "\t");
2328     char* scan_dir = strsep((char**)&buf, "\t");
2329    
2330     if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir)
2331     {
2332     fprintf(stderr,"Error: malformed insertion coordinate record\n");
2333     exit(1);
2334     }
2335    
2336     uint32_t ref_id1 = rt.get_id(ref_name1,NULL,0);
2337     uint32_t ref_id2 = rt.get_id(ref_name2,NULL,0);
2338     uint32_t left_coord = atoi(scan_left_coord);
2339     uint32_t right_coord = atoi(scan_right_coord);
2340     uint32_t dir = FUSION_FF;
2341     if (strcmp(scan_dir, "fr") == 0)
2342     dir = FUSION_FR;
2343     else if(strcmp(scan_dir, "rf") == 0)
2344     dir = FUSION_RF;
2345     else if (strcmp(scan_dir, "rr") == 0)
2346     dir = FUSION_RR;
2347    
2348     possible_fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir));
2349     }
2350     }
2351     fprintf(stderr, "done\n");
2352     }
2353    
2354     vector<boost::thread*> threads;
2355     for (int i = 0; i < num_threads; ++i)
2356     {
2357     JoinSegmentsWorker worker;
2358 gpertea 29
2359 gpertea 154 if (num_threads == 1)
2360     worker.bam_output_fname = bam_output_fname;
2361     else
2362     {
2363     string filename_base = bam_output_fname.substr(0, bam_output_fname.length() - 4);
2364     char filename[1024] = {0};
2365     sprintf(filename, "%s%d.bam", filename_base.c_str(), i);
2366     worker.bam_output_fname = filename;
2367     }
2368    
2369     worker.sam_header_fname = sam_header;
2370     worker.reads_fname = reads_fname;
2371     worker.segmap_fnames = segmap_fnames;
2372     worker.spliced_segmap_fnames = spliced_segmap_fnames;
2373     worker.possible_juncs = &possible_juncs;
2374     worker.possible_insertions = &possible_insertions;
2375     worker.possible_fusions = &possible_fusions;
2376     worker.rt = &rt;
2377     if (i == 0)
2378     {
2379     worker.begin_id = 0;
2380     worker.seg_offsets = vector<int64_t>(segmap_fnames.size(), 0);
2381     worker.spliced_seg_offsets = vector<int64_t>(spliced_segmap_fnames.size(), 0);
2382     worker.read_offset = 0;
2383     }
2384     else
2385     {
2386     worker.begin_id = read_ids[i-1];
2387     worker.seg_offsets.insert(worker.seg_offsets.end(),
2388     offsets[i-1].rbegin(),
2389     offsets[i-1].rbegin() + segmap_fnames.size());
2390     worker.spliced_seg_offsets.insert(worker.spliced_seg_offsets.end(),
2391     offsets[i-1].rbegin() + segmap_fnames.size(),
2392     offsets[i-1].rend() - 1);
2393     worker.read_offset = offsets[i-1][0];
2394     }
2395    
2396     worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
2397    
2398     if (num_threads > 1)
2399     threads.push_back(new boost::thread(worker));
2400     else
2401     worker();
2402     }
2403    
2404     for (size_t i = 0; i < threads.size(); ++i)
2405     {
2406     threads[i]->join();
2407     delete threads[i];
2408     threads[i] = NULL;
2409     }
2410     threads.clear();
2411    
2412 gpertea 29 } //driver
2413    
2414     int main(int argc, char** argv)
2415     {
2416     fprintf(stderr, "long_spanning_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
2417     fprintf(stderr, "--------------------------------------------\n");
2418    
2419     int parse_ret = parse_options(argc, argv, print_usage);
2420     if (parse_ret)
2421     return parse_ret;
2422    
2423     if(optind >= argc)
2424     {
2425     print_usage();
2426     return 1;
2427     }
2428    
2429     string ref_file_name = argv[optind++];
2430    
2431     if(optind >= argc)
2432     {
2433     print_usage();
2434     return 1;
2435     }
2436    
2437    
2438     string reads_file_name = argv[optind++];
2439    
2440     if(optind >= argc)
2441     {
2442     print_usage();
2443     return 1;
2444     }
2445    
2446     string juncs_file_list = argv[optind++];
2447    
2448     if(optind >= argc)
2449     {
2450     print_usage();
2451     return 1;
2452     }
2453    
2454     string insertions_file_list = argv[optind++];
2455     if(optind >= argc)
2456     {
2457 gpertea 154 print_usage();
2458     return 1;
2459 gpertea 29 }
2460 gpertea 154
2461     string deletions_file_list = argv[optind++];
2462    
2463     if(optind >= argc)
2464     {
2465     print_usage();
2466     return 1;
2467     }
2468 gpertea 29
2469 gpertea 154 string fusions_file_list = argv[optind++];
2470    
2471     if(optind >= argc)
2472     {
2473     print_usage();
2474     return 1;
2475     }
2476    
2477     string bam_output_fname = argv[optind++];
2478    
2479     if(optind >= argc)
2480     {
2481     print_usage();
2482     return 1;
2483     }
2484    
2485 gpertea 69 string segmap_file_list = argv[optind++];
2486     string spliced_segmap_flist;
2487 gpertea 29 if(optind < argc)
2488     {
2489 gpertea 69 spliced_segmap_flist = argv[optind++];
2490 gpertea 29 }
2491    
2492     ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
2493     if (!ref_stream.good())
2494     err_die("Error: cannot open %s for reading\n",ref_file_name.c_str());
2495    
2496     checkSamHeader();
2497    
2498     //FILE* reads_file = fopen(reads_file_name.c_str(), "r");
2499 gpertea 69 vector<string> segmap_file_names;
2500     tokenize(segmap_file_list, ",",segmap_file_names);
2501 gpertea 29
2502     vector<string> juncs_file_names;
2503     vector<FILE*> juncs_files;
2504     tokenize(juncs_file_list, ",",juncs_file_names);
2505     for (size_t i = 0; i < juncs_file_names.size(); ++i) {
2506     //fprintf(stderr, "Opening %s for reading\n",
2507     // juncs_file_names[i].c_str());
2508     FILE* juncs_file = fopen(juncs_file_names[i].c_str(), "r");
2509     if (juncs_file == NULL) {
2510     fprintf(stderr, "Warning: cannot open %s for reading\n",
2511     juncs_file_names[i].c_str());
2512     continue;
2513     }
2514     juncs_files.push_back(juncs_file);
2515     }
2516    
2517     /*
2518     * Read in the deletion file names
2519     */
2520     vector<string> deletions_file_names;
2521     vector<FILE*> deletions_files;
2522     tokenize(deletions_file_list, ",",deletions_file_names);
2523     for (size_t i = 0; i < deletions_file_names.size(); ++i)
2524     {
2525     //fprintf(stderr, "Opening %s for reading\n",
2526     // deletions_file_names[i].c_str());
2527     FILE* deletions_file = fopen(deletions_file_names[i].c_str(), "r");
2528     if (deletions_file == NULL) {
2529     fprintf(stderr, "Warning: cannot open %s for reading\n",
2530     deletions_file_names[i].c_str());
2531     continue;
2532     }
2533     deletions_files.push_back(deletions_file);
2534     }
2535    
2536     /*
2537     * Read in the list of filenames that contain
2538     * insertion coordinates
2539     */
2540    
2541     vector<string> insertions_file_names;
2542     vector<FILE*> insertions_files;
2543     tokenize(insertions_file_list, ",",insertions_file_names);
2544     for (size_t i = 0; i < insertions_file_names.size(); ++i)
2545     {
2546     //fprintf(stderr, "Opening %s for reading\n",
2547     // insertions_file_names[i].c_str());
2548     FILE* insertions_file = fopen(insertions_file_names[i].c_str(), "r");
2549     if (insertions_file == NULL)
2550     {
2551     fprintf(stderr, "Warning: cannot open %s for reading\n",
2552     insertions_file_names[i].c_str());
2553     continue;
2554     }
2555     insertions_files.push_back(insertions_file);
2556     }
2557    
2558 gpertea 69 vector<string> spliced_segmap_file_names;
2559     vector<FZPipe> spliced_segmap_files;
2560 gpertea 135 string unzcmd;
2561 gpertea 69 tokenize(spliced_segmap_flist, ",",spliced_segmap_file_names);
2562 gpertea 154
2563     vector<string> fusions_file_names;
2564     vector<FILE*> fusions_files;
2565     tokenize(fusions_file_list, ",",fusions_file_names);
2566     for (size_t i = 0; i < fusions_file_names.size(); ++i)
2567 gpertea 29 {
2568 gpertea 135 fprintf(stderr, "Opening %s for reading\n",
2569 gpertea 154 fusions_file_names[i].c_str());
2570     FILE* fusions_file = fopen(fusions_file_names[i].c_str(), "r");
2571     if (fusions_file == NULL)
2572 gpertea 29 {
2573 gpertea 154 fprintf(stderr, "Warning: cannot open %s for reading\n",
2574     fusions_file_names[i].c_str());
2575     continue;
2576 gpertea 29 }
2577 gpertea 154 fusions_files.push_back(fusions_file);
2578 gpertea 29 }
2579 gpertea 154
2580    
2581     driver(bam_output_fname,
2582     ref_stream,
2583     juncs_files,
2584     insertions_files,
2585     deletions_files,
2586     fusions_files,
2587     spliced_segmap_file_names,
2588     segmap_file_names,
2589     reads_file_name);
2590 gpertea 29
2591     return 0;
2592     }