ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/long_spanning_reads.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (7 years, 8 months ago) by gpertea
File size: 77245 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
4
5 /*
6 * long_spanning_reads.cpp
7 * TopHat
8 *
9 * Created by Cole Trapnell on 2/5/09.
10 * Copyright 2009 Cole Trapnell. All rights reserved.
11 *
12 */
13
14 #include <cassert>
15 #include <cstdio>
16 #include <vector>
17 #include <string>
18 #include <map>
19 #include <algorithm>
20 #include <set>
21 #include <cstdlib>
22 #include <cstring>
23 #include <bitset>
24 //#include <stdexcept>
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <seqan/modifier.h>
32 #include <getopt.h>
33
34 #include <boost/thread.hpp>
35
36 #include "common.h"
37 #include "utils.h"
38 #include "bwt_map.h"
39 #include "tokenize.h"
40 #include "segments.h"
41 #include "reads.h"
42
43 #include "junctions.h"
44 #include "insertions.h"
45 #include "deletions.h"
46 #include "fusions.h"
47
48 using namespace seqan;
49 using namespace std;
50
51 // daehwan
52 bool bDebug = false;
53
54 void print_usage()
55 {
56 fprintf(stderr, "Usage: long_spanning_reads <reads.fa/.fq> <possible_juncs1,...,possible_juncsN> <possible_insertions1,...,possible_insertionsN> <possible_deletions1,...,possible_deletionsN> <seg1.bwtout,...,segN.bwtout> [spliced_seg1.bwtout,...,spliced_segN.bwtout]\n");
57 }
58
59 bool key_lt(const pair<uint32_t, HitsForRead>& lhs,
60 const pair<uint32_t, HitsForRead>& rhs)
61 {
62 return lhs.first < rhs.first;
63 }
64
65 void get_seqs(istream& ref_stream,
66 RefSequenceTable& rt,
67 bool keep_seqs = true)
68 {
69 while(ref_stream.good() &&
70 !ref_stream.eof())
71 {
72 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
73 string name;
74 readMeta(ref_stream, name, Fasta());
75 string::size_type space_pos = name.find_first_of(" \t\r");
76 if (space_pos != string::npos)
77 {
78 name.resize(space_pos);
79 }
80 seqan::read(ref_stream, *ref_str, Fasta());
81
82 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
83 }
84 }
85
86
87 void look_right_for_hit_group(ReadTable& unmapped_reads,
88 vector<HitStream>& contig_hits,
89 size_t curr_file,
90 vector<HitStream>& spliced_hits,
91 const HitsForRead& targets,
92 vector<HitsForRead>& seg_hits_for_read)
93 {
94 int right_file = curr_file + 1;
95
96 HitStream& right = contig_hits[right_file];
97 uint64_t curr_next_group_id = targets.insert_id;
98 int curr_order = unmapped_reads.observation_order(curr_next_group_id);
99
100 assert (curr_order != -1);
101 while(true)
102 {
103 HitsForRead hit_group;
104 uint64_t right_next_group_id = right.next_group_id();
105 int right_order = unmapped_reads.observation_order(right_next_group_id);
106
107 // If we would have seen the hits by now, bail out.
108 if (curr_order < right_order || right_order == -1)
109 {
110 break;
111 }
112 if (right.next_read_hits(hit_group))
113 {
114 if (hit_group.insert_id == targets.insert_id)
115 {
116 // Some of the targets may be missing, we need to
117 // process them individually
118 seg_hits_for_read[right_file] = hit_group;
119 break;
120 }
121 }
122 }
123
124 HitsForRead& curr_seg_hits = seg_hits_for_read[right_file];
125
126 if (right_file < (int)spliced_hits.size() && right_file >= 0)
127 {
128 // Scan forward in the spliced hits file for this hit group
129 HitsForRead spliced_group;
130 HitsForRead curr_spliced_group;
131 while (spliced_hits[right_file].next_group_id() > 0 &&
132 spliced_hits[right_file].next_group_id() <= (uint32_t)curr_order)
133 {
134 spliced_hits[right_file].next_read_hits(curr_spliced_group);
135
136 if (curr_spliced_group.insert_id == (uint32_t)curr_order)
137 {
138 spliced_group = curr_spliced_group;
139 break;
140 }
141 }
142
143 if (!spliced_group.hits.empty())
144 {
145 curr_seg_hits.insert_id = spliced_group.insert_id;
146 curr_seg_hits.hits.insert(curr_seg_hits.hits.end(),
147 spliced_group.hits.begin(),
148 spliced_group.hits.end());
149 }
150 }
151
152 if (curr_seg_hits.hits.empty())
153 return;
154 else if (right_file + 1 < (int)contig_hits.size())
155 {
156 look_right_for_hit_group(unmapped_reads,
157 contig_hits,
158 curr_file + 1,
159 spliced_hits,
160 curr_seg_hits,
161 seg_hits_for_read);
162 }
163 }
164
165 BowtieHit merge_chain(RefSequenceTable& rt,
166 const string& read_seq,
167 const string& read_quals,
168 std::set<Junction>& possible_juncs,
169 std::set<Insertion>& possible_insertions,
170 std::set<Fusion>& possible_fusions,
171 list<BowtieHit>& hit_chain,
172 int fusion_dir = FUSION_NOTHING)
173 {
174 bool antisense = hit_chain.front().antisense_align();
175 uint64_t insert_id = hit_chain.front().insert_id();
176
177 const int left = hit_chain.front().left();
178
179 list<BowtieHit>::iterator prev_hit = hit_chain.begin();
180 list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
181
182 string seq;
183 string qual;
184 int old_read_length = 0;
185 int first_seg_length = hit_chain.front().seq().length();
186 for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
187 {
188 seq += i->seq();
189 qual += i->qual();
190 old_read_length += i->read_len();
191 }
192
193 string rev_read_seq, rev_read_quals;
194 if (color && antisense)
195 {
196 rev_read_seq = read_seq;
197 reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
198
199 rev_read_quals = read_quals;
200 reverse(rev_read_quals.begin(), rev_read_quals.end());
201 }
202
203 size_t num_fusions = prev_hit->fusion_opcode() == FUSION_NOTHING ? 0 : 1;
204 bool fusion_passed = false;
205 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
206 {
207 if (prev_hit->ref_id() != prev_hit->ref_id2() || prev_hit->ref_id2() != curr_hit->ref_id())
208 fusion_passed = true;
209
210 if (prev_hit->ref_id2() != curr_hit->ref_id())
211 ++num_fusions;
212
213 if (curr_hit->fusion_opcode() != FUSION_NOTHING)
214 ++num_fusions;
215
216 if (prev_hit->ref_id2() == curr_hit->ref_id())
217 {
218 bool reversed = false;
219 if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
220 reversed = true;
221
222 /*
223 * Note that the gap size may be negative, since left() and right() return
224 * signed integers, this will be OK.
225 */
226 int gap;
227 if (reversed)
228 gap = prev_hit->right() - curr_hit->left();
229 else
230 gap = curr_hit->left() - prev_hit->right();
231
232 // daehwan
233 if (bDebug)
234 {
235 cout << "curr left: " << curr_hit->left() << endl
236 << "prev right: " << prev_hit->right() << endl
237 << "gap: " << gap << endl;
238 }
239
240 if (gap < -(int)max_insertion_length ||
241 (gap > (int)max_deletion_length &&
242 (gap < min_report_intron_length || gap > min(max_report_intron_length, (int)fusion_min_dist))))
243 {
244 fusion_passed = true;
245 ++num_fusions;
246 }
247 }
248
249 if (num_fusions >= 2)
250 return BowtieHit();
251
252 ++prev_hit;
253 ++curr_hit;
254 }
255
256 prev_hit = hit_chain.begin();
257 curr_hit = ++(hit_chain.begin());
258
259 // daehwan
260 if (bDebug)
261 {
262 cout << "daehwan - test" << endl;
263 }
264
265 int curr_seg_index = 1;
266 fusion_passed = false;
267 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
268 {
269 // daehwan
270 if (bDebug)
271 {
272 cout << "daehwan - start - stitch" << endl;
273 cout << "prev right: " << prev_hit->right() << endl;
274 cout << "curr left: " << curr_hit->left() << endl;
275 cout << "prev back: " << prev_hit->cigar().back().opcode << endl;
276 cout << "curr front: " << curr_hit->cigar().front().opcode << endl;
277 cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
278 cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
279 }
280
281 if (prev_hit->fusion_opcode() != FUSION_NOTHING || prev_hit->ref_id2() != curr_hit->ref_id())
282 fusion_passed = true;
283
284 /*
285 * This code is assuming that the cigar strings end and start with a match
286 * While segment alignments can actually end with a junction, insertion or deletion, the hope
287 * is that in those cases, the right and left ends of the alignments will correctly
288 * line up, so we won't get to this bit of code
289 */
290 if (!(prev_hit->cigar().back().opcode == MATCH || curr_hit->cigar().front().opcode == MATCH ||
291 prev_hit->cigar().back().opcode == mATCH || curr_hit->cigar().front().opcode == mATCH))
292 {
293 return BowtieHit();
294 }
295
296 // daehwan
297 if (bDebug)
298 {
299 cout << "daehwan - pass - enough matched bases" << endl;
300 }
301
302 if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
303 {
304 /*
305 * There is no way that we can splice these together into a valid
306 * alignment
307 */
308 return BowtieHit();
309 }
310
311 bool found_closure = false;
312 /*
313 * Note that antisense_splice is the same for prev and curr
314 */
315 bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
316 vector<CigarOp> new_cigar;
317 int new_left = -1;
318 int mismatch = 0;
319
320 /*
321 * Take the length of matched bases in each segment into consideration for closures,
322 * this can be a problem for reads of variable lengths.
323 */
324 int prev_right_end_match_length = prev_hit->cigar().back().length;
325 int curr_left_end_match_length = curr_hit->cigar().front().length;
326
327 bool check_fusion = prev_hit->ref_id2() != curr_hit->ref_id();
328
329 if (prev_hit->ref_id2() == curr_hit->ref_id())
330 {
331 // daehwan
332 if (bDebug)
333 {
334 cout << "daehwan - start - junction or insertion" << endl;
335 cout << "prev right: " << prev_hit->right() << endl;
336 cout << "curr left: " << curr_hit->left() << endl;
337 cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
338 cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
339 }
340
341 bool reversed = false;
342 if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
343 reversed = true;
344
345 uint32_t reference_id = prev_hit->ref_id2();
346 RefSequenceTable::Sequence* ref_str = rt.get_seq(reference_id);
347
348 int left_boundary, right_boundary;
349 if (reversed)
350 {
351 left_boundary = curr_hit->left() - 4;
352 right_boundary = prev_hit->right() + 4;
353 }
354 else
355 {
356 left_boundary = prev_hit->right() - 4;
357 right_boundary = curr_hit->left() + 4;
358 }
359
360 int dist_btw_two;
361 if (reversed)
362 dist_btw_two = prev_hit->right() - curr_hit->left();
363 else
364 dist_btw_two = curr_hit->left() - prev_hit->right();
365
366 if (dist_btw_two < 0 && dist_btw_two >= -max_insertion_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
367 {
368 std::set<Insertion>::iterator lb, ub;
369
370 /*
371 * Create a dummy sequence to represent the maximum possible insertion
372 */
373 std::string maxInsertedSequence = "";
374 maxInsertedSequence.resize(max_insertion_length,'A');
375
376 lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
377 ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
378
379 int reference_mismatch = 0;
380
381 while (lb != ub && lb != possible_insertions.end())
382 {
383 /*
384 * In the following code, we will check to make sure that the segments have the proper
385 * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
386 * In general, reads with insertions must match the inserted sequence exactly.
387 */
388 if (((int)lb->sequence.size()) == (reversed ? curr_hit->left() - prev_hit->right() : prev_hit->right() - curr_hit->left()))
389 {
390 /*
391 * Check we have enough matched bases on prev or curr segment.
392 */
393 int insert_to_prev_right, curr_left_to_insert;
394 if (reversed)
395 {
396 insert_to_prev_right = lb->left - prev_hit->right();
397 curr_left_to_insert = curr_hit->left() - lb->left;
398 }
399 else
400 {
401 insert_to_prev_right = prev_hit->right() - lb->left - 1;
402 curr_left_to_insert = lb->left - curr_hit->left() + 1;
403 }
404
405 if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
406 {
407 ++lb;
408 continue;
409 }
410
411 // daehwan
412 if (bDebug)
413 {
414 cout << "insert_to_prev_right: " << insert_to_prev_right << endl;
415 cout << "curr_left_to_insert: " << curr_left_to_insert << endl;
416 cout << "curr_seg_index: " << curr_seg_index << endl;
417 }
418
419 /*
420 * Keep track of how many mismatches were made to the genome in the region
421 * where we should actually be matching against the insertion
422 */
423 int this_reference_mismatch = 0;
424 int insertion_mismatch = 0;
425 int insertion_len = lb->sequence.length();
426 seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
427 if (reversed)
428 {
429 seqan::convertInPlace(insertionSequence, seqan::FunctorComplement<seqan::Dna>());
430 seqan::reverseInPlace(insertionSequence);
431 }
432
433 /*
434 * First check to see if we need to adjust number of observed errors for the left (prev)
435 * hit. This is only the case if this segment runs into the insertion. To be consistent
436 * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
437 */
438 string colorSegmentSequence_prev;
439 if (insert_to_prev_right > 0)
440 {
441 seqan::Dna5String referenceSequence, oldSegmentSequence;
442
443 if (reversed)
444 {
445 referenceSequence = seqan::infix(*ref_str, prev_hit->right() + 1, lb->left + 1);
446 seqan::convertInPlace(referenceSequence, seqan::FunctorComplement<seqan::Dna>());
447 seqan::reverseInPlace(referenceSequence);
448
449 string temp;
450
451 // daehwan
452 if (bDebug)
453 {
454 cout << "reversed: " << read_seq.length() << " " << read_seq << endl;
455 }
456
457 temp = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right);
458 oldSegmentSequence = seqan::Dna5String(temp);
459 }
460 else
461 {
462 referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
463
464 // daehwan
465 if (bDebug)
466 {
467 cout << "non-reversed: " << prev_hit->seq() << endl;
468 }
469
470 oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
471 }
472
473 if (color)
474 {
475 string color;
476
477 if (antisense)
478 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
479 else
480 color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
481
482 color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
483 colorSegmentSequence_prev = convert_color_to_bp(color);
484 }
485
486 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
487
488 // daehwan
489 if (bDebug)
490 {
491 cout << "ref: " << referenceSequence << endl;
492 cout << "old: " << oldSegmentSequence << endl;
493 cout << "ins: " << insertionSequence << endl;
494 }
495
496 /*
497 * Scan right in the read until we run out of read
498 */
499 for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
500 {
501 /*
502 * Any mismatch to the insertion is a failure
503 */
504 if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
505 {
506 ++this_reference_mismatch;
507 }
508
509 if (read_index < insertion_len)
510 {
511 if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
512 {
513 ++insertion_mismatch;
514 break;
515 }
516 }
517 else
518 {
519 if (referenceSequence[read_index - insertion_len] == 'N' ||
520 referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
521 {
522 --this_reference_mismatch;
523 }
524 }
525 }
526 }
527
528 string colorSegmentSequence_curr;
529 if (curr_left_to_insert > 0)
530 {
531 seqan::Dna5String referenceSequence, oldSegmentSequence;
532 if (reversed)
533 {
534 referenceSequence = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
535 seqan::convertInPlace(referenceSequence, seqan::FunctorComplement<seqan::Dna>());
536 seqan::reverseInPlace(referenceSequence);
537
538 string temp = read_seq.substr(curr_seg_index * segment_length, curr_left_to_insert);
539 oldSegmentSequence = seqan::Dna5String(temp);
540 }
541 else
542 {
543 referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
544 oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
545 }
546
547 if (color)
548 {
549 string color;
550 if (antisense)
551 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
552 else
553 color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
554
555 color.push_back(curr_hit->seq()[curr_left_to_insert]);
556 reverse(color.begin(), color.end());
557 string bp = convert_color_to_bp(color);
558 reverse(bp.begin(), bp.end());
559 colorSegmentSequence_curr = bp;
560 }
561
562 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
563
564 // daehwan
565 if (bDebug)
566 {
567 cout << "ref: " << referenceSequence << endl;
568 cout << "old: " << oldSegmentSequence << endl;
569 cout << "ins: " << insertionSequence << endl;
570 }
571
572 /*
573 * Scan left in the read until
574 * We ran out of read sequence (insertion extends past segment)
575 */
576 for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
577 {
578 int segmentPosition = curr_left_to_insert - read_index - 1;
579 int insertionPosition = insertion_len - read_index - 1;
580
581 if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
582 {
583 ++this_reference_mismatch;
584 }
585
586 if (read_index < insertion_len)
587 {
588 if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
589 {
590 ++insertion_mismatch;
591 break;
592 }
593 }
594 else
595 {
596 if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
597 (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
598 {
599 --this_reference_mismatch;
600 }
601 }
602 }
603 }
604
605 if (found_closure)
606 {
607 // fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
608 return BowtieHit();
609 }
610
611 if (insertion_mismatch == 0)
612 {
613 reference_mismatch = this_reference_mismatch;
614 mismatch = -reference_mismatch;
615 found_closure = true;
616 new_left = prev_hit->left();
617 new_cigar = prev_hit->cigar();
618
619 /*
620 * Need to make a new insert operation between the two match character that begin
621 * and end the intersection of these two junction. Note that we necessarily assume
622 * that this insertion can't span beyond the boundaries of these reads. That should
623 * probably be better enforced somewhere
624 */
625
626 new_cigar.back().length -= insert_to_prev_right;
627
628 if (new_cigar.back().length <= 0)
629 new_cigar.pop_back();
630
631 if (reversed)
632 new_cigar.push_back(CigarOp(iNS, lb->sequence.size()));
633 else
634 new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
635
636 vector<CigarOp> new_right_cigar = curr_hit->cigar();
637 new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
638
639 /*
640 * Finish stitching together the new cigar string
641 */
642 size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
643 for (; c < new_right_cigar.size(); ++c)
644 {
645 new_cigar.push_back(new_right_cigar[c]);
646 }
647
648 if (color)
649 {
650 if (insert_to_prev_right > 0)
651 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
652
653 if (curr_left_to_insert > 0)
654 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
655 }
656 }
657 }
658 ++lb;
659 }
660
661 if (!found_closure)
662 {
663 return BowtieHit();
664 }
665 }
666
667 /*
668 * Stitch segments together using juctions or deletions if necessary.
669 */
670 else if (dist_btw_two > 0 && dist_btw_two <= max_report_intron_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
671 {
672 std::set<Junction>::iterator lb, ub;
673
674 // daehwan
675 if (bDebug)
676 {
677 cout << "junction" << endl;
678 cout << "min: " << left_boundary << "-" << right_boundary - 8 << endl;
679 cout << "max: " << left_boundary + 8 << "-" << right_boundary << endl;
680 }
681
682 lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
683 ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
684
685 int new_diff_mismatches = 0xff;
686 while (lb != ub && lb != possible_juncs.end())
687 {
688 int dist_to_left, dist_to_right;
689 if (reversed)
690 {
691 dist_to_left = lb->left - curr_hit->left();
692 dist_to_right = lb->right - prev_hit->right() - 1;
693 }
694 else
695 {
696 dist_to_left = lb->left - prev_hit->right() + 1;
697 dist_to_right = lb->right - curr_hit->left();
698 }
699
700 if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
701 {
702 /*
703 * Check we have enough matched bases on prev or curr segment.
704 */
705 if ((reversed && (dist_to_left > prev_right_end_match_length || -dist_to_left > curr_left_end_match_length)) ||
706 (!reversed && (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)))
707 {
708 ++lb;
709 continue;
710 }
711
712 // daehwan
713 if (bDebug)
714 {
715 cout << "candidate junction: " << endl;
716 cout << "coords: " << lb->left << "-" << lb->right << endl;
717 cout << "dist to left: " << dist_to_left << endl;
718 }
719
720 Dna5String new_cmp_str, old_cmp_str;
721 int new_mismatch = 0, old_mismatch = 0;
722 string new_patch_str; // this is for colorspace reads
723 if (dist_to_left > 0)
724 {
725 if (reversed)
726 {
727 new_cmp_str = seqan::infix(*ref_str, curr_hit->left() + 1, lb->left + 1);
728 seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
729 seqan::reverseInPlace(new_cmp_str);
730
731 old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb->right);
732 seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
733 seqan::reverseInPlace(old_cmp_str);
734 }
735 else
736 {
737 new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
738 old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
739 }
740
741 string new_seq;
742 if (color)
743 {
744 string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
745
746 string color, qual;
747 if (antisense)
748 {
749 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
750 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
751 }
752 else
753 {
754 color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
755 qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
756 }
757
758 BWA_decode(color, qual, ref, new_seq);
759 new_seq = new_seq.substr(1);
760 }
761
762 string curr_hit_seq;
763 if (reversed)
764 curr_hit_seq = read_seq.substr(curr_seg_index * segment_length - dist_to_left, dist_to_left);
765 else
766 curr_hit_seq = curr_hit->seq();
767
768 const string& curr_old_seq = curr_hit_seq;
769 const string& curr_seq = color ? new_seq : curr_hit_seq;
770 for (int i = 0; i < dist_to_left; ++i)
771 {
772 if (curr_seq[i] != new_cmp_str[i])
773 ++new_mismatch;
774
775 if (curr_old_seq[i] != old_cmp_str[i])
776 ++old_mismatch;
777 }
778
779 if (color)
780 new_patch_str = curr_seq.substr(0, dist_to_left);
781 }
782 else if (dist_to_left < 0)
783 {
784 if (reversed)
785 {
786 new_cmp_str = seqan::infix(*ref_str, lb->right, prev_hit->right() + 1);
787 seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
788 seqan::reverseInPlace(new_cmp_str);
789
790 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
791 seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
792 seqan::reverseInPlace(old_cmp_str);
793 }
794 else
795 {
796 new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
797 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
798 }
799
800 size_t abs_dist = -dist_to_left;
801 string new_seq;
802 if (color)
803 {
804 string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
805 ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
806
807 string color, qual;
808 if (antisense)
809 {
810 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
811 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
812 }
813 else
814 {
815 color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
816 qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
817 }
818
819 BWA_decode(color, qual, ref, new_seq);
820 new_seq = new_seq.substr(1);
821 }
822
823 string prev_hit_seq;
824 if (reversed)
825 prev_hit_seq = read_seq.substr(curr_seg_index * segment_length, abs_dist);
826 else
827 prev_hit_seq = prev_hit->seq();
828
829 // daehwan
830 if (bDebug)
831 {
832 cout << "reverse: " << (int)reversed << endl;
833 cout << "new cmp str: " << new_cmp_str << endl;
834 cout << "old cmp str: " << old_cmp_str << endl;
835 cout << "hit seq: " << prev_hit_seq << endl;
836 cout << "curr seq: " << curr_hit->seq() << endl;
837
838 cout << read_seq
839 << endl;
840 cout << read_seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, segment_length)
841 << endl;
842 }
843
844 const string& prev_old_seq = prev_hit_seq;
845 size_t prev_old_seq_len = prev_old_seq.length();
846 const string& prev_seq = color ? new_seq : prev_hit_seq;
847 size_t prev_seq_len = prev_seq.length();
848 for (size_t i = 0; i < abs_dist; ++i)
849 {
850 if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
851 ++new_mismatch;
852 if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
853 ++old_mismatch;
854 }
855
856 if (color)
857 new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
858 }
859
860 int temp_diff_mismatches = new_mismatch - old_mismatch;
861
862 // daehwan
863 if (bDebug)
864 {
865 cout << "new mismatch: " << new_mismatch << endl;
866 cout << "old mismatch: " << old_mismatch << endl;
867 cout << "new_diff_mismatch: " << new_diff_mismatches << endl;
868 cout << "temp mismatch: " << temp_diff_mismatches << endl;
869 }
870
871 if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
872 {
873 ++lb;
874 continue;
875 }
876
877 if (color)
878 {
879 /*
880 * We need to recover the origianl sequence.
881 */
882 if (found_closure)
883 {
884 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
885 prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
886 }
887
888 if (dist_to_left > 0)
889 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
890 else if (dist_to_left < 0)
891 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
892 }
893
894 new_diff_mismatches = temp_diff_mismatches;
895
896 new_left = prev_hit->left();
897 new_cigar = prev_hit->cigar();
898
899 int new_left_back_len = new_cigar.back().length;
900 if (reversed)
901 new_left_back_len -= dist_to_left;
902 else
903 new_left_back_len += dist_to_left;
904
905 vector<CigarOp> new_right_cig = curr_hit->cigar();
906 int new_right_front_len = new_right_cig.front().length;
907 if (reversed)
908 new_right_front_len += dist_to_right;
909 else
910 new_right_front_len -= dist_to_right;
911 if (new_left_back_len > 0)
912 new_cigar.back().length = new_left_back_len;
913 else
914 new_cigar.pop_back();
915
916 /*
917 * FIXME, currently just differentiating between a deletion and a
918 * reference skip based on length. However, would probably be better
919 * to denote the difference explicitly, this would allow the user
920 * to supply their own (very large) deletions
921 */
922 if ((lb->right - lb->left - 1) <= max_deletion_length)
923 {
924 if (reversed)
925 new_cigar.push_back(CigarOp(dEL, lb->right - lb->left - 1));
926 else
927 new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
928 antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
929 }
930 else
931 {
932 if (reversed)
933 new_cigar.push_back(CigarOp(rEF_SKIP, lb->right - lb->left - 1));
934 else
935 new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
936 antisense_closure = lb->antisense;
937 }
938
939 new_right_cig.front().length = new_right_front_len;
940 size_t c = new_right_front_len > 0 ? 0 : 1;
941 for (; c < new_right_cig.size(); ++c)
942 new_cigar.push_back(new_right_cig[c]);
943
944 mismatch = new_diff_mismatches;
945 found_closure = true;
946 }
947 ++lb;
948 }
949
950 if (!found_closure)
951 {
952 return BowtieHit();
953 }
954 }
955 else if (!(dist_btw_two == 0 && prev_hit->antisense_align2() == curr_hit->antisense_align()))
956 check_fusion = true;
957 }
958
959 if (check_fusion)
960 {
961 std::set<Fusion>::iterator lb, ub;
962
963 uint32_t ref_id1 = prev_hit->ref_id2();
964 uint32_t ref_id2 = curr_hit->ref_id();
965 uint32_t left = prev_hit->right() - 4;
966 uint32_t right = curr_hit->left() - 4;
967
968 // daehwan
969 if (bDebug)
970 {
971 cout << "daehwan - start - fusion" << endl
972 << "ref_id1: " << ref_id1 << endl
973 << "ref_id2: " << ref_id2 << endl
974 << "left: " << left << endl
975 << "right: " << right << endl
976 << "dir: " << fusion_dir << endl;
977 }
978
979 bool reversed = false;
980 if (fusion_dir != FUSION_FF &&
981 (ref_id2 < ref_id1 || (ref_id1 == ref_id2 && left > right)))
982 {
983 reversed = true;
984
985 uint32_t temp = ref_id1;
986 ref_id1 = ref_id2;
987 ref_id2 = temp;
988
989 temp = left;
990 left = right;
991 right = temp;
992 }
993
994 lb = possible_fusions.upper_bound(Fusion(ref_id1, ref_id2, left, right));
995 ub = possible_fusions.lower_bound(Fusion(ref_id1, ref_id2, left + 8, right + 8));
996
997 RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id2());
998 RefSequenceTable::Sequence* ref_str2 = rt.get_seq(curr_hit->ref_id());
999
1000 int new_diff_mismatches = 0xff;
1001 while (lb != ub && lb != possible_fusions.end())
1002 {
1003 int lb_left = lb->left;
1004 int lb_right = lb->right;
1005
1006 if (reversed)
1007 {
1008 lb_left = lb->right;
1009 lb_right = lb->left;
1010 }
1011
1012 int dist_to_left, dist_to_right;
1013 if (fusion_dir == FUSION_RF)
1014 dist_to_left = prev_hit->right() - lb_left + 1;
1015 else
1016 dist_to_left = lb_left - prev_hit->right() + 1;
1017
1018 if (fusion_dir == FUSION_FR)
1019 dist_to_right = curr_hit->left() - lb_right;
1020 else
1021 dist_to_right = lb_right - curr_hit->left();
1022
1023 // daehwan
1024 if (bDebug)
1025 {
1026 cout << "daehwan - fusion gap" << endl;
1027 cout << "dist left: " << dist_to_left << endl;
1028 cout << "dist right: " << dist_to_right << endl;
1029 }
1030
1031 if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
1032 {
1033 /*
1034 * Check we have enough matched bases on prev or curr segment.
1035 */
1036 if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)
1037 {
1038 ++lb;
1039 continue;
1040 }
1041
1042 Dna5String new_cmp_str, old_cmp_str;
1043 int new_mismatch = 0, old_mismatch = 0;
1044 string new_patch_str; // this is for colorspace reads
1045 if (dist_to_left > 0)
1046 {
1047 if (fusion_dir == FUSION_RF)
1048 {
1049 new_cmp_str = seqan::infix(*ref_str, lb_left, prev_hit->right() + 1);
1050 seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1051 seqan::reverseInPlace(new_cmp_str);
1052 }
1053 else
1054 new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb_left + 1);
1055
1056 if (fusion_dir == FUSION_FR)
1057 {
1058 old_cmp_str = seqan::infix(*ref_str2, lb_right + 1, curr_hit->left() + 1);
1059 seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1060 seqan::reverseInPlace(old_cmp_str);
1061 }
1062 else
1063 old_cmp_str = seqan::infix(*ref_str2, curr_hit->left(), lb_right);
1064
1065 // daehwan
1066 if (bDebug)
1067 {
1068 cout << "new str: " << new_cmp_str << endl;
1069 cout << "old str: " << old_cmp_str << endl;
1070 cout << "curr seq: " << curr_hit->seq() << endl;
1071 }
1072
1073 string curr_hit_seq;
1074 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1075 curr_hit_seq = curr_hit->seq();
1076 else
1077 curr_hit_seq = read_seq.substr(curr_seg_index * segment_length, segment_length);
1078
1079 string new_seq;
1080 const string& curr_old_seq = curr_hit_seq;
1081 const string& curr_seq = color ? new_seq : curr_hit_seq;
1082 for (int i = 0; i < dist_to_left; ++i)
1083 {
1084 if (curr_seq[i] != new_cmp_str[i])
1085 ++new_mismatch;
1086
1087 if (curr_old_seq[i] != old_cmp_str[i])
1088 ++old_mismatch;
1089 }
1090 }
1091 else if (dist_to_left < 0)
1092 {
1093 if (fusion_dir == FUSION_FR)
1094 {
1095 new_cmp_str = seqan::infix(*ref_str2, curr_hit->left() + 1, lb_right + 1);
1096 seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1097 seqan::reverseInPlace(new_cmp_str);
1098 }
1099 else
1100 new_cmp_str = seqan::infix(*ref_str2, lb_right, curr_hit->left());
1101
1102 if (fusion_dir == FUSION_RF)
1103 {
1104 old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb_left);
1105 seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1106 seqan::reverseInPlace(old_cmp_str);
1107 }
1108 else
1109 old_cmp_str = seqan::infix(*ref_str, lb_left + 1, prev_hit->right());
1110
1111 string prev_hit_seq;
1112 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1113 prev_hit_seq = prev_hit->seq();
1114 else
1115 prev_hit_seq = read_seq.substr((curr_seg_index - 1) * segment_length, segment_length);
1116
1117 size_t abs_dist = -dist_to_left;
1118 string new_seq;
1119
1120 const string& prev_old_seq = prev_hit_seq;
1121 size_t prev_old_seq_len = prev_old_seq.length();
1122 const string& prev_seq = color ? new_seq : prev_hit_seq;
1123 size_t prev_seq_len = prev_seq.length();
1124 for (size_t i = 0; i < abs_dist; ++i)
1125 {
1126 if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
1127 ++new_mismatch;
1128 if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
1129 ++old_mismatch;
1130 }
1131 }
1132
1133 int temp_diff_mismatches = new_mismatch - old_mismatch;
1134 if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
1135 {
1136 ++lb;
1137 continue;
1138 }
1139 new_diff_mismatches = temp_diff_mismatches;
1140
1141 new_left = prev_hit->left();
1142 new_cigar = prev_hit->cigar();
1143
1144 int new_left_back_len = new_cigar.back().length;
1145 new_left_back_len += dist_to_left;
1146
1147 vector<CigarOp> new_right_cig = curr_hit->cigar();
1148 int new_right_front_len = new_right_cig.front().length;
1149 new_right_front_len -= dist_to_right;
1150 if (new_left_back_len > 0)
1151 new_cigar.back().length = new_left_back_len;
1152 else
1153 new_cigar.pop_back();
1154
1155 new_cigar.push_back(CigarOp((CigarOpCode)fusion_dir, lb_right));
1156 antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
1157
1158 new_right_cig.front().length = new_right_front_len;
1159 size_t c = new_right_front_len > 0 ? 0 : 1;
1160 for (; c < new_right_cig.size(); ++c)
1161 new_cigar.push_back(new_right_cig[c]);
1162
1163 mismatch = new_diff_mismatches;
1164 found_closure = true;
1165 ++num_fusions;
1166
1167 // daehwan
1168 if (bDebug)
1169 {
1170 cout << "daehwan - fusion gap - found" << endl;
1171 }
1172
1173 }
1174 ++lb;
1175 }
1176
1177 // daehwan
1178 if (bDebug)
1179 {
1180 cout << "daehwan2 - end - fusion: " << (found_closure ? "found" : "not found") << endl;
1181 }
1182
1183 if (!found_closure)
1184 {
1185 return BowtieHit();
1186 }
1187 }
1188
1189 if (found_closure)
1190 {
1191 bool end = false;
1192 BowtieHit merged_hit(prev_hit->ref_id(),
1193 curr_hit->ref_id2(),
1194 insert_id,
1195 new_left,
1196 new_cigar,
1197 antisense,
1198 antisense_closure,
1199 prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
1200 prev_hit->splice_mms() + curr_hit->splice_mms(),
1201 end);
1202
1203 // daehwan - should fix this for SOLiD dataset
1204 merged_hit.seq(prev_hit->seq() + curr_hit->seq());
1205
1206 // daehwan
1207 if (bDebug)
1208 {
1209 cout << "fusing of " << merged_hit.left() << " and " << merged_hit.right() << endl;
1210 cout << print_cigar(merged_hit.cigar()) << endl;
1211 if (!merged_hit.check_editdist_consistency(rt))
1212 {
1213 cout << "btw " << print_cigar(prev_hit->cigar()) << " and " << print_cigar(curr_hit->cigar()) << endl;
1214 cout << "this is malformed hit" << endl;
1215 }
1216 }
1217
1218 prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
1219 /*
1220 * prev_hit now points PAST the last element removed
1221 */
1222 prev_hit = hit_chain.insert(prev_hit, merged_hit);
1223 /*
1224 * merged_hit has been inserted before the old position of
1225 * prev_hit. New location of prev_hit is merged_hit
1226 */
1227 curr_hit = prev_hit;
1228 ++curr_hit;
1229 ++curr_seg_index;
1230 continue;
1231 }
1232
1233 // daehwan
1234 if (bDebug)
1235 {
1236 cout << "daehwan - test 0.3" << endl;
1237 }
1238
1239 ++prev_hit;
1240 ++curr_hit;
1241 ++curr_seg_index;
1242 }
1243
1244 // daehwan
1245 if (bDebug)
1246 {
1247 cout << "daehwan - test2" << endl;
1248 }
1249
1250 bool saw_antisense_splice = false;
1251 bool saw_sense_splice = false;
1252 vector<CigarOp> long_cigar;
1253 int num_mismatches = 0;
1254 int num_splice_mms = 0;
1255 for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
1256 {
1257 num_mismatches += s->edit_dist();
1258 num_splice_mms += s->splice_mms();
1259
1260 /*
1261 * Check whether the sequence contains any reference skips. Previously,
1262 * this was just a check to see whether the sequence was contiguous; however
1263 * we don't want to count an indel event as a splice
1264 */
1265 bool containsSplice = s->is_spliced();
1266 if (containsSplice)
1267 {
1268 if (s->antisense_splice())
1269 {
1270 if (saw_sense_splice)
1271 return BowtieHit();
1272 saw_antisense_splice = true;
1273 }
1274 else
1275 {
1276 if (saw_antisense_splice)
1277 return BowtieHit();
1278 saw_sense_splice = true;
1279 }
1280 }
1281 const vector<CigarOp>& cigar = s->cigar();
1282 if (long_cigar.empty())
1283 {
1284 long_cigar = cigar;
1285 }
1286 else
1287 {
1288 CigarOp& last = long_cigar.back();
1289 /*
1290 * If necessary, merge the back and front
1291 * cigar operations
1292 */
1293 if(last.opcode == cigar[0].opcode){
1294 last.length += cigar[0].length;
1295 for (size_t b = 1; b < cigar.size(); ++b)
1296 {
1297 long_cigar.push_back(cigar[b]);
1298 }
1299 }else{
1300 for(size_t b = 0; b < cigar.size(); ++b)
1301 {
1302 long_cigar.push_back(cigar[b]);
1303 }
1304 }
1305 }
1306 }
1307
1308 bool end = false;
1309 BowtieHit new_hit(hit_chain.front().ref_id(),
1310 hit_chain.back().ref_id2(),
1311 insert_id,
1312 left,
1313 long_cigar,
1314 antisense,
1315 saw_antisense_splice,
1316 num_mismatches,
1317 num_splice_mms,
1318 end);
1319
1320 if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1321 {
1322 new_hit.seq(seq);
1323 new_hit.qual(qual);
1324 }
1325 else
1326 {
1327 new_hit.seq(read_seq);
1328 new_hit.qual(read_quals);
1329 }
1330
1331 bool do_reverse = new_hit.ref_id() > new_hit.ref_id2();
1332 if (new_hit.ref_id() == new_hit.ref_id2())
1333 {
1334 vector<Fusion> fusions;
1335 bool auto_sort = false;
1336 fusions_from_spliced_hit(new_hit, fusions, auto_sort);
1337 if (fusions.size() > 0)
1338 {
1339 const Fusion& fusion = fusions[0];
1340 do_reverse = fusion.left > fusion.right;
1341 }
1342 }
1343
1344 if (do_reverse)
1345 {
1346 new_hit = new_hit.reverse();
1347 }
1348
1349 if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
1350 {
1351 new_hit.antisense_align(!new_hit.antisense_align());
1352 }
1353
1354 // daehwan
1355 if (bDebug)
1356 {
1357 cout << "daehwan - test3" << endl;
1358 cout << new_hit.left() << " " << print_cigar(new_hit.cigar()) << endl;
1359 cout << new_hit.ref_id() << "-" << new_hit.ref_id2() << ": " << new_hit.fusion_opcode() << endl;
1360 }
1361
1362 int new_read_len = new_hit.read_len();
1363 if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt))
1364 {
1365 // daehwan
1366 if (bDebug)
1367 {
1368 cout << "Warning: " << new_hit.insert_id() << " malformed closure: " << print_cigar(new_hit.cigar()) << endl;
1369 exit(1);
1370 }
1371
1372 fprintf(stderr, "Warning: %d malformed closure\n", new_hit.insert_id());
1373 return BowtieHit();
1374 }
1375
1376 return new_hit;
1377 }
1378
1379
1380 int multi_closure = 0;
1381 int anchor_too_short = 0;
1382 int gap_too_short = 0;
1383
1384 bool valid_hit(const BowtieHit& bh)
1385 {
1386 if (bh.insert_id())
1387 {
1388 /*
1389 * validate the cigar chain - no gaps shorter than an intron, etc.
1390 * also,
1391 * -Don't start or end with an indel or refskip
1392 * -Only a match operation is allowed is allowed
1393 * adjacent to an indel or refskip
1394 * -Indels should confrom to length restrictions
1395 */
1396 const CigarOp* prevCig = &(bh.cigar()[0]);
1397 const CigarOp* currCig = &(bh.cigar()[1]);
1398 for (size_t i = 1; i < bh.cigar().size(); ++i){
1399 currCig = &(bh.cigar()[i]);
1400 if(!(currCig->opcode == MATCH || currCig->opcode == mATCH) &&
1401 !(prevCig->opcode == MATCH || prevCig->opcode == mATCH)){
1402 return false;
1403 }
1404 if(currCig->opcode == INS || currCig->opcode == iNS){
1405 if(currCig->length > max_insertion_length){
1406 return false;
1407 }
1408 }
1409 if(currCig->opcode == DEL || currCig->opcode == dEL){
1410 if(currCig->length > max_deletion_length){
1411 return false;
1412 }
1413 }
1414 if(currCig->opcode == REF_SKIP || currCig->opcode == rEF_SKIP){
1415 if(currCig->length < (uint64_t)min_report_intron_length){
1416 gap_too_short++;
1417 return false;
1418 }
1419 }
1420 prevCig = currCig;
1421 }
1422 if (!(bh.cigar().front().opcode == MATCH || bh.cigar().front().opcode == mATCH) ||
1423 !(bh.cigar().back().opcode == MATCH || bh.cigar().back().opcode == mATCH)/* ||
1424 (int)bh.cigar().front().length < min_anchor_len||
1425 (int)bh.cigar().back().length < min_anchor_len*/ )
1426 {
1427 anchor_too_short++;
1428 return false;
1429 }
1430 }
1431 else
1432 {
1433 multi_closure++;
1434 return false;
1435 }
1436
1437 return true;
1438 }
1439
1440 void merge_segment_chain(RefSequenceTable& rt,
1441 const string& read_seq,
1442 const string& read_quals,
1443 std::set<Junction>& possible_juncs,
1444 std::set<Insertion>& possible_insertions,
1445 std::set<Fusion>& possible_fusions,
1446 vector<BowtieHit>& hits,
1447 vector<BowtieHit>& merged_hits,
1448 int fusion_dir = FUSION_NOTHING)
1449 {
1450 if (hits.size() == 0)
1451 return;
1452
1453 BowtieHit bh;
1454 if (hits.size() > 1)
1455 {
1456 list<BowtieHit> hit_chain;
1457 if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1458 {
1459 if (hits.front().antisense_align())
1460 copy(hits.rbegin(), hits.rend(), back_inserter(hit_chain));
1461 else
1462 copy(hits.begin(), hits.end(), back_inserter(hit_chain));
1463 }
1464 else
1465 {
1466 bool bSawFusion = false;
1467 for (size_t i = 0; i < hits.size(); ++i)
1468 {
1469 bool pushed = false;
1470 if (!bSawFusion)
1471 {
1472 if (i > 0)
1473 {
1474 if (hits[i-1].ref_id() != hits[i].ref_id())
1475 bSawFusion = true;
1476 else if(hits[i-1].antisense_align() != hits[i].antisense_align())
1477 bSawFusion = true;
1478 else
1479 {
1480 int dist = 0;
1481 if (hits[i].antisense_align())
1482 dist = hits[i-1].left() - hits[i].right();
1483 else
1484 dist = hits[i].left() - hits[i-1].right();
1485
1486 if (dist >= max_report_intron_length || dist < -(int)max_insertion_length)
1487 bSawFusion = true;
1488 }
1489 }
1490 }
1491
1492 if (hits[i].fusion_opcode() == FUSION_NOTHING &&
1493 ((fusion_dir == FUSION_FR && bSawFusion) || (fusion_dir == FUSION_RF && !bSawFusion)) &&
1494 hits[i].left() < hits[i].right())
1495 {
1496 hit_chain.push_back(hits[i].reverse());
1497 pushed = true;
1498 }
1499
1500 if (i > 0 &&
1501 hits[i].fusion_opcode() != FUSION_NOTHING &&
1502 hits[i].ref_id() != hits[i-1].ref_id())
1503 {
1504 hit_chain.push_back(hits[i].reverse());
1505 pushed = true;
1506 }
1507
1508 if (!bSawFusion)
1509 {
1510 if (hits[i].fusion_opcode() != FUSION_NOTHING)
1511 bSawFusion = true;
1512 }
1513
1514 if (!pushed)
1515 hit_chain.push_back(hits[i]);
1516 }
1517 }
1518
1519 bh = merge_chain(rt,
1520 read_seq,
1521 read_quals,
1522 possible_juncs,
1523 possible_insertions,
1524 possible_fusions,
1525 hit_chain,
1526 fusion_dir);
1527 }
1528 else
1529 {
1530 bh = hits[0];
1531 }
1532
1533 if (valid_hit(bh))
1534 merged_hits.push_back(bh);
1535 }
1536
1537 bool dfs_seg_hits(RefSequenceTable& rt,
1538 const string& read_seq,
1539 const string& read_quals,
1540 std::set<Junction>& possible_juncs,
1541 std::set<Insertion>& possible_insertions,
1542 std::set<Fusion>& possible_fusions,
1543 vector<HitsForRead>& seg_hits_for_read,
1544 size_t curr,
1545 vector<BowtieHit>& seg_hit_stack,
1546 vector<BowtieHit>& joined_hits,
1547 int fusion_dir = FUSION_NOTHING)
1548 {
1549 assert (!seg_hit_stack.empty());
1550 bool join_success = false;
1551
1552 if (curr < seg_hits_for_read.size())
1553 {
1554 for (size_t i = 0; i < seg_hits_for_read[curr].hits.size(); ++i)
1555 {
1556 /*
1557 * As we reverse segments depending on directions like FR or RF,
1558 * it's necessary to recover the original segments.
1559 */
1560 BowtieHit bh = seg_hits_for_read[curr].hits[i];
1561 BowtieHit bh_prev = seg_hit_stack.back();
1562
1563 BowtieHit* prevHit = &bh_prev;
1564 BowtieHit* currHit = &bh;
1565
1566 // daehwan
1567 // if (bh.insert_id() == 11921733)
1568 // bDebug = true;
1569
1570 /*
1571 * Each segment has at most one fusion by assumption,
1572 */
1573 bool prevHit_fused = prevHit->fusion_opcode() != FUSION_NOTHING;
1574 bool currHit_fused = currHit->fusion_opcode() != FUSION_NOTHING;
1575
1576 /*
1577 * Count the number of fusions on prev and curr segments,
1578 * but this doesn't take into account the gap (if exists) between the two segments.
1579 */
1580 size_t num_fusions = prevHit_fused ? 1 : 0;
1581 num_fusions += currHit_fused ? 1 : 0;
1582
1583 int dir = prevHit_fused ? prevHit->fusion_opcode() : currHit->fusion_opcode();
1584
1585 /*
1586 * We don't allow reads that span more than two fusion points.
1587 */
1588 if (num_fusions >= 2)
1589 continue;
1590
1591 if (fusion_dir != FUSION_NOTHING && currHit_fused)
1592 continue;
1593
1594 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1595 {
1596 if ((currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id()) ||
1597 (!currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id2()))
1598 continue;
1599 }
1600
1601 if (bDebug)
1602 {
1603 cout << "daehwan - prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": "
1604 << print_cigar(prevHit->cigar()) << endl;
1605 cout << "daehwan - prev sense: "
1606 << (prevHit->antisense_align() ? "-" : "+")
1607 << "\t"
1608 << (prevHit->antisense_align2() ? "-" : "+")
1609 << endl;
1610 cout << "daehwan - prev coords: "
1611 << prevHit->left()
1612 << "\t"
1613 << prevHit->right()
1614 << endl;
1615
1616 cout << "daehwan - curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": "
1617 << print_cigar(currHit->cigar()) << endl;
1618 cout << "daehwan - curr sense: "
1619 << (currHit->antisense_align() ? "-" : "+")
1620 << "\t"
1621 << (currHit->antisense_align2() ? "-" : "+")
1622 << endl;
1623 cout << "daehwan - curr corrds: "
1624 << currHit->left()
1625 << "\t"
1626 << currHit->right()
1627 << endl;
1628 }
1629
1630 if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF) &&
1631 prevHit->ref_id2() != currHit->ref_id())
1632 continue;
1633
1634 if ((fusion_dir == FUSION_FR && !currHit->antisense_align()) ||
1635 (fusion_dir == FUSION_RF && currHit->antisense_align()))
1636 continue;
1637
1638 if (currHit_fused && dir == FUSION_RR)
1639 *currHit = currHit->reverse();
1640
1641 if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RF ||
1642 (currHit_fused && currHit->ref_id() == currHit->ref_id2() && (dir == FUSION_FR || dir == FUSION_RF)))
1643 {
1644 if (currHit_fused)
1645 {
1646 if ((dir == FUSION_FR && currHit->antisense_align()) ||
1647 (dir == FUSION_RF && !currHit->antisense_align()))
1648 *currHit = currHit->reverse();
1649 }
1650 else
1651 {
1652 if (fusion_dir == FUSION_FR && currHit->antisense_align())
1653 *currHit = currHit->reverse();
1654 }
1655 }
1656 /*
1657 * Switch prevHit and currHit in FUSION_NOTHING and FUSION_FF cases
1658 * to make it easier to check the distance in the gap between the two segments.
1659 */
1660 else if ((num_fusions == 0 && prevHit->antisense_align() && currHit->antisense_align() && prevHit->ref_id() == currHit->ref_id() && prevHit->left() <= currHit->right() + max_report_intron_length && prevHit->left() + max_insertion_length >= currHit->right()) ||
1661 (num_fusions == 1 && (dir == FUSION_FF || dir == FUSION_RR)&&
1662 ((!prevHit_fused && prevHit->antisense_align()) || (!currHit_fused && currHit->antisense_align())))
1663 )
1664 {
1665 BowtieHit* tempHit = prevHit;
1666 prevHit = currHit;
1667 currHit = tempHit;
1668 }
1669 else if (num_fusions == 0)
1670 {
1671 if (prevHit->ref_id2() == currHit->ref_id() && prevHit->antisense_align() == currHit->antisense_align())
1672 {
1673 int dist = 0;
1674 if (prevHit->antisense_align())
1675 dist = prevHit->left() - currHit->right();
1676 else
1677 dist = currHit->left() - prevHit->right();
1678
1679 if (dist > max_report_intron_length || dist < -(int)max_insertion_length)
1680 {
1681 if ((prevHit->antisense_align() && prevHit->left() > currHit->left()) ||
1682 (!prevHit->antisense_align() && prevHit->left() < currHit->left()))
1683 dir = FUSION_FF;
1684 else
1685 dir = FUSION_RR;
1686 }
1687 }
1688 else
1689 {
1690 if (prevHit->antisense_align() == currHit->antisense_align())
1691 {
1692 if ((prevHit->antisense_align() && prevHit->ref_id() > currHit->ref_id()) ||
1693 (!prevHit->antisense_align() && prevHit->ref_id() < currHit->ref_id()))
1694 dir = FUSION_FF;
1695 else
1696 dir = FUSION_RR;
1697 }
1698 else if (!prevHit->antisense_align())
1699 dir = FUSION_FR;
1700 else
1701 dir = FUSION_RF;
1702
1703 if (dir == FUSION_FR)
1704 *currHit = currHit->reverse();
1705 else if(dir == FUSION_RF)
1706 *prevHit = prevHit->reverse();
1707 }
1708 }
1709
1710 // daehwan - test
1711 if (bDebug)
1712 {
1713 cout << "insert id: " << prevHit->insert_id() << endl;
1714 cout << "(" << curr - 1 << ") prev: " << prevHit->seq() << " : " << (prevHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
1715 cout << "(" << curr << ") curr: " << currHit->seq() << " : " << (currHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
1716 cout << "prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": " << print_cigar(prevHit->cigar()) << endl;
1717 cout << "curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": " << print_cigar(currHit->cigar()) << endl;
1718 cout << "prev coords: "
1719 << prevHit->left()
1720 << "\t"
1721 << prevHit->right()
1722 << endl;
1723 cout << "curr corrds: "
1724 << currHit->left()
1725 << "\t"
1726 << currHit->right()
1727 << endl;
1728 cout << "prev sense: "
1729 << (prevHit->antisense_align() ? "-" : "+")
1730 << "\t"
1731 << (prevHit->antisense_align2() ? "-" : "+")
1732 << endl;
1733 cout << "curr sense: "
1734 << (currHit->antisense_align() ? "-" : "+")
1735 << "\t"
1736 << (currHit->antisense_align2() ? "-" : "+")
1737 << endl;
1738 }
1739
1740 if (num_fusions == 1)
1741 {
1742 // daehwan
1743 if (bDebug)
1744 {
1745 cout << "direction: " << (int)dir << endl;
1746 }
1747
1748 /*
1749 * orient the fused segment, which depends on a fusion direction.
1750 */
1751 if (dir != FUSION_FF && dir != FUSION_RR)
1752 {
1753 bool prevHit_rep = false;
1754 bool currHit_rep = false;
1755
1756 if (prevHit_fused)
1757 {
1758 if ((dir == FUSION_FR && !currHit->antisense_align()) ||
1759 (dir == FUSION_RF && currHit->antisense_align()))
1760 continue;
1761
1762 if (prevHit->ref_id2() != currHit->ref_id())
1763 prevHit_rep = true;
1764 else if ((dir == FUSION_FR && prevHit->antisense_align()) ||
1765 (dir == FUSION_RF && !prevHit->antisense_align()))
1766 prevHit_rep = true;
1767 }
1768
1769 if (currHit_fused)
1770 {
1771 if ((dir == FUSION_FR && prevHit->antisense_align()) ||
1772 (dir == FUSION_RF && !prevHit->antisense_align()))
1773 continue;
1774
1775 if (currHit->ref_id() != prevHit->ref_id2())
1776 currHit_rep = true;
1777 }
1778
1779 if (bDebug)
1780 {
1781 if (prevHit_rep) cout << "1. reversed in prev" << endl;
1782 if (currHit_rep) cout << "1. reversed in curr" << endl;
1783 }
1784
1785 if (prevHit_rep)
1786 *prevHit = prevHit->reverse();
1787 if (currHit_rep)
1788 *currHit = currHit->reverse();
1789
1790 prevHit_rep = false;
1791 currHit_rep = false;
1792
1793 if (prevHit_fused)
1794 {
1795 if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
1796 currHit_rep = true;
1797 }
1798 else
1799 {
1800 if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
1801 prevHit_rep = true;
1802 }
1803
1804 if (prevHit_rep)
1805 *prevHit = prevHit->reverse();
1806 if (currHit_rep)
1807 *currHit = currHit->reverse();
1808
1809 // daehwan
1810 if (bDebug)
1811 {
1812 if (prevHit_rep) cout << "2. reversed in prev" << endl;
1813 if (currHit_rep) cout << "2. reversed in curr" << endl;
1814 }
1815 }
1816 }
1817
1818 bool same_contig = prevHit->ref_id2() == currHit->ref_id();
1819 if (!same_contig && num_fusions > 0)
1820 continue;
1821
1822 if (same_contig && num_fusions >= 1)
1823 {
1824 if (prevHit->antisense_align2() != currHit->antisense_align())
1825 continue;
1826 }
1827
1828 int bh_l = 0, back_right = 0, dist = 0;
1829 if (same_contig)
1830 {
1831 if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF || dir == FUSION_FR || dir == FUSION_RF) &&
1832 prevHit->antisense_align2())
1833 {
1834 bh_l = prevHit->right() + 1;
1835 back_right = currHit->left() + 1;
1836 }
1837 else
1838 {
1839 bh_l = currHit->left();
1840 back_right = prevHit->right();
1841 }
1842
1843 dist = bh_l - back_right;
1844 }
1845
1846 // daehwan - pass
1847 if (bDebug)
1848 {
1849 cout << "daehwan - pass" << endl;
1850 cout << "prev coords: " << prevHit->left() << "\t" << prevHit->right() << endl;
1851 cout << "curr coords: " << currHit->left() << "\t" << currHit->right() << endl;
1852 }
1853
1854 if (!same_contig ||
1855 (same_contig && num_fusions == 0 && dir != FUSION_NOTHING && fusion_dir == FUSION_NOTHING) ||
1856 (same_contig && dist <= max_report_intron_length && dist >= -(int)max_insertion_length && prevHit->is_forwarding_right() == currHit->is_forwarding_left()))
1857 {
1858 // daehwan
1859 if (bDebug)
1860 {
1861 cout << "daehwan - really passed!!" << endl;
1862 }
1863
1864 BowtieHit tempHit = seg_hit_stack.back();
1865 seg_hit_stack.back() = bh_prev;
1866
1867 // these hits are compatible, so push bh onto the
1868 // stack, recurse, and pop it when done.
1869 seg_hit_stack.push_back(bh);
1870 bool success = dfs_seg_hits(rt,
1871 read_seq,
1872 read_quals,
1873 possible_juncs,
1874 possible_insertions,
1875 possible_fusions,
1876 seg_hits_for_read,
1877 curr + 1,
1878 seg_hit_stack,
1879 joined_hits,
1880 dir == FUSION_NOTHING ? fusion_dir : dir);
1881
1882 if (success)
1883 join_success = true;
1884
1885 seg_hit_stack.pop_back();
1886 seg_hit_stack.back() = tempHit;
1887 }
1888 }
1889 }
1890 else
1891 {
1892 merge_segment_chain(rt,
1893 read_seq,
1894 read_quals,
1895 possible_juncs,
1896 possible_insertions,
1897 possible_fusions,
1898 seg_hit_stack,
1899 joined_hits,
1900 fusion_dir);
1901
1902 return join_success = true;
1903 }
1904 return join_success;
1905 }
1906
1907 bool join_segments_for_read(RefSequenceTable& rt,
1908 const string& read_seq,
1909 const string& read_quals,
1910 std::set<Junction>& possible_juncs,
1911 std::set<Insertion>& possible_insertions,
1912 std::set<Fusion>& possible_fusions,
1913 vector<HitsForRead>& seg_hits_for_read,
1914 vector<BowtieHit>& joined_hits)
1915 {
1916 vector<BowtieHit> seg_hit_stack;
1917 bool join_success = false;
1918
1919 // ignore segments that map to more than this many places.
1920 if (bowtie2)
1921 {
1922 const size_t max_seg_hits = max_multihits * 2;
1923 for (size_t s = 0; s < seg_hits_for_read.size(); ++s)
1924 {
1925 if (seg_hits_for_read[s].hits.size() > max_seg_hits)
1926 return join_success;
1927 }
1928 }
1929
1930 for (size_t i = 0; i < seg_hits_for_read[0].hits.size(); ++i)
1931 {
1932 BowtieHit& bh = seg_hits_for_read[0].hits[i];
1933
1934 if (bh.fusion_opcode() == FUSION_RR)
1935 seg_hit_stack.push_back(bh.reverse());
1936 else
1937 seg_hit_stack.push_back(bh);
1938
1939 bool success = dfs_seg_hits(rt,
1940 read_seq,
1941 read_quals,
1942 possible_juncs,
1943 possible_insertions,
1944 possible_fusions,
1945 seg_hits_for_read,
1946 1,
1947 seg_hit_stack,
1948 joined_hits);
1949 if (success)
1950 join_success = true;
1951 seg_hit_stack.pop_back();
1952 }
1953
1954 return join_success;
1955 }
1956
1957 struct JoinSegmentsWorker
1958 {
1959 void operator()()
1960 {
1961 ReadTable it;
1962 GBamWriter bam_writer(bam_output_fname.c_str(), sam_header_fname.c_str());
1963 ReadStream readstream(reads_fname);
1964 if (readstream.file() == NULL)
1965 err_die("Error: cannot open %s for reading\n", reads_fname.c_str());
1966
1967 if (read_offset > 0)
1968 readstream.seek(read_offset);
1969
1970 bool need_seq = true;
1971 bool need_qual = color;
1972
1973 vector<HitStream> contig_hits;
1974 vector<HitStream> spliced_hits;
1975 vector<HitFactory*> factories;
1976 for (size_t i = 0; i < segmap_fnames.size(); ++i)
1977 {
1978 HitFactory* fac = new BAMHitFactory(it, *rt);
1979 factories.push_back(fac);
1980 HitStream hs(segmap_fnames[i], fac, false, false, false, need_seq, need_qual);
1981
1982 if (seg_offsets[i] > 0)
1983 hs.seek(seg_offsets[i]);
1984
1985 contig_hits.push_back(hs);
1986 }
1987
1988 for (size_t i = 0; i < spliced_segmap_fnames.size(); ++i)
1989 {
1990 int anchor_length = 0;
1991 HitFactory* fac = new SplicedBAMHitFactory(it, *rt, anchor_length);
1992 factories.push_back(fac);
1993
1994 HitStream hs(spliced_segmap_fnames[i], fac, true, false, false, need_seq, need_qual);
1995 if (spliced_seg_offsets[i] > 0)
1996 hs.seek(spliced_seg_offsets[i]);
1997
1998 spliced_hits.push_back(hs);
1999 }
2000
2001 uint32_t curr_contig_obs_order = VMAXINT32;
2002 HitStream* first_seg_contig_stream = NULL;
2003 uint64_t next_contig_id = 0;
2004
2005 if (contig_hits.size() > 0)
2006 {
2007 first_seg_contig_stream = &(contig_hits.front());
2008 next_contig_id = first_seg_contig_stream->next_group_id();
2009 curr_contig_obs_order = it.observation_order(next_contig_id);
2010 }
2011
2012 HitsForRead curr_hit_group;
2013
2014 uint32_t curr_spliced_obs_order = VMAXINT32;
2015 HitStream* first_seg_spliced_stream = NULL;
2016 uint64_t next_spliced_id = 0;
2017
2018 if (spliced_hits.size() > 0)
2019 {
2020 first_seg_spliced_stream = &(spliced_hits.front());
2021 next_spliced_id = first_seg_spliced_stream->next_group_id();
2022 curr_spliced_obs_order = it.observation_order(next_spliced_id);
2023 }
2024
2025 while((curr_contig_obs_order != VMAXINT32 || curr_spliced_obs_order != VMAXINT32) &&
2026 (curr_contig_obs_order < end_id || curr_spliced_obs_order < end_id))
2027 {
2028 uint32_t read_in_process;
2029 vector<HitsForRead> seg_hits_for_read;
2030 seg_hits_for_read.resize(contig_hits.size());
2031
2032 if (curr_contig_obs_order < curr_spliced_obs_order)
2033 {
2034 first_seg_contig_stream->next_read_hits(curr_hit_group);
2035 seg_hits_for_read.front() = curr_hit_group;
2036
2037 next_contig_id = first_seg_contig_stream->next_group_id();
2038 uint32_t next_order = it.observation_order(next_contig_id);
2039
2040 read_in_process = curr_contig_obs_order;
2041 curr_contig_obs_order = next_order;
2042 }
2043 else if (curr_spliced_obs_order < curr_contig_obs_order)
2044 {
2045 first_seg_spliced_stream->next_read_hits(curr_hit_group);
2046 seg_hits_for_read.front() = curr_hit_group;
2047
2048 next_spliced_id = first_seg_spliced_stream->next_group_id();
2049 uint32_t next_order = it.observation_order(next_spliced_id);
2050
2051 read_in_process = curr_spliced_obs_order;
2052 curr_spliced_obs_order = next_order;
2053
2054 if (read_in_process < begin_id)
2055 continue;
2056 }
2057 else if (curr_contig_obs_order == curr_spliced_obs_order &&
2058 curr_contig_obs_order != VMAXINT32 &&
2059 curr_spliced_obs_order != VMAXINT32)
2060 {
2061 first_seg_contig_stream->next_read_hits(curr_hit_group);
2062
2063 HitsForRead curr_spliced_group;
2064 first_seg_spliced_stream->next_read_hits(curr_spliced_group);
2065
2066 curr_hit_group.hits.insert(curr_hit_group.hits.end(),
2067 curr_spliced_group.hits.begin(),
2068 curr_spliced_group.hits.end());
2069 seg_hits_for_read.front() = curr_hit_group;
2070 read_in_process = curr_spliced_obs_order;
2071
2072 next_contig_id = first_seg_contig_stream->next_group_id();
2073 uint32_t next_order = it.observation_order(next_contig_id);
2074
2075 next_spliced_id = first_seg_spliced_stream->next_group_id();
2076 uint32_t next_spliced_order = it.observation_order(next_spliced_id);
2077
2078 curr_spliced_obs_order = next_spliced_order;
2079 curr_contig_obs_order = next_order;
2080 }
2081 else
2082 {
2083 break;
2084 }
2085
2086 if (contig_hits.size() > 1)
2087 {
2088 look_right_for_hit_group(it,
2089 contig_hits,
2090 0,
2091 spliced_hits,
2092 curr_hit_group,
2093 seg_hits_for_read);
2094 }
2095
2096 size_t last_non_empty = seg_hits_for_read.size() - 1;
2097 while(last_non_empty >= 0 && seg_hits_for_read[last_non_empty].hits.empty())
2098 {
2099 --last_non_empty;
2100 }
2101
2102 seg_hits_for_read.resize(last_non_empty + 1);
2103 if (!seg_hits_for_read[last_non_empty].hits[0].end())
2104 continue;
2105
2106 if (!seg_hits_for_read.empty() && !seg_hits_for_read[0].hits.empty())
2107 {
2108 uint64_t insert_id = seg_hits_for_read[0].hits[0].insert_id();
2109 if (insert_id >= begin_id && insert_id < end_id)
2110 {
2111 Read read;
2112 if (readstream.getRead(insert_id, read))
2113 {
2114 vector<BowtieHit> joined_hits;
2115 join_segments_for_read(*rt,
2116 read.seq.c_str(),
2117 read.qual.c_str(),
2118 *possible_juncs,
2119 *possible_insertions,
2120 *possible_fusions,
2121 seg_hits_for_read,
2122 joined_hits);
2123
2124 sort(joined_hits.begin(), joined_hits.end());
2125 vector<BowtieHit>::iterator new_end = unique(joined_hits.begin(), joined_hits.end());
2126 joined_hits.erase(new_end, joined_hits.end());
2127 for (size_t i = 0; i < joined_hits.size(); i++)
2128 {
2129 const char* ref_name = rt->get_name(joined_hits[i].ref_id());
2130 const char* ref_name2 = "";
2131 if (joined_hits[i].fusion_opcode() != FUSION_NOTHING)
2132 ref_name2 = rt->get_name(joined_hits[i].ref_id2());
2133
2134 if (color && !color_out)
2135 print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2136 joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true);
2137 else
2138 print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2139 read.seq.c_str(), read.qual.c_str(), false);
2140 }
2141 }
2142 else
2143 {
2144 err_die("Error: could not get read # %d from stream\n",
2145 read_in_process);
2146 }
2147 }
2148 }
2149 else
2150 {
2151 //fprintf(stderr, "Warning: couldn't join segments for read # %d\n", read_in_process);
2152 }
2153 }
2154
2155 for (size_t fac = 0; fac < factories.size(); ++fac)
2156 {
2157 delete factories[fac];
2158 }
2159 factories.clear();
2160 }
2161
2162 string bam_output_fname;
2163 string sam_header_fname;
2164 string reads_fname;
2165
2166 vector<string> segmap_fnames;
2167 vector<string> spliced_segmap_fnames;
2168
2169 std::set<Junction>* possible_juncs;
2170 std::set<Insertion>* possible_insertions;
2171 std::set<Fusion>* possible_fusions;
2172
2173 RefSequenceTable* rt;
2174
2175 uint64_t begin_id;
2176 uint64_t end_id;
2177 int64_t read_offset;
2178 vector<int64_t> seg_offsets;
2179 vector<int64_t> spliced_seg_offsets;
2180 };
2181
2182 void driver(const string& bam_output_fname,
2183 istream& ref_stream,
2184 vector<FILE*>& possible_juncs_files,
2185 vector<FILE*>& possible_insertions_files,
2186 vector<FILE*>& possible_deletions_files,
2187 vector<FILE*>& possible_fusions_files,
2188 vector<string>& spliced_segmap_fnames, //.bam files
2189 vector<string>& segmap_fnames, //.bam files
2190 const string& reads_fname)
2191 {
2192 if (!parallel)
2193 num_threads = 1;
2194
2195 if (segmap_fnames.size() == 0)
2196 {
2197 fprintf(stderr, "No hits to process, exiting\n");
2198 exit(0);
2199 }
2200
2201 RefSequenceTable rt(sam_header, true);
2202 fprintf (stderr, "Loading reference sequences...\n");
2203 get_seqs(ref_stream, rt, true);
2204 fprintf (stderr, " reference sequences loaded.\n");
2205
2206 fprintf(stderr, "Loading junctions...");
2207 std::set<Junction> possible_juncs;
2208
2209 for (size_t i = 0; i < possible_juncs_files.size(); ++i)
2210 {
2211 char buf[2048];
2212 while(!feof(possible_juncs_files[i]) &&
2213 fgets(buf, sizeof(buf), possible_juncs_files[i]))
2214 {
2215 char junc_ref_name[256];
2216 int left;
2217 int right;
2218 char orientation;
2219 int ret = sscanf(buf, "%s %d %d %c", junc_ref_name, &left, &right, &orientation);
2220 if (ret != 4)
2221 continue;
2222 uint32_t ref_id = rt.get_id(junc_ref_name, NULL, 0);
2223 possible_juncs.insert(Junction(ref_id, left, right, orientation == '-'));
2224 }
2225 }
2226 fprintf(stderr, "done\n");
2227
2228 fprintf(stderr, "Loading deletions...");
2229
2230 for (size_t i = 0; i < possible_deletions_files.size(); ++i)
2231 {
2232 char splice_buf[2048];
2233 FILE* deletions_file = possible_deletions_files[i];
2234 if(!deletions_file){
2235 continue;
2236 }
2237 while(fgets(splice_buf, 2048, deletions_file)){
2238 char* nl = strrchr(splice_buf, '\n');
2239 char* buf = splice_buf;
2240 if (nl) *nl = 0;
2241
2242 char* ref_name = get_token((char**)&buf, "\t");
2243 char* scan_left_coord = get_token((char**)&buf, "\t");
2244 char* scan_right_coord = get_token((char**)&buf, "\t");
2245
2246 if (!scan_left_coord || !scan_right_coord)
2247 {
2248 err_die("Error: malformed deletion coordinate record\n");
2249 }
2250
2251 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
2252 uint32_t left_coord = atoi(scan_left_coord);
2253 uint32_t right_coord = atoi(scan_right_coord);
2254 possible_juncs.insert((Junction)Deletion(ref_id, left_coord - 1,right_coord, false));
2255 }
2256 }
2257 fprintf(stderr, "done\n");
2258
2259 /*
2260 * Read the insertions from the list of insertion
2261 * files into a set
2262 */
2263 fprintf(stderr, "Loading insertions...");
2264 std::set<Insertion> possible_insertions;
2265 for (size_t i=0; i < possible_insertions_files.size(); ++i)
2266 {
2267 char splice_buf[2048];
2268 FILE* insertions_file = possible_insertions_files[i];
2269 if(!insertions_file){
2270 continue;
2271 }
2272 while(fgets(splice_buf, 2048, insertions_file)){
2273 char* nl = strrchr(splice_buf, '\n');
2274 char* buf = splice_buf;
2275 if (nl) *nl = 0;
2276
2277 char* ref_name = get_token((char**)&buf, "\t");
2278 char* scan_left_coord = get_token((char**)&buf, "\t");
2279 char* scan_right_coord = get_token((char**)&buf, "\t");
2280 char* scan_sequence = get_token((char**)&buf, "\t");
2281
2282 if (!scan_left_coord || !scan_sequence || !scan_right_coord)
2283 {
2284 err_die("Error: malformed insertion coordinate record\n");
2285 }
2286
2287 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
2288 uint32_t left_coord = atoi(scan_left_coord);
2289 std::string sequence(scan_sequence);
2290 possible_insertions.insert(Insertion(ref_id, left_coord, sequence));
2291 }
2292 }
2293 fprintf(stderr, "done\n");
2294
2295 vector<uint64_t> read_ids;
2296 vector<vector<int64_t> > offsets;
2297 if (num_threads > 1)
2298 {
2299 vector<string> fnames;
2300 fnames.push_back(reads_fname);
2301 fnames.insert(fnames.end(), spliced_segmap_fnames.rbegin(), spliced_segmap_fnames.rend());
2302 fnames.insert(fnames.end(), segmap_fnames.rbegin(), segmap_fnames.rend());
2303 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
2304 if (!enough_data)
2305 num_threads = 1;
2306 }
2307
2308 std::set<Fusion> possible_fusions;
2309 if (fusion_search)
2310 {
2311 fprintf(stderr, "Loading fusions...");
2312 for (size_t i=0; i < possible_fusions_files.size(); ++i)
2313 {
2314 char splice_buf[2048];
2315 FILE* fusions_file = possible_fusions_files[i];
2316 if(!fusions_file){
2317 continue;
2318 }
2319 while(fgets(splice_buf, 2048, fusions_file)){
2320 char* nl = strrchr(splice_buf, '\n');
2321 char* buf = splice_buf;
2322 if (nl) *nl = 0;
2323
2324 char* ref_name1 = strsep((char**)&buf, "\t");
2325 char* scan_left_coord = strsep((char**)&buf, "\t");
2326 char* ref_name2 = strsep((char**)&buf, "\t");
2327 char* scan_right_coord = strsep((char**)&buf, "\t");
2328 char* scan_dir = strsep((char**)&buf, "\t");
2329
2330 if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir)
2331 {
2332 fprintf(stderr,"Error: malformed insertion coordinate record\n");
2333 exit(1);
2334 }
2335
2336 uint32_t ref_id1 = rt.get_id(ref_name1,NULL,0);
2337 uint32_t ref_id2 = rt.get_id(ref_name2,NULL,0);
2338 uint32_t left_coord = atoi(scan_left_coord);
2339 uint32_t right_coord = atoi(scan_right_coord);
2340 uint32_t dir = FUSION_FF;
2341 if (strcmp(scan_dir, "fr") == 0)
2342 dir = FUSION_FR;
2343 else if(strcmp(scan_dir, "rf") == 0)
2344 dir = FUSION_RF;
2345 else if (strcmp(scan_dir, "rr") == 0)
2346 dir = FUSION_RR;
2347
2348 possible_fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir));
2349 }
2350 }
2351 fprintf(stderr, "done\n");
2352 }
2353
2354 vector<boost::thread*> threads;
2355 for (int i = 0; i < num_threads; ++i)
2356 {
2357 JoinSegmentsWorker worker;
2358
2359 if (num_threads == 1)
2360 worker.bam_output_fname = bam_output_fname;
2361 else
2362 {
2363 string filename_base = bam_output_fname.substr(0, bam_output_fname.length() - 4);
2364 char filename[1024] = {0};
2365 sprintf(filename, "%s%d.bam", filename_base.c_str(), i);
2366 worker.bam_output_fname = filename;
2367 }
2368
2369 worker.sam_header_fname = sam_header;
2370 worker.reads_fname = reads_fname;
2371 worker.segmap_fnames = segmap_fnames;
2372 worker.spliced_segmap_fnames = spliced_segmap_fnames;
2373 worker.possible_juncs = &possible_juncs;
2374 worker.possible_insertions = &possible_insertions;
2375 worker.possible_fusions = &possible_fusions;
2376 worker.rt = &rt;
2377 if (i == 0)
2378 {
2379 worker.begin_id = 0;
2380 worker.seg_offsets = vector<int64_t>(segmap_fnames.size(), 0);
2381 worker.spliced_seg_offsets = vector<int64_t>(spliced_segmap_fnames.size(), 0);
2382 worker.read_offset = 0;
2383 }
2384 else
2385 {
2386 worker.begin_id = read_ids[i-1];
2387 worker.seg_offsets.insert(worker.seg_offsets.end(),
2388 offsets[i-1].rbegin(),
2389 offsets[i-1].rbegin() + segmap_fnames.size());
2390 worker.spliced_seg_offsets.insert(worker.spliced_seg_offsets.end(),
2391 offsets[i-1].rbegin() + segmap_fnames.size(),
2392 offsets[i-1].rend() - 1);
2393 worker.read_offset = offsets[i-1][0];
2394 }
2395
2396 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
2397
2398 if (num_threads > 1)
2399 threads.push_back(new boost::thread(worker));
2400 else
2401 worker();
2402 }
2403
2404 for (size_t i = 0; i < threads.size(); ++i)
2405 {
2406 threads[i]->join();
2407 delete threads[i];
2408 threads[i] = NULL;
2409 }
2410 threads.clear();
2411
2412 } //driver
2413
2414 int main(int argc, char** argv)
2415 {
2416 fprintf(stderr, "long_spanning_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
2417 fprintf(stderr, "--------------------------------------------\n");
2418
2419 int parse_ret = parse_options(argc, argv, print_usage);
2420 if (parse_ret)
2421 return parse_ret;
2422
2423 if(optind >= argc)
2424 {
2425 print_usage();
2426 return 1;
2427 }
2428
2429 string ref_file_name = argv[optind++];
2430
2431 if(optind >= argc)
2432 {
2433 print_usage();
2434 return 1;
2435 }
2436
2437
2438 string reads_file_name = argv[optind++];
2439
2440 if(optind >= argc)
2441 {
2442 print_usage();
2443 return 1;
2444 }
2445
2446 string juncs_file_list = argv[optind++];
2447
2448 if(optind >= argc)
2449 {
2450 print_usage();
2451 return 1;
2452 }
2453
2454 string insertions_file_list = argv[optind++];
2455 if(optind >= argc)
2456 {
2457 print_usage();
2458 return 1;
2459 }
2460
2461 string deletions_file_list = argv[optind++];
2462
2463 if(optind >= argc)
2464 {
2465 print_usage();
2466 return 1;
2467 }
2468
2469 string fusions_file_list = argv[optind++];
2470
2471 if(optind >= argc)
2472 {
2473 print_usage();
2474 return 1;
2475 }
2476
2477 string bam_output_fname = argv[optind++];
2478
2479 if(optind >= argc)
2480 {
2481 print_usage();
2482 return 1;
2483 }
2484
2485 string segmap_file_list = argv[optind++];
2486 string spliced_segmap_flist;
2487 if(optind < argc)
2488 {
2489 spliced_segmap_flist = argv[optind++];
2490 }
2491
2492 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
2493 if (!ref_stream.good())
2494 err_die("Error: cannot open %s for reading\n",ref_file_name.c_str());
2495
2496 checkSamHeader();
2497
2498 //FILE* reads_file = fopen(reads_file_name.c_str(), "r");
2499 vector<string> segmap_file_names;
2500 tokenize(segmap_file_list, ",",segmap_file_names);
2501
2502 vector<string> juncs_file_names;
2503 vector<FILE*> juncs_files;
2504 tokenize(juncs_file_list, ",",juncs_file_names);
2505 for (size_t i = 0; i < juncs_file_names.size(); ++i) {
2506 //fprintf(stderr, "Opening %s for reading\n",
2507 // juncs_file_names[i].c_str());
2508 FILE* juncs_file = fopen(juncs_file_names[i].c_str(), "r");
2509 if (juncs_file == NULL) {
2510 fprintf(stderr, "Warning: cannot open %s for reading\n",
2511 juncs_file_names[i].c_str());
2512 continue;
2513 }
2514 juncs_files.push_back(juncs_file);
2515 }
2516
2517 /*
2518 * Read in the deletion file names
2519 */
2520 vector<string> deletions_file_names;
2521 vector<FILE*> deletions_files;
2522 tokenize(deletions_file_list, ",",deletions_file_names);
2523 for (size_t i = 0; i < deletions_file_names.size(); ++i)
2524 {
2525 //fprintf(stderr, "Opening %s for reading\n",
2526 // deletions_file_names[i].c_str());
2527 FILE* deletions_file = fopen(deletions_file_names[i].c_str(), "r");
2528 if (deletions_file == NULL) {
2529 fprintf(stderr, "Warning: cannot open %s for reading\n",
2530 deletions_file_names[i].c_str());
2531 continue;
2532 }
2533 deletions_files.push_back(deletions_file);
2534 }
2535
2536 /*
2537 * Read in the list of filenames that contain
2538 * insertion coordinates
2539 */
2540
2541 vector<string> insertions_file_names;
2542 vector<FILE*> insertions_files;
2543 tokenize(insertions_file_list, ",",insertions_file_names);
2544 for (size_t i = 0; i < insertions_file_names.size(); ++i)
2545 {
2546 //fprintf(stderr, "Opening %s for reading\n",
2547 // insertions_file_names[i].c_str());
2548 FILE* insertions_file = fopen(insertions_file_names[i].c_str(), "r");
2549 if (insertions_file == NULL)
2550 {
2551 fprintf(stderr, "Warning: cannot open %s for reading\n",
2552 insertions_file_names[i].c_str());
2553 continue;
2554 }
2555 insertions_files.push_back(insertions_file);
2556 }
2557
2558 vector<string> spliced_segmap_file_names;
2559 vector<FZPipe> spliced_segmap_files;
2560 string unzcmd;
2561 tokenize(spliced_segmap_flist, ",",spliced_segmap_file_names);
2562
2563 vector<string> fusions_file_names;
2564 vector<FILE*> fusions_files;
2565 tokenize(fusions_file_list, ",",fusions_file_names);
2566 for (size_t i = 0; i < fusions_file_names.size(); ++i)
2567 {
2568 fprintf(stderr, "Opening %s for reading\n",
2569 fusions_file_names[i].c_str());
2570 FILE* fusions_file = fopen(fusions_file_names[i].c_str(), "r");
2571 if (fusions_file == NULL)
2572 {
2573 fprintf(stderr, "Warning: cannot open %s for reading\n",
2574 fusions_file_names[i].c_str());
2575 continue;
2576 }
2577 fusions_files.push_back(fusions_file);
2578 }
2579
2580
2581 driver(bam_output_fname,
2582 ref_stream,
2583 juncs_files,
2584 insertions_files,
2585 deletions_files,
2586 fusions_files,
2587 spliced_segmap_file_names,
2588 segmap_file_names,
2589 reads_file_name);
2590
2591 return 0;
2592 }