ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/long_spanning_reads.cpp
Revision: 159
Committed: Mon Jan 30 22:38:08 2012 UTC (7 years, 8 months ago) by gpertea
File size: 99258 byte(s)
Log Message:
wip

Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
4
5 /*
6 * long_spanning_reads.cpp
7 * TopHat
8 *
9 * Created by Cole Trapnell on 2/5/09.
10 * Copyright 2009 Cole Trapnell. All rights reserved.
11 *
12 */
13
14 #include <cassert>
15 #include <cstdio>
16 #include <vector>
17 #include <string>
18 #include <map>
19 #include <algorithm>
20 #include <set>
21 #include <cstdlib>
22 #include <cstring>
23 #include <bitset>
24 //#include <stdexcept>
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <seqan/modifier.h>
32 #include <getopt.h>
33
34 #include <boost/thread.hpp>
35
36 #include "common.h"
37 #include "utils.h"
38 #include "bwt_map.h"
39 #include "tokenize.h"
40 #include "segments.h"
41 #include "reads.h"
42
43 #include "junctions.h"
44 #include "insertions.h"
45 #include "deletions.h"
46 #include "fusions.h"
47
48 using namespace seqan;
49 using namespace std;
50
51 // daehwan
52 bool bDebug = false;
53
54 void print_usage()
55 {
56 fprintf(stderr, "Usage: long_spanning_reads <reads.fa/.fq> <possible_juncs1,...,possible_juncsN> <possible_insertions1,...,possible_insertionsN> <possible_deletions1,...,possible_deletionsN> <seg1.bwtout,...,segN.bwtout> [spliced_seg1.bwtout,...,spliced_segN.bwtout]\n");
57 }
58
59 bool key_lt(const pair<uint32_t, HitsForRead>& lhs,
60 const pair<uint32_t, HitsForRead>& rhs)
61 {
62 return lhs.first < rhs.first;
63 }
64
65 void get_seqs(istream& ref_stream,
66 RefSequenceTable& rt,
67 bool keep_seqs = true)
68 {
69 while(ref_stream.good() &&
70 !ref_stream.eof())
71 {
72 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
73 string name;
74 readMeta(ref_stream, name, Fasta());
75 string::size_type space_pos = name.find_first_of(" \t\r");
76 if (space_pos != string::npos)
77 {
78 name.resize(space_pos);
79 }
80 seqan::read(ref_stream, *ref_str, Fasta());
81
82 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
83 }
84 }
85
86
87 void look_right_for_hit_group(ReadTable& unmapped_reads,
88 vector<HitStream>& contig_hits,
89 size_t curr_file,
90 vector<HitStream>& spliced_hits,
91 const HitsForRead& targets,
92 vector<HitsForRead>& seg_hits_for_read)
93 {
94 int right_file = curr_file + 1;
95
96 HitStream& right = contig_hits[right_file];
97 uint64_t curr_next_group_id = targets.insert_id;
98 int curr_order = unmapped_reads.observation_order(curr_next_group_id);
99
100 assert (curr_order != -1);
101 while(true)
102 {
103 HitsForRead hit_group;
104 uint64_t right_next_group_id = right.next_group_id();
105 int right_order = unmapped_reads.observation_order(right_next_group_id);
106
107 // If we would have seen the hits by now, bail out.
108 if (curr_order < right_order || right_order == -1)
109 {
110 break;
111 }
112 if (right.next_read_hits(hit_group))
113 {
114 if (hit_group.insert_id == targets.insert_id)
115 {
116 // Some of the targets may be missing, we need to
117 // process them individually
118 seg_hits_for_read[right_file] = hit_group;
119 break;
120 }
121 }
122 }
123
124 HitsForRead& curr_seg_hits = seg_hits_for_read[right_file];
125
126 if (right_file < (int)spliced_hits.size() && right_file >= 0)
127 {
128 // Scan forward in the spliced hits file for this hit group
129 HitsForRead spliced_group;
130 HitsForRead curr_spliced_group;
131 while (spliced_hits[right_file].next_group_id() > 0 &&
132 spliced_hits[right_file].next_group_id() <= (uint32_t)curr_order)
133 {
134 spliced_hits[right_file].next_read_hits(curr_spliced_group);
135
136 if (curr_spliced_group.insert_id == (uint32_t)curr_order)
137 {
138 spliced_group = curr_spliced_group;
139 break;
140 }
141 }
142
143 if (!spliced_group.hits.empty())
144 {
145 curr_seg_hits.insert_id = spliced_group.insert_id;
146 curr_seg_hits.hits.insert(curr_seg_hits.hits.end(),
147 spliced_group.hits.begin(),
148 spliced_group.hits.end());
149 }
150 }
151
152 if (curr_seg_hits.hits.empty())
153 return;
154 else if (right_file + 1 < (int)contig_hits.size())
155 {
156 look_right_for_hit_group(unmapped_reads,
157 contig_hits,
158 curr_file + 1,
159 spliced_hits,
160 curr_seg_hits,
161 seg_hits_for_read);
162 }
163 }
164
165 BowtieHit merge_chain_color(RefSequenceTable& rt,
166 const string& read_seq,
167 const string& read_quals,
168 std::set<Junction>& possible_juncs,
169 std::set<Insertion>& possible_insertions,
170 list<BowtieHit>& hit_chain)
171 {
172 bool antisense = hit_chain.front().antisense_align();
173 uint32_t reference_id = hit_chain.front().ref_id();
174 uint64_t insert_id = hit_chain.front().insert_id();
175
176 int left = hit_chain.front().left();
177
178 list<BowtieHit>::iterator prev_hit = hit_chain.begin();
179 list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
180
181 string seq;
182 string qual;
183 int old_read_length = 0;
184 int first_seg_length = hit_chain.front().seq().length();
185 for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
186 {
187 seq += i->seq();
188 qual += i->qual();
189 old_read_length += i->read_len();
190 }
191
192 string rev_read_seq, rev_read_quals;
193 if (color && antisense)
194 {
195 rev_read_seq = read_seq;
196 reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
197
198 rev_read_quals = read_quals;
199 reverse(rev_read_quals.begin(), rev_read_quals.end());
200 }
201
202 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
203 {
204 /*
205 * Note that the gap size may be negative, since left() and right() return
206 * signed integers, this will be OK.
207 */
208 int gap = curr_hit->left() - prev_hit->right();
209 if (gap < -(int)max_insertion_length ||
210 (gap > (int)max_deletion_length &&
211 (gap < min_report_intron_length || gap > max_report_intron_length)))
212 {
213 return BowtieHit();
214 }
215
216 ++prev_hit;
217 ++curr_hit;
218 }
219
220 prev_hit = hit_chain.begin();
221 curr_hit = ++(hit_chain.begin());
222
223 RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id());
224 if (!ref_str)
225 return BowtieHit();
226
227 int curr_seg_index = 1;
228 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
229 {
230 /*
231 * This code is assuming that the cigar strings end and start with a match
232 * While segment alignments can actually end with a junction, insertion or deletion, the hope
233 * is that in those cases, the right and left ends of the alignments will correctly
234 * line up, so we won't get to this bit of code
235 */
236 if (prev_hit->cigar().back().opcode != MATCH || curr_hit->cigar().front().opcode != MATCH)
237 {
238 return BowtieHit();
239 }
240
241 if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
242 {
243 /*
244 * There is no way that we can splice these together into a valid
245 * alignment
246 */
247 return BowtieHit();
248 }
249
250 bool found_closure = false;
251 /*
252 * Note that antisense_splice is the same for prev and curr
253 */
254 bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
255 vector<CigarOp> new_cigar;
256 int new_left = -1;
257 int mismatch = 0;
258
259 /*
260 * Take the length of matched bases in each segment into consideration for closures,
261 * this can be a problem for reads of variable lengths.
262 */
263 int prev_right_end_match_length = prev_hit->cigar().back().length;
264 int curr_left_end_match_length = curr_hit->cigar().front().length;
265
266 if (prev_hit->right() > curr_hit->left())
267 {
268 std::set<Insertion>::iterator lb, ub;
269 /*
270 * Note, this offset is determined by the min-anchor length supplied to
271 * juncs_db, which is currently hard-coded at 3 in tophat.py
272 * as this value determines what sort of read segments
273 * should be able to align directly to the splice sequences
274 */
275 int left_boundary = prev_hit->right() - 4;
276 int right_boundary = curr_hit->left() + 4;
277
278 /*
279 * Create a dummy sequence to represent the maximum possible insertion
280 */
281 std::string maxInsertedSequence = "";
282 maxInsertedSequence.resize(max_insertion_length,'A');
283
284 lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
285 ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
286
287 int reference_mismatch = 0;
288
289 while (lb != ub && lb != possible_insertions.end())
290 {
291 /*
292 * In the following code, we will check to make sure that the segments have the proper
293 * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
294 * In general, reads with insertions must match the inserted sequence exactly.
295 */
296 if (((int)lb->sequence.size()) == (prev_hit->right() - curr_hit->left()))
297 {
298 /*
299 * Check we have enough matched bases on prev or curr segment.
300 */
301 int insert_to_prev_right = prev_hit->right() - lb->left - 1;
302 int curr_left_to_insert = lb->left - curr_hit->left() + 1;
303 if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
304 {
305 ++lb;
306 continue;
307 }
308
309 /*
310 * Keep track of how many mismatches were made to the genome in the region
311 * where we should actually be matching against the insertion
312 */
313 int this_reference_mismatch = 0;
314 int insertion_mismatch = 0;
315 int insertion_len = lb->sequence.length();
316 const seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
317
318 /*
319 * First check to see if we need to adjust number of observed errors for the left (prev)
320 * hit. This is only the case if this segment runs into the insertion. To be consistent
321 * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
322 */
323 string colorSegmentSequence_prev;
324 if (insert_to_prev_right > 0)
325 {
326 const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
327 const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
328
329 if (color)
330 {
331 string color;
332
333 if (antisense)
334 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
335 else
336 color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
337
338 color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
339 colorSegmentSequence_prev = convert_color_to_bp(color);
340 }
341
342 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
343
344 /*
345 * Scan right in the read until we run out of read
346 */
347 for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
348 {
349 /*
350 * Any mismatch to the insertion is a failure
351 */
352 if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
353 {
354 ++this_reference_mismatch;
355 }
356
357 if (read_index < insertion_len)
358 {
359 if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
360 {
361 ++insertion_mismatch;
362 break;
363 }
364 }
365 else
366 {
367 if (referenceSequence[read_index - insertion_len] == 'N' ||
368 referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
369 {
370 --this_reference_mismatch;
371 }
372 }
373 }
374 }
375
376 string colorSegmentSequence_curr;
377 if (curr_left_to_insert > 0)
378 {
379 const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
380 const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
381
382 if (color)
383 {
384 string color;
385 if (antisense)
386 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
387 else
388 color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
389
390 color.push_back(curr_hit->seq()[curr_left_to_insert]);
391 reverse(color.begin(), color.end());
392 string bp = convert_color_to_bp(color);
393 reverse(bp.begin(), bp.end());
394 colorSegmentSequence_curr = bp;
395 }
396
397 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
398
399 /*
400 * Scan left in the read until
401 * We ran out of read sequence (insertion extends past segment)
402 */
403 for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
404 {
405 int segmentPosition = curr_left_to_insert - read_index - 1;
406 int insertionPosition = insertion_len - read_index - 1;
407
408 if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
409 {
410 ++this_reference_mismatch;
411 }
412
413 if (read_index < insertion_len)
414 {
415 if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
416 {
417 ++insertion_mismatch;
418 break;
419 }
420 }
421 else
422 {
423 if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
424 (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
425 {
426 --this_reference_mismatch;
427 }
428 }
429 }
430 }
431
432 if (found_closure)
433 {
434 fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
435 return BowtieHit();
436 }
437
438 if (insertion_mismatch == 0)
439 {
440 reference_mismatch = this_reference_mismatch;
441 mismatch = -reference_mismatch;
442 found_closure = true;
443 new_left = prev_hit->left();
444 new_cigar = prev_hit->cigar();
445
446 /*
447 * Need to make a new insert operation between the two match character that begin
448 * and end the intersection of these two junction. Note that we necessarily assume
449 * that this insertion can't span beyond the boundaries of these reads. That should
450 * probably be better enforced somewhere
451 */
452
453 new_cigar.back().length -= insert_to_prev_right;
454 if (new_cigar.back().length <= 0)
455 new_cigar.pop_back();
456
457 new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
458 vector<CigarOp> new_right_cigar = curr_hit->cigar();
459 new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
460
461 /*
462 * Finish stitching together the new cigar string
463 */
464 size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
465 for (; c < new_right_cigar.size(); ++c)
466 {
467 new_cigar.push_back(new_right_cigar[c]);
468 }
469
470 if (color)
471 {
472 if (insert_to_prev_right > 0)
473 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
474
475 if (curr_left_to_insert > 0)
476 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
477 }
478 }
479 }
480 ++lb;
481 }
482
483 if (!found_closure)
484 {
485 return BowtieHit();
486 }
487 }
488
489 /*
490 * Stitch segments together using juctions or deletions if necessary.
491 */
492 else if (prev_hit->right() < curr_hit->left())
493 {
494 std::set<Junction>::iterator lb, ub;
495
496 int left_boundary = prev_hit->right() - 4;
497 int right_boundary = curr_hit->left() + 4;
498
499 lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
500 ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
501
502 int new_diff_mismatches = 0xff;
503 while (lb != ub && lb != possible_juncs.end())
504 {
505 int dist_to_left = lb->left - prev_hit->right() + 1;
506 int dist_to_right = lb->right - curr_hit->left();
507
508 if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
509 {
510 /*
511 * Check we have enough matched bases on prev or curr segment.
512 */
513 if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length )
514 {
515 ++lb;
516 continue;
517 }
518
519 Dna5String new_cmp_str, old_cmp_str;
520 int new_mismatch = 0, old_mismatch = 0;
521 string new_patch_str; // this is for colorspace reads
522 if (dist_to_left > 0)
523 {
524 new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
525 old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
526
527 string new_seq;
528 if (color)
529 {
530 string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
531
532 string color, qual;
533 if (antisense)
534 {
535 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
536 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
537 }
538 else
539 {
540 color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
541 qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
542 }
543
544 BWA_decode(color, qual, ref, new_seq);
545 new_seq = new_seq.substr(1);
546 }
547
548 const string& curr_old_seq = curr_hit->seq();
549 const string& curr_seq = color ? new_seq : curr_hit->seq();
550 for (int i = 0; i < dist_to_left; ++i)
551 {
552 if (curr_seq[i] != new_cmp_str[i])
553 ++new_mismatch;
554
555 if (curr_old_seq[i] != old_cmp_str[i])
556 ++old_mismatch;
557 }
558
559 if (color)
560 new_patch_str = curr_seq.substr(0, dist_to_left);
561 }
562 else if (dist_to_left < 0)
563 {
564 new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
565 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
566
567 size_t abs_dist = -dist_to_left;
568 string new_seq;
569 if (color)
570 {
571 string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
572 ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
573
574 string color, qual;
575 if (antisense)
576 {
577 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
578 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
579 }
580 else
581 {
582 color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
583 qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
584 }
585
586 BWA_decode(color, qual, ref, new_seq);
587 new_seq = new_seq.substr(1);
588 }
589
590 const string& prev_old_seq = prev_hit->seq();
591 size_t prev_old_seq_len = prev_old_seq.length();
592 const string& prev_seq = color ? new_seq : prev_hit->seq();
593 size_t prev_seq_len = prev_seq.length();
594 for (size_t i = 0; i < abs_dist; ++i)
595 {
596 if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
597 ++new_mismatch;
598 if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
599 ++old_mismatch;
600 }
601
602 if (color)
603 new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
604 }
605
606 int temp_diff_mismatches = new_mismatch - old_mismatch;
607 if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
608 {
609 ++lb;
610 continue;
611 }
612
613 if (color)
614 {
615 /*
616 * We need to recover the origianl sequence.
617 */
618 if (found_closure)
619 {
620 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
621 prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
622 }
623
624 if (dist_to_left > 0)
625 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
626 else if (dist_to_left < 0)
627 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
628 }
629
630 new_diff_mismatches = temp_diff_mismatches;
631
632 new_left = prev_hit->left();
633 new_cigar = prev_hit->cigar();
634
635 int new_left_back_len = new_cigar.back().length;
636 new_left_back_len += dist_to_left;
637
638 vector<CigarOp> new_right_cig = curr_hit->cigar();
639 int new_right_front_len = new_right_cig.front().length;
640 new_right_front_len -= dist_to_right;
641 if (new_left_back_len > 0)
642 new_cigar.back().length = new_left_back_len;
643 else
644 new_cigar.pop_back();
645
646 /*
647 * FIXME, currently just differentiating between a deletion and a
648 * reference skip based on length. However, would probably be better
649 * to denote the difference explicitly, this would allow the user
650 * to supply their own (very large) deletions
651 */
652 if ((lb->right - lb->left - 1) <= max_deletion_length)
653 {
654 new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
655 antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
656 }
657 else
658 {
659 new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
660 antisense_closure = lb->antisense;
661 }
662
663 new_right_cig.front().length = new_right_front_len;
664 size_t c = new_right_front_len > 0 ? 0 : 1;
665 for (; c < new_right_cig.size(); ++c)
666 new_cigar.push_back(new_right_cig[c]);
667
668 mismatch = new_diff_mismatches;
669 found_closure = true;
670 }
671 ++lb;
672 }
673
674 if (!found_closure)
675 {
676 return BowtieHit();
677 }
678 }
679
680 if (found_closure)
681 {
682 bool end = false;
683 BowtieHit merged_hit(reference_id,
684 reference_id,
685 insert_id,
686 new_left,
687 new_cigar,
688 antisense,
689 antisense_closure,
690 prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
691 prev_hit->splice_mms() + curr_hit->splice_mms(),
692 end);
693
694 if (curr_seg_index > 1)
695 merged_hit.seq(seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, 2 * segment_length));
696 else
697 merged_hit.seq(seq.substr(0, first_seg_length + segment_length));
698
699 prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
700 /*
701 * prev_hit now points PAST the last element removed
702 */
703 prev_hit = hit_chain.insert(prev_hit, merged_hit);
704 /*
705 * merged_hit has been inserted before the old position of
706 * prev_hit. New location of prev_hit is merged_hit
707 */
708 curr_hit = prev_hit;
709 ++curr_hit;
710 ++curr_seg_index;
711 continue;
712 }
713
714 ++prev_hit;
715 ++curr_hit;
716 ++curr_seg_index;
717 }
718
719 bool saw_antisense_splice = false;
720 bool saw_sense_splice = false;
721 vector<CigarOp> long_cigar;
722 int num_mismatches = 0;
723 int num_splice_mms = 0;
724 for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
725 {
726 num_mismatches += s->edit_dist();
727 num_splice_mms += s->splice_mms();
728
729 /*
730 * Check whether the sequence contains any reference skips. Previously,
731 * this was just a check to see whether the sequence was contiguous; however
732 * we don't want to count an indel event as a splice
733 */
734 bool containsSplice = s->is_spliced();
735 if (containsSplice)
736 {
737 if (s->antisense_splice())
738 {
739 if (saw_sense_splice)
740 return BowtieHit();
741 saw_antisense_splice = true;
742 }
743 else
744 {
745 if (saw_antisense_splice)
746 return BowtieHit();
747 saw_sense_splice = true;
748 }
749 }
750 const vector<CigarOp>& cigar = s->cigar();
751 if (long_cigar.empty())
752 {
753 long_cigar = cigar;
754 }
755 else
756 {
757 CigarOp& last = long_cigar.back();
758 /*
759 * If necessary, merge the back and front
760 * cigar operations
761 */
762 if(last.opcode == cigar[0].opcode){
763 last.length += cigar[0].length;
764 for (size_t b = 1; b < cigar.size(); ++b)
765 {
766 long_cigar.push_back(cigar[b]);
767 }
768 }else{
769 for(size_t b = 0; b < cigar.size(); ++b)
770 {
771 long_cigar.push_back(cigar[b]);
772 }
773 }
774 }
775 }
776
777 bool end = false;
778 BowtieHit new_hit(reference_id,
779 reference_id,
780 insert_id,
781 left,
782 long_cigar,
783 antisense,
784 saw_antisense_splice,
785 num_mismatches,
786 num_splice_mms,
787 end);
788
789 new_hit.seq(seq);
790 new_hit.qual(qual);
791
792 int new_read_len = new_hit.read_len();
793 if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt))
794 {
795 fprintf(stderr, "Warning: malformed closure\n");
796 return BowtieHit();
797 }
798
799 return new_hit;
800 }
801
802 BowtieHit merge_chain(RefSequenceTable& rt,
803 const string& read_seq,
804 const string& read_quals,
805 std::set<Junction>& possible_juncs,
806 std::set<Insertion>& possible_insertions,
807 std::set<Fusion>& possible_fusions,
808 list<BowtieHit>& hit_chain,
809 int fusion_dir = FUSION_NOTHING)
810 {
811 bool antisense = hit_chain.front().antisense_align();
812 uint64_t insert_id = hit_chain.front().insert_id();
813
814 const int left = hit_chain.front().left();
815
816 list<BowtieHit>::iterator prev_hit = hit_chain.begin();
817 list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
818
819 string seq;
820 string qual;
821 int old_read_length = 0;
822 int first_seg_length = hit_chain.front().seq().length();
823 for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
824 {
825 seq += i->seq();
826 qual += i->qual();
827 old_read_length += i->read_len();
828 }
829
830 string rev_read_seq, rev_read_quals;
831 if (color && antisense)
832 {
833 rev_read_seq = read_seq;
834 reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
835
836 rev_read_quals = read_quals;
837 reverse(rev_read_quals.begin(), rev_read_quals.end());
838 }
839
840 size_t num_fusions = prev_hit->fusion_opcode() == FUSION_NOTHING ? 0 : 1;
841 bool fusion_passed = false;
842 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
843 {
844 if (prev_hit->ref_id() != prev_hit->ref_id2() || prev_hit->ref_id2() != curr_hit->ref_id())
845 fusion_passed = true;
846
847 if (prev_hit->ref_id2() != curr_hit->ref_id())
848 ++num_fusions;
849
850 if (curr_hit->fusion_opcode() != FUSION_NOTHING)
851 ++num_fusions;
852
853 if (prev_hit->ref_id2() == curr_hit->ref_id())
854 {
855 bool reversed = false;
856 if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
857 reversed = true;
858
859 /*
860 * Note that the gap size may be negative, since left() and right() return
861 * signed integers, this will be OK.
862 */
863 int gap;
864 if (reversed)
865 gap = prev_hit->right() - curr_hit->left();
866 else
867 gap = curr_hit->left() - prev_hit->right();
868
869 // daehwan
870 if (bDebug)
871 {
872 cout << "curr left: " << curr_hit->left() << endl
873 << "prev right: " << prev_hit->right() << endl
874 << "gap: " << gap << endl;
875 }
876
877 if (gap < -(int)max_insertion_length ||
878 (gap > (int)max_deletion_length &&
879 (gap < min_report_intron_length || gap > min(max_report_intron_length, (int)fusion_min_dist))))
880 {
881 fusion_passed = true;
882 ++num_fusions;
883 }
884 }
885
886 if (num_fusions >= 2)
887 return BowtieHit();
888
889 ++prev_hit;
890 ++curr_hit;
891 }
892
893 prev_hit = hit_chain.begin();
894 curr_hit = ++(hit_chain.begin());
895
896 // daehwan
897 if (bDebug)
898 {
899 cout << "daehwan - test" << endl;
900 }
901
902 int curr_seg_index = 1;
903 fusion_passed = false;
904 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
905 {
906 // daehwan
907 if (bDebug)
908 {
909 cout << "daehwan - start - stitch" << endl;
910 cout << "prev right: " << prev_hit->right() << endl;
911 cout << "curr left: " << curr_hit->left() << endl;
912 cout << "prev back: " << prev_hit->cigar().back().opcode << endl;
913 cout << "curr front: " << curr_hit->cigar().front().opcode << endl;
914 cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
915 cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
916 }
917
918 if (prev_hit->fusion_opcode() != FUSION_NOTHING || prev_hit->ref_id2() != curr_hit->ref_id())
919 fusion_passed = true;
920
921 /*
922 * This code is assuming that the cigar strings end and start with a match
923 * While segment alignments can actually end with a junction, insertion or deletion, the hope
924 * is that in those cases, the right and left ends of the alignments will correctly
925 * line up, so we won't get to this bit of code
926 */
927 if (!(prev_hit->cigar().back().opcode == MATCH || curr_hit->cigar().front().opcode == MATCH ||
928 prev_hit->cigar().back().opcode == mATCH || curr_hit->cigar().front().opcode == mATCH))
929 {
930 return BowtieHit();
931 }
932
933 // daehwan
934 if (bDebug)
935 {
936 cout << "daehwan - pass - enough matched bases" << endl;
937 }
938
939 if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
940 {
941 /*
942 * There is no way that we can splice these together into a valid
943 * alignment
944 */
945 return BowtieHit();
946 }
947
948 bool found_closure = false;
949 /*
950 * Note that antisense_splice is the same for prev and curr
951 */
952 bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
953 vector<CigarOp> new_cigar;
954 int new_left = -1;
955 int mismatch = 0;
956
957 /*
958 * Take the length of matched bases in each segment into consideration for closures,
959 * this can be a problem for reads of variable lengths.
960 */
961 int prev_right_end_match_length = prev_hit->cigar().back().length;
962 int curr_left_end_match_length = curr_hit->cigar().front().length;
963
964 bool check_fusion = prev_hit->ref_id2() != curr_hit->ref_id();
965
966 if (prev_hit->ref_id2() == curr_hit->ref_id())
967 {
968 // daehwan
969 if (bDebug)
970 {
971 cout << "daehwan - start - junction or insertion" << endl;
972 cout << "prev right: " << prev_hit->right() << endl;
973 cout << "curr left: " << curr_hit->left() << endl;
974 cout << "prev refs: " << prev_hit->ref_id() << "-" << prev_hit->ref_id2() << endl;
975 cout << "curr refs: " << curr_hit->ref_id() << "-" << curr_hit->ref_id2() << endl;
976 }
977
978 bool reversed = false;
979 if ((fusion_dir == FUSION_FR && fusion_passed) || (fusion_dir == FUSION_RF && !fusion_passed))
980 reversed = true;
981
982 uint32_t reference_id = prev_hit->ref_id2();
983 RefSequenceTable::Sequence* ref_str = rt.get_seq(reference_id);
984
985 int left_boundary, right_boundary;
986 if (reversed)
987 {
988 left_boundary = curr_hit->left() - 4;
989 right_boundary = prev_hit->right() + 4;
990 }
991 else
992 {
993 left_boundary = prev_hit->right() - 4;
994 right_boundary = curr_hit->left() + 4;
995 }
996
997 int dist_btw_two;
998 if (reversed)
999 dist_btw_two = prev_hit->right() - curr_hit->left();
1000 else
1001 dist_btw_two = curr_hit->left() - prev_hit->right();
1002
1003 if (dist_btw_two < 0 && dist_btw_two >= -max_insertion_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
1004 {
1005 std::set<Insertion>::iterator lb, ub;
1006
1007 /*
1008 * Create a dummy sequence to represent the maximum possible insertion
1009 */
1010 std::string maxInsertedSequence = "";
1011 maxInsertedSequence.resize(max_insertion_length,'A');
1012
1013 lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
1014 ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
1015
1016 int reference_mismatch = 0;
1017
1018 while (lb != ub && lb != possible_insertions.end())
1019 {
1020 /*
1021 * In the following code, we will check to make sure that the segments have the proper
1022 * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
1023 * In general, reads with insertions must match the inserted sequence exactly.
1024 */
1025 if (((int)lb->sequence.size()) == (reversed ? curr_hit->left() - prev_hit->right() : prev_hit->right() - curr_hit->left()))
1026 {
1027 /*
1028 * Check we have enough matched bases on prev or curr segment.
1029 */
1030 int insert_to_prev_right, curr_left_to_insert;
1031 if (reversed)
1032 {
1033 insert_to_prev_right = lb->left - prev_hit->right();
1034 curr_left_to_insert = curr_hit->left() - lb->left;
1035 }
1036 else
1037 {
1038 insert_to_prev_right = prev_hit->right() - lb->left - 1;
1039 curr_left_to_insert = lb->left - curr_hit->left() + 1;
1040 }
1041
1042 if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
1043 {
1044 ++lb;
1045 continue;
1046 }
1047
1048 // daehwan
1049 if (bDebug)
1050 {
1051 cout << "insert_to_prev_right: " << insert_to_prev_right << endl;
1052 cout << "curr_left_to_insert: " << curr_left_to_insert << endl;
1053 cout << "curr_seg_index: " << curr_seg_index << endl;
1054 }
1055
1056 /*
1057 * Keep track of how many mismatches were made to the genome in the region
1058 * where we should actually be matching against the insertion
1059 */
1060 int this_reference_mismatch = 0;
1061 int insertion_mismatch = 0;
1062 int insertion_len = lb->sequence.length();
1063 seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
1064 if (reversed)
1065 {
1066 seqan::convertInPlace(insertionSequence, seqan::FunctorComplement<seqan::Dna>());
1067 seqan::reverseInPlace(insertionSequence);
1068 }
1069
1070 /*
1071 * First check to see if we need to adjust number of observed errors for the left (prev)
1072 * hit. This is only the case if this segment runs into the insertion. To be consistent
1073 * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
1074 */
1075 string colorSegmentSequence_prev;
1076 if (insert_to_prev_right > 0)
1077 {
1078 seqan::Dna5String referenceSequence, oldSegmentSequence;
1079
1080 if (reversed)
1081 {
1082 referenceSequence = seqan::infix(*ref_str, prev_hit->right() + 1, lb->left + 1);
1083 seqan::convertInPlace(referenceSequence, seqan::FunctorComplement<seqan::Dna>());
1084 seqan::reverseInPlace(referenceSequence);
1085
1086 string temp;
1087
1088 // daehwan
1089 if (bDebug)
1090 {
1091 cout << "reversed: " << read_seq.length() << " " << read_seq << endl;
1092 }
1093
1094 temp = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right);
1095 oldSegmentSequence = seqan::Dna5String(temp);
1096 }
1097 else
1098 {
1099 referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
1100
1101 // daehwan
1102 if (bDebug)
1103 {
1104 cout << "non-reversed: " << prev_hit->seq() << endl;
1105 }
1106
1107 oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
1108 }
1109
1110 if (color)
1111 {
1112 string color;
1113
1114 if (antisense)
1115 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
1116 else
1117 color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
1118
1119 color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
1120 colorSegmentSequence_prev = convert_color_to_bp(color);
1121 }
1122
1123 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
1124
1125 // daehwan
1126 if (bDebug)
1127 {
1128 cout << "ref: " << referenceSequence << endl;
1129 cout << "old: " << oldSegmentSequence << endl;
1130 cout << "ins: " << insertionSequence << endl;
1131 }
1132
1133 /*
1134 * Scan right in the read until we run out of read
1135 */
1136 for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
1137 {
1138 /*
1139 * Any mismatch to the insertion is a failure
1140 */
1141 if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
1142 {
1143 ++this_reference_mismatch;
1144 }
1145
1146 if (read_index < insertion_len)
1147 {
1148 if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
1149 {
1150 ++insertion_mismatch;
1151 break;
1152 }
1153 }
1154 else
1155 {
1156 if (referenceSequence[read_index - insertion_len] == 'N' ||
1157 referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
1158 {
1159 --this_reference_mismatch;
1160 }
1161 }
1162 }
1163 }
1164
1165 string colorSegmentSequence_curr;
1166 if (curr_left_to_insert > 0)
1167 {
1168 seqan::Dna5String referenceSequence, oldSegmentSequence;
1169 if (reversed)
1170 {
1171 referenceSequence = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
1172 seqan::convertInPlace(referenceSequence, seqan::FunctorComplement<seqan::Dna>());
1173 seqan::reverseInPlace(referenceSequence);
1174
1175 string temp = read_seq.substr(curr_seg_index * segment_length, curr_left_to_insert);
1176 oldSegmentSequence = seqan::Dna5String(temp);
1177 }
1178 else
1179 {
1180 referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
1181 oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
1182 }
1183
1184 if (color)
1185 {
1186 string color;
1187 if (antisense)
1188 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
1189 else
1190 color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
1191
1192 color.push_back(curr_hit->seq()[curr_left_to_insert]);
1193 reverse(color.begin(), color.end());
1194 string bp = convert_color_to_bp(color);
1195 reverse(bp.begin(), bp.end());
1196 colorSegmentSequence_curr = bp;
1197 }
1198
1199 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
1200
1201 // daehwan
1202 if (bDebug)
1203 {
1204 cout << "ref: " << referenceSequence << endl;
1205 cout << "old: " << oldSegmentSequence << endl;
1206 cout << "ins: " << insertionSequence << endl;
1207 }
1208
1209 /*
1210 * Scan left in the read until
1211 * We ran out of read sequence (insertion extends past segment)
1212 */
1213 for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
1214 {
1215 int segmentPosition = curr_left_to_insert - read_index - 1;
1216 int insertionPosition = insertion_len - read_index - 1;
1217
1218 if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
1219 {
1220 ++this_reference_mismatch;
1221 }
1222
1223 if (read_index < insertion_len)
1224 {
1225 if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
1226 {
1227 ++insertion_mismatch;
1228 break;
1229 }
1230 }
1231 else
1232 {
1233 if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
1234 (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
1235 {
1236 --this_reference_mismatch;
1237 }
1238 }
1239 }
1240 }
1241
1242 if (found_closure)
1243 {
1244 // fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
1245 return BowtieHit();
1246 }
1247
1248 if (insertion_mismatch == 0)
1249 {
1250 reference_mismatch = this_reference_mismatch;
1251 mismatch = -reference_mismatch;
1252 found_closure = true;
1253 new_left = prev_hit->left();
1254 new_cigar = prev_hit->cigar();
1255
1256 /*
1257 * Need to make a new insert operation between the two match character that begin
1258 * and end the intersection of these two junction. Note that we necessarily assume
1259 * that this insertion can't span beyond the boundaries of these reads. That should
1260 * probably be better enforced somewhere
1261 */
1262
1263 new_cigar.back().length -= insert_to_prev_right;
1264
1265 if (new_cigar.back().length <= 0)
1266 new_cigar.pop_back();
1267
1268 if (reversed)
1269 new_cigar.push_back(CigarOp(iNS, lb->sequence.size()));
1270 else
1271 new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
1272
1273 vector<CigarOp> new_right_cigar = curr_hit->cigar();
1274 new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
1275
1276 /*
1277 * Finish stitching together the new cigar string
1278 */
1279 size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
1280 for (; c < new_right_cigar.size(); ++c)
1281 {
1282 new_cigar.push_back(new_right_cigar[c]);
1283 }
1284
1285 if (color)
1286 {
1287 if (insert_to_prev_right > 0)
1288 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
1289
1290 if (curr_left_to_insert > 0)
1291 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
1292 }
1293 }
1294 }
1295 ++lb;
1296 }
1297
1298 if (!found_closure)
1299 {
1300 return BowtieHit();
1301 }
1302 }
1303
1304 /*
1305 * Stitch segments together using juctions or deletions if necessary.
1306 */
1307 else if (dist_btw_two > 0 && dist_btw_two <= max_report_intron_length && prev_hit->antisense_align2() == curr_hit->antisense_align())
1308 {
1309 std::set<Junction>::iterator lb, ub;
1310
1311 // daehwan
1312 if (bDebug)
1313 {
1314 cout << "junction" << endl;
1315 cout << "min: " << left_boundary << "-" << right_boundary - 8 << endl;
1316 cout << "max: " << left_boundary + 8 << "-" << right_boundary << endl;
1317 }
1318
1319 lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
1320 ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
1321
1322 int new_diff_mismatches = 0xff;
1323 while (lb != ub && lb != possible_juncs.end())
1324 {
1325 int dist_to_left, dist_to_right;
1326 if (reversed)
1327 {
1328 dist_to_left = lb->left - curr_hit->left();
1329 dist_to_right = lb->right - prev_hit->right() - 1;
1330 }
1331 else
1332 {
1333 dist_to_left = lb->left - prev_hit->right() + 1;
1334 dist_to_right = lb->right - curr_hit->left();
1335 }
1336
1337 if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
1338 {
1339 /*
1340 * Check we have enough matched bases on prev or curr segment.
1341 */
1342 if ((reversed && (dist_to_left > prev_right_end_match_length || -dist_to_left > curr_left_end_match_length)) ||
1343 (!reversed && (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)))
1344 {
1345 ++lb;
1346 continue;
1347 }
1348
1349 // daehwan
1350 if (bDebug)
1351 {
1352 cout << "candidate junction: " << endl;
1353 cout << "coords: " << lb->left << "-" << lb->right << endl;
1354 cout << "dist to left: " << dist_to_left << endl;
1355 }
1356
1357 Dna5String new_cmp_str, old_cmp_str;
1358 int new_mismatch = 0, old_mismatch = 0;
1359 string new_patch_str; // this is for colorspace reads
1360 if (dist_to_left > 0)
1361 {
1362 if (reversed)
1363 {
1364 new_cmp_str = seqan::infix(*ref_str, curr_hit->left() + 1, lb->left + 1);
1365 seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1366 seqan::reverseInPlace(new_cmp_str);
1367
1368 old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb->right);
1369 seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1370 seqan::reverseInPlace(old_cmp_str);
1371 }
1372 else
1373 {
1374 new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
1375 old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
1376 }
1377
1378 string new_seq;
1379 if (color)
1380 {
1381 string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
1382
1383 string color, qual;
1384 if (antisense)
1385 {
1386 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
1387 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
1388 }
1389 else
1390 {
1391 color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
1392 qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
1393 }
1394
1395 BWA_decode(color, qual, ref, new_seq);
1396 new_seq = new_seq.substr(1);
1397 }
1398
1399 string curr_hit_seq;
1400 if (reversed)
1401 curr_hit_seq = read_seq.substr(curr_seg_index * segment_length - dist_to_left, dist_to_left);
1402 else
1403 curr_hit_seq = curr_hit->seq();
1404
1405 const string& curr_old_seq = curr_hit_seq;
1406 const string& curr_seq = color ? new_seq : curr_hit_seq;
1407 for (int i = 0; i < dist_to_left; ++i)
1408 {
1409 if (curr_seq[i] != new_cmp_str[i])
1410 ++new_mismatch;
1411
1412 if (curr_old_seq[i] != old_cmp_str[i])
1413 ++old_mismatch;
1414 }
1415
1416 if (color)
1417 new_patch_str = curr_seq.substr(0, dist_to_left);
1418 }
1419 else if (dist_to_left < 0)
1420 {
1421 if (reversed)
1422 {
1423 new_cmp_str = seqan::infix(*ref_str, lb->right, prev_hit->right() + 1);
1424 seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1425 seqan::reverseInPlace(new_cmp_str);
1426
1427 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, curr_hit->left() + 1);
1428 seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1429 seqan::reverseInPlace(old_cmp_str);
1430 }
1431 else
1432 {
1433 new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
1434 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
1435 }
1436
1437 size_t abs_dist = -dist_to_left;
1438 string new_seq;
1439 if (color)
1440 {
1441 string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
1442 ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
1443
1444 string color, qual;
1445 if (antisense)
1446 {
1447 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
1448 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
1449 }
1450 else
1451 {
1452 color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
1453 qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
1454 }
1455
1456 BWA_decode(color, qual, ref, new_seq);
1457 new_seq = new_seq.substr(1);
1458 }
1459
1460 string prev_hit_seq;
1461 if (reversed)
1462 prev_hit_seq = read_seq.substr(curr_seg_index * segment_length, abs_dist);
1463 else
1464 prev_hit_seq = prev_hit->seq();
1465
1466 // daehwan
1467 if (bDebug)
1468 {
1469 cout << "reverse: " << (int)reversed << endl;
1470 cout << "new cmp str: " << new_cmp_str << endl;
1471 cout << "old cmp str: " << old_cmp_str << endl;
1472 cout << "hit seq: " << prev_hit_seq << endl;
1473 cout << "curr seq: " << curr_hit->seq() << endl;
1474
1475 cout << read_seq
1476 << endl;
1477 cout << read_seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, segment_length)
1478 << endl;
1479 }
1480
1481 const string& prev_old_seq = prev_hit_seq;
1482 size_t prev_old_seq_len = prev_old_seq.length();
1483 const string& prev_seq = color ? new_seq : prev_hit_seq;
1484 size_t prev_seq_len = prev_seq.length();
1485 for (size_t i = 0; i < abs_dist; ++i)
1486 {
1487 if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
1488 ++new_mismatch;
1489 if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
1490 ++old_mismatch;
1491 }
1492
1493 if (color)
1494 new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
1495 }
1496
1497 int temp_diff_mismatches = new_mismatch - old_mismatch;
1498
1499 // daehwan
1500 if (bDebug)
1501 {
1502 cout << "new mismatch: " << new_mismatch << endl;
1503 cout << "old mismatch: " << old_mismatch << endl;
1504 cout << "new_diff_mismatch: " << new_diff_mismatches << endl;
1505 cout << "temp mismatch: " << temp_diff_mismatches << endl;
1506 }
1507
1508 if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
1509 {
1510 ++lb;
1511 continue;
1512 }
1513
1514 if (color)
1515 {
1516 /*
1517 * We need to recover the origianl sequence.
1518 */
1519 if (found_closure)
1520 {
1521 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
1522 prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
1523 }
1524
1525 if (dist_to_left > 0)
1526 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
1527 else if (dist_to_left < 0)
1528 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
1529 }
1530
1531 new_diff_mismatches = temp_diff_mismatches;
1532
1533 new_left = prev_hit->left();
1534 new_cigar = prev_hit->cigar();
1535
1536 int new_left_back_len = new_cigar.back().length;
1537 if (reversed)
1538 new_left_back_len -= dist_to_left;
1539 else
1540 new_left_back_len += dist_to_left;
1541
1542 vector<CigarOp> new_right_cig = curr_hit->cigar();
1543 int new_right_front_len = new_right_cig.front().length;
1544 if (reversed)
1545 new_right_front_len += dist_to_right;
1546 else
1547 new_right_front_len -= dist_to_right;
1548 if (new_left_back_len > 0)
1549 new_cigar.back().length = new_left_back_len;
1550 else
1551 new_cigar.pop_back();
1552
1553 /*
1554 * FIXME, currently just differentiating between a deletion and a
1555 * reference skip based on length. However, would probably be better
1556 * to denote the difference explicitly, this would allow the user
1557 * to supply their own (very large) deletions
1558 */
1559 if ((lb->right - lb->left - 1) <= max_deletion_length)
1560 {
1561 if (reversed)
1562 new_cigar.push_back(CigarOp(dEL, lb->right - lb->left - 1));
1563 else
1564 new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
1565 antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
1566 }
1567 else
1568 {
1569 if (reversed)
1570 new_cigar.push_back(CigarOp(rEF_SKIP, lb->right - lb->left - 1));
1571 else
1572 new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
1573 antisense_closure = lb->antisense;
1574 }
1575
1576 new_right_cig.front().length = new_right_front_len;
1577 size_t c = new_right_front_len > 0 ? 0 : 1;
1578 for (; c < new_right_cig.size(); ++c)
1579 new_cigar.push_back(new_right_cig[c]);
1580
1581 mismatch = new_diff_mismatches;
1582 found_closure = true;
1583 }
1584 ++lb;
1585 }
1586
1587 if (!found_closure)
1588 {
1589 return BowtieHit();
1590 }
1591 }
1592 else if (!(dist_btw_two == 0 && prev_hit->antisense_align2() == curr_hit->antisense_align()))
1593 check_fusion = true;
1594 }
1595
1596 if (check_fusion)
1597 {
1598 std::set<Fusion>::iterator lb, ub;
1599
1600 uint32_t ref_id1 = prev_hit->ref_id2();
1601 uint32_t ref_id2 = curr_hit->ref_id();
1602 uint32_t left = prev_hit->right() - 4;
1603 uint32_t right = curr_hit->left() - 4;
1604
1605 // daehwan
1606 if (bDebug)
1607 {
1608 cout << "daehwan - start - fusion" << endl
1609 << "ref_id1: " << ref_id1 << endl
1610 << "ref_id2: " << ref_id2 << endl
1611 << "left: " << left << endl
1612 << "right: " << right << endl
1613 << "dir: " << fusion_dir << endl;
1614 }
1615
1616 bool reversed = false;
1617 if (fusion_dir != FUSION_FF &&
1618 (ref_id2 < ref_id1 || (ref_id1 == ref_id2 && left > right)))
1619 {
1620 reversed = true;
1621
1622 uint32_t temp = ref_id1;
1623 ref_id1 = ref_id2;
1624 ref_id2 = temp;
1625
1626 temp = left;
1627 left = right;
1628 right = temp;
1629 }
1630
1631 lb = possible_fusions.upper_bound(Fusion(ref_id1, ref_id2, left, right));
1632 ub = possible_fusions.lower_bound(Fusion(ref_id1, ref_id2, left + 8, right + 8));
1633
1634 RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id2());
1635 RefSequenceTable::Sequence* ref_str2 = rt.get_seq(curr_hit->ref_id());
1636
1637 int new_diff_mismatches = 0xff;
1638 while (lb != ub && lb != possible_fusions.end())
1639 {
1640 int lb_left = lb->left;
1641 int lb_right = lb->right;
1642
1643 if (reversed)
1644 {
1645 lb_left = lb->right;
1646 lb_right = lb->left;
1647 }
1648
1649 int dist_to_left, dist_to_right;
1650 if (fusion_dir == FUSION_RF)
1651 dist_to_left = prev_hit->right() - lb_left + 1;
1652 else
1653 dist_to_left = lb_left - prev_hit->right() + 1;
1654
1655 if (fusion_dir == FUSION_FR)
1656 dist_to_right = curr_hit->left() - lb_right;
1657 else
1658 dist_to_right = lb_right - curr_hit->left();
1659
1660 // daehwan
1661 if (bDebug)
1662 {
1663 cout << "daehwan - fusion gap" << endl;
1664 cout << "dist left: " << dist_to_left << endl;
1665 cout << "dist right: " << dist_to_right << endl;
1666 }
1667
1668 if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
1669 {
1670 /*
1671 * Check we have enough matched bases on prev or curr segment.
1672 */
1673 if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length)
1674 {
1675 ++lb;
1676 continue;
1677 }
1678
1679 Dna5String new_cmp_str, old_cmp_str;
1680 int new_mismatch = 0, old_mismatch = 0;
1681 string new_patch_str; // this is for colorspace reads
1682 if (dist_to_left > 0)
1683 {
1684 if (fusion_dir == FUSION_RF)
1685 {
1686 new_cmp_str = seqan::infix(*ref_str, lb_left, prev_hit->right() + 1);
1687 seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1688 seqan::reverseInPlace(new_cmp_str);
1689 }
1690 else
1691 new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb_left + 1);
1692
1693 if (fusion_dir == FUSION_FR)
1694 {
1695 old_cmp_str = seqan::infix(*ref_str2, lb_right + 1, curr_hit->left() + 1);
1696 seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1697 seqan::reverseInPlace(old_cmp_str);
1698 }
1699 else
1700 old_cmp_str = seqan::infix(*ref_str2, curr_hit->left(), lb_right);
1701
1702 // daehwan
1703 if (bDebug)
1704 {
1705 cout << "new str: " << new_cmp_str << endl;
1706 cout << "old str: " << old_cmp_str << endl;
1707 cout << "curr seq: " << curr_hit->seq() << endl;
1708 }
1709
1710 string curr_hit_seq;
1711 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1712 curr_hit_seq = curr_hit->seq();
1713 else
1714 curr_hit_seq = read_seq.substr(curr_seg_index * segment_length, segment_length);
1715
1716 string new_seq;
1717 const string& curr_old_seq = curr_hit_seq;
1718 const string& curr_seq = color ? new_seq : curr_hit_seq;
1719 for (int i = 0; i < dist_to_left; ++i)
1720 {
1721 if (curr_seq[i] != new_cmp_str[i])
1722 ++new_mismatch;
1723
1724 if (curr_old_seq[i] != old_cmp_str[i])
1725 ++old_mismatch;
1726 }
1727 }
1728 else if (dist_to_left < 0)
1729 {
1730 if (fusion_dir == FUSION_FR)
1731 {
1732 new_cmp_str = seqan::infix(*ref_str2, curr_hit->left() + 1, lb_right + 1);
1733 seqan::convertInPlace(new_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1734 seqan::reverseInPlace(new_cmp_str);
1735 }
1736 else
1737 new_cmp_str = seqan::infix(*ref_str2, lb_right, curr_hit->left());
1738
1739 if (fusion_dir == FUSION_RF)
1740 {
1741 old_cmp_str = seqan::infix(*ref_str, prev_hit->right() + 1, lb_left);
1742 seqan::convertInPlace(old_cmp_str, seqan::FunctorComplement<seqan::Dna>());
1743 seqan::reverseInPlace(old_cmp_str);
1744 }
1745 else
1746 old_cmp_str = seqan::infix(*ref_str, lb_left + 1, prev_hit->right());
1747
1748 string prev_hit_seq;
1749 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1750 prev_hit_seq = prev_hit->seq();
1751 else
1752 prev_hit_seq = read_seq.substr((curr_seg_index - 1) * segment_length, segment_length);
1753
1754 size_t abs_dist = -dist_to_left;
1755 string new_seq;
1756
1757 const string& prev_old_seq = prev_hit_seq;
1758 size_t prev_old_seq_len = prev_old_seq.length();
1759 const string& prev_seq = color ? new_seq : prev_hit_seq;
1760 size_t prev_seq_len = prev_seq.length();
1761 for (size_t i = 0; i < abs_dist; ++i)
1762 {
1763 if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
1764 ++new_mismatch;
1765 if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
1766 ++old_mismatch;
1767 }
1768 }
1769
1770 int temp_diff_mismatches = new_mismatch - old_mismatch;
1771 if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
1772 {
1773 ++lb;
1774 continue;
1775 }
1776 new_diff_mismatches = temp_diff_mismatches;
1777
1778 new_left = prev_hit->left();
1779 new_cigar = prev_hit->cigar();
1780
1781 int new_left_back_len = new_cigar.back().length;
1782 new_left_back_len += dist_to_left;
1783
1784 vector<CigarOp> new_right_cig = curr_hit->cigar();
1785 int new_right_front_len = new_right_cig.front().length;
1786 new_right_front_len -= dist_to_right;
1787 if (new_left_back_len > 0)
1788 new_cigar.back().length = new_left_back_len;
1789 else
1790 new_cigar.pop_back();
1791
1792 new_cigar.push_back(CigarOp((CigarOpCode)fusion_dir, lb_right));
1793 antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
1794
1795 new_right_cig.front().length = new_right_front_len;
1796 size_t c = new_right_front_len > 0 ? 0 : 1;
1797 for (; c < new_right_cig.size(); ++c)
1798 new_cigar.push_back(new_right_cig[c]);
1799
1800 mismatch = new_diff_mismatches;
1801 found_closure = true;
1802 ++num_fusions;
1803
1804 // daehwan
1805 if (bDebug)
1806 {
1807 cout << "daehwan - fusion gap - found" << endl;
1808 }
1809
1810 }
1811 ++lb;
1812 }
1813
1814 // daehwan
1815 if (bDebug)
1816 {
1817 cout << "daehwan2 - end - fusion: " << (found_closure ? "found" : "not found") << endl;
1818 }
1819
1820 if (!found_closure)
1821 {
1822 return BowtieHit();
1823 }
1824 }
1825
1826 if (found_closure)
1827 {
1828 bool end = false;
1829 BowtieHit merged_hit(prev_hit->ref_id(),
1830 curr_hit->ref_id2(),
1831 insert_id,
1832 new_left,
1833 new_cigar,
1834 antisense,
1835 antisense_closure,
1836 prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
1837 prev_hit->splice_mms() + curr_hit->splice_mms(),
1838 end);
1839
1840 // daehwan - should fix this for SOLiD dataset
1841 merged_hit.seq(prev_hit->seq() + curr_hit->seq());
1842
1843 // daehwan
1844 if (bDebug)
1845 {
1846 cout << "fusing of " << merged_hit.left() << " and " << merged_hit.right() << endl;
1847 cout << print_cigar(merged_hit.cigar()) << endl;
1848 if (!merged_hit.check_editdist_consistency(rt))
1849 {
1850 cout << "btw " << print_cigar(prev_hit->cigar()) << " and " << print_cigar(curr_hit->cigar()) << endl;
1851 cout << "this is malformed hit" << endl;
1852 }
1853 }
1854
1855 prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
1856 /*
1857 * prev_hit now points PAST the last element removed
1858 */
1859 prev_hit = hit_chain.insert(prev_hit, merged_hit);
1860 /*
1861 * merged_hit has been inserted before the old position of
1862 * prev_hit. New location of prev_hit is merged_hit
1863 */
1864 curr_hit = prev_hit;
1865 ++curr_hit;
1866 ++curr_seg_index;
1867 continue;
1868 }
1869
1870 // daehwan
1871 if (bDebug)
1872 {
1873 cout << "daehwan - test 0.3" << endl;
1874 }
1875
1876 ++prev_hit;
1877 ++curr_hit;
1878 ++curr_seg_index;
1879 }
1880
1881 // daehwan
1882 if (bDebug)
1883 {
1884 cout << "daehwan - test2" << endl;
1885 }
1886
1887 bool saw_antisense_splice = false;
1888 bool saw_sense_splice = false;
1889 vector<CigarOp> long_cigar;
1890 int num_mismatches = 0;
1891 int num_splice_mms = 0;
1892 for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
1893 {
1894 num_mismatches += s->edit_dist();
1895 num_splice_mms += s->splice_mms();
1896
1897 /*
1898 * Check whether the sequence contains any reference skips. Previously,
1899 * this was just a check to see whether the sequence was contiguous; however
1900 * we don't want to count an indel event as a splice
1901 */
1902 bool containsSplice = s->is_spliced();
1903 if (containsSplice)
1904 {
1905 if (s->antisense_splice())
1906 {
1907 if (saw_sense_splice)
1908 return BowtieHit();
1909 saw_antisense_splice = true;
1910 }
1911 else
1912 {
1913 if (saw_antisense_splice)
1914 return BowtieHit();
1915 saw_sense_splice = true;
1916 }
1917 }
1918 const vector<CigarOp>& cigar = s->cigar();
1919 if (long_cigar.empty())
1920 {
1921 long_cigar = cigar;
1922 }
1923 else
1924 {
1925 CigarOp& last = long_cigar.back();
1926 /*
1927 * If necessary, merge the back and front
1928 * cigar operations
1929 */
1930 if(last.opcode == cigar[0].opcode){
1931 last.length += cigar[0].length;
1932 for (size_t b = 1; b < cigar.size(); ++b)
1933 {
1934 long_cigar.push_back(cigar[b]);
1935 }
1936 }else{
1937 for(size_t b = 0; b < cigar.size(); ++b)
1938 {
1939 long_cigar.push_back(cigar[b]);
1940 }
1941 }
1942 }
1943 }
1944
1945 bool end = false;
1946 BowtieHit new_hit(hit_chain.front().ref_id(),
1947 hit_chain.back().ref_id2(),
1948 insert_id,
1949 left,
1950 long_cigar,
1951 antisense,
1952 saw_antisense_splice,
1953 num_mismatches,
1954 num_splice_mms,
1955 end);
1956
1957 if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
1958 {
1959 new_hit.seq(seq);
1960 if (bowtie2)
1961 {
1962 // for the time being, let's compare "seq" and "read_seq"
1963 if (seq != read_seq)
1964 {
1965 string temp_qual = read_quals;
1966 reverse(temp_qual.begin(), temp_qual.end());
1967 new_hit.qual(temp_qual);
1968 }
1969 else
1970 new_hit.qual(read_quals);
1971 }
1972 else
1973 new_hit.qual(qual);
1974 }
1975 else
1976 {
1977 new_hit.seq(read_seq);
1978 new_hit.qual(read_quals);
1979 }
1980
1981 bool do_reverse = new_hit.ref_id() > new_hit.ref_id2();
1982 if (new_hit.ref_id() == new_hit.ref_id2())
1983 {
1984 vector<Fusion> fusions;
1985 bool auto_sort = false;
1986 fusions_from_spliced_hit(new_hit, fusions, auto_sort);
1987 if (fusions.size() > 0)
1988 {
1989 const Fusion& fusion = fusions[0];
1990 do_reverse = fusion.left > fusion.right;
1991 }
1992 }
1993
1994 if (do_reverse)
1995 {
1996 new_hit = new_hit.reverse();
1997 }
1998
1999 if (fusion_dir == FUSION_RF || fusion_dir == FUSION_RR)
2000 {
2001 new_hit.antisense_align(!new_hit.antisense_align());
2002 }
2003
2004 // daehwan
2005 if (bDebug)
2006 {
2007 cout << "daehwan - test3" << endl;
2008 cout << new_hit.left() << " " << print_cigar(new_hit.cigar()) << endl;
2009 cout << new_hit.ref_id() << "-" << new_hit.ref_id2() << ": " << new_hit.fusion_opcode() << endl;
2010 }
2011
2012 int new_read_len = new_hit.read_len();
2013 if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt, bDebug))
2014 {
2015 // daehwan
2016 if (bDebug)
2017 {
2018 cout << "Warning: " << new_hit.insert_id() << " malformed closure: " << print_cigar(new_hit.cigar()) << endl;
2019 exit(1);
2020 }
2021
2022 fprintf(stderr, "Warning: %d malformed closure\n", new_hit.insert_id());
2023 return BowtieHit();
2024 }
2025
2026 return new_hit;
2027 }
2028
2029
2030 int multi_closure = 0;
2031 int anchor_too_short = 0;
2032 int gap_too_short = 0;
2033
2034 bool valid_hit(const BowtieHit& bh)
2035 {
2036 if (bh.insert_id())
2037 {
2038 /*
2039 * validate the cigar chain - no gaps shorter than an intron, etc.
2040 * also,
2041 * -Don't start or end with an indel or refskip
2042 * -Only a match operation is allowed is allowed
2043 * adjacent to an indel or refskip
2044 * -Indels should confrom to length restrictions
2045 */
2046 const CigarOp* prevCig = &(bh.cigar()[0]);
2047 const CigarOp* currCig = &(bh.cigar()[1]);
2048 for (size_t i = 1; i < bh.cigar().size(); ++i){
2049 currCig = &(bh.cigar()[i]);
2050 if(!(currCig->opcode == MATCH || currCig->opcode == mATCH) &&
2051 !(prevCig->opcode == MATCH || prevCig->opcode == mATCH)){
2052 return false;
2053 }
2054 if(currCig->opcode == INS || currCig->opcode == iNS){
2055 if(currCig->length > max_insertion_length){
2056 return false;
2057 }
2058 }
2059 if(currCig->opcode == DEL || currCig->opcode == dEL){
2060 if(currCig->length > max_deletion_length){
2061 return false;
2062 }
2063 }
2064 if(currCig->opcode == REF_SKIP || currCig->opcode == rEF_SKIP){
2065 if(currCig->length < (uint64_t)min_report_intron_length){
2066 gap_too_short++;
2067 return false;
2068 }
2069 }
2070 prevCig = currCig;
2071 }
2072 if (!(bh.cigar().front().opcode == MATCH || bh.cigar().front().opcode == mATCH) ||
2073 !(bh.cigar().back().opcode == MATCH || bh.cigar().back().opcode == mATCH)/* ||
2074 (int)bh.cigar().front().length < min_anchor_len||
2075 (int)bh.cigar().back().length < min_anchor_len*/ )
2076 {
2077 anchor_too_short++;
2078 return false;
2079 }
2080 }
2081 else
2082 {
2083 multi_closure++;
2084 return false;
2085 }
2086
2087 return true;
2088 }
2089
2090 void merge_segment_chain(RefSequenceTable& rt,
2091 const string& read_seq,
2092 const string& read_quals,
2093 std::set<Junction>& possible_juncs,
2094 std::set<Insertion>& possible_insertions,
2095 std::set<Fusion>& possible_fusions,
2096 vector<BowtieHit>& hits,
2097 vector<BowtieHit>& merged_hits,
2098 int fusion_dir = FUSION_NOTHING)
2099 {
2100 if (hits.size() == 0)
2101 return;
2102
2103 BowtieHit bh;
2104 if (hits.size() > 1)
2105 {
2106 list<BowtieHit> hit_chain;
2107 if (fusion_dir == FUSION_NOTHING || fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
2108 {
2109 if (hits.front().antisense_align())
2110 copy(hits.rbegin(), hits.rend(), back_inserter(hit_chain));
2111 else
2112 copy(hits.begin(), hits.end(), back_inserter(hit_chain));
2113 }
2114 else
2115 {
2116 bool bSawFusion = false;
2117 for (size_t i = 0; i < hits.size(); ++i)
2118 {
2119 bool pushed = false;
2120 if (!bSawFusion)
2121 {
2122 if (i > 0)
2123 {
2124 if (hits[i-1].ref_id() != hits[i].ref_id())
2125 bSawFusion = true;
2126 else if(hits[i-1].antisense_align() != hits[i].antisense_align())
2127 bSawFusion = true;
2128 else
2129 {
2130 int dist = 0;
2131 if (hits[i].antisense_align())
2132 dist = hits[i-1].left() - hits[i].right();
2133 else
2134 dist = hits[i].left() - hits[i-1].right();
2135
2136 if (dist >= max_report_intron_length || dist < -(int)max_insertion_length)
2137 bSawFusion = true;
2138 }
2139 }
2140 }
2141
2142 if (hits[i].fusion_opcode() == FUSION_NOTHING &&
2143 ((fusion_dir == FUSION_FR && bSawFusion) || (fusion_dir == FUSION_RF && !bSawFusion)) &&
2144 hits[i].left() < hits[i].right())
2145 {
2146 hit_chain.push_back(hits[i].reverse());
2147 pushed = true;
2148 }
2149
2150 if (i > 0 &&
2151 hits[i].fusion_opcode() != FUSION_NOTHING &&
2152 hits[i].ref_id() != hits[i-1].ref_id())
2153 {
2154 hit_chain.push_back(hits[i].reverse());
2155 pushed = true;
2156 }
2157
2158 if (!bSawFusion)
2159 {
2160 if (hits[i].fusion_opcode() != FUSION_NOTHING)
2161 bSawFusion = true;
2162 }
2163
2164 if (!pushed)
2165 hit_chain.push_back(hits[i]);
2166 }
2167 }
2168
2169 // todo: merge_chain_color needs to be merged into merge_chain fuction.
2170 if (color)
2171 bh = merge_chain_color(rt,
2172 read_seq,
2173 read_quals,
2174 possible_juncs,
2175 possible_insertions,
2176 hit_chain);
2177 else
2178 bh = merge_chain(rt,
2179 read_seq,
2180 read_quals,
2181 possible_juncs,
2182 possible_insertions,
2183 possible_fusions,
2184 hit_chain,
2185 fusion_dir);
2186 }
2187 else
2188 {
2189 bh = hits[0];
2190 }
2191
2192 if (valid_hit(bh))
2193 merged_hits.push_back(bh);
2194 }
2195
2196 bool dfs_seg_hits(RefSequenceTable& rt,
2197 const string& read_seq,
2198 const string& read_quals,
2199 std::set<Junction>& possible_juncs,
2200 std::set<Insertion>& possible_insertions,
2201 std::set<Fusion>& possible_fusions,
2202 vector<HitsForRead>& seg_hits_for_read,
2203 size_t curr,
2204 vector<BowtieHit>& seg_hit_stack,
2205 vector<BowtieHit>& joined_hits,
2206 int fusion_dir = FUSION_NOTHING)
2207 {
2208 assert (!seg_hit_stack.empty());
2209 bool join_success = false;
2210
2211 if (curr < seg_hits_for_read.size())
2212 {
2213 for (size_t i = 0; i < seg_hits_for_read[curr].hits.size(); ++i)
2214 {
2215 /*
2216 * As we reverse segments depending on directions like FR or RF,
2217 * it's necessary to recover the original segments.
2218 */
2219 BowtieHit bh = seg_hits_for_read[curr].hits[i];
2220 BowtieHit bh_prev = seg_hit_stack.back();
2221
2222 BowtieHit* prevHit = &bh_prev;
2223 BowtieHit* currHit = &bh;
2224
2225 // daehwan
2226 // if (bh.insert_id() == 11921733)
2227 // bDebug = true;
2228
2229 /*
2230 * Each segment has at most one fusion by assumption,
2231 */
2232 bool prevHit_fused = prevHit->fusion_opcode() != FUSION_NOTHING;
2233 bool currHit_fused = currHit->fusion_opcode() != FUSION_NOTHING;
2234
2235 /*
2236 * Count the number of fusions on prev and curr segments,
2237 * but this doesn't take into account the gap (if exists) between the two segments.
2238 */
2239 size_t num_fusions = prevHit_fused ? 1 : 0;
2240 num_fusions += currHit_fused ? 1 : 0;
2241
2242 int dir = prevHit_fused ? prevHit->fusion_opcode() : currHit->fusion_opcode();
2243
2244 /*
2245 * We don't allow reads that span more than two fusion points.
2246 */
2247 if (num_fusions >= 2)
2248 continue;
2249
2250 if (fusion_dir != FUSION_NOTHING && currHit_fused)
2251 continue;
2252
2253 if (fusion_dir == FUSION_FF || fusion_dir == FUSION_RR)
2254 {
2255 if ((currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id()) ||
2256 (!currHit->antisense_align() && currHit->ref_id() != prevHit->ref_id2()))
2257 continue;
2258 }
2259
2260 if (bDebug)
2261 {
2262 cout << "daehwan - prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": "
2263 << print_cigar(prevHit->cigar()) << endl;
2264 cout << "daehwan - prev sense: "
2265 << (prevHit->antisense_align() ? "-" : "+")
2266 << "\t"
2267 << (prevHit->antisense_align2() ? "-" : "+")
2268 << endl;
2269 cout << "daehwan - prev coords: "
2270 << prevHit->left()
2271 << "\t"
2272 << prevHit->right()
2273 << endl;
2274
2275 cout << "daehwan - curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": "
2276 << print_cigar(currHit->cigar()) << endl;
2277 cout << "daehwan - curr sense: "
2278 << (currHit->antisense_align() ? "-" : "+")
2279 << "\t"
2280 << (currHit->antisense_align2() ? "-" : "+")
2281 << endl;
2282 cout << "daehwan - curr coords: "
2283 << currHit->left()
2284 << "\t"
2285 << currHit->right()
2286 << endl;
2287 }
2288
2289 if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF) &&
2290 prevHit->ref_id2() != currHit->ref_id())
2291 continue;
2292
2293 if ((fusion_dir == FUSION_FR && !currHit->antisense_align()) ||
2294 (fusion_dir == FUSION_RF && currHit->antisense_align()))
2295 continue;
2296
2297 if (currHit_fused && dir == FUSION_RR)
2298 *currHit = currHit->reverse();
2299
2300 if (fusion_dir == FUSION_FR || fusion_dir == FUSION_RF ||
2301 (currHit_fused && currHit->ref_id() == currHit->ref_id2() && (dir == FUSION_FR || dir == FUSION_RF)))
2302 {
2303 if (currHit_fused)
2304 {
2305 if ((dir == FUSION_FR && currHit->antisense_align()) ||
2306 (dir == FUSION_RF && !currHit->antisense_align()))
2307 *currHit = currHit->reverse();
2308 }
2309 else
2310 {
2311 if (fusion_dir == FUSION_FR && currHit->antisense_align())
2312 *currHit = currHit->reverse();
2313 }
2314 }
2315 /*
2316 * Switch prevHit and currHit in FUSION_NOTHING and FUSION_FF cases
2317 * to make it easier to check the distance in the gap between the two segments.
2318 */
2319 else if ((num_fusions == 0 && prevHit->antisense_align() && currHit->antisense_align() && prevHit->ref_id() == currHit->ref_id() && prevHit->left() <= currHit->right() + max_report_intron_length && prevHit->left() + max_insertion_length >= currHit->right()) ||
2320 (num_fusions == 1 && (dir == FUSION_FF || dir == FUSION_RR)&&
2321 ((!prevHit_fused && prevHit->antisense_align()) || (!currHit_fused && currHit->antisense_align())))
2322 )
2323 {
2324 BowtieHit* tempHit = prevHit;
2325 prevHit = currHit;
2326 currHit = tempHit;
2327 }
2328 else if (num_fusions == 0)
2329 {
2330 if (prevHit->ref_id2() == currHit->ref_id() && prevHit->antisense_align() == currHit->antisense_align())
2331 {
2332 int dist = 0;
2333 if (prevHit->antisense_align())
2334 dist = prevHit->left() - currHit->right();
2335 else
2336 dist = currHit->left() - prevHit->right();
2337
2338 if (dist > max_report_intron_length || dist < -(int)max_insertion_length)
2339 {
2340 if ((prevHit->antisense_align() && prevHit->left() > currHit->left()) ||
2341 (!prevHit->antisense_align() && prevHit->left() < currHit->left()))
2342 dir = FUSION_FF;
2343 else
2344 dir = FUSION_RR;
2345 }
2346 }
2347 else
2348 {
2349 if (prevHit->antisense_align() == currHit->antisense_align())
2350 {
2351 if ((prevHit->antisense_align() && prevHit->ref_id() > currHit->ref_id()) ||
2352 (!prevHit->antisense_align() && prevHit->ref_id() < currHit->ref_id()))
2353 dir = FUSION_FF;
2354 else
2355 dir = FUSION_RR;
2356 }
2357 else if (!prevHit->antisense_align())
2358 dir = FUSION_FR;
2359 else
2360 dir = FUSION_RF;
2361
2362 if (dir == FUSION_FR)
2363 *currHit = currHit->reverse();
2364 else if(dir == FUSION_RF)
2365 *prevHit = prevHit->reverse();
2366 }
2367 }
2368
2369 // daehwan - test
2370 if (bDebug)
2371 {
2372 cout << "insert id: " << prevHit->insert_id() << endl;
2373 cout << "(" << curr - 1 << ") prev: " << prevHit->seq() << " : " << (prevHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
2374 cout << "(" << curr << ") curr: " << currHit->seq() << " : " << (currHit->fusion_opcode() != FUSION_NOTHING ? "fused" : "no") << endl;
2375 cout << "prev ref: " << prevHit->ref_id() << "-" << prevHit->ref_id2() << ": " << print_cigar(prevHit->cigar()) << endl;
2376 cout << "curr ref: " << currHit->ref_id() << "-" << currHit->ref_id2() << ": " << print_cigar(currHit->cigar()) << endl;
2377 cout << "prev coords: "
2378 << prevHit->left()
2379 << "\t"
2380 << prevHit->right()
2381 << endl;
2382 cout << "curr corrds: "
2383 << currHit->left()
2384 << "\t"
2385 << currHit->right()
2386 << endl;
2387 cout << "prev sense: "
2388 << (prevHit->antisense_align() ? "-" : "+")
2389 << "\t"
2390 << (prevHit->antisense_align2() ? "-" : "+")
2391 << endl;
2392 cout << "curr sense: "
2393 << (currHit->antisense_align() ? "-" : "+")
2394 << "\t"
2395 << (currHit->antisense_align2() ? "-" : "+")
2396 << endl;
2397 }
2398
2399 if (num_fusions == 1)
2400 {
2401 // daehwan
2402 if (bDebug)
2403 {
2404 cout << "direction: " << (int)dir << endl;
2405 }
2406
2407 /*
2408 * orient the fused segment, which depends on a fusion direction.
2409 */
2410 if (dir != FUSION_FF && dir != FUSION_RR)
2411 {
2412 bool prevHit_rep = false;
2413 bool currHit_rep = false;
2414
2415 if (prevHit_fused)
2416 {
2417 if ((dir == FUSION_FR && !currHit->antisense_align()) ||
2418 (dir == FUSION_RF && currHit->antisense_align()))
2419 continue;
2420
2421 if (prevHit->ref_id2() != currHit->ref_id())
2422 prevHit_rep = true;
2423 else if ((dir == FUSION_FR && prevHit->antisense_align()) ||
2424 (dir == FUSION_RF && !prevHit->antisense_align()))
2425 prevHit_rep = true;
2426 }
2427
2428 if (currHit_fused)
2429 {
2430 if ((dir == FUSION_FR && prevHit->antisense_align()) ||
2431 (dir == FUSION_RF && !prevHit->antisense_align()))
2432 continue;
2433
2434 if (currHit->ref_id() != prevHit->ref_id2())
2435 currHit_rep = true;
2436 }
2437
2438 if (bDebug)
2439 {
2440 if (prevHit_rep) cout << "1. reversed in prev" << endl;
2441 if (currHit_rep) cout << "1. reversed in curr" << endl;
2442 }
2443
2444 if (prevHit_rep)
2445 *prevHit = prevHit->reverse();
2446 if (currHit_rep)
2447 *currHit = currHit->reverse();
2448
2449 prevHit_rep = false;
2450 currHit_rep = false;
2451
2452 if (prevHit_fused)
2453 {
2454 if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
2455 currHit_rep = true;
2456 }
2457 else
2458 {
2459 if (prevHit->is_forwarding_right() != currHit->is_forwarding_left())
2460 prevHit_rep = true;
2461 }
2462
2463 if (prevHit_rep)
2464 *prevHit = prevHit->reverse();
2465 if (currHit_rep)
2466 *currHit = currHit->reverse();
2467
2468 // daehwan
2469 if (bDebug)
2470 {
2471 if (prevHit_rep) cout << "2. reversed in prev" << endl;
2472 if (currHit_rep) cout << "2. reversed in curr" << endl;
2473 }
2474 }
2475 }
2476
2477 bool same_contig = prevHit->ref_id2() == currHit->ref_id();
2478 if (!same_contig && num_fusions > 0)
2479 continue;
2480
2481 if (same_contig && num_fusions >= 1)
2482 {
2483 if (prevHit->antisense_align2() != currHit->antisense_align())
2484 continue;
2485 }
2486
2487 int bh_l = 0, back_right = 0, dist = 0;
2488 if (same_contig)
2489 {
2490 if ((fusion_dir == FUSION_FR || fusion_dir == FUSION_RF || dir == FUSION_FR || dir == FUSION_RF) &&
2491 prevHit->antisense_align2())
2492 {
2493 bh_l = prevHit->right() + 1;
2494 back_right = currHit->left() + 1;
2495 }
2496 else
2497 {
2498 bh_l = currHit->left();
2499 back_right = prevHit->right();
2500 }
2501
2502 dist = bh_l - back_right;
2503 }
2504
2505 // daehwan - pass
2506 if (bDebug)
2507 {
2508 cout << "daehwan - pass" << endl;
2509 cout << "prev coords: " << prevHit->left() << "\t" << prevHit->right() << endl;
2510 cout << "curr coords: " << currHit->left() << "\t" << currHit->right() << endl;
2511 }
2512
2513 if (!same_contig ||
2514 (same_contig && num_fusions == 0 && dir != FUSION_NOTHING && fusion_dir == FUSION_NOTHING) ||
2515 (same_contig && dist <= max_report_intron_length && dist >= -(int)max_insertion_length && prevHit->is_forwarding_right() == currHit->is_forwarding_left()))
2516 {
2517 // daehwan
2518 if (bDebug)
2519 {
2520 cout << "daehwan - really passed!!" << endl;
2521 }
2522
2523 BowtieHit tempHit = seg_hit_stack.back();
2524 seg_hit_stack.back() = bh_prev;
2525
2526 // these hits are compatible, so push bh onto the
2527 // stack, recurse, and pop it when done.
2528 seg_hit_stack.push_back(bh);
2529 bool success = dfs_seg_hits(rt,
2530 read_seq,
2531 read_quals,
2532 possible_juncs,
2533 possible_insertions,
2534 possible_fusions,
2535 seg_hits_for_read,
2536 curr + 1,
2537 seg_hit_stack,
2538 joined_hits,
2539 dir == FUSION_NOTHING ? fusion_dir : dir);
2540
2541 if (success)
2542 join_success = true;
2543
2544 seg_hit_stack.pop_back();
2545 seg_hit_stack.back() = tempHit;
2546 }
2547 }
2548 }
2549 else
2550 {
2551 merge_segment_chain(rt,
2552 read_seq,
2553 read_quals,
2554 possible_juncs,
2555 possible_insertions,
2556 possible_fusions,
2557 seg_hit_stack,
2558 joined_hits,
2559 fusion_dir);
2560
2561 return join_success = true;
2562 }
2563 return join_success;
2564 }
2565
2566 bool join_segments_for_read(RefSequenceTable& rt,
2567 const string& read_seq,
2568 const string& read_quals,
2569 std::set<Junction>& possible_juncs,
2570 std::set<Insertion>& possible_insertions,
2571 std::set<Fusion>& possible_fusions,
2572 vector<HitsForRead>& seg_hits_for_read,
2573 vector<BowtieHit>& joined_hits)
2574 {
2575 vector<BowtieHit> seg_hit_stack;
2576 bool join_success = false;
2577
2578 // ignore segments that map to more than this many places.
2579 if (bowtie2)
2580 {
2581 for (size_t s = 0; s < seg_hits_for_read.size(); ++s)
2582 {
2583 if (seg_hits_for_read[s].hits.size() > max_seg_multihits)
2584 return join_success;
2585 }
2586 }
2587
2588 for (size_t i = 0; i < seg_hits_for_read[0].hits.size(); ++i)
2589 {
2590 BowtieHit& bh = seg_hits_for_read[0].hits[i];
2591
2592 // daehwan - remove this
2593 //if (bh.insert_id() == 811745)
2594 //bDebug = true;
2595
2596 if (bh.fusion_opcode() == FUSION_RR)
2597 seg_hit_stack.push_back(bh.reverse());
2598 else
2599 seg_hit_stack.push_back(bh);
2600
2601 bool success = dfs_seg_hits(rt,
2602 read_seq,
2603 read_quals,
2604 possible_juncs,
2605 possible_insertions,
2606 possible_fusions,
2607 seg_hits_for_read,
2608 1,
2609 seg_hit_stack,
2610 joined_hits);
2611 if (success)
2612 join_success = true;
2613 seg_hit_stack.pop_back();
2614 }
2615
2616 return join_success;
2617 }
2618
2619 struct JoinSegmentsWorker
2620 {
2621 void operator()()
2622 {
2623 ReadTable it;
2624 GBamWriter bam_writer(bam_output_fname.c_str(), sam_header_fname.c_str());
2625 ReadStream readstream(reads_fname);
2626 if (readstream.file() == NULL)
2627 err_die("Error: cannot open %s for reading\n", reads_fname.c_str());
2628
2629 if (read_offset > 0)
2630 readstream.seek(read_offset);
2631
2632 bool need_seq = true;
2633 bool need_qual = true;
2634
2635 vector<HitStream> contig_hits;
2636 vector<HitStream> spliced_hits;
2637 vector<HitFactory*> factories;
2638 for (size_t i = 0; i < segmap_fnames.size(); ++i)
2639 {
2640 HitFactory* fac = new BAMHitFactory(it, *rt);
2641 factories.push_back(fac);
2642 HitStream hs(segmap_fnames[i], fac, false, false, false, need_seq, need_qual);
2643
2644 if (seg_offsets[i] > 0)
2645 hs.seek(seg_offsets[i]);
2646
2647 contig_hits.push_back(hs);
2648 }
2649
2650 for (size_t i = 0; i < spliced_segmap_fnames.size(); ++i)
2651 {
2652 int anchor_length = 0;
2653 HitFactory* fac = new SplicedBAMHitFactory(it, *rt, anchor_length);
2654 factories.push_back(fac);
2655
2656 HitStream hs(spliced_segmap_fnames[i], fac, true, false, false, need_seq, need_qual);
2657 if (spliced_seg_offsets[i] > 0)
2658 hs.seek(spliced_seg_offsets[i]);
2659
2660 spliced_hits.push_back(hs);
2661 }
2662
2663 uint32_t curr_contig_obs_order = VMAXINT32;
2664 HitStream* first_seg_contig_stream = NULL;
2665 uint64_t next_contig_id = 0;
2666
2667 if (contig_hits.size() > 0)
2668 {
2669 first_seg_contig_stream = &(contig_hits.front());
2670 next_contig_id = first_seg_contig_stream->next_group_id();
2671 curr_contig_obs_order = it.observation_order(next_contig_id);
2672 }
2673
2674 HitsForRead curr_hit_group;
2675
2676 uint32_t curr_spliced_obs_order = VMAXINT32;
2677 HitStream* first_seg_spliced_stream = NULL;
2678 uint64_t next_spliced_id = 0;
2679
2680 if (spliced_hits.size() > 0)
2681 {
2682 first_seg_spliced_stream = &(spliced_hits.front());
2683 next_spliced_id = first_seg_spliced_stream->next_group_id();
2684 curr_spliced_obs_order = it.observation_order(next_spliced_id);
2685 }
2686
2687 while((curr_contig_obs_order != VMAXINT32 || curr_spliced_obs_order != VMAXINT32) &&
2688 (curr_contig_obs_order < end_id || curr_spliced_obs_order < end_id))
2689 {
2690 uint32_t read_in_process;
2691 vector<HitsForRead> seg_hits_for_read;
2692 seg_hits_for_read.resize(contig_hits.size());
2693
2694 if (curr_contig_obs_order < curr_spliced_obs_order)
2695 {
2696 first_seg_contig_stream->next_read_hits(curr_hit_group);
2697 seg_hits_for_read.front() = curr_hit_group;
2698
2699 next_contig_id = first_seg_contig_stream->next_group_id();
2700 uint32_t next_order = it.observation_order(next_contig_id);
2701
2702 read_in_process = curr_contig_obs_order;
2703 curr_contig_obs_order = next_order;
2704 }
2705 else if (curr_spliced_obs_order < curr_contig_obs_order)
2706 {
2707 first_seg_spliced_stream->next_read_hits(curr_hit_group);
2708 seg_hits_for_read.front() = curr_hit_group;
2709
2710 next_spliced_id = first_seg_spliced_stream->next_group_id();
2711 uint32_t next_order = it.observation_order(next_spliced_id);
2712
2713 read_in_process = curr_spliced_obs_order;
2714 curr_spliced_obs_order = next_order;
2715
2716 if (read_in_process < begin_id)
2717 continue;
2718 }
2719 else if (curr_contig_obs_order == curr_spliced_obs_order &&
2720 curr_contig_obs_order != VMAXINT32 &&
2721 curr_spliced_obs_order != VMAXINT32)
2722 {
2723 first_seg_contig_stream->next_read_hits(curr_hit_group);
2724
2725 HitsForRead curr_spliced_group;
2726 first_seg_spliced_stream->next_read_hits(curr_spliced_group);
2727
2728 curr_hit_group.hits.insert(curr_hit_group.hits.end(),
2729 curr_spliced_group.hits.begin(),
2730 curr_spliced_group.hits.end());
2731 seg_hits_for_read.front() = curr_hit_group;
2732 read_in_process = curr_spliced_obs_order;
2733
2734 next_contig_id = first_seg_contig_stream->next_group_id();
2735 uint32_t next_order = it.observation_order(next_contig_id);
2736
2737 next_spliced_id = first_seg_spliced_stream->next_group_id();
2738 uint32_t next_spliced_order = it.observation_order(next_spliced_id);
2739
2740 curr_spliced_obs_order = next_spliced_order;
2741 curr_contig_obs_order = next_order;
2742 }
2743 else
2744 {
2745 break;
2746 }
2747
2748 if (contig_hits.size() > 1)
2749 {
2750 look_right_for_hit_group(it,
2751 contig_hits,
2752 0,
2753 spliced_hits,
2754 curr_hit_group,
2755 seg_hits_for_read);
2756 }
2757
2758 size_t last_non_empty = seg_hits_for_read.size() - 1;
2759 while(last_non_empty >= 0 && seg_hits_for_read[last_non_empty].hits.empty())
2760 {
2761 --last_non_empty;
2762 }
2763
2764 seg_hits_for_read.resize(last_non_empty + 1);
2765 if (!seg_hits_for_read[last_non_empty].hits[0].end())
2766 continue;
2767
2768 if (!seg_hits_for_read.empty() && !seg_hits_for_read[0].hits.empty())
2769 {
2770 uint64_t insert_id = seg_hits_for_read[0].hits[0].insert_id();
2771 if (insert_id >= begin_id && insert_id < end_id)
2772 {
2773 Read read;
2774 if (readstream.getRead(insert_id, read))
2775 {
2776 vector<BowtieHit> joined_hits;
2777 join_segments_for_read(*rt,
2778 read.seq.c_str(),
2779 read.qual.c_str(),
2780 *possible_juncs,
2781 *possible_insertions,
2782 *possible_fusions,
2783 seg_hits_for_read,
2784 joined_hits);
2785
2786 sort(joined_hits.begin(), joined_hits.end());
2787 vector<BowtieHit>::iterator new_end = unique(joined_hits.begin(), joined_hits.end());
2788 joined_hits.erase(new_end, joined_hits.end());
2789 for (size_t i = 0; i < joined_hits.size(); i++)
2790 {
2791 const char* ref_name = rt->get_name(joined_hits[i].ref_id());
2792 const char* ref_name2 = "";
2793 if (joined_hits[i].fusion_opcode() != FUSION_NOTHING)
2794 ref_name2 = rt->get_name(joined_hits[i].ref_id2());
2795
2796 vector<string> extra_fields;
2797
2798 if (!color)
2799 bowtie_sam_extra(joined_hits[i], *rt, extra_fields);
2800
2801 if (color)
2802 print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2803 joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true, &extra_fields);
2804 else
2805 print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, ref_name2,
2806 read.seq.c_str(), read.qual.c_str(), false, &extra_fields);
2807 }
2808 }
2809 else
2810 {
2811 err_die("Error: could not get read # %d from stream\n",
2812 read_in_process);
2813 }
2814 }
2815 }
2816 else
2817 {
2818 //fprintf(stderr, "Warning: couldn't join segments for read # %d\n", read_in_process);
2819 }
2820 }
2821
2822 for (size_t fac = 0; fac < factories.size(); ++fac)
2823 {
2824 delete factories[fac];
2825 }
2826 factories.clear();
2827 }
2828
2829 string bam_output_fname;
2830 string sam_header_fname;
2831 string reads_fname;
2832
2833 vector<string> segmap_fnames;
2834 vector<string> spliced_segmap_fnames;
2835
2836 std::set<Junction>* possible_juncs;
2837 std::set<Insertion>* possible_insertions;
2838 std::set<Fusion>* possible_fusions;
2839
2840 RefSequenceTable* rt;
2841
2842 uint64_t begin_id;
2843 uint64_t end_id;
2844 int64_t read_offset;
2845 vector<int64_t> seg_offsets;
2846 vector<int64_t> spliced_seg_offsets;
2847 };
2848
2849 void driver(const string& bam_output_fname,
2850 istream& ref_stream,
2851 vector<FILE*>& possible_juncs_files,
2852 vector<FILE*>& possible_insertions_files,
2853 vector<FILE*>& possible_deletions_files,
2854 vector<FILE*>& possible_fusions_files,
2855 vector<string>& spliced_segmap_fnames, //.bam files
2856 vector<string>& segmap_fnames, //.bam files
2857 const string& reads_fname)
2858 {
2859 if (!parallel)
2860 num_threads = 1;
2861
2862 if (segmap_fnames.size() == 0)
2863 {
2864 fprintf(stderr, "No hits to process, exiting\n");
2865 exit(0);
2866 }
2867
2868 RefSequenceTable rt(sam_header, true);
2869 fprintf (stderr, "Loading reference sequences...\n");
2870 get_seqs(ref_stream, rt, true);
2871 fprintf (stderr, " reference sequences loaded.\n");
2872
2873 fprintf(stderr, "Loading junctions...");
2874 std::set<Junction> possible_juncs;
2875
2876 for (size_t i = 0; i < possible_juncs_files.size(); ++i)
2877 {
2878 char buf[2048];
2879 while(!feof(possible_juncs_files[i]) &&
2880 fgets(buf, sizeof(buf), possible_juncs_files[i]))
2881 {
2882 char junc_ref_name[256];
2883 int left;
2884 int right;
2885 char orientation;
2886 int ret = sscanf(buf, "%s %d %d %c", junc_ref_name, &left, &right, &orientation);
2887 if (ret != 4)
2888 continue;
2889 uint32_t ref_id = rt.get_id(junc_ref_name, NULL, 0);
2890 possible_juncs.insert(Junction(ref_id, left, right, orientation == '-'));
2891 }
2892 }
2893 fprintf(stderr, "done\n");
2894
2895 fprintf(stderr, "Loading deletions...");
2896
2897 for (size_t i = 0; i < possible_deletions_files.size(); ++i)
2898 {
2899 char splice_buf[2048];
2900 FILE* deletions_file = possible_deletions_files[i];
2901 if(!deletions_file){
2902 continue;
2903 }
2904 while(fgets(splice_buf, 2048, deletions_file)){
2905 char* nl = strrchr(splice_buf, '\n');
2906 char* buf = splice_buf;
2907 if (nl) *nl = 0;
2908
2909 char* ref_name = get_token((char**)&buf, "\t");
2910 char* scan_left_coord = get_token((char**)&buf, "\t");
2911 char* scan_right_coord = get_token((char**)&buf, "\t");
2912
2913 if (!scan_left_coord || !scan_right_coord)
2914 {
2915 err_die("Error: malformed deletion coordinate record\n");
2916 }
2917
2918 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
2919 uint32_t left_coord = atoi(scan_left_coord);
2920 uint32_t right_coord = atoi(scan_right_coord);
2921 possible_juncs.insert((Junction)Deletion(ref_id, left_coord - 1,right_coord, false));
2922 }
2923 }
2924 fprintf(stderr, "done\n");
2925
2926 /*
2927 * Read the insertions from the list of insertion
2928 * files into a set
2929 */
2930 fprintf(stderr, "Loading insertions...");
2931 std::set<Insertion> possible_insertions;
2932 for (size_t i=0; i < possible_insertions_files.size(); ++i)
2933 {
2934 char splice_buf[2048];
2935 FILE* insertions_file = possible_insertions_files[i];
2936 if(!insertions_file){
2937 continue;
2938 }
2939 while(fgets(splice_buf, 2048, insertions_file)){
2940 char* nl = strrchr(splice_buf, '\n');
2941 char* buf = splice_buf;
2942 if (nl) *nl = 0;
2943
2944 char* ref_name = get_token((char**)&buf, "\t");
2945 char* scan_left_coord = get_token((char**)&buf, "\t");
2946 char* scan_right_coord = get_token((char**)&buf, "\t");
2947 char* scan_sequence = get_token((char**)&buf, "\t");
2948
2949 if (!scan_left_coord || !scan_sequence || !scan_right_coord)
2950 {
2951 err_die("Error: malformed insertion coordinate record\n");
2952 }
2953
2954 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
2955 uint32_t left_coord = atoi(scan_left_coord);
2956 std::string sequence(scan_sequence);
2957 possible_insertions.insert(Insertion(ref_id, left_coord, sequence));
2958 }
2959 }
2960 fprintf(stderr, "done\n");
2961
2962 vector<uint64_t> read_ids;
2963 vector<vector<int64_t> > offsets;
2964 if (num_threads > 1)
2965 {
2966 vector<string> fnames;
2967 fnames.push_back(reads_fname);
2968 fnames.insert(fnames.end(), spliced_segmap_fnames.rbegin(), spliced_segmap_fnames.rend());
2969 fnames.insert(fnames.end(), segmap_fnames.rbegin(), segmap_fnames.rend());
2970 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
2971 if (!enough_data)
2972 num_threads = 1;
2973 }
2974
2975 std::set<Fusion> possible_fusions;
2976 if (fusion_search)
2977 {
2978 fprintf(stderr, "Loading fusions...");
2979 for (size_t i=0; i < possible_fusions_files.size(); ++i)
2980 {
2981 char splice_buf[2048];
2982 FILE* fusions_file = possible_fusions_files[i];
2983 if(!fusions_file){
2984 continue;
2985 }
2986 while(fgets(splice_buf, 2048, fusions_file)){
2987 char* nl = strrchr(splice_buf, '\n');
2988 char* buf = splice_buf;
2989 if (nl) *nl = 0;
2990
2991 char* ref_name1 = strsep((char**)&buf, "\t");
2992 char* scan_left_coord = strsep((char**)&buf, "\t");
2993 char* ref_name2 = strsep((char**)&buf, "\t");
2994 char* scan_right_coord = strsep((char**)&buf, "\t");
2995 char* scan_dir = strsep((char**)&buf, "\t");
2996
2997 if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir)
2998 {
2999 fprintf(stderr,"Error: malformed insertion coordinate record\n");
3000 exit(1);
3001 }
3002
3003 uint32_t ref_id1 = rt.get_id(ref_name1,NULL,0);
3004 uint32_t ref_id2 = rt.get_id(ref_name2,NULL,0);
3005 uint32_t left_coord = atoi(scan_left_coord);
3006 uint32_t right_coord = atoi(scan_right_coord);
3007 uint32_t dir = FUSION_FF;
3008 if (strcmp(scan_dir, "fr") == 0)
3009 dir = FUSION_FR;
3010 else if(strcmp(scan_dir, "rf") == 0)
3011 dir = FUSION_RF;
3012 else if (strcmp(scan_dir, "rr") == 0)
3013 dir = FUSION_RR;
3014
3015 possible_fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir));
3016 }
3017 }
3018 fprintf(stderr, "done\n");
3019 }
3020
3021 vector<boost::thread*> threads;
3022 for (int i = 0; i < num_threads; ++i)
3023 {
3024 JoinSegmentsWorker worker;
3025
3026 if (num_threads == 1)
3027 worker.bam_output_fname = bam_output_fname;
3028 else
3029 {
3030 string filename_base = bam_output_fname.substr(0, bam_output_fname.length() - 4);
3031 char filename[1024] = {0};
3032 sprintf(filename, "%s%d.bam", filename_base.c_str(), i);
3033 worker.bam_output_fname = filename;
3034 }
3035
3036 worker.sam_header_fname = sam_header;
3037 worker.reads_fname = reads_fname;
3038 worker.segmap_fnames = segmap_fnames;
3039 worker.spliced_segmap_fnames = spliced_segmap_fnames;
3040 worker.possible_juncs = &possible_juncs;
3041 worker.possible_insertions = &possible_insertions;
3042 worker.possible_fusions = &possible_fusions;
3043 worker.rt = &rt;
3044 if (i == 0)
3045 {
3046 worker.begin_id = 0;
3047 worker.seg_offsets = vector<int64_t>(segmap_fnames.size(), 0);
3048 worker.spliced_seg_offsets = vector<int64_t>(spliced_segmap_fnames.size(), 0);
3049 worker.read_offset = 0;
3050 }
3051 else
3052 {
3053 worker.begin_id = read_ids[i-1];
3054 worker.seg_offsets.insert(worker.seg_offsets.end(),
3055 offsets[i-1].rbegin(),
3056 offsets[i-1].rbegin() + segmap_fnames.size());
3057 worker.spliced_seg_offsets.insert(worker.spliced_seg_offsets.end(),
3058 offsets[i-1].rbegin() + segmap_fnames.size(),
3059 offsets[i-1].rend() - 1);
3060 worker.read_offset = offsets[i-1][0];
3061 }
3062
3063 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
3064
3065 if (num_threads > 1)
3066 threads.push_back(new boost::thread(worker));
3067 else
3068 worker();
3069 }
3070
3071 for (size_t i = 0; i < threads.size(); ++i)
3072 {
3073 threads[i]->join();
3074 delete threads[i];
3075 threads[i] = NULL;
3076 }
3077 threads.clear();
3078
3079 } //driver
3080
3081 int main(int argc, char** argv)
3082 {
3083 fprintf(stderr, "long_spanning_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
3084 fprintf(stderr, "--------------------------------------------\n");
3085
3086 int parse_ret = parse_options(argc, argv, print_usage);
3087 if (parse_ret)
3088 return parse_ret;
3089
3090 if(optind >= argc)
3091 {
3092 print_usage();
3093 return 1;
3094 }
3095
3096 string ref_file_name = argv[optind++];
3097
3098 if(optind >= argc)
3099 {
3100 print_usage();
3101 return 1;
3102 }
3103
3104
3105 string reads_file_name = argv[optind++];
3106
3107 if(optind >= argc)
3108 {
3109 print_usage();
3110 return 1;
3111 }
3112
3113 string juncs_file_list = argv[optind++];
3114
3115 if(optind >= argc)
3116 {
3117 print_usage();
3118 return 1;
3119 }
3120
3121 string insertions_file_list = argv[optind++];
3122 if(optind >= argc)
3123 {
3124 print_usage();
3125 return 1;
3126 }
3127
3128 string deletions_file_list = argv[optind++];
3129
3130 if(optind >= argc)
3131 {
3132 print_usage();
3133 return 1;
3134 }
3135
3136 string fusions_file_list = argv[optind++];
3137
3138 if(optind >= argc)
3139 {
3140 print_usage();
3141 return 1;
3142 }
3143
3144 string bam_output_fname = argv[optind++];
3145
3146 if(optind >= argc)
3147 {
3148 print_usage();
3149 return 1;
3150 }
3151
3152 string segmap_file_list = argv[optind++];
3153 string spliced_segmap_flist;
3154 if(optind < argc)
3155 {
3156 spliced_segmap_flist = argv[optind++];
3157 }
3158
3159 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
3160 if (!ref_stream.good())
3161 err_die("Error: cannot open %s for reading\n",ref_file_name.c_str());
3162
3163 checkSamHeader();
3164
3165 //FILE* reads_file = fopen(reads_file_name.c_str(), "r");
3166 vector<string> segmap_file_names;
3167 tokenize(segmap_file_list, ",",segmap_file_names);
3168
3169 vector<string> juncs_file_names;
3170 vector<FILE*> juncs_files;
3171 tokenize(juncs_file_list, ",",juncs_file_names);
3172 for (size_t i = 0; i < juncs_file_names.size(); ++i) {
3173 //fprintf(stderr, "Opening %s for reading\n",
3174 // juncs_file_names[i].c_str());
3175 FILE* juncs_file = fopen(juncs_file_names[i].c_str(), "r");
3176 if (juncs_file == NULL) {
3177 fprintf(stderr, "Warning: cannot open %s for reading\n",
3178 juncs_file_names[i].c_str());
3179 continue;
3180 }
3181 juncs_files.push_back(juncs_file);
3182 }
3183
3184 /*
3185 * Read in the deletion file names
3186 */
3187 vector<string> deletions_file_names;
3188 vector<FILE*> deletions_files;
3189 tokenize(deletions_file_list, ",",deletions_file_names);
3190 for (size_t i = 0; i < deletions_file_names.size(); ++i)
3191 {
3192 //fprintf(stderr, "Opening %s for reading\n",
3193 // deletions_file_names[i].c_str());
3194 FILE* deletions_file = fopen(deletions_file_names[i].c_str(), "r");
3195 if (deletions_file == NULL) {
3196 fprintf(stderr, "Warning: cannot open %s for reading\n",
3197 deletions_file_names[i].c_str());
3198 continue;
3199 }
3200 deletions_files.push_back(deletions_file);
3201 }
3202
3203 /*
3204 * Read in the list of filenames that contain
3205 * insertion coordinates
3206 */
3207
3208 vector<string> insertions_file_names;
3209 vector<FILE*> insertions_files;
3210 tokenize(insertions_file_list, ",",insertions_file_names);
3211 for (size_t i = 0; i < insertions_file_names.size(); ++i)
3212 {
3213 //fprintf(stderr, "Opening %s for reading\n",
3214 // insertions_file_names[i].c_str());
3215 FILE* insertions_file = fopen(insertions_file_names[i].c_str(), "r");
3216 if (insertions_file == NULL)
3217 {
3218 fprintf(stderr, "Warning: cannot open %s for reading\n",
3219 insertions_file_names[i].c_str());
3220 continue;
3221 }
3222 insertions_files.push_back(insertions_file);
3223 }
3224
3225 vector<string> spliced_segmap_file_names;
3226 vector<FZPipe> spliced_segmap_files;
3227 string unzcmd;
3228 tokenize(spliced_segmap_flist, ",",spliced_segmap_file_names);
3229
3230 vector<string> fusions_file_names;
3231 vector<FILE*> fusions_files;
3232 tokenize(fusions_file_list, ",",fusions_file_names);
3233 for (size_t i = 0; i < fusions_file_names.size(); ++i)
3234 {
3235 fprintf(stderr, "Opening %s for reading\n",
3236 fusions_file_names[i].c_str());
3237 FILE* fusions_file = fopen(fusions_file_names[i].c_str(), "r");
3238 if (fusions_file == NULL)
3239 {
3240 fprintf(stderr, "Warning: cannot open %s for reading\n",
3241 fusions_file_names[i].c_str());
3242 continue;
3243 }
3244 fusions_files.push_back(fusions_file);
3245 }
3246
3247
3248 driver(bam_output_fname,
3249 ref_stream,
3250 juncs_files,
3251 insertions_files,
3252 deletions_files,
3253 fusions_files,
3254 spliced_segmap_file_names,
3255 segmap_file_names,
3256 reads_file_name);
3257
3258 return 0;
3259 }