ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/fusions.cpp
Revision: 166
Committed: Fri Feb 10 02:45:06 2012 UTC (7 years, 8 months ago) by gpertea
File size: 27501 byte(s)
Log Message:
resync

Line File contents
1 /*
2 * fusions.cpp
3 * TopHat
4 */
5
6 #ifdef HAVE_CONFIG_H
7 #include <config.h>
8 #else
9 #define PACKAGE_VERSION "INTERNAL"
10 #define SVN_REVISION "XXX"
11 #endif
12
13
14 #include <cassert>
15 #include <cstdio>
16 #include <cstring>
17 #include <vector>
18 #include <string>
19 #include <map>
20 #include <algorithm>
21 #include <set>
22 #include <stdexcept>
23 #include <iostream>
24 #include <fstream>
25 #include <seqan/sequence.h>
26 #include <seqan/find.h>
27 #include <seqan/file.h>
28 #include <seqan/modifier.h>
29 // #include <seqan/align.h>
30 #include <getopt.h>
31
32 #include "common.h"
33 #include "bwt_map.h"
34 #include "junctions.h"
35 #include "fusions.h"
36 #include "fragments.h"
37 #include "wiggles.h"
38 #include "tokenize.h"
39 #include "reads.h"
40
41 #include "inserts.h"
42
43 // daehwan - this is redundancy, too!
44 int difference(const string& first, const string& second)
45 {
46 int len = seqan::length(first);
47 if (len != (int)seqan::length(second))
48 return 0;
49
50 int min_value = 10000;
51 short value1[1024] = {0,};
52 short value2[1024] = {0,};
53
54 short* curr = value1;
55 short* prev = value2;
56
57 for (int j = 0; j < len; ++j)
58 {
59 for (int i = 0; i < len; ++i)
60 {
61 int value = 10000;
62 int match = first[i] == second[j] ? 0 : 1;
63
64 // right
65 if (i == 0)
66 value = j * 2 + match;
67 else if (j > 0)
68 value = prev[i] + 2;
69
70 int temp_value = 10000;
71
72 // down
73 if (j == 0)
74 temp_value = i * 2 + match;
75 else if (i > 0)
76 temp_value = curr[i-1] + 2;
77
78 if (temp_value < value)
79 value = temp_value;
80
81 // match
82 if (i > 0 && j > 0)
83 temp_value = prev[i-1] + match;
84
85 if (temp_value < value)
86 value = temp_value;
87
88 curr[i] = value;
89
90 if ((i == len - 1 || j == len - 1) && value < min_value)
91 min_value = value;
92 }
93
94 short* temp = prev;
95 prev = curr;
96 curr = temp;
97 }
98
99 return min_value;
100 }
101
102
103 /**
104 * Add fusions from an alignment to an FusionSet.
105 * This will look for fusion in the alignment specified by bh.
106 * @param bh The bowtie hit to be used to specify alignment infromation.
107 * @param fusions The FusionSet that will be updated with the insertion information from teh alignment.
108 */
109 void fusions_from_alignment(const BowtieHit& bh,
110 FusionSet& fusions,
111 RefSequenceTable& rt,
112 bool update_stat)
113 {
114 RefSequenceTable::Sequence* ref_str1 = rt.get_seq(bh.ref_id());
115 RefSequenceTable::Sequence* ref_str2 = rt.get_seq(bh.ref_id2());
116
117 if (!ref_str1 || !ref_str2)
118 return;
119
120 vector<Fusion> new_fusions;
121 fusions_from_spliced_hit(bh, new_fusions);
122
123 for(size_t i = 0; i < new_fusions.size(); ++i)
124 {
125 Fusion fusion = new_fusions[i];
126 const vector<CigarOp>& cigars = bh.cigar();
127
128 /*
129 * Assume read is in the same direction as fusion.
130 */
131
132 // Find the position of Fusion.
133 size_t fusion_pos = 0;
134 for (; fusion_pos < cigars.size(); ++fusion_pos)
135 {
136 CigarOpCode opcode = cigars[fusion_pos].opcode;
137 if (opcode == FUSION_FF || opcode == FUSION_FR || opcode == FUSION_RF || opcode == FUSION_RR)
138 break;
139 }
140
141 if (fusion_pos <= 0 || fusion_pos + 1 >= cigars.size())
142 continue;
143
144 // For left bases,
145 size_t left_pos = 0;
146 for (int j = (int)fusion_pos - 1; j >= 0; --j)
147 {
148 const CigarOp& cigar = cigars[j];
149 switch (cigar.opcode)
150 {
151 case MATCH:
152 case mATCH:
153 case REF_SKIP:
154 case rEF_SKIP:
155 case DEL:
156 case dEL:
157 left_pos += cigar.length;
158 break;
159 default:
160 break;
161 }
162 }
163
164 // For right bases,
165 size_t right_pos = 0;
166 for (size_t j = fusion_pos + 1; j < cigars.size(); ++j)
167 {
168 const CigarOp& cigar = cigars[j];
169 switch (cigar.opcode)
170 {
171 case MATCH:
172 case mATCH:
173 case REF_SKIP:
174 case rEF_SKIP:
175 case DEL:
176 case dEL:
177 right_pos += cigar.length;
178 break;
179 default:
180 break;
181 }
182 }
183
184 if (left_pos < fusion_anchor_length || right_pos < fusion_anchor_length)
185 continue;
186
187 FusionSet::iterator itr = fusions.find(fusion);
188 if (itr != fusions.end())
189 {
190 itr->second.count += 1;
191 }
192 else
193 {
194 assert(fusion.refid1 != 0xFFFFFFFF);
195 FusionStat fusionStat;
196 fusionStat.count = 1;
197 fusions[fusion] = fusionStat;
198
199 itr = fusions.find(fusion);
200
201 if (!update_stat)
202 {
203 /*
204 * make a reversed fusion.
205 * this is necessary to detect reads that contradict the fusion.
206 */
207
208 FusionStat fusionStat_rev;
209 fusionStat_rev.count = 1;
210 fusionStat_rev.reversed = true;
211
212 Fusion fusion_rev(fusion.refid2, fusion.refid1, fusion.right, fusion.left, fusion.dir);
213 fusions[fusion_rev] = fusionStat_rev;
214 }
215 }
216
217 if (update_stat)
218 {
219 if (itr->second.chr1_seq.length() <= 0)
220 {
221 size_t len = 100;
222 size_t half_len = len / 2;
223 size_t increase = 20;
224
225 if (fusion.left >= half_len && fusion.left + half_len <= seqan::length(*ref_str1) &&
226 fusion.right >= half_len && fusion.right + half_len <= seqan::length(*ref_str2))
227 {
228 seqan::Dna5String left, right;
229
230 if (fusion.dir == FUSION_RF || fusion.dir == FUSION_RR)
231 {
232 left = seqan::infix(*ref_str1, fusion.left - half_len, fusion.left + half_len);
233 seqan::convertInPlace(left, seqan::FunctorComplement<seqan::Dna>());
234 seqan::reverseInPlace(left);
235 }
236 else
237 left = seqan::infix(*ref_str1, fusion.left - half_len + 1, fusion.left + half_len + 1);
238
239 if (fusion.dir == FUSION_FR || fusion.dir == FUSION_RR)
240 {
241 right = seqan::infix(*ref_str2, fusion.right - half_len + 1, fusion.right + half_len + 1);
242 seqan::convertInPlace(right, seqan::FunctorComplement<seqan::Dna>());
243 seqan::reverseInPlace(right);
244 }
245 else
246 right = seqan::infix(*ref_str2, fusion.right - half_len, fusion.right + half_len);
247
248 itr->second.chr1_seq = DnaString_to_string(left);
249 itr->second.chr2_seq = DnaString_to_string(right);
250
251 for (size_t j = 0; j < 5; ++j)
252 {
253 size_t pos = (5 - j - 1) * increase / 2;
254 const string& left_sub = itr->second.chr1_seq.substr(pos, (j+1) * increase);
255 const string& right_sub = itr->second.chr2_seq.substr(pos, (j+1) * increase);
256
257 itr->second.diffs.push_back(difference(left_sub, right_sub));
258 }
259 }
260 }
261
262 assert (bh.ref_id() == itr->first.refid1);
263
264 itr->second.left_ext = max((size_t)itr->second.left_ext, left_pos);
265 itr->second.right_ext = max((size_t)itr->second.right_ext, right_pos);
266
267 for (size_t k = 0; k < left_pos && k < itr->second.left_bases.size(); ++k)
268 {
269 ++(itr->second.left_bases[k]);
270 }
271
272 for (size_t k = 0; k < right_pos && k < itr->second.right_bases.size(); ++k)
273 {
274 ++(itr->second.right_bases[k]);
275 }
276 }
277 }
278 }
279
280 void unsupport_fusions(const BowtieHit& bh, FusionSet& fusions, FusionSet& fusions_ref)
281 {
282 if (bh.fusion_opcode() != FUSION_NOTHING || bh.is_spliced() || bh.read_len() < 40)
283 return;
284
285 FusionSet::iterator lb, ub;
286
287 uint32_t left = bh.left() + 20;
288 uint32_t right = bh.right() - 20;
289
290 lb = fusions_ref.upper_bound(Fusion(0u, 0u, left, 0));
291 ub = fusions_ref.lower_bound(Fusion(0xffffffffu, 0xffffffffu, right, 0xffffffffu));
292 while (lb != ub && lb != fusions_ref.end())
293 {
294 if (lb->first.refid1 == bh.ref_id())
295 {
296 // daehwan
297 #if 0
298 // MCF-7 RPS6KB1 17:57970443-58027925:1 TMEM49 17:57784863-57917950:1
299 if ((lb->first.left == 57992061 && lb->first.right == 57917126) ||
300 (lb->first.left == 57917126 && lb->first.right == 57992061))
301 {
302 const char* dir_str = "ff";
303 if (lb->first.dir == FUSION_FR)
304 dir_str = "fr";
305 else if (lb->first.dir == FUSION_RF)
306 dir_str = "rf";
307
308 cout << "fusion: " << lb->first.left << "-" << lb->first.right << endl;
309 cout << dir_str << endl;
310 cout << bh.insert_id() << ": " << bh.left() << "-" << bh.right() << "\t";
311 cout << print_cigar(bh.cigar()) << " " << (int)bh.edit_dist() << endl;
312 cout << bh.seq() << endl;
313 cout << bh.qual() << endl;
314 cout << endl;
315 }
316 #endif
317
318 FusionSet::iterator itr;
319 if (lb->second.reversed)
320 itr = fusions.find(Fusion(lb->first.refid2, lb->first.refid1, lb->first.right, lb->first.left, lb->first.dir));
321 else
322 itr = fusions.find(lb->first);
323
324 if (itr == fusions.end())
325 {
326 FusionStat fusionStat;
327 fusionStat.unsupport_count = 1;
328 fusions[lb->first] = fusionStat;
329 }
330 else
331 ++(itr->second.unsupport_count);
332 }
333
334 ++lb;
335 }
336 }
337
338 /**
339 */
340 void print_fusions(FILE* fusions_out, FusionSet& fusions, RefSequenceTable& ref_sequences)
341 {
342 // fprintf(fusions_out, "track name=fusions description=\"TopHat fusions\"\n");
343
344 vector<Fusion> vFusion;
345 for (FusionSet::iterator itr = fusions.begin(); itr != fusions.end(); ++itr)
346 {
347 vFusion.push_back(itr->first);
348 }
349 sort(vFusion.begin(), vFusion.end());
350
351 for (size_t i = 0; i < vFusion.size(); ++i)
352 {
353 FusionSet::iterator itr = fusions.find(vFusion[i]);
354 int counts = itr->second.count;
355 if (counts <= 0 || itr->second.reversed)
356 continue;
357
358 const char* dir = "";
359 if (itr->first.dir == FUSION_FF)
360 dir = "ff";
361 else if(itr->first.dir == FUSION_FR)
362 dir = "fr";
363 else if(itr->first.dir == FUSION_RF)
364 dir = "rf";
365 else
366 dir = "rr";
367
368 assert (itr->second.left_bases.size() == itr->second.right_bases.size());
369
370 float symm = 0.0f;
371 for (uint32_t i = 0; i < itr->second.left_bases.size(); ++i)
372 {
373 float term = ((int)itr->second.left_bases[i] - (int)itr->second.right_bases[i]) / (float)counts;
374 symm += (term * term);
375 }
376
377 fprintf(fusions_out, "%s-%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%.6f",
378 ref_sequences.get_name(itr->first.refid1),
379 ref_sequences.get_name(itr->first.refid2),
380 itr->first.left,
381 itr->first.right,
382 dir,
383 counts,
384 itr->second.pair_count,
385 itr->second.pair_count_fusion,
386 itr->second.unsupport_count,
387 itr->second.left_ext,
388 itr->second.right_ext,
389 symm);
390
391 fprintf(fusions_out, "\t@\t");
392
393 for (uint32_t i = 0; i < itr->second.diffs.size(); ++i)
394 {
395 fprintf(fusions_out, "%d ", itr->second.diffs[i]);
396 }
397
398 fprintf(fusions_out, "\t@\t");
399
400 uint32_t half_length = itr->second.chr1_seq.length() / 2;
401 fprintf(fusions_out, "%s %s\t@\t", itr->second.chr1_seq.substr(0, half_length).c_str(), itr->second.chr1_seq.substr(half_length).c_str());
402 fprintf(fusions_out, "%s %s\t@\t", itr->second.chr2_seq.substr(0, half_length).c_str(), itr->second.chr2_seq.substr(half_length).c_str());
403
404 for (uint32_t i = 0; i < itr->second.left_bases.size(); ++i)
405 {
406 fprintf(fusions_out, "%d ", itr->second.left_bases[i]);
407 }
408
409 fprintf(fusions_out, "\t@\t");
410
411 for (uint32_t i = 0; i < itr->second.right_bases.size(); ++i)
412 {
413 fprintf(fusions_out, "%d ", itr->second.right_bases[i]);
414 }
415
416 fprintf(fusions_out, "\t@\t");
417
418 sort(itr->second.vPairSupport.begin(), itr->second.vPairSupport.end());
419
420 for (uint32_t i = 0; i < min((size_t)200, itr->second.vPairSupport.size()); ++i)
421 {
422 fprintf(fusions_out, "%d:%d ", itr->second.vPairSupport[i].ldist, itr->second.vPairSupport[i].rdist);
423 }
424
425 fprintf(fusions_out, "\n");
426 }
427 }
428
429 /**
430 * Extract a list of fusions from a bowtie hit.
431 * Given a bowtie hit, extract a vector of insertions.
432 * @param bh The bowtie hit to use for alignment information.
433 * @param insertions Used to store the resultant vector of insertions.
434 */
435 void fusions_from_spliced_hit(const BowtieHit& bh, vector<Fusion>& fusions, bool auto_sort)
436 {
437 const vector<CigarOp>& cigar = bh.cigar();
438 unsigned int positionInGenome = bh.left();
439
440 for(size_t c = 0; c < cigar.size(); ++c)
441 {
442 Fusion fusion;
443 switch(cigar[c].opcode)
444 {
445 case REF_SKIP:
446 case MATCH:
447 case DEL:
448 positionInGenome += cigar[c].length;
449 break;
450 case rEF_SKIP:
451 case mATCH:
452 case dEL:
453 positionInGenome -= cigar[c].length;
454 break;
455 case FUSION_FF:
456 case FUSION_FR:
457 case FUSION_RF:
458 case FUSION_RR:
459 fusion.dir = cigar[c].opcode;
460 if (fusion.dir == FUSION_RF || fusion.dir == FUSION_RR)
461 positionInGenome = positionInGenome + 1;
462 else
463 positionInGenome = positionInGenome - 1;
464
465 if (bh.ref_id() < bh.ref_id2() ||
466 (bh.ref_id() == bh.ref_id2() && positionInGenome < cigar[c].length) ||
467 !auto_sort)
468 {
469 fusion.refid1 = bh.ref_id();
470 fusion.refid2 = bh.ref_id2();
471 fusion.left = positionInGenome;
472 fusion.right = cigar[c].length;
473 }
474 else
475 {
476 assert (auto_sort);
477 fusion.refid1 = bh.ref_id2();
478 fusion.refid2 = bh.ref_id();
479 fusion.left = cigar[c].length;
480 fusion.right = positionInGenome;
481 }
482
483 fusions.push_back(fusion);
484 break;
485 default:
486 break;
487 }
488 }
489 }
490
491 void pair_support(const vector<pair<BowtieHit, BowtieHit> >& best_hits, FusionSet& fusions, FusionSet& fusions_ref)
492 {
493 if (best_hits.size() > fusion_multipairs)
494 return;
495
496 for (size_t i = 0; i < best_hits.size(); ++i)
497 {
498 const BowtieHit& lh = best_hits[i].first;
499 const BowtieHit& rh = best_hits[i].second;
500
501 bool left_fusionSpanned = lh.fusion_opcode() != FUSION_NOTHING;
502 bool right_fusionSpanned = rh.fusion_opcode() != FUSION_NOTHING;
503 if (left_fusionSpanned && right_fusionSpanned)
504 continue;
505
506 bool fusionSpanned = left_fusionSpanned || right_fusionSpanned;
507 bool fusion_leftSide = false;
508
509 uint32_t ref_id1 = lh.ref_id2();
510 uint32_t ref_id2 = rh.ref_id();
511
512 int dir = FUSION_FF;
513 if (fusionSpanned)
514 {
515 if (left_fusionSpanned)
516 dir = lh.fusion_opcode();
517 else
518 dir = rh.fusion_opcode();
519 }
520 else
521 {
522 if (!lh.antisense_align() && !rh.antisense_align())
523 dir = FUSION_FR;
524 else if (lh.antisense_align() && rh.antisense_align())
525 dir = FUSION_RF;
526 else
527 {
528 if (lh.ref_id() == rh.ref_id())
529 {
530 if ((lh.antisense_align() && lh.left() > rh.left()) ||
531 (!lh.antisense_align() && lh.left() < rh.left()))
532 dir = FUSION_FF;
533 else
534 dir = FUSION_RR;
535 }
536 else
537 {
538 if ((lh.antisense_align() && lh.ref_id() > rh.ref_id()) ||
539 (!lh.antisense_align() && lh.ref_id() < rh.ref_id()))
540 dir = FUSION_FF;
541 else
542 dir = FUSION_RR;
543 }
544 }
545 }
546
547 FusionSet::iterator lb, ub;
548 bool unsupport = false;
549
550 // int inner_dist = max_report_intron_length * 2;
551
552 int inner_dist = 10000;
553 int outer_dist = 10000 * 2;
554 int max_dist = 10000 * 2;
555 int left1 = 0, left2 = 0, right1 = 0, right2 = 0;
556
557 if (fusionSpanned)
558 {
559 vector<Fusion> new_fusions;
560 if (left_fusionSpanned)
561 fusions_from_spliced_hit(lh, new_fusions);
562 else
563 fusions_from_spliced_hit(rh, new_fusions);
564
565 Fusion& fusion = new_fusions[0];
566 dir = fusion.dir;
567 ref_id1 = fusion.refid1;
568 ref_id2 = fusion.refid2;
569
570 if (left_fusionSpanned && lh.ref_id() != ref_id2 && lh.ref_id2() != ref_id2)
571 unsupport = true;
572
573 if (right_fusionSpanned && rh.ref_id() != ref_id1 && rh.ref_id2() != ref_id1)
574 unsupport = true;
575
576 int temp1, temp2;
577 const BowtieHit* fusionHit;
578 const BowtieHit* otherHit;
579
580 if (left_fusionSpanned)
581 {
582 fusionHit = &lh;
583 otherHit = &rh;
584 }
585 else
586 {
587 fusionHit = &rh;
588 otherHit = &lh;
589 }
590
591 // check the normal hit (otherHit) is on the left hand side of the fusion.
592 if (ref_id1 == ref_id2)
593 {
594 if (dir == FUSION_FF || dir == FUSION_FR)
595 fusion_leftSide = otherHit->left() < fusionHit->left();
596 else if (dir == FUSION_RF)
597 fusion_leftSide = otherHit->left() < fusionHit->right();
598 else
599 fusion_leftSide = otherHit->right() > fusionHit->left();
600 }
601 else
602 fusion_leftSide = fusionHit->ref_id() == otherHit->ref_id();
603
604 // *****
605 // daehwan - make sure what '+' and '-' really mean for FF, FR, RF, RR cases
606 // *****
607 if ((dir != FUSION_RF && dir != FUSION_RR) && fusionHit->antisense_align() && (!fusion_leftSide || otherHit->antisense_align()))
608 unsupport = true;
609
610 if ((dir != FUSION_FR && dir != FUSION_RR) && !fusionHit->antisense_align() && (fusion_leftSide || !otherHit->antisense_align()))
611 unsupport = true;
612
613 if ((dir == FUSION_FR || dir == FUSION_RR) && !fusionHit->antisense_align() && (fusion_leftSide || otherHit->antisense_align()))
614 unsupport = true;
615
616 if ((dir == FUSION_RF || dir == FUSION_RR) && fusionHit->antisense_align() && (!fusion_leftSide || !otherHit->antisense_align()))
617 unsupport = true;
618
619 temp1 = otherHit->left();
620 temp2 = otherHit->right();
621
622 if ((fusion_leftSide && dir == FUSION_RF) || (!fusion_leftSide && dir != FUSION_FR))
623 {
624 temp2 = temp1 + inner_dist;
625 if (temp1 > outer_dist)
626 temp1 = temp1 - outer_dist;
627 else
628 temp1 = 0;
629 }
630 else
631 {
632 if (temp2 >= inner_dist)
633 temp1 = temp2 - inner_dist;
634 else
635 temp1 = 0;
636
637 temp2 = temp2 + outer_dist;
638 }
639
640 if (fusion_leftSide)
641 {
642 left1 = temp1;
643 left2 = temp2;
644 }
645 else
646 {
647 right1 = temp1;
648 right2 = temp2;
649 }
650
651 lb = fusions_ref.find(fusion);
652 ub = fusions_ref.end();
653
654 // daehwan - debug
655 #if 0
656 if (fusion.left == 6994359 && fusion.right == 17581683)
657 {
658 cout << "daehwan - test - pair_with_fusion: " << lh.insert_id() << endl;
659 cout << "edit dist: " << (int)lh.edit_dist() << ", " << (int)rh.edit_dist() << endl;
660 const char* dir_str = "ff";
661 if (dir == FUSION_FR)
662 dir_str = "fr";
663 else if (dir == FUSION_RF)
664 dir_str = "rf";
665 else if (dir == FUSION_RR)
666 dir_str = "rr";
667
668 cout << dir_str << " : " << (lh.antisense_align() ? "-" : "+") << " " << (rh.antisense_align() ? "-" : "+") << endl;
669 cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl;
670 cout << lh.ref_id2() << ": " << print_cigar(lh.cigar()) << endl;
671 cout << rh.ref_id() << ": " << rh.left() << "-" << rh.right() << endl;
672 cout << rh.ref_id2() << ": " << print_cigar(rh.cigar()) << endl;
673 cout << "found: " << (lb == ub ? "no" : "yes") << endl;
674 cout << "unsupport: " << (unsupport ? "yes" : "no") << endl;
675 cout << "fusion_left: " << (fusion_leftSide ? "yes" : "no") << endl;
676 cout << endl;
677 }
678 #endif
679 }
680 else
681 {
682 if (dir == FUSION_FF)
683 {
684 if (lh.antisense_align())
685 {
686 if (rh.right() >= inner_dist)
687 right1 = rh.right() - inner_dist;
688 right2 = right1 + outer_dist;
689
690 left2 = lh.left() + inner_dist;
691 if (left2 > outer_dist)
692 left1 = left2 - outer_dist;
693 }
694 else
695 {
696 if (lh.right() >= inner_dist)
697 left1 = lh.right() - inner_dist;
698 left2 = left1 + outer_dist;
699
700 right2 = rh.left() + inner_dist;
701 if (right2 > outer_dist)
702 right1 = right2 - outer_dist;
703 }
704 }
705 else if (dir == FUSION_FR)
706 {
707 if (lh.right() >= inner_dist)
708 left1 = lh.right() - inner_dist;
709 left2 = left1 + outer_dist;
710
711 if (rh.right() >= inner_dist)
712 right1 = rh.right() - inner_dist;
713 right2 = right1 + outer_dist;
714 }
715 else if (dir == FUSION_RF)
716 {
717 left2 = lh.left() + inner_dist;
718 right2 = rh.left() + inner_dist;
719
720 if (left2 > outer_dist)
721 left1 = left2 - outer_dist;
722
723 if (right2 > outer_dist)
724 right1 = right2 - outer_dist;
725 }
726 else // if (dir == FUSION_RR)
727 {
728 if (lh.antisense_align())
729 {
730 left2 = lh.left() + inner_dist;
731 if (left2 > outer_dist)
732 left1 = left2 - outer_dist;
733
734 if (rh.right() >= inner_dist)
735 right1 = rh.right() - inner_dist;
736 right2 = right1 + outer_dist;
737 }
738 else
739 {
740 if (lh.right() >= inner_dist)
741 left1 = lh.right() - inner_dist;
742 left2 = left1 + outer_dist;
743
744 right2 = rh.left() + inner_dist;
745 if (right2 > outer_dist)
746 right1 = right2 - outer_dist;
747 }
748 }
749
750 // daehwan - debug
751 #if 0
752 if (fusion.left == 6994359 && fusion.right == 17581683)
753 {
754 const char* dir_str = "ff";
755 if (dir == FUSION_FR)
756 dir_str = "fr";
757 else if (dir == FUSION_RF)
758 dir_str = "rf";
759 else if (dir == FUSION_RR)
760 dir_str = "rr";
761
762 cout << "paired-end from two chromosomes" << endl;
763 cout << "insert id: " << lh.insert_id() << endl;
764 cout << dir_str << " : " << (lh.antisense_align() ? "-" : "+") << " " << (rh.antisense_align() ? "-" : "+") << endl;
765 cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl;
766 cout << lh.ref_id2() << " " << print_cigar(lh.cigar()) << endl;
767 cout << rh.ref_id() << ": " << rh.left() << "-" << rh.right() << endl;
768 cout << rh.ref_id2() << " " << print_cigar(rh.cigar()) << endl;
769 cout << "left: " << left1 << "-" << left2 << endl;
770 cout << "right: " << right1 << "-" << right2 << endl;
771 cout << endl;
772 }
773 #endif
774
775 if (ref_id1 > ref_id2 || (ref_id1 == ref_id2 && lh.left() > rh.left()))
776 {
777 uint32_t temp = ref_id1;
778 ref_id1 = ref_id2;
779 ref_id2 = temp;
780
781 temp = left1;
782 left1 = right1;
783 right1 = temp;
784
785 temp = left2;
786 left2 = right2;
787 right2 = temp;
788 }
789
790 lb = fusions_ref.upper_bound(Fusion(ref_id1, ref_id2, left1, right1));
791 ub = fusions_ref.lower_bound(Fusion(ref_id1, ref_id2, left2, right2));
792
793 // daehwan
794 #if 0
795 static const uint32_t chr_id1 = RefSequenceTable::hash_string("chr2");
796 static const uint32_t chr_id2 = RefSequenceTable::hash_string("chr3");
797
798 if ((lh.ref_id() == chr_id1 && rh.ref_id() == chr_id2) ||
799 (lh.ref_id() == chr_id2 && rh.ref_id() == chr_id1))
800 {
801 // KPL-4 BSG 19:571325-583492:1 NFIX 19:13106584-13209610:1
802 // const uint32_t left1 = 571325, right1 = 583492, left2 = 13106584, right2 = 13209610;
803
804 // SK-BR-3 DHX35 20:37590942-37668366:1 ITCH 20:32951041-33099198:1
805 // const uint32_t left1 = 37590942, right1 = 37668366, left2 = 32951041, right2 = 33099198;
806
807 // SK-BR-3 NFS1 20:34220262-34287287:-1 PREX1 20:47240790-47444420:-1
808 // const uint32_t left1 = 34220262, right1 = 34287287, left2 = 47240790, right2 = 47444420;
809
810 // VCaP TIA1 2:70436576-70475792:-1 DIRC2 3:122513642-122599986:1
811 uint32_t left1 = 70436576, right1 = 70475792, left2 = 122513642, right2 = 122599986;
812 if (lh.ref_id() != chr_id1)
813 {
814 uint32_t temp = left1;
815 left1 = left2;
816 left2 = temp;
817
818 temp = right1;
819 right1 = right2;
820 right2 = temp;
821 }
822
823 if ((lh.left() >= left1 && lh.left() <= right1 && rh.left() >= left2 && rh.left() <= right2) ||
824 (lh.left() >= left2 && lh.left() <= right2 && rh.left() >= left1 && rh.left() <= right1))
825 {
826 for (size_t t = 0; t < left.size(); ++t)
827 {
828 const BowtieHit& lh = left[t];
829 const BowtieHit& rh = right[t];
830
831 const char* dir_str = "ff";
832 if (dir == FUSION_FR)
833 dir_str = "fr";
834 else if (dir == FUSION_RF)
835 dir_str = "rf";
836 else if (dir == FUSION_RR)
837 dir_str = "rr";
838
839 cout << "paired-end from two chromosomes" << endl;
840 cout << "insert id: " << lh.insert_id() << endl;
841 cout << dir_str << " : " << (lh.antisense_align() ? "-" : "+") << " " << (rh.antisense_align() ? "-" : "+") << endl;
842 cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl;
843 cout << lh.ref_id2() << " " << print_cigar(lh.cigar()) << endl;
844 cout << rh.ref_id() << ": " << rh.left() << "-" << rh.right() << endl;
845 cout << rh.ref_id2() << " " << print_cigar(rh.cigar()) << endl;
846 cout << "left: " << left1 << "-" << left2 << endl;
847 cout << "right: " << right1 << "-" << right2 << endl;
848 cout << endl;
849 }
850 }
851 }
852 #endif
853 }
854
855 while (lb != ub && lb != fusions_ref.end())
856 {
857 if (lb->first.dir == dir &&
858 lb->first.refid1 == ref_id1 &&
859 lb->first.refid2 == ref_id2 &&
860 ((!fusionSpanned && lb->first.right >= right1 && lb->first.right <= right2) || fusionSpanned) &&
861 !lb->second.reversed)
862 {
863 int dist = 0, left_dist = 0, right_dist = 0;
864
865 // daehwan - check this out
866 if (!fusionSpanned || fusion_leftSide)
867 {
868 if (dir == FUSION_RF || dir == FUSION_RR)
869 left_dist = (left2 - inner_dist) - (int)lb->first.left;
870 else
871 left_dist = (int)lb->first.left - (left1 + inner_dist);
872 }
873
874 if (!fusionSpanned || !fusion_leftSide)
875 {
876 if (dir == FUSION_FR || dir == FUSION_RR)
877 right_dist = (int)lb->first.right - (right1 + inner_dist);
878 else
879 right_dist = (right2 - inner_dist) - (int)lb->first.right;
880 }
881
882 dist = abs(left_dist) + abs(right_dist);
883
884 // daehwan - fix this later
885 if (dist < 0 && fusionSpanned)
886 unsupport = true;
887 bool pass = dist < max_dist;
888
889 // daehwan
890 #if 0
891 // if(!fusionSpanned)
892 // if (pass && !unsupport)
893 if (lb->first.left == 6994359 && lb->first.right == 17581683)
894 {
895 const char* dir_str = "ff";
896 if (dir == FUSION_FR)
897 dir_str = "fr";
898 else if (dir == FUSION_RF)
899 dir_str = "rf";
900 else if (dir == FUSION_RR)
901 dir_str = "rr";
902
903 cout << dir_str << endl;
904 cout << "dist: " << dist << endl;
905 cout << lb->first.refid1 << " " << lb->first.refid2 << endl;
906 cout << lb->first.left << "-" << lb->first.right << endl;
907 cout << "unsupport: " << (unsupport ? "yes" : "no") << endl;
908 cout << "pass: " << (pass ? "yes" : "no") << endl;
909 cout << "ids: " << lh.insert_id() << " : " << rh.insert_id() << endl;
910 cout << endl << endl;
911 }
912 #endif
913
914 FusionSet::iterator itr = fusions.find(lb->first);
915 if (itr != fusions.end())
916 {
917 if (unsupport)
918 ++(itr->second.unsupport_count_pair);
919 else if (pass)
920 {
921 if (fusionSpanned)
922 ++(itr->second.pair_count_fusion);
923 else
924 {
925 if (itr->second.vPairSupport.size() < 200)
926 itr->second.vPairSupport.push_back(FusionPairSupport(left_dist, right_dist));
927 ++(itr->second.pair_count);
928 }
929 }
930 }
931 else
932 {
933 FusionStat fusionStat;
934 if (unsupport)
935 fusionStat.unsupport_count_pair = 1;
936 else if (pass)
937 {
938 if (fusionSpanned)
939 fusionStat.pair_count_fusion = 1;
940 else
941 {
942 fusionStat.vPairSupport.push_back(FusionPairSupport(left_dist, right_dist));
943 fusionStat.pair_count = 1;
944 }
945 }
946
947 fusions[lb->first] = fusionStat;
948 }
949 }
950
951 if (fusionSpanned)
952 break;
953
954 ++lb;
955 }
956 }
957 }
958
959 void merge_with(FusionSimpleSet& fusions, const FusionSimpleSet& other_fusions)
960 {
961 for (FusionSimpleSet::const_iterator other_itr = other_fusions.begin(); other_itr != other_fusions.end(); ++other_itr)
962 {
963 FusionSimpleSet::iterator itr = fusions.find(other_itr->first);
964 if (itr != fusions.end())
965 {
966 FusionSimpleStat& curr = itr->second;
967 curr.merge_with(other_itr->second);
968 }
969 else
970 {
971 fusions[other_itr->first] = other_itr->second;
972 }
973 }
974 }
975
976 void merge_with(FusionSet& fusions, const FusionSet& other_fusions)
977 {
978 for (FusionSet::const_iterator other_itr = other_fusions.begin(); other_itr != other_fusions.end(); ++other_itr)
979 {
980 FusionSet::iterator itr = fusions.find(other_itr->first);
981 if (itr != fusions.end())
982 {
983 FusionStat& curr = itr->second;
984 curr.merge_with(other_itr->second);
985 }
986 else
987 {
988 fusions[other_itr->first] = other_itr->second;
989 }
990 }
991 }

Properties

Name Value
svn:executable *