ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/inserts.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (7 years, 8 months ago) by gpertea
File size: 8629 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 /*
2 * inserts.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 1/14/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9 #ifdef HAVE_CONFIG_H
10 #include <config.h>
11 #endif
12
13 #include <iostream>
14 #include <cassert>
15 #include <map>
16 #include <set>
17 #include "bwt_map.h"
18 #include "common.h"
19 #include "inserts.h"
20 using namespace std;
21
22 bool InsertAlignmentGrade::operator<(const InsertAlignmentGrade& rhs)
23 {
24 if (fusion && !rhs.fusion)
25 return true;
26
27 if (!fusion && rhs.fusion)
28 return false;
29
30 // We always prefer a insert alignment with both ends mapped than a
31 // singleton
32 if (num_mapped != rhs.num_mapped)
33 {
34 return num_mapped < rhs.num_mapped;
35 }
36 // daehwan - later we should do many things to report the best pair alignments.
37 else if (num_mapped == 2 && !fusion)
38 {
39 // if significant difference in their inner mate distances
40 if (abs(rhs.inner_dist - inner_dist) >= 30)
41 {
42 // Prefer a pair that is too close or perfect to one that is too far
43 if (too_far && !rhs.too_far)
44 return true;
45
46 // Prefer a pair that is perfect to one that is too close
47 if (too_close && !(rhs.too_close || rhs.too_far))
48 return true;
49
50 // Prefer closer mates
51 if (rhs.inner_dist < inner_dist)
52 return true;
53 }
54
55 if (edit_dist != rhs.edit_dist)
56 return rhs.edit_dist < edit_dist;
57
58 // daehwan - do somethings here
59 if (!bowtie2)
60 {
61 // Prefer shorter introns
62 if (longest_ref_skip != rhs.longest_ref_skip)
63 return rhs.longest_ref_skip < longest_ref_skip;
64 }
65
66 return false;
67 }
68 else
69 {
70 // We prefer a singleton mapping to an insert with neither end mapped
71 if (num_mapped != rhs.num_mapped)
72 {
73 return num_mapped < rhs.num_mapped; // if RHS has MORE READS, RHS is BETTER (lhs < rhs)
74 }
75 else
76 {
77 if (rhs.num_spliced != num_spliced)
78 return rhs.num_spliced < num_spliced;// if RHS is LESS SPLICED, RHS is BETTER (lhs < rhs)
79
80 // Prefer shorter introns
81 if (longest_ref_skip != rhs.longest_ref_skip)
82 return rhs.longest_ref_skip < longest_ref_skip; // if RHS intron is SHORTER, RHS is BETTER (lhs < rhs)
83
84 return rhs.edit_dist < edit_dist; // if RHS edit is LOWER, RHS is BETTER (lhs < rhs)
85 }
86 }
87 return false;
88 }
89
90 bool gap_lt(const pair<int, int>& lhs, const pair<int, int>& rhs)
91 {
92 return abs(lhs.second - lhs.first) < abs(rhs.second - rhs.first);
93 }
94
95 pair<int, int> pair_distances(const BowtieHit& h1, const BowtieHit& h2)
96 {
97 int minor_hit_start, major_hit_start;
98 int minor_hit_end, major_hit_end;
99 if (h1.left() < h2.left())
100 {
101 minor_hit_start = (int)h1.left();
102 minor_hit_end = (int)h1.right();
103 major_hit_start = (int)h2.left();
104 major_hit_end = (int)h2.right();
105 }
106 else
107 {
108 minor_hit_start = (int)h2.left();
109 minor_hit_end = (int)h2.right();
110 major_hit_start = (int)h1.left();
111 major_hit_end = (int)h1.right();
112 }
113
114 int inner_dist = major_hit_start - minor_hit_end;
115 int outer_dist = major_hit_end - minor_hit_start;
116 return make_pair(outer_dist, inner_dist);
117 }
118
119 void best_insert_mappings(uint64_t refid,
120 ReadTable& it,
121 HitList& hits1_in_ref,
122 HitList& hits2_in_ref,
123 BestInsertAlignmentTable& best_status_for_inserts,
124 bool prefer_shorter_pairs)
125 {
126 long chucked_for_shorter_pair = 0;
127 std::set<size_t> marked;
128 HitList::iterator last_good = hits2_in_ref.begin();
129
130 for (size_t i = 0; i < hits1_in_ref.size(); ++i)
131 {
132 BowtieHit& h1 = hits1_in_ref[i];
133 pair<HitList::iterator, HitList::iterator> range_pair;
134 range_pair = equal_range(last_good, hits2_in_ref.end(),
135 h1, hit_insert_id_lt);
136 bool found_hit = false;
137 if (range_pair.first != range_pair.second)
138 last_good = range_pair.first;
139
140 uint32_t obs_order = it.observation_order(h1.insert_id());
141
142 for (HitList::iterator f = range_pair.first;
143 f != range_pair.second;
144 ++f)
145 {
146 BowtieHit& h2 = *f;
147
148 if (h1.insert_id() == h2.insert_id())
149 {
150 // max mate inner distance (genomic)
151 int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev;
152 if (max_mate_inner_dist == -1)
153 {
154 max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev;
155 }
156
157 InsertAlignmentGrade s(h1, h2, min_mate_inner_dist, max_mate_inner_dist);
158
159 pair<InsertAlignmentGrade, vector<InsertAlignment> >& insert_best
160 = best_status_for_inserts[obs_order];
161 InsertAlignmentGrade& current = insert_best.first;
162 // Is the new status better than the current best one?
163 if (current < s)
164 {
165 insert_best.second.clear();
166 current = s;
167 insert_best.second.push_back(InsertAlignment(refid, &h1, &h2));
168 }
169 else if (!(s < current))
170 {
171 if (prefer_shorter_pairs && current.num_mapped == 2)
172 {
173 pair<int, int> dc = pair_distances(*(insert_best.second[0].left_alignment), *(insert_best.second[0].right_alignment));
174 pair<int, int> ds = pair_distances(h1,h2);
175 if (ds.second < dc.second)
176 {
177 chucked_for_shorter_pair += insert_best.second.size();
178 insert_best.second.clear();
179 current = s;
180 insert_best.second.push_back(InsertAlignment(refid, &h1, &h2));
181 }
182 }
183 else
184 {
185 insert_best.second.push_back(InsertAlignment(refid, &h1, &h2));
186 }
187 }
188
189 marked.insert(f - hits2_in_ref.begin());
190 found_hit = true;
191 }
192
193 }
194 if (!found_hit)
195 {
196 pair<InsertAlignmentGrade, vector<InsertAlignment> >& insert_best
197 = best_status_for_inserts[obs_order];
198 InsertAlignmentGrade& current = insert_best.first;
199
200 InsertAlignmentGrade s(h1);
201
202 if (current < s)
203 {
204 insert_best.second.clear();
205 current = s;
206 insert_best.second.push_back(InsertAlignment(refid, &h1, NULL));
207 }
208 else if (! (s < current))
209 {
210 insert_best.second.push_back(InsertAlignment(refid, &h1, NULL));
211 }
212
213 }
214 }
215
216 for (size_t i = 0; i < hits2_in_ref.size(); ++i)
217 {
218 BowtieHit& h2 = hits2_in_ref[i];
219 uint32_t obs_order = it.observation_order(h2.insert_id());
220 pair<InsertAlignmentGrade, vector<InsertAlignment> >& insert_best
221 = best_status_for_inserts[obs_order];
222 InsertAlignmentGrade& current = insert_best.first;
223
224 InsertAlignmentGrade s(h2);
225 // Did we include h2 as part of a pairing already, or is this first time
226 // we've seen it? If so, it's a singleton.
227 if (marked.find(i) == marked.end())
228 {
229 if (current < s)
230 {
231 insert_best.second.clear();
232 current = s;
233 insert_best.second.push_back(InsertAlignment(refid, NULL, &h2));
234 }
235 else if (! (s < current))
236 {
237 insert_best.second.push_back(InsertAlignment(refid, NULL, &h2));
238 }
239 }
240 }
241 fprintf(stderr, "Chucked %ld pairs for shorter pairing of same mates\n", chucked_for_shorter_pair);
242 }
243
244 int long_spliced = 0;
245 int short_spliced = 0;
246 int singleton_splices = 0;
247
248 bool valid_insert_alignment(const InsertAlignmentGrade& g, const InsertAlignment& a)
249 {
250 if (!a.left_alignment || !a.right_alignment)
251 return false;
252 if (g.num_mapped == 2)
253 {
254 // Take all the contiguously mapped pairs
255 if (g.num_spliced == 0)
256 return true;
257
258 // Take the pairs that include one or more spliced reads as long as
259 // the inner dist isn't too big
260 // if (g.one_spliced || g.both_spliced)
261 // {
262 // if (g.too_far || g.too_close)
263 // return false;
264 // }
265 return true;
266 }
267 return false;
268 }
269
270
271 void insert_best_pairings(RefSequenceTable& rt,
272 ReadTable& it,
273 HitTable& hits1,
274 HitTable& hits2,
275 BestInsertAlignmentTable& best_pairings,
276 bool prefer_shorter_pairs)
277 {
278 for(RefSequenceTable::const_iterator ci = rt.begin();
279 ci != rt.end();
280 ++ci)
281 {
282
283 // Tracks the number of singleton ALIGNMENTS, not the number of singleton
284 // READS in each Bowtie map.
285 vector<size_t> map1_singletons;
286 vector<size_t> map2_singletons;
287 vector<pair<size_t, size_t> > happy_mates;
288
289 uint64_t ref_id = ci->second.observation_order;
290 HitList* hits1_in_ref = hits1.get_hits(ref_id);
291 HitList* hits2_in_ref = hits2.get_hits(ref_id);
292
293 if (!hits1_in_ref || !hits2_in_ref)
294 continue;
295
296 //if (verbose)
297 // fprintf(stderr, "Looking for best insert mappings in %s\n", name.c_str());
298
299 best_insert_mappings(ref_id,
300 it,
301 *hits1_in_ref,
302 *hits2_in_ref,
303 best_pairings,
304 prefer_shorter_pairs);
305 }
306 }