ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/junctions.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (7 years, 8 months ago) by gpertea
File size: 9864 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 /*
2 * junctions.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 12/12/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <cassert>
15 #include "common.h"
16 #include "junctions.h"
17
18 void junctions_from_spliced_hit(const BowtieHit& h,
19 vector<pair<Junction, JunctionStats> >& new_juncs)
20 {
21 const vector<CigarOp>& cigar = h.cigar();
22 int j = h.left();
23
24 bool bSawFusion = false;
25 for (size_t c = 0 ; c < cigar.size(); ++c)
26 {
27 Junction junc;
28 JunctionStats stats;
29 switch(cigar[c].opcode)
30 {
31 case REF_SKIP:
32 case rEF_SKIP:
33 if (bSawFusion)
34 junc.refid = h.ref_id2();
35 else
36 junc.refid = h.ref_id();
37
38 if (cigar[c].opcode == REF_SKIP)
39 {
40 junc.left = j;
41 junc.right = junc.left + cigar[c].length;
42 }
43 else
44 {
45 junc.right = j + 1;
46 junc.left = junc.right - cigar[c].length;
47 }
48
49 junc.antisense = h.antisense_splice();
50
51 assert (c > 0 && c < cigar.size() - 1);
52 assert (cigar[c - 1].length);
53 assert (cigar[c + 1].length);
54
55 /*
56 * Note that in valid_hit() in tophat_report.cpp
57 * we have tried to ensure that the REF_SKIP operator
58 * will only be surrounded by match operators
59 */
60 stats.left_extent = cigar[c - 1].length;
61 stats.right_extent = cigar[c + 1].length;
62 stats.min_splice_mms = h.splice_mms();
63 stats.supporting_hits++;
64 new_juncs.push_back(make_pair(junc, stats));
65 j += cigar[c].length;
66 break;
67 case MATCH:
68 case DEL:
69 j += cigar[c].length;
70 break;
71 case mATCH:
72 case dEL:
73 j -= cigar[c].length;
74 break;
75 case FUSION_FF:
76 case FUSION_FR:
77 case FUSION_RF:
78 j = cigar[c].length;
79 bSawFusion = true;
80 break;
81 default:
82 break;
83 }
84 }
85 }
86
87 void print_junction(FILE* junctions_out,
88 const char* name,
89 const Junction& j,
90 const JunctionStats& s,
91 uint64_t junc_id)
92 {
93 fprintf(junctions_out,
94 "%s\t%d\t%d\tJUNC%08d\t%d\t%c\t%d\t%d\t255,0,0\t2\t%d,%d\t0,%d\n",
95 name,
96 j.left - s.left_extent,
97 j.right + s.right_extent,
98 (int)junc_id,
99 s.supporting_hits,
100 j.antisense ? '-' : '+',
101 j.left - s.left_extent,
102 j.right + s.right_extent,
103 s.left_extent,
104 s.right_extent,
105 j.right - (j.left - s.left_extent));
106 }
107
108 void junctions_from_alignment(const BowtieHit& spliced_alignment,
109 JunctionSet& junctions)
110 {
111 vector<pair<Junction, JunctionStats> > juncs;
112 junctions_from_spliced_hit(spliced_alignment, juncs);
113
114 for (size_t i = 0; i < juncs.size(); ++i)
115 {
116 pair<Junction, JunctionStats>& junc = juncs[i];
117 JunctionSet::iterator itr = junctions.find(junc.first);
118
119 if (itr != junctions.end())
120 {
121 JunctionStats& j = itr->second;
122 j.merge_with(junc.second);
123 }
124 else
125 {
126 assert(junc.first.refid != VMAXINT32);
127 junctions[junc.first] = junc.second;
128 }
129 }
130 }
131
132 #if !NDEBUG
133 void validate_junctions(const JunctionSet& junctions)
134 {
135 uint32_t invalid_juncs = 0;
136 for (JunctionSet::const_iterator i = junctions.begin();
137 i != junctions.end();
138 ++i)
139 {
140 if (!i->first.valid())
141 invalid_juncs++;
142 }
143 fprintf(stderr, "Found %d invalid junctions\n", invalid_juncs);
144 }
145 #endif
146
147 int rejected = 0;
148 int rejected_spliced = 0;
149 int total_spliced = 0;
150 int total = 0;
151
152 /*
153 void junctions_from_alignments(HitTable& hits,
154 JunctionSet& junctions)
155 {
156 for (HitTable::iterator ci = hits.begin();
157 ci != hits.end();
158 ++ci)
159 {
160 HitList& rh = ci->second;
161 if (rh.size() == 0)
162 continue;
163 for (size_t i = 0; i < rh.size(); ++i)
164 {
165 BowtieHit& bh = rh[i];
166 AlignStatus s = status(&bh);
167 total++;
168 if (s == SPLICED)
169 total_spliced++;
170
171 if (s == SPLICED)
172 {
173 junctions_from_alignment(bh, junctions);
174 }
175 }
176 }
177 }
178 */
179
180 bool accept_if_valid(const Junction& j, JunctionStats& s)
181 {
182 if (min(s.left_extent, s.right_extent) < min_anchor_len)
183 {
184 s.accepted = false;
185 return false;
186 }
187
188 if (s.min_splice_mms > max_splice_mismatches)
189 {
190 s.accepted = false;
191 return false;
192 }
193
194 // uint32_t junc_doc = 0;
195 // uint8_t extent = 0;
196 // if (s.left_exon_doc > s.right_exon_doc)
197 // {
198 // junc_doc = s.left_exon_doc;
199 // extent = s.left_extent;
200 // }
201 // else
202 // {
203 // junc_doc = s.right_exon_doc;
204 // extent = s.right_extent;
205 // }
206 //
207 // double avg_junc_doc = junc_doc / (double)(extent);
208
209 //if (avg_junc_doc / (float) s.num_reads > 100.0)
210 // if (s.supporting_hits / avg_junc_doc < min_isoform_fraction)
211 // {
212 // s.accepted = false;
213 // }
214 // else
215 {
216 //fprintf (stderr, "Junction size = %d\n, support = %d", (int)j.right - (int)j.left, (int)s.supporting_hits.size() );
217 if ((int)j.right - (int)j.left > 50000)
218 {
219 s.accepted = (s.supporting_hits >= 2 &&
220 min(s.left_extent, s.right_extent) > 12);
221 }
222 else
223 {
224 s.accepted = true;
225 }
226 }
227 return s.accepted;
228 }
229
230 void knockout_shadow_junctions(JunctionSet& junctions)
231 {
232 vector<uint32_t> ref_ids;
233
234 for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
235 {
236 ref_ids.push_back(i->first.refid);
237 }
238
239 sort(ref_ids.begin(), ref_ids.end());
240 vector<uint32_t>::iterator new_end = unique(ref_ids.begin(), ref_ids.end());
241 ref_ids.erase(new_end, ref_ids.end());
242
243 for(size_t i = 0; i < ref_ids.size(); ++i)
244 {
245 uint32_t refid = ref_ids[i];
246
247 Junction dummy_left(refid, 0, 0, true);
248 Junction dummy_right(refid, VMAXINT32, VMAXINT32, true);
249
250 pair<JunctionSet::iterator, JunctionSet::iterator> r;
251 r.first = junctions.lower_bound(dummy_left);
252 r.second = junctions.upper_bound(dummy_right);
253
254 JunctionSet::iterator itr = r.first;
255
256 while(itr != r.second && itr != junctions.end())
257 {
258 if (itr->second.accepted && !itr->second.gtf_match)
259 {
260 Junction fuzzy_left = itr->first;
261 Junction fuzzy_right = itr->first;
262 fuzzy_left.left -= min_anchor_len;
263 fuzzy_right.right += min_anchor_len;
264 fuzzy_left.antisense = !itr->first.antisense;
265 fuzzy_right.antisense = !itr->first.antisense;
266
267 pair<JunctionSet::iterator, JunctionSet::iterator> s;
268 s.first = junctions.lower_bound(fuzzy_left);
269 s.second = junctions.upper_bound(fuzzy_right);
270 JunctionSet::iterator itr2 = s.first;
271
272 int junc_support = itr->second.supporting_hits;
273
274 while(itr2 != s.second && itr2 != junctions.end())
275 {
276 int left_diff = itr->first.left - itr2->first.left;
277 int right_diff = itr->first.right - itr2->first.right;
278 if (itr != itr2 &&
279 itr->first.antisense != itr2->first.antisense &&
280 (left_diff < min_anchor_len || right_diff < min_anchor_len))
281 {
282 if (junc_support < itr2->second.supporting_hits)
283 itr->second.accepted = false;
284 }
285 ++itr2;
286 }
287 }
288 ++itr;
289 }
290 }
291 }
292
293 void filter_junctions(JunctionSet& junctions, const JunctionSet& gtf_junctions)
294 {
295 for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
296 {
297 if (gtf_junctions.find(i->first) == gtf_junctions.end())
298 accept_if_valid(i->first, i->second);
299 else {//automatically accept junctions matching GTF
300 i->second.accepted = true;
301 i->second.gtf_match = true;
302 }
303 }
304
305 knockout_shadow_junctions(junctions);
306 }
307
308 void accept_all_junctions(JunctionSet& junctions,
309 const uint32_t refid)
310 {
311 fprintf(stderr, "Accepting all junctions\n");
312 for (JunctionSet::iterator itr = junctions.begin(); itr != junctions.end(); ++itr)
313 {
314 itr->second.accepted = true;
315 }
316 }
317
318
319 void print_junctions(FILE* junctions_out,
320 const JunctionSet& junctions,
321 RefSequenceTable& ref_sequences)
322 {
323 uint64_t junc_id = 1;
324 fprintf(junctions_out, "track name=junctions description=\"TopHat junctions\"\n");
325 for (JunctionSet::const_iterator i = junctions.begin();
326 i != junctions.end();
327 ++i)
328 {
329 const pair<Junction, JunctionStats>& j_itr = *i;
330 const Junction& j = j_itr.first;
331 const JunctionStats& s = j_itr.second;
332
333 assert(ref_sequences.get_name(j.refid));
334 //fprintf(stdout,"%d\t%d\t%d\t%c\n", j.refid, j.left, j.right, j.antisense ? '-' : '+');
335 print_junction(junctions_out,
336 ref_sequences.get_name(j.refid),
337 j,
338 s,
339 junc_id++);
340 }
341 //fprintf(stderr, "Rejected %d / %d alignments, %d / %d spliced\n", rejected, total, rejected_spliced, total_spliced);
342 }
343
344 // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
345 // resets the stream when finished.
346 void get_junctions_from_hits(HitStream& hit_stream,
347 ReadTable& it,
348 JunctionSet& junctions)
349 {
350 HitsForRead curr_hit_group;
351 hit_stream.next_read_hits(curr_hit_group);
352
353 uint32_t curr_obs_order = it.observation_order(curr_hit_group.insert_id);
354
355 while(curr_obs_order != VMAXINT32)
356 {
357 for (size_t i = 0; i < curr_hit_group.hits.size(); ++i)
358 {
359 BowtieHit& bh = curr_hit_group.hits[i];
360 if (!bh.contiguous())
361 {
362 junctions_from_alignment(bh, junctions);
363 }
364 hit_stream.next_read_hits(curr_hit_group);
365 curr_obs_order = it.observation_order(curr_hit_group.insert_id);
366 }
367 }
368
369 hit_stream.reset();
370 }
371
372 void merge_with(JunctionSet& juncs, const JunctionSet& other_juncs)
373 {
374 for (JunctionSet::const_iterator junc = other_juncs.begin(); junc != other_juncs.end(); ++junc)
375 {
376 JunctionSet::iterator itr = juncs.find(junc->first);
377 if (itr != juncs.end())
378 {
379 JunctionStats& curr = itr->second;
380 curr.merge_with(junc->second);
381 }
382 else
383 {
384 juncs[junc->first] = junc->second;
385 }
386 }
387 }