ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/closures.cpp
Revision: 135
Committed: Mon Dec 12 22:28:38 2011 UTC (8 years, 10 months ago) by gpertea
File size: 19442 byte(s)
Log Message:
wip - SplicedSAMHitFactory() still not implemented

Line File contents
1 /*
2 * closures.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 1/15/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <iostream>
15 #include <fstream>
16 #include <getopt.h>
17 #include "bwt_map.h"
18 #include "inserts.h"
19 #include "closures.h"
20 #include "reads.h"
21 #include "tokenize.h"
22 #include <seqan/sequence.h>
23 #include <seqan/file.h>
24
25 using namespace seqan;
26 using namespace std;
27
28
29 bool possible_cotranscript(const BowtieHit& h1, const BowtieHit& h2, bool check_strand = true)
30 {
31 if (h1.insert_id() != h2.insert_id())
32 return false;
33 int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev;
34 if (max_mate_inner_dist == -1)
35 {
36 max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev;
37 }
38
39 InsertAlignmentGrade grade(h1,h2, min_mate_inner_dist, max_mate_inner_dist);
40 return (!grade.too_far && !grade.too_close && grade.opposite_strands);
41 }
42
43
44 void check_mates(const HitList& hits1_in_ref,
45 const HitList& hits2_in_ref,
46 vector<pair<size_t, size_t> >& happy_mates,
47 vector<size_t>& map1_singletons,
48 vector<size_t>& map2_singletons)
49 {
50 std::set<size_t> marked;
51 // TODO: if this shows up on the profile, replace it with a linear
52 // time algorithm. This one is 2*n*lg(n).
53 HitList::const_iterator last_good = hits2_in_ref.begin();
54
55 for (size_t i = 0; i < hits1_in_ref.size(); ++i)
56 {
57 pair<HitList::const_iterator, HitList::const_iterator> range_pair;
58 range_pair = equal_range(last_good, hits2_in_ref.end(),
59 hits1_in_ref[i], hit_insert_id_lt);
60 bool found_hit = false;
61 if (range_pair.first != range_pair.second)
62 last_good = range_pair.first;
63 for (HitList::const_iterator f = range_pair.first;
64 f != range_pair.second;
65 ++f)
66 {
67 if (possible_cotranscript(hits1_in_ref[i], *f))
68 {
69 happy_mates.push_back(make_pair(i,f - hits2_in_ref.begin()));
70 marked.insert(f - hits2_in_ref.begin());
71 found_hit = true;
72 }
73 }
74 if (!found_hit)
75 map1_singletons.push_back(i);
76 }
77
78 for (size_t i = 0; i < hits2_in_ref.size(); ++i)
79 {
80 if (marked.find(i) == marked.end())
81 {
82 map2_singletons.push_back(i);
83 }
84 }
85 }
86
87 bool prefer_shorter_pairs = true;
88
89 typedef pair<InsertAlignmentGrade, vector<InsertAlignment> > BestPairingHits;
90
91 template<typename Visitor>
92 void visit_best_pairing(HitsForRead& left_hit_group,
93 HitsForRead& right_hit_group,
94 Visitor& visitor)
95 {
96 BestPairingHits insert_best;
97
98 for (size_t i = 0; i < left_hit_group.hits.size(); ++i)
99 {
100 BowtieHit& h1 = left_hit_group.hits[i];
101
102 for (size_t j = 0; j < right_hit_group.hits.size(); j++)
103 {
104 BowtieHit& h2 = right_hit_group.hits[j];
105 if (h1.ref_id() != h2.ref_id())
106 continue;
107
108 uint32_t refid = h1.ref_id();
109
110 int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev;
111 if (max_mate_inner_dist == -1)
112 {
113 max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev;
114 }
115 InsertAlignmentGrade s(h1, h2, min_mate_inner_dist, max_mate_inner_dist);
116
117 //pair<InsertAlignmentGrade, vector<InsertAlignment> >& insert_best
118 // = best_status_for_inserts[curr_left_obs_order];
119 InsertAlignmentGrade& current = insert_best.first;
120 // Is the new status better than the current best one?
121 if (current < s)
122 {
123 insert_best.second.clear();
124 current = s;
125 insert_best.second.push_back(InsertAlignment(refid, &h1, &h2));
126 }
127 else if (! (s < current))
128 {
129 if (prefer_shorter_pairs && current.num_mapped == 2)
130 {
131 pair<int, int> dc = pair_distances(*(insert_best.second[0].left_alignment), *(insert_best.second[0].right_alignment));
132 pair<int, int> ds = pair_distances(h1,h2);
133 if (ds.second < dc.second)
134 {
135 //chucked_for_shorter_pair += insert_best.second.size();
136 insert_best.second.clear();
137 current = s;
138 insert_best.second.push_back(InsertAlignment(refid, &h1, &h2));
139
140 }
141 }
142 else
143 {
144 insert_best.second.push_back(InsertAlignment(refid, &h1, &h2));
145 }
146 }
147 }
148 }
149
150 visitor.visit(insert_best);
151 }
152
153 class CovMapIntronFinder
154 {
155 std::set<SpliceMotif> _tmp_hits;
156 vector<SpliceMotif> _hits;
157 RefSequenceTable::Sequence* _ref_seq;
158 public:
159 CovMapIntronFinder() : _ref_seq(NULL) {}
160 CovMapIntronFinder(RefSequenceTable::Sequence* ref_seq) :
161 _ref_seq(ref_seq),
162 _lms(vector<bool>(length(*ref_seq), false)),
163 _rms(vector<bool>(length(*ref_seq), false)){}
164
165 void add_motifs_in_window(int left,
166 int right,
167 const string& left_motif,
168 const string& right_motif)
169 {
170 size_t l_start = max(0,left);
171 for (int i = l_start;
172 i < min(right, (int)length(*_ref_seq) - 2);
173 ++i)
174 {
175 seqan::Infix<RefSequenceTable::Sequence>::Type curr
176 = seqan::infix(*_ref_seq,i, i + 2);
177 if (curr == left_motif)
178 _lms[i] = true;
179 else if (curr == right_motif)
180 _rms[i] = true;
181 }
182 }
183
184 void finalize()
185 {
186 int pos = 0;
187 for (vector<bool>::iterator itr = _lms.begin();
188 itr != _lms.end();
189 ++itr, ++pos)
190 {
191 if (*itr)
192 _hits.push_back(SpliceMotif(pos, true));
193 }
194
195 pos = 0;
196 for (vector<bool>::iterator itr = _rms.begin();
197 itr != _rms.end();
198 ++itr, ++pos)
199 {
200 if (*itr)
201 _hits.push_back(SpliceMotif(pos, false));
202 }
203
204 sort(_hits.begin(), _hits.end());
205 }
206
207 size_t seq_len() const { return length(*_ref_seq); }
208 const vector<SpliceMotif>& hits() const { return _hits; }
209 vector<bool> _lms;
210 vector<bool> _rms;
211 };
212
213 typedef CovMapIntronFinder CIF;
214
215 struct RefCIF
216 {
217 RefCIF() : ref_seq(NULL) {}
218 RefCIF(const CIF& f, const CIF& r, RefSequenceTable::Sequence* s) :
219 fwd_cif(f), rev_cif(r), ref_seq(s) {}
220 CIF fwd_cif;
221 CIF rev_cif;
222 RefSequenceTable::Sequence* ref_seq;
223 };
224
225 class CoverageMapVisitor
226 {
227 public:
228 map<uint32_t, RefCIF> finders;
229
230 CoverageMapVisitor(istream& ref_stream,
231 RefSequenceTable& rt)
232 {
233
234 while(ref_stream.good() &&
235 !ref_stream.eof())
236 {
237 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence;
238 string name;
239 readMeta(ref_stream, name, Fasta());
240 string::size_type space_pos = name.find_first_of(" \t\r");
241 if (space_pos != string::npos)
242 {
243 name.resize(space_pos);
244 }
245 read(ref_stream, *ref_str, Fasta());
246
247 uint32_t ref_id = rt.get_id(name, NULL, 0);
248 finders[ref_id] = RefCIF(CIF(ref_str), CIF(ref_str), ref_str);
249 }
250 }
251
252 void visit(BestPairingHits& pairings)
253 {
254 if (!pairings.first.num_mapped == 2)
255 return;
256
257 static string fwd_lm("GT");
258 static string fwd_rm("AG");
259 static string rev_lm("CT");
260 static string rev_rm("AC");
261
262 for (size_t i = 0;
263 i < pairings.second.size();
264 ++i)
265 {
266
267 InsertAlignment& al = pairings.second[i];
268
269 BowtieHit& bh_left = *(al.left_alignment);
270 BowtieHit& bh_right = *(al.right_alignment);
271
272 assert (bh_left.ref_id() == bh_right.ref_id());
273
274 map<uint32_t, RefCIF >::iterator if_itr = finders.find(bh_left.ref_id());
275 assert (if_itr != finders.end());
276
277 RefCIF& ref_finders = if_itr->second;
278
279 pair<int, int> ds = pair_distances(bh_left, bh_right);
280
281 int minor_hit_start, major_hit_start;
282 int minor_hit_end, major_hit_end;
283 if (bh_left.left() < bh_right.left())
284 {
285 minor_hit_start = (int)bh_left.left();
286 minor_hit_end = (int)bh_left.right();
287 major_hit_start = (int)bh_right.left();
288 major_hit_end = (int)bh_right.right();
289 }
290 else
291 {
292 minor_hit_start = (int)bh_right.left();
293 minor_hit_end = (int)bh_right.right();
294 major_hit_start = (int)bh_left.left();
295 major_hit_end = (int)bh_left.right();
296 }
297
298 int inner_dist = major_hit_start - minor_hit_end;
299
300 bool skip_fwd = false;
301 bool skip_rev = false;
302
303 if (library_type == FR_FIRSTSTRAND)
304 {
305 if (bh_left.antisense_align()) skip_rev = true;
306 else skip_fwd = true;
307 }
308
309 if (library_type == FR_SECONDSTRAND)
310 {
311 if (bh_left.antisense_align()) skip_fwd = true;
312 else skip_rev = true;
313 }
314
315 if (inner_dist > inner_dist_mean + 2 * inner_dist_std_dev)
316 {
317 // Forward strand
318 if (!skip_fwd)
319 {
320 ref_finders.fwd_cif.add_motifs_in_window((int)bh_left.left() - 10,
321 bh_left.right() + 10,
322 fwd_lm,
323 fwd_rm);
324
325 ref_finders.fwd_cif.add_motifs_in_window((int)bh_right.left() - 10,
326 bh_right.right() + 10,
327 fwd_lm,
328 fwd_rm);
329 }
330
331 // Reverse strand
332 if (!skip_rev)
333 {
334 ref_finders.rev_cif.add_motifs_in_window((int)bh_left.left() - 10,
335 bh_left.right() + 10,
336 rev_lm,
337 rev_rm);
338
339 ref_finders.rev_cif.add_motifs_in_window((int)bh_right.left() - 10,
340 bh_right.right() + 10,
341 rev_lm,
342 rev_rm);
343 }
344 }
345 }
346 }
347
348 void finalize()
349 {
350 for (map<uint32_t, RefCIF >::iterator itr = finders.begin();
351 itr != finders.end();
352 ++itr)
353 {
354 itr->second.fwd_cif.finalize();
355 itr->second.rev_cif.finalize();
356 delete itr->second.ref_seq;
357 itr->second.ref_seq = NULL;
358 }
359 }
360 };
361
362
363 class JunctionMapVisitor
364 {
365 public:
366 typedef JunctionFinder<RefSequenceTable::Sequence, CIF> JF;
367
368 struct JunctionTable
369 {
370 JunctionTable() : jf(NULL), possible_splices(NULL) {}
371 JunctionTable(ClosureJunctionSet* ps, JF* _jf, bool as)
372 : jf(_jf), possible_splices(ps), antisense(as) {}
373
374 JF* jf;
375 ClosureJunctionSet* possible_splices;
376 bool antisense;
377 };
378
379
380 map<uint32_t, pair<JunctionTable, JunctionTable> > _finders;
381
382 JunctionMapVisitor(ClosureJunctionSet& fwd_splices,
383 ClosureJunctionSet& rev_splices,
384 map<uint32_t, RefCIF >& finders)
385 {
386 static const uint32_t bowtie_padding = 5;
387 for (map<uint32_t, RefCIF >::iterator itr = finders.begin();
388 itr != finders.end();
389 ++itr)
390 {
391 JF* fwd_jf = new JF(itr->second.fwd_cif,
392 inner_dist_mean,
393 inner_dist_std_dev,
394 min_closure_intron_length,
395 min_closure_exon_length,
396 bowtie_padding,
397 island_extension);
398 JF* rev_jf = new JF(itr->second.rev_cif,
399 inner_dist_mean,
400 inner_dist_std_dev,
401 min_closure_intron_length,
402 min_closure_exon_length,
403 bowtie_padding,
404 island_extension);
405
406 _finders[itr->first] = make_pair(JunctionTable(&fwd_splices, fwd_jf,false),
407 JunctionTable(&rev_splices, rev_jf, true));
408 }
409 }
410
411
412 void visit(BestPairingHits& pairings)
413 {
414 if (!pairings.first.num_mapped == 2)
415 return;
416 for (size_t i = 0;
417 i < pairings.second.size();
418 ++i)
419 {
420
421 InsertAlignment& al = pairings.second[i];
422
423 BowtieHit& bh_left = *(al.left_alignment);
424 BowtieHit& bh_right = *(al.right_alignment);
425
426 assert (bh_left.ref_id() == bh_right.ref_id());
427
428 map<uint32_t, pair<JunctionTable, JunctionTable> >::iterator if_itr = _finders.find(bh_left.ref_id());
429 assert (if_itr != _finders.end());
430
431 JF& fwd_jf = *(if_itr->second.first.jf);
432 fwd_jf.possible_junctions(*(if_itr->second.first.possible_splices), bh_left, bh_right);
433
434 JF& rev_jf = *(if_itr->second.second.jf);
435 rev_jf.possible_junctions(*(if_itr->second.second.possible_splices), bh_left, bh_right);
436 }
437 }
438 };
439
440 void closure_driver(vector<FZPipe>& map1,
441 vector<FZPipe>& map2,
442 ifstream& ref_stream,
443 FILE* juncs_file)
444 {
445 typedef RefSequenceTable::Sequence Reference;
446
447 ReadTable it;
448 RefSequenceTable rt(true);
449
450 BowtieHitFactory hit_factory(it,rt);
451
452 fprintf (stderr, "Finding near-covered motifs...");
453 CoverageMapVisitor cov_map_visitor(ref_stream, rt);
454 uint32_t coverage_attempts = 0;
455
456 assert(map1.size() == map2.size());
457 for (size_t num = 0; num < map1.size(); ++num)
458 {
459 HitStream left_hs(map1[num].file, &hit_factory, false, true, false);
460 HitStream right_hs(map2[num].file, &hit_factory, false, true, false);
461
462 HitsForRead curr_left_hit_group;
463 HitsForRead curr_right_hit_group;
464
465 left_hs.next_read_hits(curr_left_hit_group);
466 right_hs.next_read_hits(curr_right_hit_group);
467
468 uint32_t curr_right_obs_order = it.observation_order(curr_left_hit_group.insert_id);
469 uint32_t curr_left_obs_order = it.observation_order(curr_right_hit_group.insert_id);
470
471 while(curr_left_obs_order != VMAXINT32 &&
472 curr_right_obs_order != VMAXINT32)
473 {
474 while (curr_left_obs_order < curr_right_obs_order&&
475 curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32)
476 {
477 // Get hit group
478
479 left_hs.next_read_hits(curr_left_hit_group);
480 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
481 }
482
483 while (curr_left_obs_order > curr_right_obs_order &&
484 curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32)
485 {
486 // Get hit group
487
488 right_hs.next_read_hits(curr_right_hit_group);
489 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
490 }
491
492 while (curr_left_obs_order == curr_right_obs_order &&
493 curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32)
494 {
495 if (coverage_attempts++ % 1000 == 0)
496 fprintf (stderr, "Adding covered motifs from pair %d\n", coverage_attempts);
497 visit_best_pairing(curr_left_hit_group, curr_right_hit_group, cov_map_visitor);
498
499 left_hs.next_read_hits(curr_left_hit_group);
500 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
501
502 right_hs.next_read_hits(curr_right_hit_group);
503 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
504 }
505 }
506 }
507
508 cov_map_visitor.finalize();
509 fprintf (stderr, "done\n");
510
511 ClosureJunctionSet fwd_splices;
512 ClosureJunctionSet rev_splices;
513
514 JunctionMapVisitor junc_map_visitor(fwd_splices, rev_splices, cov_map_visitor.finders);
515 fprintf (stderr, "Searching for closures...");
516 uint32_t closure_attempts = 0;
517
518 for (size_t num = 0; num < map1.size(); ++num)
519 {
520 map1[num].rewind();
521 map2[num].rewind();
522
523 HitStream left_hs = HitStream(map1[num].file, &hit_factory, false, true, false);
524 HitStream right_hs = HitStream(map2[num].file, &hit_factory, false, true, false);
525
526 HitsForRead curr_left_hit_group;
527 HitsForRead curr_right_hit_group;
528
529 left_hs.next_read_hits(curr_left_hit_group);
530 right_hs.next_read_hits(curr_right_hit_group);
531
532 uint32_t curr_right_obs_order = it.observation_order(curr_left_hit_group.insert_id);
533 uint32_t curr_left_obs_order = it.observation_order(curr_right_hit_group.insert_id);
534
535 while(curr_left_obs_order != VMAXINT32 &&
536 curr_right_obs_order != VMAXINT32)
537 {
538 while (curr_left_obs_order < curr_right_obs_order &&
539 curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32)
540 {
541 // Get hit group
542
543 left_hs.next_read_hits(curr_left_hit_group);
544 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
545 }
546
547 while (curr_left_obs_order > curr_right_obs_order &&
548 curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32)
549 {
550 // Get hit group
551
552 right_hs.next_read_hits(curr_right_hit_group);
553 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
554 }
555
556 while (curr_left_obs_order == curr_right_obs_order &&
557 curr_left_obs_order != VMAXINT32 && curr_right_obs_order != VMAXINT32)
558 {
559 if (closure_attempts++ % 1000 == 0)
560 fprintf (stderr, "Trying to close pair %d\n", closure_attempts);
561 visit_best_pairing(curr_left_hit_group, curr_right_hit_group, junc_map_visitor);
562 left_hs.next_read_hits(curr_left_hit_group);
563 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
564
565 right_hs.next_read_hits(curr_right_hit_group);
566 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
567 }
568 }
569 }
570
571 for (size_t num = 0; num < map1.size(); ++num)
572 {
573 map1[num].close();
574 map2[num].close();
575 }
576
577 fprintf(stderr, "%lu Forward strand splices\n", fwd_splices.size());
578 fprintf(stderr, "%lu Reverse strand splices\n", rev_splices.size());
579
580 fprintf (stderr, "done\n");
581 uint32_t num_potential_splices = 0;
582 fprintf (stderr, "Reporting possible junctions...");
583 map<uint32_t, pair<JunctionMapVisitor::JunctionTable, JunctionMapVisitor::JunctionTable> >::iterator f_itr;
584 f_itr = junc_map_visitor._finders.begin();
585
586 ClosureJunctionSet::iterator j_itr;
587 j_itr = fwd_splices.begin();
588 while (j_itr != fwd_splices.end())
589 {
590 fprintf (juncs_file,"%s\t%u\t%u\t%c\n",
591 rt.get_name(j_itr->refid),
592 j_itr->left,j_itr->right,'+');
593 ++num_potential_splices;
594 ++j_itr;
595 }
596
597 j_itr = rev_splices.begin();
598 while (j_itr != rev_splices.end())
599 {
600 fprintf (juncs_file,"%s\t%u\t%u\t%c\n",
601 rt.get_name(j_itr->refid),
602 j_itr->left,j_itr->right,'-');
603 ++num_potential_splices;
604 ++j_itr;
605 }
606
607 //accept_all_best_hits(best_status_for_inserts);
608 fprintf(stderr, "done\n");
609 fprintf(stderr, "Searched for closures between %d pairs\n", searched);
610 fprintf(stderr, "Successfully closed %d pairs\n", closed);
611
612 fprintf(stderr, "Found %d total possible splices\n", num_potential_splices);
613 }
614
615 void print_usage()
616 {
617 fprintf(stderr, "Usage: closure_juncs <closure.juncs> <ref.fa> <left_map.bwtout> <right_map.bwtout>\n");
618 }
619
620 int main(int argc, char** argv)
621 {
622 fprintf(stderr, "closure_juncs v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
623 fprintf(stderr, "---------------------------\n");
624
625 int parse_ret = parse_options(argc, argv, print_usage);
626 if (parse_ret)
627 return parse_ret;
628
629 if(optind >= argc)
630 {
631 print_usage();
632 return 1;
633 }
634
635 string junctions_file_name = argv[optind++];
636
637 if(optind >= argc)
638 {
639 print_usage();
640 return 1;
641 }
642
643 string ref_fasta = argv[optind++];
644
645 if(optind >= argc)
646 {
647 print_usage();
648 return 1;
649 }
650
651 string left_file_list = argv[optind++];
652 vector<string> left_file_names;
653 vector<FZPipe> left_files;
654 tokenize(left_file_list, ",", left_file_names);
655
656 string unzcmd = getUnpackCmd(left_file_names[0], false);
657 for (size_t i = 0; i < left_file_names.size(); ++i)
658 {
659 FZPipe seg_file(left_file_names[i], unzcmd);
660 if (seg_file.file == NULL)
661 {
662 fprintf(stderr, "Error: cannot open file %s for reading\n",
663 left_file_names[i].c_str());
664 exit(1);
665 }
666 left_files.push_back(seg_file);
667 }
668
669 if(optind >= argc)
670 {
671 print_usage();
672 return 1;
673 }
674
675 string right_file_list = argv[optind++];
676 vector<string> right_file_names;
677 vector<FZPipe> right_files;
678 tokenize(right_file_list, ",", right_file_names);
679 for (size_t i = 0; i < right_file_names.size(); ++i)
680 {
681 FZPipe seg_file(right_file_names[i], unzcmd);
682 if (seg_file.file == NULL)
683 {
684 fprintf(stderr, "Error: cannot open %s for reading\n",
685 right_file_names[i].c_str());
686 exit(1);
687 }
688 right_files.push_back(seg_file);
689 }
690
691 ifstream ref_stream(ref_fasta.c_str(), ifstream::in);
692
693 FILE* splice_db = fopen(junctions_file_name.c_str(), "w");
694 if (splice_db == NULL)
695 {
696 fprintf(stderr, "Error: cannot open junctions file %s for writing\n",
697 junctions_file_name.c_str());
698 exit(1);
699 }
700
701 closure_driver(left_files,
702 right_files,
703 ref_stream,
704 splice_db);
705
706 return 0;
707 }