ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat_reports.cpp
Revision: 166
Committed: Fri Feb 10 02:45:06 2012 UTC (7 years, 8 months ago) by gpertea
File size: 52075 byte(s)
Log Message:
resync

Line File contents
1 /*
2 * tophat_reports.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/20/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #else
13 #define PACKAGE_VERSION "INTERNAL"
14 #define SVN_REVISION "XXX"
15 #endif
16
17 #include <cassert>
18 #include <cstdio>
19 #include <cstring>
20 #include <vector>
21 #include <string>
22 #include <map>
23 #include <algorithm>
24 #include <set>
25 #include <stdexcept>
26 #include <iostream>
27 #include <fstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <getopt.h>
32
33 #include <boost/thread.hpp>
34
35 #include "common.h"
36 #include "utils.h"
37 #include "bwt_map.h"
38 #include "junctions.h"
39 #include "insertions.h"
40 #include "deletions.h"
41 #include "fusions.h"
42 #include "align_status.h"
43 #include "fragments.h"
44 #include "wiggles.h"
45 #include "tokenize.h"
46 #include "reads.h"
47
48
49 #include "inserts.h"
50
51 using namespace std;
52 using namespace seqan;
53 using std::set;
54
55 // daehwan - this is redundancy, which should be removed.
56 void get_seqs(istream& ref_stream,
57 RefSequenceTable& rt,
58 bool keep_seqs = true)
59 {
60 while(ref_stream.good() && !ref_stream.eof())
61 {
62 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
63 string name;
64 readMeta(ref_stream, name, Fasta());
65 string::size_type space_pos = name.find_first_of(" \t\r");
66 if (space_pos != string::npos)
67 {
68 name.resize(space_pos);
69 }
70 seqan::read(ref_stream, *ref_str, Fasta());
71
72 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
73 }
74 }
75
76 struct cmp_read_alignment
77 {
78 cmp_read_alignment(const JunctionSet& gtf_juncs) :
79 _gtf_juncs(&gtf_juncs) {}
80
81 bool operator() (const BowtieHit& l, const BowtieHit& r) const
82 {
83 FragmentAlignmentGrade gl(l, *_gtf_juncs);
84 FragmentAlignmentGrade gr(r, *_gtf_juncs);
85 bool better = gr < gl;
86 bool worse = gl < gr;
87
88 if (better && !worse)
89 return true;
90 else
91 return false;
92 }
93
94 const JunctionSet* _gtf_juncs;
95 };
96
97 void read_best_alignments(const HitsForRead& hits_for_read,
98 FragmentAlignmentGrade& best_grade,
99 HitsForRead& best_hits,
100 const JunctionSet& gtf_junctions)
101 {
102 const vector<BowtieHit>& hits = hits_for_read.hits;
103 for (size_t i = 0; i < hits.size(); ++i)
104 {
105 if (hits[i].edit_dist() > max_read_mismatches)
106 continue;
107
108 if (report_secondary_alignments)
109 {
110 best_hits.hits.push_back(hits[i]);
111 }
112 else
113 {
114 FragmentAlignmentGrade g(hits[i], gtf_junctions);
115
116 // Is the new status better than the current best one?
117 if (best_grade < g)
118 {
119 best_hits.hits.clear();
120 best_grade = g;
121 best_hits.hits.push_back(hits[i]);
122 }
123 else if (!(g < best_grade)) // is it just as good?
124 {
125 best_grade.num_alignments++;
126 best_hits.hits.push_back(hits[i]);
127 }
128 }
129 }
130
131 if (report_secondary_alignments && best_hits.hits.size() > 0)
132 {
133 cmp_read_alignment cmp(gtf_junctions);
134 sort(best_hits.hits.begin(), best_hits.hits.end(), cmp);
135 best_grade = FragmentAlignmentGrade(best_hits.hits[0], gtf_junctions);
136 best_grade.num_alignments = best_hits.hits.size();
137 }
138 }
139
140 void set_insert_alignment_grade(const BowtieHit& lh, const BowtieHit& rh, InsertAlignmentGrade& grade)
141 {
142 // max mate inner distance (genomic)
143 int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev;
144 if (max_mate_inner_dist == -1)
145 max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev;
146
147 bool fusion = false;
148 if (fusion_search)
149 {
150 bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING;
151 bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING;
152 if (left_fusion && right_fusion)
153 return;
154
155 fusion = left_fusion || right_fusion;
156 if (!fusion && lh.ref_id() != rh.ref_id())
157 fusion = true;
158
159 if (!fusion && lh.ref_id() == rh.ref_id())
160 {
161 if (lh.antisense_align() == rh.antisense_align())
162 fusion = true;
163 else
164 {
165 int inter_dist = 0;
166 if (lh.antisense_align())
167 inter_dist = lh.left() - rh.right();
168 else
169 inter_dist = rh.left() - lh.right();
170
171 if (inter_dist < -(int)max_insertion_length || inter_dist > (int)fusion_min_dist)
172 fusion = true;
173 }
174 }
175 }
176
177 grade = InsertAlignmentGrade(lh, rh, min_mate_inner_dist, max_mate_inner_dist, fusion);
178 }
179
180 struct cmp_pair_alignment
181 {
182 cmp_pair_alignment(const JunctionSet& gtf_juncs) :
183 _gtf_juncs(&gtf_juncs) {}
184
185 bool operator() (const pair<BowtieHit, BowtieHit>& l, const pair<BowtieHit, BowtieHit>& r) const
186 {
187 InsertAlignmentGrade gl; set_insert_alignment_grade(l.first, l.second, gl);
188 InsertAlignmentGrade gr; set_insert_alignment_grade(r.first, r.second, gr);
189
190 bool better = gr < gl;
191 bool worse = gl < gr;
192
193 if (better && !worse)
194 return true;
195 else
196 return false;
197 }
198
199 const JunctionSet* _gtf_juncs;
200 };
201
202 void pair_best_alignments(const HitsForRead& left_hits,
203 const HitsForRead& right_hits,
204 InsertAlignmentGrade& best_grade,
205 vector<pair<BowtieHit, BowtieHit> >& best_hits,
206 const JunctionSet& gtf_junctions)
207 {
208 const vector<BowtieHit>& left = left_hits.hits;
209 const vector<BowtieHit>& right = right_hits.hits;
210
211 for (size_t i = 0; i < left.size(); ++i)
212 {
213 const BowtieHit& lh = left[i];
214 if (lh.edit_dist() > max_read_mismatches) continue;
215 for (size_t j = 0; j < right.size(); ++j)
216 {
217 const BowtieHit& rh = right[j];
218 if (rh.edit_dist() > max_read_mismatches) continue;
219
220 InsertAlignmentGrade g; set_insert_alignment_grade(lh, rh, g);
221
222 if (report_secondary_alignments)
223 {
224 best_hits.push_back(make_pair(lh, rh));
225 }
226 else
227 {
228 // Is the new status better than the current best one?
229 if (best_grade < g)
230 {
231 best_hits.clear();
232 best_grade = g;
233 best_hits.push_back(make_pair(lh, rh));
234 }
235 else if (!(g < best_grade))
236 {
237 best_hits.push_back(make_pair(lh, rh));
238 best_grade.num_alignments++;
239 }
240 }
241 }
242 }
243
244 if (report_secondary_alignments && best_hits.size() > 0)
245 {
246 cmp_pair_alignment cmp(gtf_junctions);
247 sort(best_hits.begin(), best_hits.end(), cmp);
248 set_insert_alignment_grade(best_hits[0].first, best_hits[0].second, best_grade);
249 best_grade.num_alignments = best_hits.size();
250 }
251 }
252
253 enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT};
254
255 void add_auxData(vector<string>& auxdata, vector<string>& sam_toks,
256 const RefSequenceTable& rt,const BowtieHit& bh, FragmentType insert_side,
257 int num_hits, const BowtieHit* next_hit, int hitIndex) {
258 bool XS_found = false;
259 if (sam_toks.size()>11) {
260
261 for (size_t i=11;i<sam_toks.size();++i) {
262 if (sam_toks[i].find("NH:i:")==string::npos &&
263 sam_toks[i].find("XS:i:")==string::npos)
264 auxdata.push_back(sam_toks[i]);
265
266 if (sam_toks[i].find("XS:A:")!=string::npos)
267 XS_found = true;
268 }
269 }
270 string aux("NH:i:");
271 str_appendInt(aux, num_hits);
272 auxdata.push_back(aux);
273 if (next_hit) {
274 const char* nh_ref_name = "=";
275 nh_ref_name = rt.get_name(next_hit->ref_id());
276 assert (nh_ref_name != NULL);
277 bool same_contig=(next_hit->ref_id()==bh.ref_id());
278 aux="CC:Z:";
279 aux+= (same_contig ? "=" : nh_ref_name);
280 auxdata.push_back(aux);
281 aux="CP:i:";
282 int nh_gpos=next_hit->left() + 1;
283 str_appendInt(aux, nh_gpos);
284 auxdata.push_back(aux);
285 } //has next_hit
286 // FIXME: this code is still a bit brittle, because it contains no
287 // consistency check that the mates are on opposite strands - a current protocol
288 // requirement, and that the strand indicated by the alignment is consistent
289 // with the orientation of the splices (though that should be handled upstream).
290 if (!XS_found) {
291 const string xs_f("XS:A:+");
292 const string xs_r("XS:A:-");
293 if (library_type == FR_FIRSTSTRAND) {
294 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
295 if (bh.antisense_align())
296 auxdata.push_back(xs_f);
297 else
298 auxdata.push_back(xs_r);
299 }
300 else {
301 if (bh.antisense_align())
302 auxdata.push_back(xs_r);
303 else
304 auxdata.push_back(xs_f);
305 }
306 }
307 else if (library_type == FR_SECONDSTRAND) {
308 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
309 if (bh.antisense_align())
310 auxdata.push_back(xs_r);
311 else
312 auxdata.push_back(xs_f);
313 }
314 else
315 {
316 if (bh.antisense_align())
317 auxdata.push_back(xs_f);
318 else
319 auxdata.push_back(xs_r);
320 }
321 }
322 }
323 if (hitIndex >= 0)
324 {
325 string aux("HI:i:");
326 str_appendInt(aux, hitIndex);
327 auxdata.push_back(aux);
328 }
329 }
330
331 bool rewrite_sam_record(GBamWriter& bam_writer,
332 const RefSequenceTable& rt,
333 const BowtieHit& bh,
334 const char* bwt_buf,
335 const char* read_alt_name,
336 const FragmentAlignmentGrade& grade,
337 FragmentType insert_side,
338 int num_hits,
339 const BowtieHit* next_hit,
340 bool primary,
341 int hitIndex)
342 {
343 // Rewrite this hit, filling in the alt name, mate mapping
344 // and setting the pair flag
345 vector<string> sam_toks;
346 tokenize(bwt_buf, "\t", sam_toks);
347
348 string ref_name = sam_toks[2], ref_name2 = "";
349 char rebuf2[2048] = {0};
350 char cigar1[256] = {0}, cigar2[256] = {0};
351 string left_seq, right_seq, left_qual, right_qual;
352 int left = -1, left1 = -1, left2 = -1;
353 bool fusion_alignment = false;
354 size_t XF_index = 0;
355 for (size_t i = 11; i < sam_toks.size(); ++i)
356 {
357 const string& tok = sam_toks[i];
358 if (strncmp(tok.c_str(), "XF", 2) == 0)
359 {
360 fusion_alignment = true;
361 XF_index = i;
362
363 vector<string> tuple_fields;
364 tokenize(tok.c_str(), " ", tuple_fields);
365 vector<string> contigs;
366 tokenize(tuple_fields[1].c_str(), "-", contigs);
367 if (contigs.size() >= 2)
368 {
369 ref_name = contigs[0];
370 ref_name2 = contigs[1];
371 }
372
373 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
374 cigar1, cigar2, left_seq, right_seq,
375 left_qual, right_qual, left1, left2);
376
377 break;
378 }
379 }
380
381 string qname(read_alt_name);
382 size_t slash_pos=qname.rfind('/');
383 if (slash_pos!=string::npos)
384 qname.resize(slash_pos);
385 //read_alt_name as QNAME
386 int flag=atoi(sam_toks[1].c_str()); //FLAG
387 if (insert_side != FRAG_UNPAIRED) {
388 //flag = atoi(sam_toks[1].c_str());
389 // mark this as a singleton mate
390 flag |= 0x0001;
391 if (insert_side == FRAG_LEFT)
392 flag |= 0x0040;
393 else if (insert_side == FRAG_RIGHT)
394 flag |= 0x0080;
395 flag |= 0x0008;
396 //char flag_buf[64];
397 //sprintf(flag_buf, "%d", flag);
398 //sam_toks[t] = flag_buf;
399 }
400 if (!primary)
401 flag |= 0x100;
402
403 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
404 int mapQ=255;
405 if (grade.num_alignments > 1) {
406 double err_prob = 1 - (1.0 / grade.num_alignments);
407 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
408 }
409 int tlen =atoi(sam_toks[8].c_str()); //TLEN
410 int mate_pos=atoi(sam_toks[7].c_str());
411
412 GBamRecord* bamrec=NULL;
413 if (fusion_alignment) {
414 vector<string> auxdata;
415 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
416 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
417 cigar1, sam_toks[6].c_str(), mate_pos,
418 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
419 bam_writer.write(bamrec);
420 delete bamrec;
421
422 auxdata.clear();
423 sam_toks[XF_index][5] = '2';
424 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
425 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
426 cigar2, sam_toks[6].c_str(), mate_pos,
427 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
428 bam_writer.write(bamrec);
429 delete bamrec;
430 } else {
431 vector<string> auxdata;
432 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
433 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
434 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
435 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
436 bam_writer.write(bamrec);
437 delete bamrec;
438 }
439
440 return true;
441 }
442
443 bool rewrite_sam_record(GBamWriter& bam_writer,
444 const RefSequenceTable& rt,
445 const BowtieHit& bh,
446 const char* bwt_buf,
447 const char* read_alt_name,
448 const InsertAlignmentGrade& grade,
449 FragmentType insert_side,
450 const BowtieHit* partner,
451 int num_hits,
452 const BowtieHit* next_hit,
453 bool primary,
454 int hitIndex)
455 {
456 // Rewrite this hit, filling in the alt name, mate mapping
457 // and setting the pair flag
458 vector<string> sam_toks;
459 tokenize(bwt_buf, "\t", sam_toks);
460 string qname(read_alt_name);
461 size_t slash_pos=qname.rfind('/');
462 if (slash_pos!=string::npos)
463 qname.resize(slash_pos);
464 //read_alt_name as QNAME
465 int flag = atoi(sam_toks[1].c_str());
466 // 0x0010 (strand of query) is assumed to be set correctly
467 // to begin with
468 flag |= 0x0001; //it must be paired
469 if (insert_side == FRAG_LEFT)
470 flag |= 0x0040;
471 else if (insert_side == FRAG_RIGHT)
472 flag |= 0x0080;
473 if (!primary)
474 flag |= 0x100;
475
476 string ref_name = sam_toks[2], ref_name2 = "";
477 char rebuf2[2048] = {0};
478 char cigar1[256] = {0}, cigar2[256] = {0};
479 string left_seq, right_seq, left_qual, right_qual;
480 int left = -1, left1 = -1, left2 = -1;
481 bool fusion_alignment = false;
482 size_t XF_tok_idx = 11;
483 for (; XF_tok_idx < sam_toks.size(); ++XF_tok_idx)
484 {
485 const string& tok = sam_toks[XF_tok_idx];
486 if (strncmp(tok.c_str(), "XF", 2) == 0)
487 {
488 fusion_alignment = true;
489
490 vector<string> tuple_fields;
491 tokenize(tok.c_str(), " ", tuple_fields);
492 vector<string> contigs;
493 tokenize(tuple_fields[1].c_str(), "-", contigs);
494 if (contigs.size() >= 2)
495 {
496 ref_name = contigs[0];
497 ref_name2 = contigs[1];
498 }
499
500 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
501 cigar1, cigar2, left_seq, right_seq,
502 left_qual, right_qual, left1, left2);
503
504 break;
505 }
506 }
507
508 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
509 int mapQ=255;
510 if (grade.num_alignments > 1) {
511 double err_prob = 1 - (1.0 / grade.num_alignments);
512 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
513 }
514 int tlen=0; //TLEN
515 int mate_pos=atoi(sam_toks[7].c_str());
516 if (partner) {
517 if (partner->ref_id()==bh.ref_id()) {
518 sam_toks[6] = "="; //same chromosome
519 //TLEN:
520 tlen = bh.left() < partner->left() ? partner->right() - bh.left() :
521 partner->left() - bh.right();
522 }
523
524 else { //partner on different chromosome/contig
525 sam_toks[6] = rt.get_name(partner->ref_id());
526 }
527 mate_pos = partner->left() + 1;
528 if (grade.happy())
529 flag |= 0x0002;
530 if (partner->antisense_align())
531 flag |= 0x0020;
532
533 if (partner->fusion_opcode() != FUSION_NOTHING)
534 {
535 char partner_pos[1024];
536 sprintf(partner_pos, "XP:Z:%s-%s %d", rt.get_name(partner->ref_id()), rt.get_name(partner->ref_id2()), partner->left() + 1);
537 sam_toks.push_back(partner_pos);
538 }
539 }
540 else {
541 sam_toks[6] = "*";
542 mate_pos = 0;
543 flag |= 0x0008;
544 }
545
546 GBamRecord* bamrec=NULL;
547 if (fusion_alignment) {
548 vector<string> auxdata;
549 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
550 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
551 cigar1, sam_toks[6].c_str(), mate_pos,
552 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
553 bam_writer.write(bamrec);
554 delete bamrec;
555
556 auxdata.clear();
557 sam_toks[XF_tok_idx][5] = '2';
558 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
559 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
560 cigar2, sam_toks[6].c_str(), mate_pos,
561 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
562 bam_writer.write(bamrec);
563 delete bamrec;
564 } else {
565 vector<string> auxdata;
566 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
567 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
568 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
569 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
570 bam_writer.write(bamrec);
571 delete bamrec;
572 }
573
574 return true;
575 }
576
577 struct lex_hit_sort
578 {
579 lex_hit_sort(const RefSequenceTable& rt, const HitsForRead& hits)
580 : _rt(rt), _hits(hits)
581 {}
582
583 bool operator()(const uint32_t& l, const uint32_t& r) const
584 {
585 const BowtieHit& lhs = _hits.hits[l];
586 const BowtieHit& rhs = _hits.hits[r];
587
588 uint32_t l_id = lhs.ref_id();
589 uint32_t r_id = rhs.ref_id();
590 if (l_id != r_id)
591 return l_id < r_id;
592
593 return lhs.left() < rhs.left();
594 }
595
596 const RefSequenceTable& _rt;
597 const HitsForRead& _hits;
598 };
599
600 void print_sam_for_single(const RefSequenceTable& rt,
601 const HitsForRead& hits,
602 const FragmentAlignmentGrade& grade,
603 FragmentType frag_type,
604 Read& read,
605 GBamWriter& bam_writer,
606 FILE* um_out //write unmapped reads here
607 )
608 {
609 assert(!read.alt_name.empty());
610 if (hits.hits.empty())
611 return;
612
613 lex_hit_sort s(rt, hits);
614 vector<uint32_t> index_vector;
615
616 size_t i;
617 for (i = 0; i < hits.hits.size(); ++i)
618 index_vector.push_back(i);
619
620 sort(index_vector.begin(), index_vector.end(), s);
621
622 size_t primaryHit = 0;
623 if (!report_secondary_alignments)
624 primaryHit = random() % hits.hits.size();
625
626 bool multipleHits = (hits.hits.size() > 1);
627 for (i = 0; i < hits.hits.size(); ++i)
628 {
629 size_t index = index_vector[i];
630 bool primary = (index == primaryHit);
631 const BowtieHit& bh = hits.hits[index];
632 rewrite_sam_record(bam_writer, rt,
633 bh,
634 bh.hitfile_rec().c_str(),
635 read.alt_name.c_str(),
636 grade,
637 frag_type,
638 hits.hits.size(),
639 (i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL,
640 primary,
641 (multipleHits? i: -1));
642 }
643 }
644
645 void print_sam_for_pair(const RefSequenceTable& rt,
646 const vector<pair<BowtieHit, BowtieHit> >& best_hits,
647 const InsertAlignmentGrade& grade,
648 ReadStream& left_reads_file,
649 ReadStream& right_reads_file,
650 GBamWriter& bam_writer,
651 FILE* left_um_out,
652 FILE* right_um_out,
653 uint64_t begin_id = 0,
654 uint64_t end_id = std::numeric_limits<uint64_t>::max())
655 {
656 Read left_read;
657 Read right_read;
658 if (best_hits.empty())
659 return;
660
661 size_t i;
662 HitsForRead right_hits;
663 for (i = 0; i < best_hits.size(); ++i)
664 right_hits.hits.push_back(best_hits[i].second);
665
666 size_t primaryHit = 0;
667 vector<uint32_t> index_vector;
668 lex_hit_sort s(rt, right_hits);
669 for (i = 0; i < right_hits.hits.size(); ++i)
670 index_vector.push_back(i);
671
672 sort(index_vector.begin(), index_vector.end(), s);
673
674 if (!report_secondary_alignments)
675 primaryHit = random() % right_hits.hits.size();
676
677 bool got_left_read = left_reads_file.getRead(best_hits[0].first.insert_id(), left_read,
678 reads_format, false, begin_id, end_id,
679 left_um_out, false);
680
681 bool got_right_read = right_reads_file.getRead(best_hits[0].first.insert_id(), right_read,
682 reads_format, false, begin_id, end_id,
683 right_um_out, false);
684
685 assert (got_left_read && got_right_read);
686 bool multipleHits = (best_hits.size() > 1);
687 for (i = 0; i < best_hits.size(); ++i)
688 {
689 size_t index = index_vector[i];
690 bool primary = (index == primaryHit);
691 const BowtieHit& right_bh = best_hits[index].second;
692 const BowtieHit& left_bh = best_hits[index].first;
693
694 rewrite_sam_record(bam_writer, rt,
695 right_bh,
696 right_bh.hitfile_rec().c_str(),
697 right_read.alt_name.c_str(),
698 grade,
699 FRAG_RIGHT,
700 &left_bh,
701 best_hits.size(),
702 (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].second) : NULL,
703 primary,
704 (multipleHits? i: -1));
705 rewrite_sam_record(bam_writer, rt,
706 left_bh,
707 left_bh.hitfile_rec().c_str(),
708 left_read.alt_name.c_str(),
709 grade,
710 FRAG_LEFT,
711 &right_bh,
712 best_hits.size(),
713 (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].first) : NULL,
714 primary,
715 (multipleHits? i: -1));
716 }
717 }
718
719 /**
720 * Given all of the hits for a particular read, update the read counts for insertions and deletions.
721 * @param hits hits The alignments for a particular read
722 * @param insertions Maps from an insertion to the number of supporting reads for that insertion
723 * @param deletions Maps from a deletion to the number of supporting reads for that deletion
724 */
725 void update_insertions_and_deletions(const HitsForRead& hits,
726 InsertionSet& insertions,
727 DeletionSet& deletions)
728 {
729 for (size_t i = 0; i < hits.hits.size(); ++i)
730 {
731 const BowtieHit& bh = hits.hits[i];
732 insertions_from_alignment(bh, insertions);
733 deletions_from_alignment(bh, deletions);
734 }
735 }
736
737
738 FusionSet empty_fusions;
739 void update_fusions(const HitsForRead& hits,
740 RefSequenceTable& rt,
741 FusionSet& fusions,
742 FusionSet& fusions_ref = empty_fusions)
743 {
744 if (hits.hits.size() > fusion_multireads)
745 return;
746
747 bool update_stat = fusions_ref.size() > 0;
748 for (size_t i = 0; i < hits.hits.size(); ++i)
749 {
750 const BowtieHit& bh = hits.hits[i];
751
752 if (bh.edit_dist() > fusion_read_mismatches)
753 continue;
754
755 // daehwan
756 #if 0
757 vector<Fusion> new_fusions;
758 fusions_from_spliced_hit(bh, new_fusions);
759
760 for(size_t i = 0; i < new_fusions.size(); ++i)
761 {
762 Fusion fusion = new_fusions[i];
763 if (fusion.left == 110343414 && fusion.right == 80211173 && hits.hits.size() > 1)
764 {
765 for (size_t t = 0; t < hits.hits.size(); ++t)
766 {
767 const BowtieHit& lh = hits.hits[t];
768 cout << "insert id: " << lh.insert_id() << endl;
769 cout << (lh.antisense_align() ? "-" : "+") << endl;
770 cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl;
771 cout << lh.ref_id2() << ": " << print_cigar(lh.cigar()) << endl;
772 }
773 }
774 }
775 #endif
776
777 fusions_from_alignment(bh, fusions, rt, update_stat);
778
779 if (update_stat)
780 unsupport_fusions(bh, fusions, fusions_ref);
781 }
782 }
783
784 void update_junctions(const HitsForRead& hits,
785 JunctionSet& junctions)
786 {
787 for (size_t i = 0; i < hits.hits.size(); ++i)
788 {
789 const BowtieHit& bh = hits.hits[i];
790 junctions_from_alignment(bh, junctions);
791 }
792 }
793
794 // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
795 // resets the stream when finished.
796 void exclude_hits_on_filtered_junctions(const JunctionSet& junctions, HitsForRead& hits)
797 {
798 HitsForRead remaining;
799 remaining.insert_id = hits.insert_id;
800
801 for (size_t i = 0; i < hits.hits.size(); ++i)
802 {
803 BowtieHit& bh = hits.hits[i];
804 if (bh.edit_dist() > max_read_mismatches)
805 continue;
806
807 bool filter_hit = false;
808 if (!bh.contiguous())
809 {
810 JunctionSet bh_juncs;
811 junctions_from_alignment(bh, bh_juncs);
812 for (JunctionSet::iterator itr = bh_juncs.begin();
813 itr != bh_juncs.end();
814 itr++)
815 {
816 const Junction& j = itr->first;
817 JunctionSet::const_iterator target = junctions.find(j);
818 if (target == junctions.end() || !target->second.accepted)
819 {
820 filter_hit = true;
821 break;
822 }
823 }
824 }
825 if (!filter_hit)
826 remaining.hits.push_back(bh);
827 }
828 hits = remaining;
829 }
830
831 // events include splice junction, indels, and fusion points.
832 struct ConsensusEventsWorker
833 {
834 void operator()()
835 {
836 ReadTable it;
837 vector<BAMHitFactory*> hit_factories;
838 hit_factories.push_back(new BAMHitFactory(it, *rt));
839 HitStream l_hs(left_map_fname, hit_factories.back(), false, true, true, true);
840 if (left_map_offset > 0)
841 l_hs.seek(left_map_offset);
842
843 hit_factories.push_back(new BAMHitFactory(it, *rt));
844 HitStream r_hs(right_map_fname, hit_factories.back(), false, true, true, true);
845 if (right_map_offset > 0)
846 r_hs.seek(right_map_offset);
847
848 HitsForRead curr_left_hit_group;
849 HitsForRead curr_right_hit_group;
850
851 l_hs.next_read_hits(curr_left_hit_group);
852 r_hs.next_read_hits(curr_right_hit_group);
853
854 uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
855 uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
856
857 // While we still have unreported hits...
858 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
859 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
860 {
861 // Chew up left singletons
862 while (curr_left_obs_order < curr_right_obs_order &&
863 curr_left_obs_order < end_id &&
864 curr_left_obs_order != VMAXINT32)
865 {
866 HitsForRead best_hits;
867 best_hits.insert_id = curr_left_obs_order;
868 FragmentAlignmentGrade best_grade;
869
870 // Process hits for left singleton, select best alignments
871 read_best_alignments(curr_left_hit_group, best_grade, best_hits, *gtf_junctions);
872 update_junctions(best_hits, *junctions);
873 update_fusions(best_hits, *rt, *fusions);
874
875 // Get next hit group
876 l_hs.next_read_hits(curr_left_hit_group);
877 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
878 }
879
880 // Chew up right singletons
881 while (curr_left_obs_order > curr_right_obs_order &&
882 curr_right_obs_order < end_id &&
883 curr_right_obs_order != VMAXINT32)
884 {
885 HitsForRead best_hits;
886 best_hits.insert_id = curr_right_obs_order;
887 FragmentAlignmentGrade best_grade;
888 if (curr_right_obs_order >= begin_id)
889 {
890 // Process hit for right singleton, select best alignments
891 read_best_alignments(curr_right_hit_group, best_grade, best_hits, *gtf_junctions);
892 update_junctions(best_hits, *junctions);
893 update_fusions(best_hits, *rt, *fusions);
894 }
895
896 // Get next hit group
897 r_hs.next_read_hits(curr_right_hit_group);
898 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
899 }
900
901 // Since we have both left hits and right hits for this insert,
902 // Find the best pairing and print both
903 while (curr_left_obs_order == curr_right_obs_order &&
904 curr_left_obs_order < end_id &&
905 curr_left_obs_order != VMAXINT32)
906 {
907 vector<pair<BowtieHit, BowtieHit> > best_hits;
908
909 InsertAlignmentGrade grade;
910 pair_best_alignments(curr_left_hit_group,
911 curr_right_hit_group,
912 grade,
913 best_hits,
914 *gtf_junctions);
915
916 HitsForRead single_best_hits;
917 single_best_hits.insert_id = curr_left_obs_order;
918
919 // this is too much memory copy - we will make it efficient soon
920 for (size_t i = 0; i < best_hits.size(); ++i)
921 {
922 single_best_hits.hits.push_back(best_hits[i].first);
923 single_best_hits.hits.push_back(best_hits[i].second);
924 }
925
926 update_junctions(single_best_hits, *junctions);
927 update_fusions(single_best_hits, *rt, *fusions);
928
929 l_hs.next_read_hits(curr_left_hit_group);
930 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
931
932 r_hs.next_read_hits(curr_right_hit_group);
933 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
934 }
935 }
936
937 for (size_t i = 0; i < hit_factories.size(); ++i)
938 delete hit_factories[i];
939
940 hit_factories.clear();
941 }
942
943 string left_map_fname;
944 string right_map_fname;
945
946 JunctionSet* junctions;
947 JunctionSet* gtf_junctions;
948 FusionSet* fusions;
949 RefSequenceTable* rt;
950
951 uint64_t begin_id;
952 uint64_t end_id;
953 int64_t left_map_offset;
954 int64_t right_map_offset;
955 };
956
957 struct ReportWorker
958 {
959 void operator()()
960 {
961 ReadTable it;
962 GBamWriter bam_writer(bam_output_fname.c_str(), sam_header.c_str());
963
964 ReadStream left_reads_file(left_reads_fname);
965 if (left_reads_file.file() == NULL)
966 err_die("Error: cannot open %s for reading\n", left_reads_fname.c_str());
967
968 if (left_reads_offset > 0)
969 left_reads_file.seek(left_reads_offset);
970
971 if (!zpacker.empty()) left_um_fname+=".z";
972 FZPipe left_um_out;
973 if (left_um_out.openWrite(left_um_fname.c_str(), zpacker)==NULL)
974 err_die("Error: cannot open file %s for writing!\n",left_um_fname.c_str());
975
976 ReadStream right_reads_file(right_reads_fname);
977 if (right_reads_offset > 0)
978 right_reads_file.seek(right_reads_offset);
979
980 FZPipe right_um_out;
981 if (right_reads_fname != "")
982 {
983 if (!zpacker.empty()) right_um_fname+=".z";
984 if (right_um_out.openWrite(right_um_fname.c_str(), zpacker)==NULL)
985 err_die("Error: cannot open file %s for writing!\n",right_um_fname.c_str());
986 }
987
988 vector<BAMHitFactory*> hit_factories;
989 hit_factories.push_back(new BAMHitFactory(it, *rt));
990 HitStream left_hs(left_map_fname, hit_factories.back(), false, true, true, true);
991 if (left_map_offset > 0)
992 left_hs.seek(left_map_offset);
993
994 hit_factories.push_back(new BAMHitFactory(it, *rt));
995 HitStream right_hs(right_map_fname, hit_factories.back(), false, true, true, true);
996 if (right_map_offset > 0)
997 right_hs.seek(right_map_offset);
998
999 HitsForRead curr_left_hit_group;
1000 HitsForRead curr_right_hit_group;
1001
1002 left_hs.next_read_hits(curr_left_hit_group);
1003 right_hs.next_read_hits(curr_right_hit_group);
1004
1005 uint64_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1006 uint64_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1007
1008 // While we still have unreported hits...
1009 Read l_read;
1010 Read r_read;
1011 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
1012 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
1013 {
1014 // Chew up left singletons (pairs with right reads unmapped)
1015 while (curr_left_obs_order < curr_right_obs_order &&
1016 curr_left_obs_order < end_id &&
1017 curr_left_obs_order != VMAXINT32)
1018 {
1019 HitsForRead best_hits;
1020 best_hits.insert_id = curr_left_obs_order;
1021 FragmentAlignmentGrade grade;
1022 bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1023 reads_format, false, begin_id, end_id,
1024 left_um_out.file, false);
1025 //assert(got_read);
1026
1027 if (right_reads_file.file()) {
1028 /*
1029 fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(),
1030 l_read.seq.c_str(), l_read.qual.c_str());
1031 */
1032 got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1033 reads_format, false, begin_id, end_id,
1034 right_um_out.file, true);
1035 //assert(got_read);
1036 }
1037
1038 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1039
1040 // Process hits for left singleton, select best alignments
1041 read_best_alignments(curr_left_hit_group, grade, best_hits, *gtf_junctions);
1042
1043 if (best_hits.hits.size() > max_multihits)
1044 {
1045 best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end());
1046 grade.num_alignments = best_hits.hits.size();
1047 }
1048
1049 if (best_hits.hits.size() > 0)
1050 {
1051 update_junctions(best_hits, *final_junctions);
1052 update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1053 update_fusions(best_hits, *rt, *final_fusions, *fusions);
1054
1055 print_sam_for_single(*rt,
1056 best_hits,
1057 grade,
1058 (right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT),
1059 l_read,
1060 bam_writer,
1061 left_um_out.file);
1062 }
1063 // Get next hit group
1064 left_hs.next_read_hits(curr_left_hit_group);
1065 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1066 } //left singletons
1067
1068 // Chew up right singletons
1069 while (curr_left_obs_order > curr_right_obs_order &&
1070 curr_right_obs_order < end_id &&
1071 curr_right_obs_order != VMAXINT32)
1072 {
1073 HitsForRead best_hits;
1074 best_hits.insert_id = curr_right_obs_order;
1075 FragmentAlignmentGrade grade;
1076
1077 if (curr_right_obs_order >= begin_id)
1078 {
1079 bool got_read=right_reads_file.getRead(curr_right_obs_order, r_read,
1080 reads_format, false, begin_id, end_id,
1081 right_um_out.file, false);
1082
1083 //assert(got_read);
1084 /*
1085 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1086 r_read.seq.c_str(), r_read.qual.c_str());
1087 */
1088 got_read=left_reads_file.getRead(curr_right_obs_order, l_read,
1089 reads_format, false, begin_id, end_id,
1090 left_um_out.file, true);
1091 //assert(got_read);
1092
1093 exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1094
1095 // Process hit for right singleton, select best alignments
1096 read_best_alignments(curr_right_hit_group, grade, best_hits, *gtf_junctions);
1097
1098 if (best_hits.hits.size() > max_multihits)
1099 {
1100 best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end());
1101 grade.num_alignments = best_hits.hits.size();
1102 }
1103
1104 if (best_hits.hits.size() > 0)
1105 {
1106 update_junctions(best_hits, *final_junctions);
1107 update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1108 update_fusions(best_hits, *rt, *final_fusions, *fusions);
1109
1110 print_sam_for_single(*rt,
1111 best_hits,
1112 grade,
1113 FRAG_RIGHT,
1114 r_read,
1115 bam_writer,
1116 right_um_out.file);
1117 }
1118 }
1119
1120 // Get next hit group
1121 right_hs.next_read_hits(curr_right_hit_group);
1122 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1123 }
1124
1125 // Since we have both left hits and right hits for this insert,
1126 // Find the best pairing and print both
1127 while (curr_left_obs_order == curr_right_obs_order &&
1128 curr_left_obs_order < end_id &&
1129 curr_left_obs_order != VMAXINT32)
1130 {
1131 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1132 exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1133 if (curr_left_hit_group.hits.empty())
1134 { //only right read mapped
1135 //write it in the mapped file with the #MAPPED# flag
1136
1137 bool got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1138 reads_format, false, begin_id, end_id,
1139 right_um_out.file, false);
1140 //assert(got_read);
1141 /*
1142 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1143 r_read.seq.c_str(), r_read.qual.c_str());
1144 */
1145 HitsForRead right_best_hits;
1146 right_best_hits.insert_id = curr_right_obs_order;
1147
1148 FragmentAlignmentGrade grade;
1149 read_best_alignments(curr_right_hit_group, grade, right_best_hits, *gtf_junctions);
1150
1151 if (right_best_hits.hits.size() > max_multihits)
1152 {
1153 right_best_hits.hits.erase(right_best_hits.hits.begin() + max_multihits, right_best_hits.hits.end());
1154 grade.num_alignments = right_best_hits.hits.size();
1155 }
1156
1157 if (right_best_hits.hits.size() > 0)
1158 {
1159 update_junctions(right_best_hits, *final_junctions);
1160 update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions);
1161 update_fusions(right_best_hits, *rt, *final_fusions, *fusions);
1162
1163 print_sam_for_single(*rt,
1164 right_best_hits,
1165 grade,
1166 FRAG_RIGHT,
1167 r_read,
1168 bam_writer,
1169 right_um_out.file);
1170 }
1171 }
1172 else if (curr_right_hit_group.hits.empty())
1173 {
1174 HitsForRead left_best_hits;
1175 left_best_hits.insert_id = curr_left_obs_order;
1176 //only left read mapped
1177 bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1178 reads_format, false, begin_id, end_id,
1179 left_um_out.file, false);
1180 //assert(got_read);
1181 /*
1182 fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(),
1183 l_read.seq.c_str(), l_read.qual.c_str());
1184 */
1185 FragmentAlignmentGrade grade;
1186 // Process hits for left singleton, select best alignments
1187 read_best_alignments(curr_left_hit_group, grade, left_best_hits, *gtf_junctions);
1188
1189 if (left_best_hits.hits.size() > max_multihits)
1190 {
1191 left_best_hits.hits.erase(left_best_hits.hits.begin() + max_multihits, left_best_hits.hits.end());
1192 grade.num_alignments = left_best_hits.hits.size();
1193 }
1194
1195 if (left_best_hits.hits.size() > 0)
1196 {
1197 update_junctions(left_best_hits, *final_junctions);
1198 update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions);
1199 update_fusions(left_best_hits, *rt, *final_fusions, *fusions);
1200
1201 print_sam_for_single(*rt,
1202 left_best_hits,
1203 grade,
1204 FRAG_LEFT,
1205 l_read,
1206 bam_writer,
1207 left_um_out.file);
1208 }
1209 }
1210 else
1211 {
1212 vector<pair<BowtieHit, BowtieHit> > best_hits;
1213
1214 InsertAlignmentGrade grade;
1215 pair_best_alignments(curr_left_hit_group,
1216 curr_right_hit_group,
1217 grade,
1218 best_hits,
1219 *gtf_junctions);
1220
1221 if (best_hits.size() > max_multihits)
1222 {
1223 best_hits.erase(best_hits.begin() + max_multihits, best_hits.end());
1224 grade.num_alignments = best_hits.size();
1225 }
1226
1227 //hits for both left and right reads
1228 HitsForRead single_best_hits;
1229 single_best_hits.insert_id = curr_left_obs_order;
1230 for (size_t i = 0; i < best_hits.size(); ++i)
1231 {
1232 single_best_hits.hits.push_back(best_hits[i].first);
1233 single_best_hits.hits.push_back(best_hits[i].second);
1234 }
1235
1236 if (single_best_hits.hits.size() > 0)
1237 {
1238 update_junctions(single_best_hits, *final_junctions);
1239 update_insertions_and_deletions(single_best_hits, *final_insertions, *final_deletions);
1240
1241 pair_support(best_hits, *final_fusions, *fusions);
1242 update_fusions(single_best_hits, *rt, *final_fusions, *fusions);
1243
1244 print_sam_for_pair(*rt,
1245 best_hits,
1246 grade,
1247 left_reads_file,
1248 right_reads_file,
1249 bam_writer,
1250 left_um_out.file,
1251 right_um_out.file,
1252 begin_id,
1253 end_id);
1254 }
1255 }
1256
1257 left_hs.next_read_hits(curr_left_hit_group);
1258 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1259
1260 right_hs.next_read_hits(curr_right_hit_group);
1261 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1262 }
1263 } //while we still have unreported hits..
1264 //print the remaining unmapped reads at the end of each reads' stream
1265 left_reads_file.getRead(VMAXINT32, l_read,
1266 reads_format, false, begin_id, end_id,
1267 left_um_out.file, false);
1268 if (right_reads_file.file())
1269 right_reads_file.getRead(VMAXINT32, r_read,
1270 reads_format, false, begin_id, end_id,
1271 right_um_out.file, false);
1272
1273 left_um_out.close();
1274 right_um_out.close();
1275
1276 for (size_t i = 0; i < hit_factories.size(); ++i)
1277 delete hit_factories[i];
1278
1279 hit_factories.clear();
1280 }
1281
1282 string bam_output_fname;
1283 string sam_header_fname;
1284
1285 string left_reads_fname;
1286 string left_map_fname;
1287 string right_reads_fname;
1288 string right_map_fname;
1289
1290 string left_um_fname;
1291 string right_um_fname;
1292
1293 JunctionSet* gtf_junctions;
1294 JunctionSet* junctions;
1295 FusionSet* fusions;
1296 JunctionSet* final_junctions;
1297 InsertionSet* final_insertions;
1298 DeletionSet* final_deletions;
1299 FusionSet* final_fusions;
1300
1301 RefSequenceTable* rt;
1302
1303 uint64_t begin_id;
1304 uint64_t end_id;
1305 int64_t left_reads_offset;
1306 int64_t left_map_offset;
1307 int64_t right_reads_offset;
1308 int64_t right_map_offset;
1309 };
1310
1311 void driver(const string& bam_output_fname,
1312 istream& ref_stream,
1313 const string& left_map_fname,
1314 const string& left_reads_fname,
1315 const string& right_map_fname,
1316 const string& right_reads_fname,
1317 FILE* junctions_out,
1318 FILE* insertions_out,
1319 FILE* deletions_out,
1320 FILE* fusions_out)
1321 {
1322 if (!parallel)
1323 num_threads = 1;
1324
1325 RefSequenceTable rt(sam_header, true);
1326
1327 if (fusion_search)
1328 get_seqs(ref_stream, rt, true);
1329
1330 srandom(1);
1331
1332 JunctionSet gtf_junctions;
1333 if (!gtf_juncs.empty())
1334 {
1335 char splice_buf[2048];
1336 FILE* splice_coords = fopen(gtf_juncs.c_str(), "r");
1337 if (splice_coords)
1338 {
1339 while (fgets(splice_buf, sizeof(splice_buf), splice_coords))
1340 {
1341 char* nl = strrchr(splice_buf, '\n');
1342 char* buf = splice_buf;
1343 if (nl) *nl = 0;
1344
1345 char* ref_name = get_token((char**)&buf, "\t");
1346 char* scan_left_coord = get_token((char**)&buf, "\t");
1347 char* scan_right_coord = get_token((char**)&buf, "\t");
1348 char* orientation = get_token((char**)&buf, "\t");
1349
1350 if (!scan_left_coord || !scan_right_coord || !orientation)
1351 {
1352 fprintf(stderr,"Error: malformed splice coordinate record in %s\n:%s\n",
1353 gtf_juncs.c_str(), buf);
1354 exit(1);
1355 }
1356
1357 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
1358 uint32_t left_coord = atoi(scan_left_coord);
1359 uint32_t right_coord = atoi(scan_right_coord);
1360 bool antisense = *orientation == '-';
1361
1362 // add 1 to left_coord to meet BED format
1363 gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord + 1, right_coord, antisense), JunctionStats()));
1364 }
1365 }
1366 fprintf(stderr, "Loaded %d GFF junctions from %s.\n", (int)(gtf_junctions.size()), gtf_juncs.c_str());
1367 }
1368
1369 vector<uint64_t> read_ids;
1370 vector<vector<int64_t> > offsets;
1371 if (num_threads > 1)
1372 {
1373 vector<string> fnames;
1374 if (right_map_fname != "")
1375 {
1376 fnames.push_back(right_reads_fname);
1377 fnames.push_back(right_map_fname);
1378 }
1379 fnames.push_back(left_reads_fname);
1380 fnames.push_back(left_map_fname);
1381 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
1382 if (!enough_data)
1383 num_threads = 1;
1384 }
1385
1386 JunctionSet vjunctions[num_threads];
1387 FusionSet vfusions[num_threads];
1388 vector<boost::thread*> threads;
1389 for (int i = 0; i < num_threads; ++i)
1390 {
1391 ConsensusEventsWorker worker;
1392
1393 worker.left_map_fname = left_map_fname;
1394 worker.right_map_fname = right_map_fname;
1395 worker.junctions = &vjunctions[i];
1396 worker.fusions = &vfusions[i];
1397 worker.gtf_junctions = &gtf_junctions;
1398 worker.rt = &rt;
1399
1400 worker.right_map_offset = 0;
1401
1402 if (i == 0)
1403 {
1404 worker.begin_id = 0;
1405 worker.left_map_offset = 0;
1406 }
1407 else
1408 {
1409 size_t offsets_size = offsets[i-1].size();
1410
1411 worker.begin_id = read_ids[i-1];
1412 worker.left_map_offset = offsets[i-1].back();
1413 if (offsets_size == 4)
1414 worker.right_map_offset = offsets[i-1][1];
1415 }
1416
1417 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1418
1419 if (num_threads > 1)
1420 threads.push_back(new boost::thread(worker));
1421 else
1422 worker();
1423 }
1424
1425 for (size_t i = 0; i < threads.size(); ++i)
1426 {
1427 threads[i]->join();
1428 delete threads[i];
1429 threads[i] = NULL;
1430 }
1431 threads.clear();
1432
1433 JunctionSet junctions = vjunctions[0];
1434 FusionSet fusions = vfusions[0];
1435 for (size_t i = 1; i < num_threads; ++i)
1436 {
1437 merge_with(junctions, vjunctions[i]);
1438 vjunctions[i].clear();
1439
1440 merge_with(fusions, vfusions[i]);
1441 vfusions[i].clear();
1442 }
1443
1444 size_t num_unfiltered_juncs = junctions.size();
1445 fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs);
1446
1447 // Read hits, extract junctions, and toss the ones that arent strongly enough supported.
1448
1449 filter_junctions(junctions, gtf_junctions);
1450
1451 //size_t num_juncs_after_filter = junctions.size();
1452 //fprintf(stderr, "Filtered %lu junctions\n",
1453 // num_unfiltered_juncs - num_juncs_after_filter);
1454
1455 /*
1456 size_t small_overhangs = 0;
1457 for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
1458 {
1459 if (i->second.accepted &&
1460 (i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len))
1461 {
1462 small_overhangs++;
1463 }
1464 }
1465
1466 if (small_overhangs >0)
1467 fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs);
1468 */
1469
1470 JunctionSet vfinal_junctions[num_threads];
1471 InsertionSet vfinal_insertions[num_threads];
1472 DeletionSet vfinal_deletions[num_threads];
1473 FusionSet vfinal_fusions[num_threads];
1474
1475 for (int i = 0; i < num_threads; ++i)
1476 {
1477 ReportWorker worker;
1478
1479 char filename[1024] = {0};
1480 sprintf(filename, "%s%d.bam", bam_output_fname.c_str(), i);
1481 worker.bam_output_fname = filename;
1482 worker.sam_header_fname = sam_header;
1483 sprintf(filename, "%s/unmapped_left%d.fq", output_dir.c_str(), i);
1484 worker.left_um_fname = filename;
1485
1486 if (right_reads_fname != "")
1487 {
1488 sprintf(filename, "%s/unmapped_right%d.fq", output_dir.c_str(), i);
1489 worker.right_um_fname = filename;
1490 }
1491
1492 worker.left_reads_fname = left_reads_fname;
1493 worker.left_map_fname = left_map_fname;
1494 worker.right_reads_fname = right_reads_fname;
1495 worker.right_map_fname = right_map_fname;
1496
1497 worker.gtf_junctions = &gtf_junctions;
1498 worker.junctions = &junctions;
1499 worker.fusions = &fusions;
1500 worker.final_junctions = &vfinal_junctions[i];
1501 worker.final_insertions = &vfinal_insertions[i];
1502 worker.final_deletions = &vfinal_deletions[i];
1503 worker.final_fusions = &vfinal_fusions[i];
1504 worker.rt = &rt;
1505
1506 worker.right_reads_offset = 0;
1507 worker.right_map_offset = 0;
1508
1509 if (i == 0)
1510 {
1511 worker.begin_id = 0;
1512 worker.left_reads_offset = 0;
1513 worker.left_map_offset = 0;
1514 }
1515 else
1516 {
1517 size_t offsets_size = offsets[i-1].size();
1518
1519 worker.begin_id = read_ids[i-1];
1520 worker.left_reads_offset = offsets[i-1][offsets_size - 2];
1521 worker.left_map_offset = offsets[i-1].back();
1522 if (offsets_size == 4)
1523 {
1524 worker.right_reads_offset = offsets[i-1][0];
1525 worker.right_map_offset = offsets[i-1][1];
1526 }
1527 }
1528
1529 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1530
1531 if (num_threads > 1)
1532 threads.push_back(new boost::thread(worker));
1533 else
1534 worker();
1535 }
1536
1537 for (size_t i = 0; i < threads.size(); ++i)
1538 {
1539 threads[i]->join();
1540 delete threads[i];
1541 threads[i] = NULL;
1542 }
1543 threads.clear();
1544
1545 JunctionSet final_junctions = vfinal_junctions[0];
1546 InsertionSet final_insertions = vfinal_insertions[0];
1547 DeletionSet final_deletions = vfinal_deletions[0];
1548 FusionSet final_fusions = vfinal_fusions[0];
1549 for (size_t i = 1; i < num_threads; ++i)
1550 {
1551 merge_with(final_junctions, vfinal_junctions[i]);
1552 vfinal_junctions[i].clear();
1553
1554 merge_with(final_insertions, vfinal_insertions[i]);
1555 vfinal_insertions[i].clear();
1556
1557 merge_with(final_deletions, vfinal_deletions[i]);
1558 vfinal_deletions[i].clear();
1559
1560 merge_with(final_fusions, vfinal_fusions[i]);
1561 vfinal_fusions[i].clear();
1562 }
1563
1564 fprintf (stderr, "Reporting final accepted alignments...");
1565 fprintf (stderr, "done.\n");
1566
1567 //small_overhangs = 0;
1568 for (JunctionSet::iterator i = final_junctions.begin(); i != final_junctions.end();)
1569 {
1570 if (i->second.supporting_hits == 0 || i->second.left_extent < 8 || i->second.right_extent < 8)
1571 {
1572 final_junctions.erase(i++);
1573 }
1574 else
1575 {
1576 ++i;
1577 }
1578 }
1579
1580 // if (small_overhangs > 0)
1581 // fprintf(stderr, "Warning: %lu small overhang junctions!\n", small_overhangs);
1582
1583 fprintf (stderr, "Printing junction BED track...");
1584 print_junctions(junctions_out, final_junctions, rt);
1585 fprintf (stderr, "done\n");
1586
1587 fprintf (stderr, "Printing insertions...");
1588 print_insertions(insertions_out, final_insertions,rt);
1589 fclose(insertions_out);
1590 fprintf (stderr, "done\n");
1591
1592 fprintf (stderr, "Printing deletions...");
1593 print_deletions(deletions_out, final_deletions, rt);
1594 fclose(deletions_out);
1595 fprintf (stderr, "done\n");
1596
1597 if (fusion_search)
1598 {
1599 fprintf (stderr, "Printing fusions...");
1600 print_fusions(fusions_out, final_fusions, rt);
1601 fclose(fusions_out);
1602 fprintf (stderr, "done\n");
1603 }
1604
1605 fprintf(stderr, "Found %lu junctions from happy spliced reads\n", (long unsigned int)final_junctions.size());
1606 }
1607
1608 void print_usage()
1609 {
1610 fprintf(stderr, "Usage: tophat_reports <junctions.bed> <insertions.vcf> <deletions.vcf> <accepted_hits.sam> <left_map1,...,left_mapN> <left_reads.fq> [right_map1,...,right_mapN] [right_reads.fq]\n");
1611
1612 // fprintf(stderr, "Usage: tophat_reports <coverage.wig> <junctions.bed> <accepted_hits.sam> <map1.bwtout> [splice_map1.sbwtout]\n");
1613 }
1614
1615 int main(int argc, char** argv)
1616 {
1617 fprintf(stderr, "tophat_reports v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
1618 fprintf(stderr, "---------------------------------------\n");
1619
1620 reads_format = FASTQ;
1621
1622 int parse_ret = parse_options(argc, argv, print_usage);
1623 if (parse_ret)
1624 return parse_ret;
1625
1626 if(optind >= argc)
1627 {
1628 print_usage();
1629 return 1;
1630 }
1631
1632 string ref_file_name = argv[optind++];
1633
1634 if(optind >= argc)
1635 {
1636 print_usage();
1637 return 1;
1638 }
1639
1640 string junctions_file_name = argv[optind++];
1641
1642 if(optind >= argc)
1643 {
1644 print_usage();
1645 return 1;
1646 }
1647
1648 string insertions_file_name = argv[optind++];
1649 if(optind >= argc)
1650 {
1651 print_usage();
1652 return 1;
1653 }
1654
1655 string deletions_file_name = argv[optind++];
1656 if(optind >= argc)
1657 {
1658 print_usage();
1659 return 1;
1660 }
1661
1662 string fusions_file_name = argv[optind++];
1663 if(optind >= argc)
1664 {
1665 print_usage();
1666 return 1;
1667 }
1668
1669 string accepted_hits_file_name = argv[optind++];
1670 if(optind >= argc)
1671 {
1672 print_usage();
1673 return 1;
1674 }
1675
1676 string left_map_filename = argv[optind++];
1677 if(optind >= argc)
1678 {
1679 print_usage();
1680 return 1;
1681 }
1682
1683 string left_reads_filename = argv[optind++];
1684 string unzcmd=getUnpackCmd(left_reads_filename, false);
1685
1686 string right_map_filename;
1687 string right_reads_filename;
1688
1689 if (optind < argc)
1690 {
1691 right_map_filename = argv[optind++];
1692 if(optind >= argc) {
1693 print_usage();
1694 return 1;
1695 }
1696 right_reads_filename=argv[optind++];
1697 }
1698
1699 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
1700 if (!ref_stream.good())
1701 {
1702 fprintf(stderr, "Error: cannot open %s for reading\n",
1703 ref_file_name.c_str());
1704 exit(1);
1705 }
1706
1707 FILE* junctions_file = fopen(junctions_file_name.c_str(), "w");
1708 if (junctions_file == NULL)
1709 {
1710 fprintf(stderr, "Error: cannot open BED file %s for writing\n",
1711 junctions_file_name.c_str());
1712 exit(1);
1713 }
1714
1715 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
1716 if (insertions_file == NULL)
1717 {
1718 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1719 insertions_file_name.c_str());
1720 exit(1);
1721 }
1722
1723 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
1724 if (deletions_file == NULL)
1725 {
1726 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1727 deletions_file_name.c_str());
1728 exit(1);
1729 }
1730
1731 FILE* fusions_file = NULL;
1732 if (fusion_search)
1733 {
1734 fusions_file = fopen(fusions_file_name.c_str(), "w");
1735 if (fusions_file == NULL)
1736 {
1737 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1738 fusions_file_name.c_str());
1739 exit(1);
1740 }
1741 }
1742
1743 driver(accepted_hits_file_name,
1744 ref_stream,
1745 left_map_filename,
1746 left_reads_filename,
1747 right_map_filename,
1748 right_reads_filename,
1749 junctions_file,
1750 insertions_file,
1751 deletions_file,
1752 fusions_file);
1753
1754 return 0;
1755 }