ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat_reports.cpp
Revision: 233
Committed: Wed Apr 11 21:05:01 2012 UTC (7 years, 7 months ago) by gpertea
File size: 55083 byte(s)
Log Message:
Line File contents
1 /*
2 * tophat_reports.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/20/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #else
13 #define PACKAGE_VERSION "INTERNAL"
14 #define SVN_REVISION "XXX"
15 #endif
16
17 #include <cassert>
18 #include <cstdio>
19 #include <cstring>
20 #include <vector>
21 #include <string>
22 #include <map>
23 #include <algorithm>
24 #include <set>
25 #include <stdexcept>
26 #include <iostream>
27 #include <fstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <getopt.h>
32
33 #include <boost/thread.hpp>
34
35 #include "common.h"
36 #include "utils.h"
37 #include "bwt_map.h"
38 #include "junctions.h"
39 #include "insertions.h"
40 #include "deletions.h"
41 #include "fusions.h"
42 #include "align_status.h"
43 #include "fragments.h"
44 #include "wiggles.h"
45 #include "tokenize.h"
46 #include "reads.h"
47 #include "coverage.h"
48
49
50 #include "inserts.h"
51
52 using namespace std;
53 using namespace seqan;
54 using std::set;
55
56 static const JunctionSet empty_junctions;
57 static const InsertionSet empty_insertions;
58 static const DeletionSet empty_deletions;
59 static const FusionSet empty_fusions;
60 static const Coverage empty_coverage;
61
62 // daehwan - this is redundancy, which should be removed.
63 void get_seqs(istream& ref_stream,
64 RefSequenceTable& rt,
65 bool keep_seqs = true)
66 {
67 while(ref_stream.good() && !ref_stream.eof())
68 {
69 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
70 string name;
71 readMeta(ref_stream, name, Fasta());
72 string::size_type space_pos = name.find_first_of(" \t\r");
73 if (space_pos != string::npos)
74 {
75 name.resize(space_pos);
76 }
77 seqan::read(ref_stream, *ref_str, Fasta());
78
79 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
80 }
81 }
82
83 struct cmp_read_alignment
84 {
85 bool operator() (const BowtieHit& l, const BowtieHit& r) const
86 {
87 return l.alignment_score() > r.alignment_score();
88 }
89 };
90
91 void read_best_alignments(const HitsForRead& hits_for_read,
92 HitsForRead& best_hits,
93 const JunctionSet& gtf_junctions,
94 const JunctionSet& junctions = empty_junctions,
95 const InsertionSet& insertions = empty_insertions,
96 const DeletionSet& deletions = empty_deletions,
97 const FusionSet& fusions = empty_fusions,
98 const Coverage& coverage = empty_coverage,
99 bool final_report = false)
100 {
101 const vector<BowtieHit>& hits = hits_for_read.hits;
102
103 if (hits.size() >= max_multihits * 5)
104 return;
105
106 for (size_t i = 0; i < hits.size(); ++i)
107 {
108 if (hits[i].edit_dist() > max_read_mismatches)
109 continue;
110
111 BowtieHit hit = hits[i];
112 AlignStatus align_status(hit, gtf_junctions,
113 junctions, insertions, deletions, fusions, coverage);
114 hit.alignment_score(align_status._alignment_score);
115
116 if (report_secondary_alignments || !final_report)
117 {
118 best_hits.hits.push_back(hit);
119 }
120 else
121 {
122 // Is the new status better than the current best one?
123 if (best_hits.hits.size() == 0 || cmp_read_alignment()(hit, best_hits.hits[0]))
124 {
125 best_hits.hits.clear();
126 best_hits.hits.push_back(hit);
127 }
128 else if (!cmp_read_alignment()(best_hits.hits[0], hit)) // is it just as good?
129 {
130 best_hits.hits.push_back(hit);
131 }
132 }
133 }
134
135 if ((report_secondary_alignments || !final_report) && best_hits.hits.size() > 0)
136 {
137 sort(best_hits.hits.begin(), best_hits.hits.end(), cmp_read_alignment());
138 }
139
140 if (final_report)
141 {
142 if (best_hits.hits.size() > max_multihits)
143 best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end());
144 }
145 }
146
147 bool set_insert_alignment_grade(const BowtieHit& lh, const BowtieHit& rh, InsertAlignmentGrade& grade)
148 {
149 bool fusion = false;
150 bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING;
151 bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING;
152 if (left_fusion && right_fusion)
153 return false;
154
155 fusion = left_fusion || right_fusion;
156 if (!fusion && lh.ref_id() != rh.ref_id())
157 fusion = true;
158
159 if (!fusion && lh.ref_id() == rh.ref_id())
160 {
161 if (lh.antisense_align() == rh.antisense_align())
162 fusion = true;
163 else
164 {
165 int inner_dist = 0;
166 if (lh.antisense_align())
167 // used rh.left() instead of rh.right() for the cases,
168 // where reads overlap with each other or reads span introns
169 inner_dist = lh.left() - rh.left();
170 else
171 inner_dist = rh.left() - lh.left();
172
173 if (inner_dist < 0 || inner_dist > (int)fusion_min_dist)
174 fusion = true;
175 }
176 }
177
178 grade = InsertAlignmentGrade(lh, rh, fusion);
179
180 return true;
181 }
182
183 struct cmp_pair_alignment
184 {
185 cmp_pair_alignment(const JunctionSet& gtf_juncs) :
186 _gtf_juncs(&gtf_juncs) {}
187
188 bool operator() (const pair<BowtieHit, BowtieHit>& l, const pair<BowtieHit, BowtieHit>& r) const
189 {
190 InsertAlignmentGrade gl; set_insert_alignment_grade(l.first, l.second, gl);
191 InsertAlignmentGrade gr; set_insert_alignment_grade(r.first, r.second, gr);
192
193 bool better = gr < gl;
194 bool worse = gl < gr;
195
196 if (better && !worse)
197 return true;
198 else
199 return false;
200 }
201
202 const JunctionSet* _gtf_juncs;
203 };
204
205 void pair_best_alignments(const HitsForRead& left_hits,
206 const HitsForRead& right_hits,
207 InsertAlignmentGrade& best_grade,
208 vector<pair<BowtieHit, BowtieHit> >& best_hits,
209 const JunctionSet& gtf_junctions,
210 const JunctionSet& junctions = empty_junctions,
211 const InsertionSet& insertions = empty_insertions,
212 const DeletionSet& deletions = empty_deletions,
213 const FusionSet& fusions = empty_fusions,
214 const Coverage& coverage = empty_coverage,
215 bool final_report = false)
216 {
217 const vector<BowtieHit>& left = left_hits.hits;
218 const vector<BowtieHit>& right = right_hits.hits;
219
220 if (left.size() >= max_multihits * 5 || right.size() >= max_multihits * 5)
221 return;
222
223 for (size_t i = 0; i < left.size(); ++i)
224 {
225 if (left[i].edit_dist() > max_read_mismatches) continue;
226
227 BowtieHit lh = left[i];
228 AlignStatus align_status(lh, gtf_junctions,
229 junctions, insertions, deletions, fusions, coverage);
230 lh.alignment_score(align_status._alignment_score);
231
232 for (size_t j = 0; j < right.size(); ++j)
233 {
234 if (right[j].edit_dist() > max_read_mismatches) continue;
235
236 BowtieHit rh = right[j];
237 AlignStatus align_status(rh, gtf_junctions,
238 junctions, insertions, deletions, fusions, coverage);
239 rh.alignment_score(align_status._alignment_score);
240 InsertAlignmentGrade g;
241 bool allowed = set_insert_alignment_grade(lh, rh, g);
242
243 if (!allowed) continue;
244 if (!fusion_search && !report_discordant_pair_alignments && g.is_fusion()) continue;
245
246 if (report_secondary_alignments || !final_report)
247 {
248 best_hits.push_back(make_pair(lh, rh));
249 }
250 else
251 {
252 // Is the new status better than the current best one?
253 if (best_grade < g)
254 {
255 best_hits.clear();
256 best_grade = g;
257 best_hits.push_back(make_pair(lh, rh));
258 }
259 else if (!(g < best_grade))
260 {
261 best_hits.push_back(make_pair(lh, rh));
262 }
263 }
264 }
265 }
266
267 if ((report_secondary_alignments || !final_report) && best_hits.size() > 0)
268 {
269 cmp_pair_alignment cmp(gtf_junctions);
270 sort(best_hits.begin(), best_hits.end(), cmp);
271 set_insert_alignment_grade(best_hits[0].first, best_hits[0].second, best_grade);
272 }
273
274 if (final_report)
275 {
276 if (best_hits.size() > max_multihits)
277 best_hits.erase(best_hits.begin() + max_multihits, best_hits.end());
278 }
279
280 best_grade.num_alignments = best_hits.size();
281 }
282
283 enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT};
284
285 void add_auxData(vector<string>& auxdata, vector<string>& sam_toks,
286 const RefSequenceTable& rt,const BowtieHit& bh, FragmentType insert_side,
287 int num_hits, const BowtieHit* next_hit, int hitIndex) {
288 bool XS_found = false;
289 if (sam_toks.size()>11) {
290
291 for (size_t i=11;i<sam_toks.size();++i) {
292 if (sam_toks[i].find("NH:i:")==string::npos &&
293 sam_toks[i].find("XS:i:")==string::npos)
294 auxdata.push_back(sam_toks[i]);
295
296 if (sam_toks[i].find("XS:A:")!=string::npos)
297 XS_found = true;
298 }
299 }
300 string aux("NH:i:");
301 str_appendInt(aux, num_hits);
302 auxdata.push_back(aux);
303 if (next_hit) {
304 const char* nh_ref_name = "=";
305 nh_ref_name = rt.get_name(next_hit->ref_id());
306 assert (nh_ref_name != NULL);
307 bool same_contig=(next_hit->ref_id()==bh.ref_id());
308 aux="CC:Z:";
309 aux+= (same_contig ? "=" : nh_ref_name);
310 auxdata.push_back(aux);
311 aux="CP:i:";
312 int nh_gpos=next_hit->left() + 1;
313 str_appendInt(aux, nh_gpos);
314 auxdata.push_back(aux);
315 } //has next_hit
316 // FIXME: this code is still a bit brittle, because it contains no
317 // consistency check that the mates are on opposite strands - a current protocol
318 // requirement, and that the strand indicated by the alignment is consistent
319 // with the orientation of the splices (though that should be handled upstream).
320 if (!XS_found) {
321 const string xs_f("XS:A:+");
322 const string xs_r("XS:A:-");
323 if (library_type == FR_FIRSTSTRAND) {
324 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
325 if (bh.antisense_align())
326 auxdata.push_back(xs_f);
327 else
328 auxdata.push_back(xs_r);
329 }
330 else {
331 if (bh.antisense_align())
332 auxdata.push_back(xs_r);
333 else
334 auxdata.push_back(xs_f);
335 }
336 }
337 else if (library_type == FR_SECONDSTRAND) {
338 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
339 if (bh.antisense_align())
340 auxdata.push_back(xs_r);
341 else
342 auxdata.push_back(xs_f);
343 }
344 else
345 {
346 if (bh.antisense_align())
347 auxdata.push_back(xs_f);
348 else
349 auxdata.push_back(xs_r);
350 }
351 }
352 }
353 if (hitIndex >= 0)
354 {
355 string aux("HI:i:");
356 str_appendInt(aux, hitIndex);
357 auxdata.push_back(aux);
358 }
359 }
360
361 bool rewrite_sam_record(GBamWriter& bam_writer,
362 const RefSequenceTable& rt,
363 const BowtieHit& bh,
364 const char* bwt_buf,
365 const char* read_alt_name,
366 FragmentType insert_side,
367 int num_hits,
368 const BowtieHit* next_hit,
369 bool primary,
370 int hitIndex)
371 {
372 // Rewrite this hit, filling in the alt name, mate mapping
373 // and setting the pair flag
374 vector<string> sam_toks;
375 tokenize(bwt_buf, "\t", sam_toks);
376
377 string ref_name = sam_toks[2], ref_name2 = "";
378 char cigar1[1024] = {0}, cigar2[1024] = {0};
379 string left_seq, right_seq, left_qual, right_qual;
380 int left = -1, left1 = -1, left2 = -1;
381 bool fusion_alignment = false;
382 size_t XF_index = 0;
383 for (size_t i = 11; i < sam_toks.size(); ++i)
384 {
385 string& tok = sam_toks[i];
386 if (strncmp(tok.c_str(), "XF", 2) == 0)
387 {
388 fusion_alignment = true;
389 XF_index = i;
390
391 vector<string> tuple_fields;
392 tokenize(tok.c_str(), " ", tuple_fields);
393 vector<string> contigs;
394 tokenize(tuple_fields[1].c_str(), "-", contigs);
395 if (contigs.size() >= 2)
396 {
397 ref_name = contigs[0];
398 ref_name2 = contigs[1];
399 }
400
401 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
402 cigar1, cigar2, left_seq, right_seq,
403 left_qual, right_qual, left1, left2);
404
405 break;
406 }
407
408 else if (strncmp(tok.c_str(), "AS", 2) == 0)
409 {
410 char AS_score[128] = {0};
411 sprintf(AS_score, "AS:i:%d", bh.alignment_score());
412 tok = AS_score;
413 }
414 }
415
416 string qname(read_alt_name);
417 size_t slash_pos=qname.rfind('/');
418 if (slash_pos!=string::npos)
419 qname.resize(slash_pos);
420 //read_alt_name as QNAME
421 int flag=atoi(sam_toks[1].c_str()); //FLAG
422 if (insert_side != FRAG_UNPAIRED) {
423 //flag = atoi(sam_toks[1].c_str());
424 // mark this as a singleton mate
425 flag |= 0x0001;
426 if (insert_side == FRAG_LEFT)
427 flag |= 0x0040;
428 else if (insert_side == FRAG_RIGHT)
429 flag |= 0x0080;
430 flag |= 0x0008;
431 //char flag_buf[64];
432 //sprintf(flag_buf, "%d", flag);
433 //sam_toks[t] = flag_buf;
434 }
435 if (!primary)
436 flag |= 0x100;
437
438 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
439 int mapQ=255;
440 if (num_hits > 1) {
441 double err_prob = 1 - (1.0 / num_hits);
442 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
443 }
444 int tlen =atoi(sam_toks[8].c_str()); //TLEN
445 int mate_pos=atoi(sam_toks[7].c_str());
446
447 GBamRecord* bamrec=NULL;
448 if (fusion_alignment) {
449 vector<string> auxdata;
450 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
451 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
452 cigar1, sam_toks[6].c_str(), mate_pos,
453 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
454 bam_writer.write(bamrec);
455 delete bamrec;
456
457 auxdata.clear();
458 sam_toks[XF_index][5] = '2';
459 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
460 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
461 cigar2, sam_toks[6].c_str(), mate_pos,
462 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
463 bam_writer.write(bamrec);
464 delete bamrec;
465 } else {
466 vector<string> auxdata;
467 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
468 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
469 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
470 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
471 bam_writer.write(bamrec);
472 delete bamrec;
473 }
474
475 return true;
476 }
477
478 bool rewrite_sam_record(GBamWriter& bam_writer,
479 const RefSequenceTable& rt,
480 const BowtieHit& bh,
481 const char* bwt_buf,
482 const char* read_alt_name,
483 const InsertAlignmentGrade& grade,
484 FragmentType insert_side,
485 const BowtieHit* partner,
486 int num_hits,
487 const BowtieHit* next_hit,
488 bool primary,
489 int hitIndex)
490 {
491 // Rewrite this hit, filling in the alt name, mate mapping
492 // and setting the pair flag
493 vector<string> sam_toks;
494 tokenize(bwt_buf, "\t", sam_toks);
495 string qname(read_alt_name);
496 size_t slash_pos=qname.rfind('/');
497 if (slash_pos!=string::npos)
498 qname.resize(slash_pos);
499 //read_alt_name as QNAME
500 int flag = atoi(sam_toks[1].c_str());
501 // 0x0010 (strand of query) is assumed to be set correctly
502 // to begin with
503 flag |= 0x0001; //it must be paired
504 if (insert_side == FRAG_LEFT)
505 flag |= 0x0040;
506 else if (insert_side == FRAG_RIGHT)
507 flag |= 0x0080;
508 if (!primary)
509 flag |= 0x100;
510
511 string ref_name = sam_toks[2], ref_name2 = "";
512 char cigar1[1024] = {0}, cigar2[1024] = {0};
513 string left_seq, right_seq, left_qual, right_qual;
514 int left = -1, left1 = -1, left2 = -1;
515 bool fusion_alignment = false;
516 size_t XF_tok_idx = 11;
517 for (; XF_tok_idx < sam_toks.size(); ++XF_tok_idx)
518 {
519 string& tok = sam_toks[XF_tok_idx];
520 if (strncmp(tok.c_str(), "XF", 2) == 0)
521 {
522 fusion_alignment = true;
523
524 vector<string> tuple_fields;
525 tokenize(tok.c_str(), " ", tuple_fields);
526 vector<string> contigs;
527 tokenize(tuple_fields[1].c_str(), "-", contigs);
528 if (contigs.size() >= 2)
529 {
530 ref_name = contigs[0];
531 ref_name2 = contigs[1];
532 }
533
534 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
535 cigar1, cigar2, left_seq, right_seq,
536 left_qual, right_qual, left1, left2);
537
538 break;
539 }
540
541 else if (strncmp(tok.c_str(), "AS", 2) == 0)
542 {
543 char AS_score[128] = {0};
544 sprintf(AS_score, "AS:i:%d", bh.alignment_score());
545 tok = AS_score;
546 }
547 }
548
549 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
550 int mapQ=255;
551 if (grade.num_alignments > 1) {
552 double err_prob = 1 - (1.0 / grade.num_alignments);
553 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
554 }
555 int tlen=0; //TLEN
556 int mate_pos=atoi(sam_toks[7].c_str());
557 if (partner) {
558 if (partner->ref_id()==bh.ref_id()) {
559 sam_toks[6] = "="; //same chromosome
560 //TLEN:
561 tlen = bh.left() < partner->left() ? partner->right() - bh.left() :
562 partner->left() - bh.right();
563 }
564
565 else { //partner on different chromosome/contig
566 sam_toks[6] = rt.get_name(partner->ref_id());
567 }
568 mate_pos = partner->left() + 1;
569 if (grade.happy())
570 flag |= 0x0002;
571 if (partner->antisense_align())
572 flag |= 0x0020;
573
574 if (partner->fusion_opcode() != FUSION_NOTHING)
575 {
576 char partner_pos[1024];
577 sprintf(partner_pos, "XP:Z:%s-%s %d", rt.get_name(partner->ref_id()), rt.get_name(partner->ref_id2()), partner->left() + 1);
578 sam_toks.push_back(partner_pos);
579 }
580 }
581 else {
582 sam_toks[6] = "*";
583 mate_pos = 0;
584 flag |= 0x0008;
585 }
586
587 GBamRecord* bamrec=NULL;
588 if (fusion_alignment) {
589 vector<string> auxdata;
590 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
591 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
592 cigar1, sam_toks[6].c_str(), mate_pos,
593 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
594 bam_writer.write(bamrec);
595 delete bamrec;
596
597 auxdata.clear();
598 sam_toks[XF_tok_idx][5] = '2';
599 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
600 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
601 cigar2, sam_toks[6].c_str(), mate_pos,
602 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
603 bam_writer.write(bamrec);
604 delete bamrec;
605 } else {
606 vector<string> auxdata;
607 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
608 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
609 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
610 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
611 bam_writer.write(bamrec);
612 delete bamrec;
613 }
614
615 return true;
616 }
617
618 struct lex_hit_sort
619 {
620 lex_hit_sort(const RefSequenceTable& rt, const HitsForRead& hits)
621 : _rt(rt), _hits(hits)
622 {}
623
624 bool operator()(const uint32_t& l, const uint32_t& r) const
625 {
626 const BowtieHit& lhs = _hits.hits[l];
627 const BowtieHit& rhs = _hits.hits[r];
628
629 uint32_t l_id = lhs.ref_id();
630 uint32_t r_id = rhs.ref_id();
631 if (l_id != r_id)
632 return l_id < r_id;
633
634 return lhs.left() < rhs.left();
635 }
636
637 const RefSequenceTable& _rt;
638 const HitsForRead& _hits;
639 };
640
641 void print_sam_for_single(const RefSequenceTable& rt,
642 const HitsForRead& hits,
643 FragmentType frag_type,
644 Read& read,
645 GBamWriter& bam_writer,
646 GBamWriter* um_out //write unmapped reads here
647 )
648 {
649 assert(!read.alt_name.empty());
650 if (hits.hits.empty())
651 return;
652
653 lex_hit_sort s(rt, hits);
654 vector<uint32_t> index_vector;
655
656 size_t i;
657 for (i = 0; i < hits.hits.size(); ++i)
658 index_vector.push_back(i);
659
660 sort(index_vector.begin(), index_vector.end(), s);
661
662 size_t primaryHit = 0;
663 if (!report_secondary_alignments)
664 primaryHit = random() % hits.hits.size();
665
666 bool multipleHits = (hits.hits.size() > 1);
667 for (i = 0; i < hits.hits.size(); ++i)
668 {
669 size_t index = index_vector[i];
670 bool primary = (index == primaryHit);
671 const BowtieHit& bh = hits.hits[index];
672 rewrite_sam_record(bam_writer, rt,
673 bh,
674 bh.hitfile_rec().c_str(),
675 read.alt_name.c_str(),
676 frag_type,
677 hits.hits.size(),
678 (i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL,
679 primary,
680 (multipleHits? i: -1));
681 }
682 }
683
684 void print_sam_for_pair(const RefSequenceTable& rt,
685 const vector<pair<BowtieHit, BowtieHit> >& best_hits,
686 const InsertAlignmentGrade& grade,
687 ReadStream& left_reads_file,
688 ReadStream& right_reads_file,
689 GBamWriter& bam_writer,
690 GBamWriter* left_um_out,
691 GBamWriter* right_um_out,
692 uint64_t begin_id = 0,
693 uint64_t end_id = std::numeric_limits<uint64_t>::max())
694 {
695 Read left_read;
696 Read right_read;
697 if (best_hits.empty())
698 return;
699
700 size_t i;
701 HitsForRead right_hits;
702 for (i = 0; i < best_hits.size(); ++i)
703 right_hits.hits.push_back(best_hits[i].second);
704
705 size_t primaryHit = 0;
706 vector<uint32_t> index_vector;
707 lex_hit_sort s(rt, right_hits);
708 for (i = 0; i < right_hits.hits.size(); ++i)
709 index_vector.push_back(i);
710
711 sort(index_vector.begin(), index_vector.end(), s);
712
713 if (!report_secondary_alignments)
714 primaryHit = random() % right_hits.hits.size();
715
716 bool got_left_read = left_reads_file.getRead(best_hits[0].first.insert_id(), left_read,
717 reads_format, false, begin_id, end_id,
718 left_um_out, false);
719
720 bool got_right_read = right_reads_file.getRead(best_hits[0].first.insert_id(), right_read,
721 reads_format, false, begin_id, end_id,
722 right_um_out, false);
723
724 assert (got_left_read && got_right_read);
725 bool multipleHits = (best_hits.size() > 1);
726 for (i = 0; i < best_hits.size(); ++i)
727 {
728 size_t index = index_vector[i];
729 bool primary = (index == primaryHit);
730 const BowtieHit& right_bh = best_hits[index].second;
731 const BowtieHit& left_bh = best_hits[index].first;
732
733 rewrite_sam_record(bam_writer, rt,
734 right_bh,
735 right_bh.hitfile_rec().c_str(),
736 right_read.alt_name.c_str(),
737 grade,
738 FRAG_RIGHT,
739 &left_bh,
740 best_hits.size(),
741 (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].second) : NULL,
742 primary,
743 (multipleHits? i: -1));
744 rewrite_sam_record(bam_writer, rt,
745 left_bh,
746 left_bh.hitfile_rec().c_str(),
747 left_read.alt_name.c_str(),
748 grade,
749 FRAG_LEFT,
750 &right_bh,
751 best_hits.size(),
752 (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].first) : NULL,
753 primary,
754 (multipleHits? i: -1));
755 }
756 }
757
758 /**
759 * Given all of the hits for a particular read, update the read counts for insertions and deletions.
760 * @param hits hits The alignments for a particular read
761 * @param insertions Maps from an insertion to the number of supporting reads for that insertion
762 * @param deletions Maps from a deletion to the number of supporting reads for that deletion
763 */
764 void update_insertions_and_deletions(const HitsForRead& hits,
765 InsertionSet& insertions,
766 DeletionSet& deletions)
767 {
768 for (size_t i = 0; i < hits.hits.size(); ++i)
769 {
770 const BowtieHit& bh = hits.hits[i];
771 insertions_from_alignment(bh, insertions);
772 deletions_from_alignment(bh, deletions);
773 }
774 }
775
776 void update_coverage(const HitsForRead& hits,
777 Coverage& coverage)
778 {
779 for (size_t i = 0; i < hits.hits.size(); ++i)
780 {
781 const BowtieHit& hit = hits.hits[i];
782 const vector<CigarOp>& cigar = hit.cigar();
783 unsigned int positionInGenome = hit.left();
784 RefID ref_id = hit.ref_id();
785
786 for(size_t c = 0; c < cigar.size(); ++c)
787 {
788 int opcode = cigar[c].opcode;
789 int length = cigar[c].length;
790 switch(opcode)
791 {
792 case REF_SKIP:
793 case MATCH:
794 case DEL:
795 if (opcode == MATCH)
796 coverage.add_coverage(ref_id, positionInGenome, length);
797
798 positionInGenome += length;
799 break;
800 case rEF_SKIP:
801 case mATCH:
802 case dEL:
803 positionInGenome -= length;
804
805 if (opcode == mATCH)
806 coverage.add_coverage(ref_id, positionInGenome + 1, length);
807 break;
808 case FUSION_FF:
809 case FUSION_FR:
810 case FUSION_RF:
811 case FUSION_RR:
812 positionInGenome = length;
813 ref_id = hit.ref_id2();
814 break;
815 default:
816 break;
817 }
818 }
819 }
820 }
821
822
823 void update_fusions(const HitsForRead& hits,
824 RefSequenceTable& rt,
825 FusionSet& fusions,
826 const FusionSet& fusions_ref = empty_fusions)
827 {
828 if (!fusion_search)
829 return;
830
831 if (hits.hits.size() > fusion_multireads)
832 return;
833
834 bool update_stat = fusions_ref.size() > 0;
835 for (size_t i = 0; i < hits.hits.size(); ++i)
836 {
837 const BowtieHit& bh = hits.hits[i];
838
839 if (bh.edit_dist() > fusion_read_mismatches)
840 continue;
841
842 fusions_from_alignment(bh, fusions, rt, update_stat);
843
844 if (update_stat)
845 unsupport_fusions(bh, fusions, fusions_ref);
846 }
847 }
848
849 void update_junctions(const HitsForRead& hits,
850 JunctionSet& junctions)
851 {
852 for (size_t i = 0; i < hits.hits.size(); ++i)
853 {
854 const BowtieHit& bh = hits.hits[i];
855 junctions_from_alignment(bh, junctions);
856 }
857 }
858
859 // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
860 // resets the stream when finished.
861 void exclude_hits_on_filtered_junctions(const JunctionSet& junctions, HitsForRead& hits)
862 {
863 HitsForRead remaining;
864 remaining.insert_id = hits.insert_id;
865
866 for (size_t i = 0; i < hits.hits.size(); ++i)
867 {
868 BowtieHit& bh = hits.hits[i];
869 if (bh.edit_dist() > max_read_mismatches)
870 continue;
871
872 bool filter_hit = false;
873 if (!bh.contiguous())
874 {
875 JunctionSet bh_juncs;
876 junctions_from_alignment(bh, bh_juncs);
877 for (JunctionSet::iterator itr = bh_juncs.begin();
878 itr != bh_juncs.end();
879 itr++)
880 {
881 const Junction& j = itr->first;
882 JunctionSet::const_iterator target = junctions.find(j);
883 if (target == junctions.end() || !target->second.accepted)
884 {
885 filter_hit = true;
886 break;
887 }
888 }
889 }
890 if (!filter_hit)
891 remaining.hits.push_back(bh);
892 }
893 hits = remaining;
894 }
895
896 // events include splice junction, indels, and fusion points.
897 struct ConsensusEventsWorker
898 {
899 void operator()()
900 {
901 ReadTable it;
902 vector<BAMHitFactory*> hit_factories;
903 hit_factories.push_back(new BAMHitFactory(it, *rt));
904 HitStream l_hs(left_map_fname, hit_factories.back(), false, true, true, true);
905 if (left_map_offset > 0)
906 l_hs.seek(left_map_offset);
907
908 hit_factories.push_back(new BAMHitFactory(it, *rt));
909 HitStream r_hs(right_map_fname, hit_factories.back(), false, true, true, true);
910 if (right_map_offset > 0)
911 r_hs.seek(right_map_offset);
912
913 HitsForRead curr_left_hit_group;
914 HitsForRead curr_right_hit_group;
915
916 l_hs.next_read_hits(curr_left_hit_group);
917 r_hs.next_read_hits(curr_right_hit_group);
918
919 uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
920 uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
921
922 // While we still have unreported hits...
923 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
924 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
925 {
926 // Chew up left singletons
927 while (curr_left_obs_order < curr_right_obs_order &&
928 curr_left_obs_order < end_id &&
929 curr_left_obs_order != VMAXINT32)
930 {
931 HitsForRead best_hits;
932 best_hits.insert_id = curr_left_obs_order;
933
934 // Process hits for left singleton, select best alignments
935 read_best_alignments(curr_left_hit_group, best_hits, *gtf_junctions);
936 update_coverage(best_hits, *coverage);
937 update_junctions(best_hits, *junctions);
938 update_insertions_and_deletions(best_hits, *insertions, *deletions);
939 update_fusions(best_hits, *rt, *fusions);
940
941 // Get next hit group
942 l_hs.next_read_hits(curr_left_hit_group);
943 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
944 }
945
946 // Chew up right singletons
947 while (curr_left_obs_order > curr_right_obs_order &&
948 curr_right_obs_order < end_id &&
949 curr_right_obs_order != VMAXINT32)
950 {
951 HitsForRead best_hits;
952 best_hits.insert_id = curr_right_obs_order;
953 if (curr_right_obs_order >= begin_id)
954 {
955 // Process hit for right singleton, select best alignments
956 read_best_alignments(curr_right_hit_group, best_hits, *gtf_junctions);
957 update_coverage(best_hits, *coverage);
958 update_junctions(best_hits, *junctions);
959 update_insertions_and_deletions(best_hits, *insertions, *deletions);
960 update_fusions(best_hits, *rt, *fusions);
961 }
962
963 // Get next hit group
964 r_hs.next_read_hits(curr_right_hit_group);
965 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
966 }
967
968 // Since we have both left hits and right hits for this insert,
969 // Find the best pairing and print both
970 while (curr_left_obs_order == curr_right_obs_order &&
971 curr_left_obs_order < end_id &&
972 curr_left_obs_order != VMAXINT32)
973 {
974 vector<pair<BowtieHit, BowtieHit> > best_hits;
975
976 InsertAlignmentGrade grade;
977 pair_best_alignments(curr_left_hit_group,
978 curr_right_hit_group,
979 grade,
980 best_hits,
981 *gtf_junctions);
982
983 HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order;
984 HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order;
985
986 for (size_t i = 0; i < best_hits.size(); ++i)
987 {
988 best_left_hit_group.hits.push_back(best_hits[i].first);
989 best_right_hit_group.hits.push_back(best_hits[i].second);
990 }
991
992 update_coverage(best_left_hit_group, *coverage);
993 update_junctions(best_left_hit_group, *junctions);
994 update_insertions_and_deletions(best_left_hit_group, *insertions, *deletions);
995 update_fusions(best_left_hit_group, *rt, *fusions);
996
997 update_coverage(best_right_hit_group, *coverage);
998 update_junctions(best_right_hit_group, *junctions);
999 update_insertions_and_deletions(best_right_hit_group, *insertions, *deletions);
1000 update_fusions(best_right_hit_group, *rt, *fusions);
1001
1002 l_hs.next_read_hits(curr_left_hit_group);
1003 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1004
1005 r_hs.next_read_hits(curr_right_hit_group);
1006 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1007 }
1008 }
1009
1010 for (size_t i = 0; i < hit_factories.size(); ++i)
1011 delete hit_factories[i];
1012
1013 hit_factories.clear();
1014 }
1015
1016 string left_map_fname;
1017 string right_map_fname;
1018
1019 RefSequenceTable* rt;
1020
1021 JunctionSet* gtf_junctions;
1022
1023 uint64_t begin_id;
1024 uint64_t end_id;
1025 int64_t left_map_offset;
1026 int64_t right_map_offset;
1027
1028 JunctionSet* junctions;
1029 InsertionSet* insertions;
1030 DeletionSet* deletions;
1031 FusionSet* fusions;
1032
1033 Coverage* coverage;
1034 };
1035
1036 struct ReportWorker
1037 {
1038 void operator()()
1039 {
1040 ReadTable it;
1041 GBamWriter bam_writer(bam_output_fname.c_str(), sam_header.c_str());
1042
1043 ReadStream left_reads_file(left_reads_fname);
1044 if (left_reads_file.file() == NULL)
1045 err_die("Error: cannot open %s for reading\n", left_reads_fname.c_str());
1046 if (left_reads_file.isBam()) {
1047 left_reads_file.use_alt_name();
1048 left_reads_file.ignoreQC();
1049 }
1050 if (left_reads_offset > 0)
1051 left_reads_file.seek(left_reads_offset);
1052
1053 //if (!zpacker.empty()) left_um_fname+=".z";
1054 GBamWriter* left_um_out=new GBamWriter(left_um_fname.c_str(), sam_header.c_str());
1055 GBamWriter* right_um_out=NULL;
1056
1057 //if (left_um_out.openWrite(left_um_fname.c_str(), zpacker)==NULL)
1058 // err_die("Error: cannot open file %s for writing!\n",left_um_fname.c_str());
1059 ReadStream right_reads_file(right_reads_fname);
1060 if (right_reads_offset > 0)
1061 right_reads_file.seek(right_reads_offset);
1062
1063 //FZPipe right_um_out;
1064 if (!right_reads_fname.empty())
1065 {
1066 if (right_reads_file.isBam()) {
1067 right_reads_file.use_alt_name();
1068 right_reads_file.ignoreQC();
1069 right_um_out=new GBamWriter(right_um_fname.c_str(), sam_header.c_str());
1070 }
1071 //if (!zpacker.empty()) right_um_fname+=".z";
1072 //if (right_um_out.openWrite(right_um_fname.c_str(), zpacker)==NULL)
1073 // err_die("Error: cannot open file %s for writing!\n",right_um_fname.c_str());
1074 }
1075
1076 vector<BAMHitFactory*> hit_factories;
1077 hit_factories.push_back(new BAMHitFactory(it, *rt));
1078 HitStream left_hs(left_map_fname, hit_factories.back(), false, true, true, true);
1079 if (left_map_offset > 0)
1080 left_hs.seek(left_map_offset);
1081
1082 hit_factories.push_back(new BAMHitFactory(it, *rt));
1083 HitStream right_hs(right_map_fname, hit_factories.back(), false, true, true, true);
1084 if (right_map_offset > 0)
1085 right_hs.seek(right_map_offset);
1086
1087 HitsForRead curr_left_hit_group;
1088 HitsForRead curr_right_hit_group;
1089
1090 left_hs.next_read_hits(curr_left_hit_group);
1091 right_hs.next_read_hits(curr_right_hit_group);
1092
1093 uint64_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1094 uint64_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1095
1096 const bool final_report = true;
1097
1098 // While we still have unreported hits...
1099 Read l_read;
1100 Read r_read;
1101 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
1102 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
1103 {
1104 // Chew up left singletons (pairs with right reads unmapped)
1105 while (curr_left_obs_order < curr_right_obs_order &&
1106 curr_left_obs_order < end_id &&
1107 curr_left_obs_order != VMAXINT32)
1108 {
1109 HitsForRead best_hits;
1110 best_hits.insert_id = curr_left_obs_order;
1111 bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1112 reads_format, false, begin_id, end_id,
1113 left_um_out, false);
1114 //assert(got_read);
1115
1116 if (right_reads_file.file()) {
1117 got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1118 reads_format, false, begin_id, end_id,
1119 right_um_out, true);
1120 //assert(got_read);
1121 }
1122
1123 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1124
1125 // Process hits for left singleton, select best alignments
1126 read_best_alignments(curr_left_hit_group, best_hits, *gtf_junctions,
1127 *junctions, *insertions, *deletions, *fusions, *coverage,
1128 final_report);
1129
1130 if (best_hits.hits.size() > 0)
1131 {
1132 update_junctions(best_hits, *final_junctions);
1133 update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1134 update_fusions(best_hits, *rt, *final_fusions, *fusions);
1135
1136 print_sam_for_single(*rt,
1137 best_hits,
1138 (right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT),
1139 l_read,
1140 bam_writer,
1141 left_um_out);
1142 }
1143 // Get next hit group
1144 left_hs.next_read_hits(curr_left_hit_group);
1145 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1146 } //left singletons
1147
1148 // Chew up right singletons
1149 while (curr_left_obs_order > curr_right_obs_order &&
1150 curr_right_obs_order < end_id &&
1151 curr_right_obs_order != VMAXINT32)
1152 {
1153 HitsForRead best_hits;
1154 best_hits.insert_id = curr_right_obs_order;
1155
1156 if (curr_right_obs_order >= begin_id)
1157 {
1158 bool got_read=right_reads_file.getRead(curr_right_obs_order, r_read,
1159 reads_format, false, begin_id, end_id,
1160 right_um_out, false);
1161
1162 //assert(got_read);
1163 /*
1164 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1165 r_read.seq.c_str(), r_read.qual.c_str());
1166 */
1167 got_read=left_reads_file.getRead(curr_right_obs_order, l_read,
1168 reads_format, false, begin_id, end_id,
1169 left_um_out, true);
1170 //assert(got_read);
1171
1172 exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1173
1174 // Process hit for right singleton, select best alignments
1175 read_best_alignments(curr_right_hit_group, best_hits, *gtf_junctions,
1176 *junctions, *insertions, *deletions, *fusions, *coverage,
1177 final_report);
1178
1179 if (best_hits.hits.size() > 0)
1180 {
1181 update_junctions(best_hits, *final_junctions);
1182 update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1183 update_fusions(best_hits, *rt, *final_fusions, *fusions);
1184
1185 print_sam_for_single(*rt,
1186 best_hits,
1187 FRAG_RIGHT,
1188 r_read,
1189 bam_writer,
1190 right_um_out);
1191 }
1192 }
1193
1194 // Get next hit group
1195 right_hs.next_read_hits(curr_right_hit_group);
1196 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1197 }
1198
1199 // Since we have both left hits and right hits for this insert,
1200 // Find the best pairing and print both
1201 while (curr_left_obs_order == curr_right_obs_order &&
1202 curr_left_obs_order < end_id &&
1203 curr_left_obs_order != VMAXINT32)
1204 {
1205 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1206 exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1207 if (curr_left_hit_group.hits.empty())
1208 { //only right read mapped
1209 //write it in the mapped file with the #MAPPED# flag
1210
1211 bool got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1212 reads_format, false, begin_id, end_id,
1213 right_um_out, false);
1214 //assert(got_read);
1215 /*
1216 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1217 r_read.seq.c_str(), r_read.qual.c_str());
1218 */
1219 HitsForRead right_best_hits;
1220 right_best_hits.insert_id = curr_right_obs_order;
1221
1222 read_best_alignments(curr_right_hit_group, right_best_hits, *gtf_junctions,
1223 *junctions, *insertions, *deletions, *fusions, *coverage,
1224 final_report);
1225
1226 if (right_best_hits.hits.size() > 0)
1227 {
1228 update_junctions(right_best_hits, *final_junctions);
1229 update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions);
1230 update_fusions(right_best_hits, *rt, *final_fusions, *fusions);
1231
1232 print_sam_for_single(*rt,
1233 right_best_hits,
1234 FRAG_RIGHT,
1235 r_read,
1236 bam_writer,
1237 right_um_out);
1238 }
1239 }
1240 else if (curr_right_hit_group.hits.empty())
1241 {
1242 HitsForRead left_best_hits;
1243 left_best_hits.insert_id = curr_left_obs_order;
1244 //only left read mapped
1245 bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1246 reads_format, false, begin_id, end_id,
1247 left_um_out, false);
1248
1249 // Process hits for left singleton, select best alignments
1250 read_best_alignments(curr_left_hit_group, left_best_hits, *gtf_junctions,
1251 *junctions, *insertions, *deletions, *fusions, *coverage,
1252 final_report);
1253
1254 if (left_best_hits.hits.size() > 0)
1255 {
1256 update_junctions(left_best_hits, *final_junctions);
1257 update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions);
1258 update_fusions(left_best_hits, *rt, *final_fusions, *fusions);
1259
1260 print_sam_for_single(*rt,
1261 left_best_hits,
1262 FRAG_LEFT,
1263 l_read,
1264 bam_writer,
1265 left_um_out);
1266 }
1267 }
1268 else
1269 {
1270 vector<pair<BowtieHit, BowtieHit> > best_hits;
1271
1272 InsertAlignmentGrade grade;
1273 pair_best_alignments(curr_left_hit_group,
1274 curr_right_hit_group,
1275 grade, best_hits, *gtf_junctions,
1276 *junctions, *insertions, *deletions, *fusions, *coverage,
1277 final_report);
1278
1279 HitsForRead best_left_hit_group; best_left_hit_group.insert_id = curr_left_obs_order;
1280 HitsForRead best_right_hit_group; best_right_hit_group.insert_id = curr_left_obs_order;
1281
1282 for (size_t i = 0; i < best_hits.size(); ++i)
1283 {
1284 best_left_hit_group.hits.push_back(best_hits[i].first);
1285 best_right_hit_group.hits.push_back(best_hits[i].second);
1286 }
1287
1288 if (best_hits.size() > 0)
1289 {
1290 update_junctions(best_left_hit_group, *final_junctions);
1291 update_insertions_and_deletions(best_left_hit_group, *final_insertions, *final_deletions);
1292 update_fusions(best_left_hit_group, *rt, *final_fusions, *fusions);
1293
1294 update_junctions(best_right_hit_group, *final_junctions);
1295 update_insertions_and_deletions(best_right_hit_group, *final_insertions, *final_deletions);
1296 update_fusions(best_right_hit_group, *rt, *final_fusions, *fusions);
1297
1298 pair_support(best_hits, *final_fusions, *fusions);
1299
1300 print_sam_for_pair(*rt,
1301 best_hits,
1302 grade,
1303 left_reads_file,
1304 right_reads_file,
1305 bam_writer,
1306 left_um_out,
1307 right_um_out,
1308 begin_id,
1309 end_id);
1310 }
1311 }
1312
1313 left_hs.next_read_hits(curr_left_hit_group);
1314 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1315
1316 right_hs.next_read_hits(curr_right_hit_group);
1317 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1318 }
1319 } //while we still have unreported hits..
1320 //print the remaining unmapped reads at the end of each reads' stream
1321
1322 left_reads_file.getRead(VMAXINT32, l_read,
1323 reads_format, false, begin_id, end_id,
1324 left_um_out, false);
1325 if (right_reads_file.file())
1326 right_reads_file.getRead(VMAXINT32, r_read,
1327 reads_format, false, begin_id, end_id,
1328 right_um_out, false);
1329
1330
1331 // pclose (pipe close), which waits for a process to end, seems to conflict with boost::thread::join somehow,
1332 // resulting in deadlock like behavior.
1333 delete left_um_out;
1334 delete right_um_out;
1335
1336
1337 for (size_t i = 0; i < hit_factories.size(); ++i)
1338 delete hit_factories[i];
1339
1340 hit_factories.clear();
1341 }
1342
1343 string bam_output_fname;
1344 string sam_header_fname;
1345
1346 string left_reads_fname;
1347 string left_map_fname;
1348 string right_reads_fname;
1349 string right_map_fname;
1350
1351 string left_um_fname;
1352 string right_um_fname;
1353
1354 JunctionSet* gtf_junctions;
1355
1356 JunctionSet* junctions;
1357 InsertionSet* insertions;
1358 DeletionSet* deletions;
1359 FusionSet* fusions;
1360 Coverage* coverage;
1361
1362 JunctionSet* final_junctions;
1363 InsertionSet* final_insertions;
1364 DeletionSet* final_deletions;
1365 FusionSet* final_fusions;
1366
1367 RefSequenceTable* rt;
1368
1369 uint64_t begin_id;
1370 uint64_t end_id;
1371 int64_t left_reads_offset;
1372 int64_t left_map_offset;
1373 int64_t right_reads_offset;
1374 int64_t right_map_offset;
1375 };
1376
1377 void driver(const string& bam_output_fname,
1378 istream& ref_stream,
1379 const string& left_map_fname,
1380 const string& left_reads_fname,
1381 const string& right_map_fname,
1382 const string& right_reads_fname,
1383 FILE* junctions_out,
1384 FILE* insertions_out,
1385 FILE* deletions_out,
1386 FILE* fusions_out)
1387 {
1388 if (!parallel)
1389 num_threads = 1;
1390
1391 RefSequenceTable rt(sam_header, true);
1392
1393 if (fusion_search)
1394 get_seqs(ref_stream, rt, true);
1395
1396 srandom(1);
1397
1398 JunctionSet gtf_junctions;
1399 if (!gtf_juncs.empty())
1400 {
1401 char splice_buf[4096];
1402 FILE* splice_coords = fopen(gtf_juncs.c_str(), "r");
1403 if (splice_coords)
1404 {
1405 while (fgets(splice_buf, sizeof(splice_buf), splice_coords))
1406 {
1407 char* nl = strrchr(splice_buf, '\n');
1408 char* buf = splice_buf;
1409 if (nl) *nl = 0;
1410
1411 char* ref_name = get_token((char**)&buf, "\t");
1412 char* scan_left_coord = get_token((char**)&buf, "\t");
1413 char* scan_right_coord = get_token((char**)&buf, "\t");
1414 char* orientation = get_token((char**)&buf, "\t");
1415
1416 if (!scan_left_coord || !scan_right_coord || !orientation)
1417 {
1418 fprintf(stderr,"Error: malformed splice coordinate record in %s\n:%s\n",
1419 gtf_juncs.c_str(), buf);
1420 exit(1);
1421 }
1422
1423 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
1424 uint32_t left_coord = atoi(scan_left_coord);
1425 uint32_t right_coord = atoi(scan_right_coord);
1426 bool antisense = *orientation == '-';
1427
1428 gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats()));
1429 }
1430 }
1431 fprintf(stderr, "Loaded %d GFF junctions from %s.\n", (int)(gtf_junctions.size()), gtf_juncs.c_str());
1432 }
1433
1434 vector<uint64_t> read_ids;
1435 vector<vector<int64_t> > offsets;
1436 if (num_threads > 1)
1437 {
1438 vector<string> fnames;
1439 if (right_map_fname != "")
1440 {
1441 fnames.push_back(right_reads_fname);
1442 fnames.push_back(right_map_fname);
1443 }
1444 fnames.push_back(left_reads_fname);
1445 fnames.push_back(left_map_fname);
1446 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
1447 if (!enough_data)
1448 num_threads = 1;
1449 }
1450
1451 JunctionSet vjunctions[num_threads];
1452 InsertionSet vinsertions[num_threads];
1453 DeletionSet vdeletions[num_threads];
1454 FusionSet vfusions[num_threads];
1455 Coverage vcoverages[num_threads];
1456
1457 vector<boost::thread*> threads;
1458 for (int i = 0; i < num_threads; ++i)
1459 {
1460 ConsensusEventsWorker worker;
1461
1462 worker.left_map_fname = left_map_fname;
1463 worker.right_map_fname = right_map_fname;
1464 worker.rt = &rt;
1465 worker.gtf_junctions = &gtf_junctions;
1466
1467 worker.junctions = &vjunctions[i];
1468 worker.insertions = &vinsertions[i];
1469 worker.deletions = &vdeletions[i];
1470 worker.fusions = &vfusions[i];
1471 worker.coverage = &vcoverages[i];
1472
1473 worker.right_map_offset = 0;
1474
1475 if (i == 0)
1476 {
1477 worker.begin_id = 0;
1478 worker.left_map_offset = 0;
1479 }
1480 else
1481 {
1482 size_t offsets_size = offsets[i-1].size();
1483
1484 worker.begin_id = read_ids[i-1];
1485 worker.left_map_offset = offsets[i-1].back();
1486 if (offsets_size == 4)
1487 worker.right_map_offset = offsets[i-1][1];
1488 }
1489
1490 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1491
1492 if (num_threads > 1)
1493 threads.push_back(new boost::thread(worker));
1494 else
1495 worker();
1496 }
1497
1498 for (size_t i = 0; i < threads.size(); ++i)
1499 {
1500 threads[i]->join();
1501 delete threads[i];
1502 threads[i] = NULL;
1503 }
1504 threads.clear();
1505
1506 JunctionSet& junctions = vjunctions[0];
1507 InsertionSet& insertions = vinsertions[0];
1508 DeletionSet& deletions = vdeletions[0];
1509 FusionSet& fusions = vfusions[0];
1510 Coverage& coverage = vcoverages[0];
1511 for (size_t i = 1; i < num_threads; ++i)
1512 {
1513 merge_with(junctions, vjunctions[i]);
1514 vjunctions[i].clear();
1515
1516 merge_with(insertions, vinsertions[i]);
1517 vinsertions[i].clear();
1518
1519 merge_with(deletions, vdeletions[i]);
1520 vdeletions[i].clear();
1521
1522 merge_with(fusions, vfusions[i]);
1523 vfusions[i].clear();
1524
1525 coverage.merge_with(vcoverages[i]);
1526 vcoverages[i].clear();
1527 }
1528
1529 coverage.calculate_coverage();
1530
1531 size_t num_unfiltered_juncs = junctions.size();
1532 fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs);
1533
1534 // Read hits, extract junctions, and toss the ones that arent strongly enough supported.
1535 filter_junctions(junctions, gtf_junctions);
1536
1537 //size_t num_juncs_after_filter = junctions.size();
1538 //fprintf(stderr, "Filtered %lu junctions\n",
1539 // num_unfiltered_juncs - num_juncs_after_filter);
1540
1541 /*
1542 size_t small_overhangs = 0;
1543 for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
1544 {
1545 if (i->second.accepted &&
1546 (i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len))
1547 {
1548 small_overhangs++;
1549 }
1550 }
1551
1552 if (small_overhangs >0)
1553 fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs);
1554 */
1555
1556 JunctionSet vfinal_junctions[num_threads];
1557 InsertionSet vfinal_insertions[num_threads];
1558 DeletionSet vfinal_deletions[num_threads];
1559 FusionSet vfinal_fusions[num_threads];
1560
1561 for (int i = 0; i < num_threads; ++i)
1562 {
1563 ReportWorker worker;
1564
1565 worker.sam_header_fname = sam_header;
1566 char filename[1024] = {0};
1567 sprintf(filename, "%s%d.bam", bam_output_fname.c_str(), i);
1568 worker.bam_output_fname = filename;
1569 string tmpoutdir=getFdir(worker.bam_output_fname);
1570 worker.left_um_fname = tmpoutdir;
1571 sprintf(filename, "unmapped_left_%d.bam", i);
1572 worker.left_um_fname+=filename;
1573
1574 if (right_reads_fname != "")
1575 {
1576 sprintf(filename, "unmapped_right_%d.bam", i);
1577 worker.right_um_fname = tmpoutdir;
1578 worker.right_um_fname += filename;
1579 }
1580
1581 worker.left_reads_fname = left_reads_fname;
1582 worker.left_map_fname = left_map_fname;
1583 worker.right_reads_fname = right_reads_fname;
1584 worker.right_map_fname = right_map_fname;
1585
1586 worker.gtf_junctions = &gtf_junctions;
1587 worker.junctions = &junctions;
1588 worker.insertions = &insertions;
1589 worker.deletions = &deletions;
1590 worker.fusions = &fusions;
1591 worker.coverage = &coverage;
1592
1593 worker.final_junctions = &vfinal_junctions[i];
1594 worker.final_insertions = &vfinal_insertions[i];
1595 worker.final_deletions = &vfinal_deletions[i];
1596 worker.final_fusions = &vfinal_fusions[i];
1597 worker.rt = &rt;
1598
1599 worker.right_reads_offset = 0;
1600 worker.right_map_offset = 0;
1601
1602 if (i == 0)
1603 {
1604 worker.begin_id = 0;
1605 worker.left_reads_offset = 0;
1606 worker.left_map_offset = 0;
1607 }
1608 else
1609 {
1610 size_t offsets_size = offsets[i-1].size();
1611
1612 worker.begin_id = read_ids[i-1];
1613 worker.left_reads_offset = offsets[i-1][offsets_size - 2];
1614 worker.left_map_offset = offsets[i-1].back();
1615 if (offsets_size == 4)
1616 {
1617 worker.right_reads_offset = offsets[i-1][0];
1618 worker.right_map_offset = offsets[i-1][1];
1619 }
1620 }
1621
1622 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1623
1624 if (num_threads > 1)
1625 threads.push_back(new boost::thread(worker));
1626 else
1627 worker();
1628 }
1629
1630 for (size_t i = 0; i < threads.size(); ++i)
1631 {
1632 threads[i]->join();
1633 delete threads[i];
1634 threads[i] = NULL;
1635 }
1636 threads.clear();
1637
1638 JunctionSet final_junctions = vfinal_junctions[0];
1639 InsertionSet final_insertions = vfinal_insertions[0];
1640 DeletionSet final_deletions = vfinal_deletions[0];
1641 FusionSet final_fusions = vfinal_fusions[0];
1642 for (size_t i = 1; i < num_threads; ++i)
1643 {
1644 merge_with(final_junctions, vfinal_junctions[i]);
1645 vfinal_junctions[i].clear();
1646
1647 merge_with(final_insertions, vfinal_insertions[i]);
1648 vfinal_insertions[i].clear();
1649
1650 merge_with(final_deletions, vfinal_deletions[i]);
1651 vfinal_deletions[i].clear();
1652
1653 merge_with(final_fusions, vfinal_fusions[i]);
1654 vfinal_fusions[i].clear();
1655 }
1656
1657 fprintf (stderr, "Reporting final accepted alignments...");
1658 fprintf (stderr, "done.\n");
1659
1660 //small_overhangs = 0;
1661 for (JunctionSet::iterator i = final_junctions.begin(); i != final_junctions.end();)
1662 {
1663 if (i->second.supporting_hits == 0 || i->second.left_extent < 8 || i->second.right_extent < 8)
1664 {
1665 final_junctions.erase(i++);
1666 }
1667 else
1668 {
1669 ++i;
1670 }
1671 }
1672
1673 // if (small_overhangs > 0)
1674 // fprintf(stderr, "Warning: %lu small overhang junctions!\n", small_overhangs);
1675
1676 fprintf (stderr, "Printing junction BED track...");
1677 print_junctions(junctions_out, final_junctions, rt);
1678 fprintf (stderr, "done\n");
1679
1680 fprintf (stderr, "Printing insertions...");
1681 print_insertions(insertions_out, final_insertions,rt);
1682 fclose(insertions_out);
1683 fprintf (stderr, "done\n");
1684
1685 fprintf (stderr, "Printing deletions...");
1686 print_deletions(deletions_out, final_deletions, rt);
1687 fclose(deletions_out);
1688 fprintf (stderr, "done\n");
1689
1690 if (fusion_search)
1691 {
1692 fprintf (stderr, "Printing fusions...");
1693 print_fusions(fusions_out, final_fusions, rt);
1694 fclose(fusions_out);
1695 fprintf (stderr, "done\n");
1696 }
1697
1698 fprintf(stderr, "Found %lu junctions from happy spliced reads\n", (long unsigned int)final_junctions.size());
1699 }
1700
1701 void print_usage()
1702 {
1703 fprintf(stderr, "Usage: tophat_reports <junctions.bed> <insertions.vcf> <deletions.vcf> <accepted_hits.sam> <left_map1,...,left_mapN> <left_reads.fq> [right_map1,...,right_mapN] [right_reads.fq]\n");
1704
1705 // fprintf(stderr, "Usage: tophat_reports <coverage.wig> <junctions.bed> <accepted_hits.sam> <map1.bwtout> [splice_map1.sbwtout]\n");
1706 }
1707
1708 int main(int argc, char** argv)
1709 {
1710 fprintf(stderr, "tophat_reports v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
1711 fprintf(stderr, "---------------------------------------\n");
1712
1713 reads_format = FASTQ;
1714
1715 int parse_ret = parse_options(argc, argv, print_usage);
1716 if (parse_ret)
1717 return parse_ret;
1718
1719 if(optind >= argc)
1720 {
1721 print_usage();
1722 return 1;
1723 }
1724
1725 string ref_file_name = argv[optind++];
1726
1727 if(optind >= argc)
1728 {
1729 print_usage();
1730 return 1;
1731 }
1732
1733 string junctions_file_name = argv[optind++];
1734
1735 if(optind >= argc)
1736 {
1737 print_usage();
1738 return 1;
1739 }
1740
1741 string insertions_file_name = argv[optind++];
1742 if(optind >= argc)
1743 {
1744 print_usage();
1745 return 1;
1746 }
1747
1748 string deletions_file_name = argv[optind++];
1749 if(optind >= argc)
1750 {
1751 print_usage();
1752 return 1;
1753 }
1754
1755 string fusions_file_name = argv[optind++];
1756 if(optind >= argc)
1757 {
1758 print_usage();
1759 return 1;
1760 }
1761
1762 string accepted_hits_file_name = argv[optind++];
1763 if(optind >= argc)
1764 {
1765 print_usage();
1766 return 1;
1767 }
1768
1769 string left_map_filename = argv[optind++];
1770 if(optind >= argc)
1771 {
1772 print_usage();
1773 return 1;
1774 }
1775
1776 string left_reads_filename = argv[optind++];
1777 string unzcmd=getUnpackCmd(left_reads_filename, false);
1778
1779 string right_map_filename;
1780 string right_reads_filename;
1781
1782 if (optind < argc)
1783 {
1784 right_map_filename = argv[optind++];
1785 if(optind >= argc) {
1786 print_usage();
1787 return 1;
1788 }
1789 right_reads_filename=argv[optind++];
1790 }
1791
1792 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
1793 if (!ref_stream.good())
1794 {
1795 fprintf(stderr, "Error: cannot open %s for reading\n",
1796 ref_file_name.c_str());
1797 exit(1);
1798 }
1799
1800 FILE* junctions_file = fopen(junctions_file_name.c_str(), "w");
1801 if (junctions_file == NULL)
1802 {
1803 fprintf(stderr, "Error: cannot open BED file %s for writing\n",
1804 junctions_file_name.c_str());
1805 exit(1);
1806 }
1807
1808 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
1809 if (insertions_file == NULL)
1810 {
1811 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1812 insertions_file_name.c_str());
1813 exit(1);
1814 }
1815
1816 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
1817 if (deletions_file == NULL)
1818 {
1819 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1820 deletions_file_name.c_str());
1821 exit(1);
1822 }
1823
1824 FILE* fusions_file = NULL;
1825 if (fusion_search)
1826 {
1827 fusions_file = fopen(fusions_file_name.c_str(), "w");
1828 if (fusions_file == NULL)
1829 {
1830 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1831 fusions_file_name.c_str());
1832 exit(1);
1833 }
1834 }
1835
1836 driver(accepted_hits_file_name,
1837 ref_stream,
1838 left_map_filename,
1839 left_reads_filename,
1840 right_map_filename,
1841 right_reads_filename,
1842 junctions_file,
1843 insertions_file,
1844 deletions_file,
1845 fusions_file);
1846
1847 return 0;
1848 }