ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat_reports.cpp
Revision: 159
Committed: Mon Jan 30 22:38:08 2012 UTC (7 years, 8 months ago) by gpertea
File size: 53630 byte(s)
Log Message:
wip

Line File contents
1 /*
2 * tophat_reports.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/20/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #else
13 #define PACKAGE_VERSION "INTERNAL"
14 #define SVN_REVISION "XXX"
15 #endif
16
17 #include <cassert>
18 #include <cstdio>
19 #include <cstring>
20 #include <vector>
21 #include <string>
22 #include <map>
23 #include <algorithm>
24 #include <set>
25 #include <stdexcept>
26 #include <iostream>
27 #include <fstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <getopt.h>
32
33 #include <boost/thread.hpp>
34
35 #include "common.h"
36 #include "utils.h"
37 #include "bwt_map.h"
38 #include "junctions.h"
39 #include "insertions.h"
40 #include "deletions.h"
41 #include "fusions.h"
42 #include "align_status.h"
43 #include "fragments.h"
44 #include "wiggles.h"
45 #include "tokenize.h"
46 #include "reads.h"
47
48
49 #include "inserts.h"
50
51 using namespace std;
52 using namespace seqan;
53 using std::set;
54
55 // daehwan - this is redundancy, which should be removed.
56 void get_seqs(istream& ref_stream,
57 RefSequenceTable& rt,
58 bool keep_seqs = true)
59 {
60 while(ref_stream.good() && !ref_stream.eof())
61 {
62 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
63 string name;
64 readMeta(ref_stream, name, Fasta());
65 string::size_type space_pos = name.find_first_of(" \t\r");
66 if (space_pos != string::npos)
67 {
68 name.resize(space_pos);
69 }
70 seqan::read(ref_stream, *ref_str, Fasta());
71
72 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
73 }
74 }
75
76 void read_best_alignments(const HitsForRead& hits_for_read,
77 FragmentAlignmentGrade& best_grade,
78 HitsForRead& best_hits,
79 const JunctionSet& gtf_junctions)
80 {
81 const vector<BowtieHit>& hits = hits_for_read.hits;
82 for (size_t i = 0; i < hits.size(); ++i)
83 {
84 if (hits[i].edit_dist()>max_read_mismatches) continue;
85
86 FragmentAlignmentGrade g(hits[i], gtf_junctions);
87
88 // Is the new status better than the current best one?
89 if (best_grade < g)
90 {
91 if (!report_secondary_alignments)
92 best_hits.hits.clear();
93
94 best_grade = g;
95 best_hits.hits.push_back(hits[i]);
96 }
97 else if (!(g < best_grade)) // is it just as good?
98 {
99 best_grade.num_alignments++;
100 best_hits.hits.push_back(hits[i]);
101 }
102 }
103 }
104
105 void pair_best_alignments(const HitsForRead& left_hits,
106 const HitsForRead& right_hits,
107 InsertAlignmentGrade& best_grade,
108 HitsForRead& left_best_hits,
109 HitsForRead& right_best_hits)
110 {
111 // max mate inner distance (genomic)
112 int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev;
113 if (max_mate_inner_dist == -1)
114 max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev;
115
116 const vector<BowtieHit>& left = left_hits.hits;
117 const vector<BowtieHit>& right = right_hits.hits;
118
119 for (size_t i = 0; i < left.size(); ++i)
120 {
121 const BowtieHit& lh = left[i];
122 if (lh.edit_dist() > max_read_mismatches) continue;
123 for (size_t j = 0; j < right.size(); ++j)
124 {
125 const BowtieHit& rh = right[j];
126 if (rh.edit_dist() > max_read_mismatches) continue;
127
128 bool fusion = false;
129 if (fusion_search)
130 {
131 bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING;
132 bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING;
133 if (left_fusion && right_fusion)
134 continue;
135
136 fusion = left_fusion || right_fusion;
137 if (!fusion && lh.ref_id() != rh.ref_id())
138 fusion = true;
139
140 if (!fusion && lh.ref_id() == rh.ref_id())
141 {
142 if (lh.antisense_align() == rh.antisense_align())
143 fusion = true;
144 else
145 {
146 int inter_dist = 0;
147 if (lh.antisense_align())
148 inter_dist = lh.left() - rh.right();
149 else
150 inter_dist = rh.left() - lh.right();
151
152 if (inter_dist < -(int)max_insertion_length || inter_dist > (int)fusion_min_dist)
153 fusion = true;
154 }
155 }
156 }
157
158 InsertAlignmentGrade g(lh, rh, min_mate_inner_dist, max_mate_inner_dist, fusion);
159
160 // Is the new status better than the current best one?
161 if (best_grade < g)
162 {
163 if (!report_secondary_alignments)
164 {
165 left_best_hits.hits.clear();
166 right_best_hits.hits.clear();
167 }
168
169 best_grade = g;
170 left_best_hits.hits.push_back(lh);
171 right_best_hits.hits.push_back(rh);
172 }
173 else if (!(g < best_grade))
174 {
175 left_best_hits.hits.push_back(lh);
176 right_best_hits.hits.push_back(rh);
177 best_grade.num_alignments++;
178 }
179 }
180 }
181 }
182
183 enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT};
184
185 void add_auxData(vector<string>& auxdata, vector<string>& sam_toks,
186 const RefSequenceTable& rt,const BowtieHit& bh, FragmentType insert_side,
187 int num_hits, const BowtieHit* next_hit, int hitIndex) {
188 bool XS_found = false;
189 if (sam_toks.size()>11) {
190
191 for (size_t i=11;i<sam_toks.size();++i) {
192 if (sam_toks[i].find("NH:i:")==string::npos &&
193 sam_toks[i].find("XS:i:")==string::npos)
194 auxdata.push_back(sam_toks[i]);
195
196 if (sam_toks[i].find("XS:A:")!=string::npos)
197 XS_found = true;
198 }
199 }
200 string aux("NH:i:");
201 str_appendInt(aux, num_hits);
202 auxdata.push_back(aux);
203 if (next_hit) {
204 const char* nh_ref_name = "=";
205 nh_ref_name = rt.get_name(next_hit->ref_id());
206 assert (nh_ref_name != NULL);
207 bool same_contig=(next_hit->ref_id()==bh.ref_id());
208 aux="CC:Z:";
209 aux+= (same_contig ? "=" : nh_ref_name);
210 auxdata.push_back(aux);
211 aux="CP:i:";
212 int nh_gpos=next_hit->left() + 1;
213 str_appendInt(aux, nh_gpos);
214 auxdata.push_back(aux);
215 } //has next_hit
216 // FIXME: this code is still a bit brittle, because it contains no
217 // consistency check that the mates are on opposite strands - a current protocol
218 // requirement, and that the strand indicated by the alignment is consistent
219 // with the orientation of the splices (though that should be handled upstream).
220 if (!XS_found) {
221 const string xs_f("XS:A:+");
222 const string xs_r("XS:A:-");
223 if (library_type == FR_FIRSTSTRAND) {
224 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
225 if (bh.antisense_align())
226 auxdata.push_back(xs_f);
227 else
228 auxdata.push_back(xs_r);
229 }
230 else {
231 if (bh.antisense_align())
232 auxdata.push_back(xs_r);
233 else
234 auxdata.push_back(xs_f);
235 }
236 }
237 else if (library_type == FR_SECONDSTRAND) {
238 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
239 if (bh.antisense_align())
240 auxdata.push_back(xs_r);
241 else
242 auxdata.push_back(xs_f);
243 }
244 else
245 {
246 if (bh.antisense_align())
247 auxdata.push_back(xs_f);
248 else
249 auxdata.push_back(xs_r);
250 }
251 }
252 }
253 if (hitIndex >= 0)
254 {
255 string aux("HI:i:");
256 str_appendInt(aux, hitIndex);
257 auxdata.push_back(aux);
258 }
259 }
260
261 bool rewrite_sam_record(GBamWriter& bam_writer,
262 const RefSequenceTable& rt,
263 const BowtieHit& bh,
264 const char* bwt_buf,
265 const char* read_alt_name,
266 const FragmentAlignmentGrade& grade,
267 FragmentType insert_side,
268 int num_hits,
269 const BowtieHit* next_hit,
270 bool primary,
271 int hitIndex)
272 {
273 // Rewrite this hit, filling in the alt name, mate mapping
274 // and setting the pair flag
275 vector<string> sam_toks;
276 tokenize(bwt_buf, "\t", sam_toks);
277
278 string ref_name = sam_toks[2], ref_name2 = "";
279 char rebuf2[2048] = {0};
280 char cigar1[256] = {0}, cigar2[256] = {0};
281 string left_seq, right_seq, left_qual, right_qual;
282 int left = -1, left1 = -1, left2 = -1;
283 bool fusion_alignment = false;
284 size_t XF_index = 0;
285 for (size_t i = 11; i < sam_toks.size(); ++i)
286 {
287 const string& tok = sam_toks[i];
288 if (strncmp(tok.c_str(), "XF", 2) == 0)
289 {
290 fusion_alignment = true;
291 XF_index = i;
292
293 vector<string> tuple_fields;
294 tokenize(tok.c_str(), " ", tuple_fields);
295 vector<string> contigs;
296 tokenize(tuple_fields[1].c_str(), "-", contigs);
297 if (contigs.size() >= 2)
298 {
299 ref_name = contigs[0];
300 ref_name2 = contigs[1];
301 }
302
303 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
304 cigar1, cigar2, left_seq, right_seq,
305 left_qual, right_qual, left1, left2);
306
307 break;
308 }
309 }
310
311 string qname(read_alt_name);
312 size_t slash_pos=qname.rfind('/');
313 if (slash_pos!=string::npos)
314 qname.resize(slash_pos);
315 //read_alt_name as QNAME
316 int flag=atoi(sam_toks[1].c_str()); //FLAG
317 if (insert_side != FRAG_UNPAIRED) {
318 //flag = atoi(sam_toks[1].c_str());
319 // mark this as a singleton mate
320 flag |= 0x0001;
321 if (insert_side == FRAG_LEFT)
322 flag |= 0x0040;
323 else if (insert_side == FRAG_RIGHT)
324 flag |= 0x0080;
325 flag |= 0x0008;
326 //char flag_buf[64];
327 //sprintf(flag_buf, "%d", flag);
328 //sam_toks[t] = flag_buf;
329 }
330 if (!primary)
331 flag |= 0x100;
332
333 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
334 int mapQ=255;
335 if (grade.num_alignments > 1) {
336 double err_prob = 1 - (1.0 / grade.num_alignments);
337 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
338 }
339 int tlen =atoi(sam_toks[8].c_str()); //TLEN
340 int mate_pos=atoi(sam_toks[7].c_str());
341
342 GBamRecord* bamrec=NULL;
343 if (fusion_alignment) {
344 vector<string> auxdata;
345 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
346 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
347 cigar1, sam_toks[6].c_str(), mate_pos,
348 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
349 bam_writer.write(bamrec);
350 delete bamrec;
351
352 auxdata.clear();
353 sam_toks[XF_index][5] = '2';
354 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
355 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
356 cigar2, sam_toks[6].c_str(), mate_pos,
357 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
358 bam_writer.write(bamrec);
359 delete bamrec;
360 } else {
361 vector<string> auxdata;
362 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
363 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
364 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
365 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
366 bam_writer.write(bamrec);
367 delete bamrec;
368 }
369
370 return true;
371 }
372
373 bool rewrite_sam_record(GBamWriter& bam_writer,
374 const RefSequenceTable& rt,
375 const BowtieHit& bh,
376 const char* bwt_buf,
377 const char* read_alt_name,
378 const InsertAlignmentGrade& grade,
379 FragmentType insert_side,
380 const BowtieHit* partner,
381 int num_hits,
382 const BowtieHit* next_hit,
383 bool primary,
384 int hitIndex)
385 {
386 // Rewrite this hit, filling in the alt name, mate mapping
387 // and setting the pair flag
388 vector<string> sam_toks;
389 tokenize(bwt_buf, "\t", sam_toks);
390 string qname(read_alt_name);
391 size_t slash_pos=qname.rfind('/');
392 if (slash_pos!=string::npos)
393 qname.resize(slash_pos);
394 //read_alt_name as QNAME
395 int flag = atoi(sam_toks[1].c_str());
396 // 0x0010 (strand of query) is assumed to be set correctly
397 // to begin with
398 flag |= 0x0001; //it must be paired
399 if (insert_side == FRAG_LEFT)
400 flag |= 0x0040;
401 else if (insert_side == FRAG_RIGHT)
402 flag |= 0x0080;
403 if (!primary)
404 flag |= 0x100;
405
406 string ref_name = sam_toks[2], ref_name2 = "";
407 char rebuf2[2048] = {0};
408 char cigar1[256] = {0}, cigar2[256] = {0};
409 string left_seq, right_seq, left_qual, right_qual;
410 int left = -1, left1 = -1, left2 = -1;
411 bool fusion_alignment = false;
412 size_t XF_tok_idx = 11;
413 for (; XF_tok_idx < sam_toks.size(); ++XF_tok_idx)
414 {
415 const string& tok = sam_toks[XF_tok_idx];
416 if (strncmp(tok.c_str(), "XF", 2) == 0)
417 {
418 fusion_alignment = true;
419
420 vector<string> tuple_fields;
421 tokenize(tok.c_str(), " ", tuple_fields);
422 vector<string> contigs;
423 tokenize(tuple_fields[1].c_str(), "-", contigs);
424 if (contigs.size() >= 2)
425 {
426 ref_name = contigs[0];
427 ref_name2 = contigs[1];
428 }
429
430 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
431 cigar1, cigar2, left_seq, right_seq,
432 left_qual, right_qual, left1, left2);
433
434 break;
435 }
436 }
437
438 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
439 int mapQ=255;
440 if (grade.num_alignments > 1) {
441 double err_prob = 1 - (1.0 / grade.num_alignments);
442 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
443 }
444 int tlen=0; //TLEN
445 int mate_pos=atoi(sam_toks[7].c_str());
446 if (partner) {
447 if (partner->ref_id()==bh.ref_id()) {
448 sam_toks[6] = "="; //same chromosome
449 //TLEN:
450 tlen = bh.left() < partner->left() ? partner->right() - bh.left() :
451 partner->left() - bh.right();
452 }
453
454 else { //partner on different chromosome/contig
455 sam_toks[6] = rt.get_name(partner->ref_id());
456 }
457 mate_pos = partner->left() + 1;
458 if (grade.happy())
459 flag |= 0x0002;
460 if (partner->antisense_align())
461 flag |= 0x0020;
462
463 if (partner->fusion_opcode() != FUSION_NOTHING)
464 {
465 char partner_pos[1024];
466 sprintf(partner_pos, "XP:Z:%s-%s %d", rt.get_name(partner->ref_id()), rt.get_name(partner->ref_id2()), partner->left() + 1);
467 sam_toks.push_back(partner_pos);
468 }
469 }
470 else {
471 sam_toks[6] = "*";
472 mate_pos = 0;
473 flag |= 0x0008;
474 }
475
476 GBamRecord* bamrec=NULL;
477 if (fusion_alignment) {
478 vector<string> auxdata;
479 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
480 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
481 cigar1, sam_toks[6].c_str(), mate_pos,
482 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
483 bam_writer.write(bamrec);
484 delete bamrec;
485
486 auxdata.clear();
487 sam_toks[XF_tok_idx][5] = '2';
488 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
489 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
490 cigar2, sam_toks[6].c_str(), mate_pos,
491 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
492 bam_writer.write(bamrec);
493 delete bamrec;
494 } else {
495 vector<string> auxdata;
496 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
497 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
498 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
499 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
500 bam_writer.write(bamrec);
501 delete bamrec;
502 }
503
504 return true;
505 }
506
507 struct lex_hit_sort
508 {
509 lex_hit_sort(const RefSequenceTable& rt, const HitsForRead& hits)
510 : _rt(rt), _hits(hits)
511 {}
512
513 bool operator()(const uint32_t& l, const uint32_t& r) const
514 {
515 const BowtieHit& lhs = _hits.hits[l];
516 const BowtieHit& rhs = _hits.hits[r];
517
518 uint32_t l_id = lhs.ref_id();
519 uint32_t r_id = rhs.ref_id();
520 if (l_id != r_id)
521 return l_id < r_id;
522
523 return lhs.left() < rhs.left();
524 }
525
526 const RefSequenceTable& _rt;
527 const HitsForRead& _hits;
528 };
529
530 void print_sam_for_single(const RefSequenceTable& rt,
531 const HitsForRead& hits,
532 const FragmentAlignmentGrade& grade,
533 FragmentType frag_type,
534 Read& read,
535 GBamWriter& bam_writer,
536 FILE* um_out //write unmapped reads here
537 )
538 {
539 assert(!read.alt_name.empty());
540 lex_hit_sort s(rt, hits);
541 vector<uint32_t> index_vector;
542
543 size_t i;
544 for (i = 0; i < hits.hits.size(); ++i)
545 index_vector.push_back(i);
546
547 sort(index_vector.begin(), index_vector.end(), s);
548
549 size_t primaryHit = (hits.hits.empty()? 0: random() % hits.hits.size());
550 bool multipleHits = (hits.hits.size() > 1);
551 for (i = 0; i < hits.hits.size(); ++i)
552 {
553 bool primary = (i == primaryHit);
554 size_t index = index_vector[i];
555 const BowtieHit& bh = hits.hits[index];
556 rewrite_sam_record(bam_writer, rt,
557 bh,
558 bh.hitfile_rec().c_str(),
559 read.alt_name.c_str(),
560 grade,
561 frag_type,
562 hits.hits.size(),
563 (i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL,
564 primary,
565 (multipleHits? i: -1));
566 }
567 }
568
569 void print_sam_for_pair(const RefSequenceTable& rt,
570 const HitsForRead& left_hits,
571 const HitsForRead& right_hits,
572 const InsertAlignmentGrade& grade,
573 ReadStream& left_reads_file,
574 ReadStream& right_reads_file,
575 GBamWriter& bam_writer,
576 FILE* left_um_out,
577 FILE* right_um_out,
578 uint64_t begin_id = 0,
579 uint64_t end_id = std::numeric_limits<uint64_t>::max())
580 {
581 assert (left_hits.insert_id == right_hits.insert_id);
582
583 Read left_read;
584 Read right_read;
585 assert (left_hits.hits.size() == right_hits.hits.size() ||
586 (left_hits.hits.empty() || right_hits.hits.empty()));
587
588 size_t primaryHit = 0;
589 vector<uint32_t> index_vector;
590 if(right_hits.hits.size() > 0)
591 {
592 lex_hit_sort s(rt, right_hits);
593 for (size_t i = 0; i < right_hits.hits.size(); ++i)
594 index_vector.push_back(i);
595
596 sort(index_vector.begin(), index_vector.end(), s);
597 primaryHit = random() % right_hits.hits.size();
598 }
599 else if (left_hits.hits.size() > 0)
600 {
601 lex_hit_sort s(rt, left_hits);
602 for (size_t i = 0; i < left_hits.hits.size(); ++i)
603 index_vector.push_back(i);
604
605 sort(index_vector.begin(), index_vector.end(), s);
606 primaryHit = random() % left_hits.hits.size();
607 }
608
609 bool got_left_read = left_reads_file.getRead(left_hits.insert_id, left_read,
610 reads_format, false, begin_id, end_id,
611 left_um_out, false);
612
613 bool got_right_read = right_reads_file.getRead(right_hits.insert_id, right_read,
614 reads_format, false, begin_id, end_id,
615 right_um_out, false);
616
617 if (left_hits.hits.size() == right_hits.hits.size())
618 {
619 assert (got_left_read && got_right_read);
620 bool multipleHits = (left_hits.hits.size() > 1);
621 for (size_t i = 0; i < right_hits.hits.size(); ++i)
622 {
623 bool primary = (i == primaryHit);
624 size_t index = index_vector[i];
625 const BowtieHit& right_bh = right_hits.hits[index];
626 const BowtieHit& left_bh = left_hits.hits[index];
627
628 rewrite_sam_record(bam_writer, rt,
629 right_bh,
630 right_bh.hitfile_rec().c_str(),
631 right_read.alt_name.c_str(),
632 grade,
633 FRAG_RIGHT,
634 &left_bh,
635 right_hits.hits.size(),
636 (i < right_hits.hits.size() - 1) ? &(right_hits.hits[index_vector[i+1]]) : NULL,
637 primary,
638 (multipleHits? i: -1));
639 rewrite_sam_record(bam_writer, rt,
640 left_bh,
641 left_bh.hitfile_rec().c_str(),
642 left_read.alt_name.c_str(),
643 grade,
644 FRAG_LEFT,
645 &right_bh,
646 left_hits.hits.size(),
647 (i < left_hits.hits.size() - 1) ? &(left_hits.hits[index_vector[i+1]]) : NULL,
648 primary,
649 (multipleHits? i: -1));
650 }
651 }
652 else if (left_hits.hits.empty())
653 { //only right read was mapped properly
654 if (right_um_out) {
655 //write it in the mapped file with the #MAPPED# flag
656 fprintf(right_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", right_read.alt_name.c_str(),
657 right_read.seq.c_str(), right_read.qual.c_str());
658 }
659 for (size_t i = 0; i < right_hits.hits.size(); ++i)
660 {
661 bool primary = (i == primaryHit);
662 size_t index = index_vector[i];
663 const BowtieHit& bh = right_hits.hits[index];
664
665 rewrite_sam_record(bam_writer, rt,
666 bh,
667 bh.hitfile_rec().c_str(),
668 right_read.alt_name.c_str(),
669 grade,
670 FRAG_RIGHT,
671 NULL,
672 right_hits.hits.size(),
673 (i < right_hits.hits.size() - 1) ? &(right_hits.hits[index_vector[i+1]]) : NULL,
674 primary,
675 -1);
676
677 }
678 }
679 else if (right_hits.hits.empty())
680 { //only left read was mapped properly
681 if (left_um_out) {
682 //write it in the mapped file with the #MAPPED# flag
683 fprintf(left_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", left_read.alt_name.c_str(), left_read.seq.c_str(),
684 left_read.qual.c_str());
685 }
686
687 for (size_t i = 0; i < left_hits.hits.size(); ++i)
688 {
689 bool primary = (i == primaryHit);
690 size_t index = index_vector[i];
691 const BowtieHit& bh = left_hits.hits[index];
692 rewrite_sam_record(bam_writer, rt,
693 bh,
694 bh.hitfile_rec().c_str(),
695 left_read.alt_name.c_str(),
696 grade,
697 FRAG_LEFT,
698 NULL,
699 left_hits.hits.size(),
700 (i < left_hits.hits.size() - 1) ? &(left_hits.hits[index_vector[i+1]]) : NULL,
701 primary,
702 -1);
703
704 }
705 }
706 else
707 {
708 assert (false);
709 }
710 }
711
712 /**
713 * Given all of the hits for a particular read, update the read counts for insertions and deletions.
714 * @param hits hits The alignments for a particular read
715 * @param insertions Maps from an insertion to the number of supporting reads for that insertion
716 * @param deletions Maps from a deletion to the number of supporting reads for that deletion
717 */
718 void update_insertions_and_deletions(const HitsForRead& hits,
719 InsertionSet& insertions,
720 DeletionSet& deletions)
721 {
722 for (size_t i = 0; i < hits.hits.size(); ++i)
723 {
724 const BowtieHit& bh = hits.hits[i];
725 insertions_from_alignment(bh, insertions);
726 deletions_from_alignment(bh, deletions);
727 }
728 }
729
730
731 FusionSet empty_fusions;
732 void update_fusions(const HitsForRead& hits,
733 RefSequenceTable& rt,
734 FusionSet& fusions,
735 FusionSet& fusions_ref = empty_fusions)
736 {
737 if (hits.hits.size() > fusion_multireads)
738 return;
739
740 bool update_stat = fusions_ref.size() > 0;
741 for (size_t i = 0; i < hits.hits.size(); ++i)
742 {
743 const BowtieHit& bh = hits.hits[i];
744
745 if (bh.edit_dist() > fusion_read_mismatches)
746 continue;
747
748 // daehwan
749 #if 0
750 vector<Fusion> new_fusions;
751 fusions_from_spliced_hit(bh, new_fusions);
752
753 for(size_t i = 0; i < new_fusions.size(); ++i)
754 {
755 Fusion fusion = new_fusions[i];
756 if (fusion.left == 110343414 && fusion.right == 80211173 && hits.hits.size() > 1)
757 {
758 for (size_t t = 0; t < hits.hits.size(); ++t)
759 {
760 const BowtieHit& lh = hits.hits[t];
761 cout << "insert id: " << lh.insert_id() << endl;
762 cout << (lh.antisense_align() ? "-" : "+") << endl;
763 cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl;
764 cout << lh.ref_id2() << ": " << print_cigar(lh.cigar()) << endl;
765 }
766 }
767 }
768 #endif
769
770 fusions_from_alignment(bh, fusions, rt, update_stat);
771
772 if (update_stat)
773 unsupport_fusions(bh, fusions, fusions_ref);
774 }
775 }
776
777 void update_junctions(const HitsForRead& hits,
778 JunctionSet& junctions)
779 {
780 for (size_t i = 0; i < hits.hits.size(); ++i)
781 {
782 const BowtieHit& bh = hits.hits[i];
783 junctions_from_alignment(bh, junctions);
784 }
785 }
786
787 // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
788 // resets the stream when finished.
789 void exclude_hits_on_filtered_junctions(const JunctionSet& junctions, HitsForRead& hits)
790 {
791 HitsForRead remaining;
792 remaining.insert_id = hits.insert_id;
793
794 for (size_t i = 0; i < hits.hits.size(); ++i)
795 {
796 BowtieHit& bh = hits.hits[i];
797 bool filter_hit = false;
798 if (!bh.contiguous())
799 {
800 JunctionSet bh_juncs;
801 junctions_from_alignment(bh, bh_juncs);
802 for (JunctionSet::iterator itr = bh_juncs.begin();
803 itr != bh_juncs.end();
804 itr++)
805 {
806 const Junction& j = itr->first;
807 JunctionSet::const_iterator target = junctions.find(j);
808 if (target == junctions.end() || !target->second.accepted)
809 {
810 filter_hit = true;
811 break;
812 }
813 }
814 }
815 if (!filter_hit)
816 remaining.hits.push_back(bh);
817 }
818 hits = remaining;
819 }
820
821 // events include splice junction, indels, and fusion points.
822 struct ConsensusEventsWorker
823 {
824 void operator()()
825 {
826 ReadTable it;
827 vector<BAMHitFactory*> hit_factories;
828 hit_factories.push_back(new BAMHitFactory(it, *rt));
829 HitStream l_hs(left_map_fname, hit_factories.back(), false, true, true, true);
830 if (left_map_offset > 0)
831 l_hs.seek(left_map_offset);
832
833 hit_factories.push_back(new BAMHitFactory(it, *rt));
834 HitStream r_hs(right_map_fname, hit_factories.back(), false, true, true, true);
835 if (right_map_offset > 0)
836 r_hs.seek(right_map_offset);
837
838 HitsForRead curr_left_hit_group;
839 HitsForRead curr_right_hit_group;
840
841 l_hs.next_read_hits(curr_left_hit_group);
842 r_hs.next_read_hits(curr_right_hit_group);
843
844 uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
845 uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
846
847 // While we still have unreported hits...
848 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
849 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
850 {
851 // Chew up left singletons
852 while (curr_left_obs_order < curr_right_obs_order &&
853 curr_left_obs_order < end_id &&
854 curr_left_obs_order != VMAXINT32)
855 {
856 HitsForRead best_hits;
857 best_hits.insert_id = curr_left_obs_order;
858 FragmentAlignmentGrade grade;
859
860 // Process hits for left singleton, select best alignments
861 read_best_alignments(curr_left_hit_group, grade, best_hits, *gtf_junctions);
862 update_junctions(best_hits, *junctions);
863 update_fusions(best_hits, *rt, *fusions);
864
865 // Get next hit group
866 l_hs.next_read_hits(curr_left_hit_group);
867 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
868 }
869
870 // Chew up right singletons
871 while (curr_left_obs_order > curr_right_obs_order &&
872 curr_right_obs_order < end_id &&
873 curr_right_obs_order != VMAXINT32)
874 {
875 HitsForRead best_hits;
876 best_hits.insert_id = curr_right_obs_order;
877 if (curr_right_obs_order >= begin_id)
878 {
879 FragmentAlignmentGrade grade;
880 // Process hit for right singleton, select best alignments
881 read_best_alignments(curr_right_hit_group,grade, best_hits, *gtf_junctions);
882 update_junctions(best_hits, *junctions);
883 update_fusions(best_hits, *rt, *fusions);
884 }
885
886 // Get next hit group
887 r_hs.next_read_hits(curr_right_hit_group);
888 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
889 }
890
891 // Since we have both left hits and right hits for this insert,
892 // Find the best pairing and print both
893 while (curr_left_obs_order == curr_right_obs_order &&
894 curr_left_obs_order < end_id &&
895 curr_left_obs_order != VMAXINT32)
896 {
897 if (curr_left_hit_group.hits.empty())
898 {
899 HitsForRead right_best_hits;
900 right_best_hits.insert_id = curr_right_obs_order;
901
902 FragmentAlignmentGrade grade;
903 read_best_alignments(curr_right_hit_group, grade, right_best_hits, *gtf_junctions);
904 update_junctions(right_best_hits, *junctions);
905 update_fusions(right_best_hits, *rt, *fusions);
906 }
907 else if (curr_right_hit_group.hits.empty())
908 {
909 HitsForRead left_best_hits;
910 left_best_hits.insert_id = curr_left_obs_order;
911
912 FragmentAlignmentGrade grade;
913 // Process hits for left singleton, select best alignments
914 read_best_alignments(curr_left_hit_group, grade, left_best_hits, *gtf_junctions);
915 update_junctions(left_best_hits, *junctions);
916 update_fusions(left_best_hits, *rt, *fusions);
917 }
918 else
919 {
920 HitsForRead left_best_hits;
921 HitsForRead right_best_hits;
922 left_best_hits.insert_id = curr_left_obs_order;
923 right_best_hits.insert_id = curr_right_obs_order;
924
925 // daehwan - apply gtf_junctions here, too!
926
927 InsertAlignmentGrade grade;
928 pair_best_alignments(curr_left_hit_group,
929 curr_right_hit_group,
930 grade,
931 left_best_hits,
932 right_best_hits);
933 update_junctions(left_best_hits, *junctions);
934 update_junctions(right_best_hits, *junctions);
935 update_fusions(left_best_hits, *rt, *fusions);
936 update_fusions(right_best_hits, *rt, *fusions);
937 }
938
939 l_hs.next_read_hits(curr_left_hit_group);
940 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
941
942 r_hs.next_read_hits(curr_right_hit_group);
943 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
944 }
945 }
946
947 for (size_t i = 0; i < hit_factories.size(); ++i)
948 delete hit_factories[i];
949
950 hit_factories.clear();
951 }
952
953 string left_map_fname;
954 string right_map_fname;
955
956 JunctionSet* junctions;
957 JunctionSet* gtf_junctions;
958 FusionSet* fusions;
959 RefSequenceTable* rt;
960
961 uint64_t begin_id;
962 uint64_t end_id;
963 int64_t left_map_offset;
964 int64_t right_map_offset;
965 };
966
967 struct ReportWorker
968 {
969 void operator()()
970 {
971 ReadTable it;
972 GBamWriter bam_writer(bam_output_fname.c_str(), sam_header.c_str());
973
974 ReadStream left_reads_file(left_reads_fname);
975 if (left_reads_file.file() == NULL)
976 err_die("Error: cannot open %s for reading\n", left_reads_fname.c_str());
977
978 if (left_reads_offset > 0)
979 left_reads_file.seek(left_reads_offset);
980
981 if (!zpacker.empty()) left_um_fname+=".z";
982 FZPipe left_um_out;
983 if (left_um_out.openWrite(left_um_fname.c_str(), zpacker)==NULL)
984 err_die("Error: cannot open file %s for writing!\n",left_um_fname.c_str());
985
986 ReadStream right_reads_file(right_reads_fname);
987 if (right_reads_offset > 0)
988 right_reads_file.seek(right_reads_offset);
989
990 FZPipe right_um_out;
991 if (right_reads_fname != "")
992 {
993 if (!zpacker.empty()) right_um_fname+=".z";
994 if (right_um_out.openWrite(right_um_fname.c_str(), zpacker)==NULL)
995 err_die("Error: cannot open file %s for writing!\n",right_um_fname.c_str());
996 }
997
998 vector<BAMHitFactory*> hit_factories;
999 hit_factories.push_back(new BAMHitFactory(it, *rt));
1000 HitStream left_hs(left_map_fname, hit_factories.back(), false, true, true, true);
1001 if (left_map_offset > 0)
1002 left_hs.seek(left_map_offset);
1003
1004 hit_factories.push_back(new BAMHitFactory(it, *rt));
1005 HitStream right_hs(right_map_fname, hit_factories.back(), false, true, true, true);
1006 if (right_map_offset > 0)
1007 right_hs.seek(right_map_offset);
1008
1009 HitsForRead curr_left_hit_group;
1010 HitsForRead curr_right_hit_group;
1011
1012 left_hs.next_read_hits(curr_left_hit_group);
1013 right_hs.next_read_hits(curr_right_hit_group);
1014
1015 uint64_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1016 uint64_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1017
1018 // While we still have unreported hits...
1019 Read l_read;
1020 Read r_read;
1021 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
1022 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
1023 {
1024 // Chew up left singletons (pairs with right reads unmapped)
1025 while (curr_left_obs_order < curr_right_obs_order &&
1026 curr_left_obs_order < end_id &&
1027 curr_left_obs_order != VMAXINT32)
1028 {
1029 HitsForRead best_hits;
1030 best_hits.insert_id = curr_left_obs_order;
1031 FragmentAlignmentGrade grade;
1032 bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1033 reads_format, false, begin_id, end_id,
1034 left_um_out.file, false);
1035 assert(got_read);
1036
1037 if (right_reads_file.file()) {
1038 fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(),
1039 l_read.seq.c_str(), l_read.qual.c_str());
1040 got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1041 reads_format, false, begin_id, end_id,
1042 right_um_out.file, true);
1043 assert(got_read);
1044 }
1045
1046 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1047
1048 // Process hits for left singleton, select best alignments
1049 read_best_alignments(curr_left_hit_group, grade, best_hits, *gtf_junctions);
1050
1051 if (best_hits.hits.size()>0 && best_hits.hits.size() <= max_multihits)
1052 {
1053 update_junctions(best_hits, *final_junctions);
1054 update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1055 update_fusions(best_hits, *rt, *final_fusions, *fusions);
1056
1057 print_sam_for_single(*rt,
1058 best_hits,
1059 grade,
1060 (right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT),
1061 l_read,
1062 bam_writer,
1063 left_um_out.file);
1064 }
1065 // Get next hit group
1066 left_hs.next_read_hits(curr_left_hit_group);
1067 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1068 } //left singletons
1069
1070 // Chew up right singletons
1071 while (curr_left_obs_order > curr_right_obs_order &&
1072 curr_right_obs_order < end_id &&
1073 curr_right_obs_order != VMAXINT32)
1074 {
1075 HitsForRead best_hits;
1076 best_hits.insert_id = curr_right_obs_order;
1077 FragmentAlignmentGrade grade;
1078
1079 if (curr_right_obs_order >= begin_id)
1080 {
1081 bool got_read=right_reads_file.getRead(curr_right_obs_order, r_read,
1082 reads_format, false, begin_id, end_id,
1083 right_um_out.file, false);
1084
1085 assert(got_read);
1086
1087 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1088 r_read.seq.c_str(), r_read.qual.c_str());
1089 got_read=left_reads_file.getRead(curr_right_obs_order, l_read,
1090 reads_format, false, begin_id, end_id,
1091 left_um_out.file, true);
1092 assert(got_read);
1093
1094 exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1095
1096 // Process hit for right singleton, select best alignments
1097 read_best_alignments(curr_right_hit_group, grade, best_hits, *gtf_junctions);
1098 if (best_hits.hits.size()>0 && best_hits.hits.size() <= max_multihits)
1099 {
1100 update_junctions(best_hits, *final_junctions);
1101 update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1102 update_fusions(best_hits, *rt, *final_fusions, *fusions);
1103
1104 print_sam_for_single(*rt,
1105 best_hits,
1106 grade,
1107 FRAG_RIGHT,
1108 r_read,
1109 bam_writer,
1110 right_um_out.file);
1111 }
1112 }
1113
1114 // Get next hit group
1115 right_hs.next_read_hits(curr_right_hit_group);
1116 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1117 }
1118
1119 // Since we have both left hits and right hits for this insert,
1120 // Find the best pairing and print both
1121 while (curr_left_obs_order == curr_right_obs_order &&
1122 curr_left_obs_order < end_id &&
1123 curr_left_obs_order != VMAXINT32)
1124 {
1125 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1126 exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1127 if (curr_left_hit_group.hits.empty())
1128 { //only right read mapped
1129 //write it in the mapped file with the #MAPPED# flag
1130
1131 bool got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1132 reads_format, false, begin_id, end_id,
1133 right_um_out.file, false);
1134 assert(got_read);
1135 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1136 r_read.seq.c_str(), r_read.qual.c_str());
1137 HitsForRead right_best_hits;
1138 right_best_hits.insert_id = curr_right_obs_order;
1139
1140 FragmentAlignmentGrade grade;
1141 read_best_alignments(curr_right_hit_group, grade, right_best_hits, *gtf_junctions);
1142
1143 if (right_best_hits.hits.size()>0 && right_best_hits.hits.size() <= max_multihits)
1144 {
1145 update_junctions(right_best_hits, *final_junctions);
1146 update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions);
1147 update_fusions(right_best_hits, *rt, *final_fusions, *fusions);
1148
1149 print_sam_for_single(*rt,
1150 right_best_hits,
1151 grade,
1152 FRAG_RIGHT,
1153 r_read,
1154 bam_writer,
1155 right_um_out.file);
1156 }
1157 }
1158 else if (curr_right_hit_group.hits.empty())
1159 {
1160 HitsForRead left_best_hits;
1161 left_best_hits.insert_id = curr_left_obs_order;
1162 //only left read mapped
1163 bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1164 reads_format, false, begin_id, end_id,
1165 left_um_out.file, false);
1166 assert(got_read);
1167 fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(),
1168 l_read.seq.c_str(), l_read.qual.c_str());
1169 FragmentAlignmentGrade grade;
1170 // Process hits for left singleton, select best alignments
1171 read_best_alignments(curr_left_hit_group, grade, left_best_hits, *gtf_junctions);
1172 if (left_best_hits.hits.size()>0 && left_best_hits.hits.size() <= max_multihits)
1173 {
1174 update_junctions(left_best_hits, *final_junctions);
1175 update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions);
1176 update_fusions(left_best_hits, *rt, *final_fusions, *fusions);
1177
1178 print_sam_for_single(*rt,
1179 left_best_hits,
1180 grade,
1181 FRAG_LEFT,
1182 l_read,
1183 bam_writer,
1184 left_um_out.file);
1185 }
1186 }
1187 else
1188 { //hits for both left and right reads
1189 HitsForRead left_best_hits;
1190 HitsForRead right_best_hits;
1191 left_best_hits.insert_id = curr_left_obs_order;
1192 right_best_hits.insert_id = curr_right_obs_order;
1193
1194 InsertAlignmentGrade grade;
1195 pair_best_alignments(curr_left_hit_group,
1196 curr_right_hit_group,
1197 grade,
1198 left_best_hits,
1199 right_best_hits);
1200
1201 if (left_best_hits.hits.size()>0 && left_best_hits.hits.size() <= max_multihits &&
1202 right_best_hits.hits.size()>0 && right_best_hits.hits.size() <= max_multihits)
1203 {
1204 update_junctions(left_best_hits, *final_junctions);
1205 update_junctions(right_best_hits, *final_junctions);
1206 update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions);
1207 update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions);
1208
1209 pair_support(left_best_hits, right_best_hits, *final_fusions, *fusions);
1210 update_fusions(left_best_hits, *rt, *final_fusions, *fusions);
1211 update_fusions(right_best_hits, *rt, *final_fusions, *fusions);
1212
1213 print_sam_for_pair(*rt,
1214 left_best_hits,
1215 right_best_hits,
1216 grade,
1217 left_reads_file,
1218 right_reads_file,
1219 bam_writer,
1220 left_um_out.file,
1221 right_um_out.file,
1222 begin_id,
1223 end_id);
1224 }
1225 }
1226
1227 left_hs.next_read_hits(curr_left_hit_group);
1228 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1229
1230 right_hs.next_read_hits(curr_right_hit_group);
1231 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1232 }
1233 } //while we still have unreported hits..
1234 //print the remaining unmapped reads at the end of each reads' stream
1235 left_reads_file.getRead(VMAXINT32, l_read,
1236 reads_format, false, begin_id, end_id,
1237 left_um_out.file, false);
1238 if (right_reads_file.file())
1239 right_reads_file.getRead(VMAXINT32, r_read,
1240 reads_format, false, begin_id, end_id,
1241 right_um_out.file, false);
1242
1243 left_um_out.close();
1244 right_um_out.close();
1245
1246 for (size_t i = 0; i < hit_factories.size(); ++i)
1247 delete hit_factories[i];
1248
1249 hit_factories.clear();
1250 }
1251
1252 string bam_output_fname;
1253 string sam_header_fname;
1254
1255 string left_reads_fname;
1256 string left_map_fname;
1257 string right_reads_fname;
1258 string right_map_fname;
1259
1260 string left_um_fname;
1261 string right_um_fname;
1262
1263 JunctionSet* gtf_junctions;
1264 JunctionSet* junctions;
1265 FusionSet* fusions;
1266 JunctionSet* final_junctions;
1267 InsertionSet* final_insertions;
1268 DeletionSet* final_deletions;
1269 FusionSet* final_fusions;
1270
1271 RefSequenceTable* rt;
1272
1273 uint64_t begin_id;
1274 uint64_t end_id;
1275 int64_t left_reads_offset;
1276 int64_t left_map_offset;
1277 int64_t right_reads_offset;
1278 int64_t right_map_offset;
1279 };
1280
1281 void driver(const string& bam_output_fname,
1282 istream& ref_stream,
1283 const string& left_map_fname,
1284 const string& left_reads_fname,
1285 const string& right_map_fname,
1286 const string& right_reads_fname,
1287 FILE* junctions_out,
1288 FILE* insertions_out,
1289 FILE* deletions_out,
1290 FILE* fusions_out)
1291 {
1292 if (!parallel)
1293 num_threads = 1;
1294
1295 RefSequenceTable rt(sam_header, true);
1296
1297 if (fusion_search)
1298 get_seqs(ref_stream, rt, true);
1299
1300 srandom(1);
1301
1302 JunctionSet gtf_junctions;
1303 if (!gtf_juncs.empty())
1304 {
1305 char splice_buf[2048];
1306 FILE* splice_coords = fopen(gtf_juncs.c_str(), "r");
1307 if (splice_coords)
1308 {
1309 while (fgets(splice_buf, sizeof(splice_buf), splice_coords))
1310 {
1311 char* nl = strrchr(splice_buf, '\n');
1312 char* buf = splice_buf;
1313 if (nl) *nl = 0;
1314
1315 char* ref_name = get_token((char**)&buf, "\t");
1316 char* scan_left_coord = get_token((char**)&buf, "\t");
1317 char* scan_right_coord = get_token((char**)&buf, "\t");
1318 char* orientation = get_token((char**)&buf, "\t");
1319
1320 if (!scan_left_coord || !scan_right_coord || !orientation)
1321 {
1322 fprintf(stderr,"Error: malformed splice coordinate record in %s\n:%s\n",
1323 gtf_juncs.c_str(), buf);
1324 exit(1);
1325 }
1326
1327 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
1328 uint32_t left_coord = atoi(scan_left_coord);
1329 uint32_t right_coord = atoi(scan_right_coord);
1330 bool antisense = *orientation == '-';
1331
1332 // add 1 to left_coord to meet BED format
1333 gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord + 1, right_coord, antisense), JunctionStats()));
1334 }
1335 }
1336 fprintf(stderr, "Loaded %d GFF junctions from %s.\n", (int)(gtf_junctions.size()), gtf_juncs.c_str());
1337 }
1338
1339 vector<uint64_t> read_ids;
1340 vector<vector<int64_t> > offsets;
1341 if (num_threads > 1)
1342 {
1343 vector<string> fnames;
1344 if (right_map_fname != "")
1345 {
1346 fnames.push_back(right_reads_fname);
1347 fnames.push_back(right_map_fname);
1348 }
1349 fnames.push_back(left_reads_fname);
1350 fnames.push_back(left_map_fname);
1351 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
1352 if (!enough_data)
1353 num_threads = 1;
1354 }
1355
1356 JunctionSet vjunctions[num_threads];
1357 FusionSet vfusions[num_threads];
1358 vector<boost::thread*> threads;
1359 for (int i = 0; i < num_threads; ++i)
1360 {
1361 ConsensusEventsWorker worker;
1362
1363 worker.left_map_fname = left_map_fname;
1364 worker.right_map_fname = right_map_fname;
1365 worker.junctions = &vjunctions[i];
1366 worker.fusions = &vfusions[i];
1367 worker.gtf_junctions = &gtf_junctions;
1368 worker.rt = &rt;
1369
1370 worker.right_map_offset = 0;
1371
1372 if (i == 0)
1373 {
1374 worker.begin_id = 0;
1375 worker.left_map_offset = 0;
1376 }
1377 else
1378 {
1379 size_t offsets_size = offsets[i-1].size();
1380
1381 worker.begin_id = read_ids[i-1];
1382 worker.left_map_offset = offsets[i-1].back();
1383 if (offsets_size == 4)
1384 worker.right_map_offset = offsets[i-1][1];
1385 }
1386
1387 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1388
1389 if (num_threads > 1)
1390 threads.push_back(new boost::thread(worker));
1391 else
1392 worker();
1393 }
1394
1395 for (size_t i = 0; i < threads.size(); ++i)
1396 {
1397 threads[i]->join();
1398 delete threads[i];
1399 threads[i] = NULL;
1400 }
1401 threads.clear();
1402
1403 JunctionSet junctions = vjunctions[0];
1404 FusionSet fusions = vfusions[0];
1405 for (size_t i = 1; i < num_threads; ++i)
1406 {
1407 merge_with(junctions, vjunctions[i]);
1408 vjunctions[i].clear();
1409
1410 merge_with(fusions, vfusions[i]);
1411 vfusions[i].clear();
1412 }
1413
1414 size_t num_unfiltered_juncs = junctions.size();
1415 fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs);
1416
1417 // Read hits, extract junctions, and toss the ones that arent strongly enough supported.
1418
1419 filter_junctions(junctions, gtf_junctions);
1420
1421 //size_t num_juncs_after_filter = junctions.size();
1422 //fprintf(stderr, "Filtered %lu junctions\n",
1423 // num_unfiltered_juncs - num_juncs_after_filter);
1424
1425 /*
1426 size_t small_overhangs = 0;
1427 for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
1428 {
1429 if (i->second.accepted &&
1430 (i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len))
1431 {
1432 small_overhangs++;
1433 }
1434 }
1435
1436 if (small_overhangs >0)
1437 fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs);
1438 */
1439
1440 JunctionSet vfinal_junctions[num_threads];
1441 InsertionSet vfinal_insertions[num_threads];
1442 DeletionSet vfinal_deletions[num_threads];
1443 FusionSet vfinal_fusions[num_threads];
1444
1445 for (int i = 0; i < num_threads; ++i)
1446 {
1447 ReportWorker worker;
1448
1449 char filename[1024] = {0};
1450 sprintf(filename, "%s%d.bam", bam_output_fname.c_str(), i);
1451 worker.bam_output_fname = filename;
1452 worker.sam_header_fname = sam_header;
1453 sprintf(filename, "%s/unmapped_left%d.fq", output_dir.c_str(), i);
1454 worker.left_um_fname = filename;
1455
1456 if (right_reads_fname != "")
1457 {
1458 sprintf(filename, "%s/unmapped_right%d.fq", output_dir.c_str(), i);
1459 worker.right_um_fname = filename;
1460 }
1461
1462 worker.left_reads_fname = left_reads_fname;
1463 worker.left_map_fname = left_map_fname;
1464 worker.right_reads_fname = right_reads_fname;
1465 worker.right_map_fname = right_map_fname;
1466
1467 worker.gtf_junctions = &gtf_junctions;
1468 worker.junctions = &junctions;
1469 worker.fusions = &fusions;
1470 worker.final_junctions = &vfinal_junctions[i];
1471 worker.final_insertions = &vfinal_insertions[i];
1472 worker.final_deletions = &vfinal_deletions[i];
1473 worker.final_fusions = &vfinal_fusions[i];
1474 worker.rt = &rt;
1475
1476 worker.right_reads_offset = 0;
1477 worker.right_map_offset = 0;
1478
1479 if (i == 0)
1480 {
1481 worker.begin_id = 0;
1482 worker.left_reads_offset = 0;
1483 worker.left_map_offset = 0;
1484 }
1485 else
1486 {
1487 size_t offsets_size = offsets[i-1].size();
1488
1489 worker.begin_id = read_ids[i-1];
1490 worker.left_reads_offset = offsets[i-1][offsets_size - 2];
1491 worker.left_map_offset = offsets[i-1].back();
1492 if (offsets_size == 4)
1493 {
1494 worker.right_reads_offset = offsets[i-1][0];
1495 worker.right_map_offset = offsets[i-1][1];
1496 }
1497 }
1498
1499 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1500
1501 if (num_threads > 1)
1502 threads.push_back(new boost::thread(worker));
1503 else
1504 worker();
1505 }
1506
1507 for (size_t i = 0; i < threads.size(); ++i)
1508 {
1509 threads[i]->join();
1510 delete threads[i];
1511 threads[i] = NULL;
1512 }
1513 threads.clear();
1514
1515 JunctionSet final_junctions = vfinal_junctions[0];
1516 InsertionSet final_insertions = vfinal_insertions[0];
1517 DeletionSet final_deletions = vfinal_deletions[0];
1518 FusionSet final_fusions = vfinal_fusions[0];
1519 for (size_t i = 1; i < num_threads; ++i)
1520 {
1521 merge_with(final_junctions, vfinal_junctions[i]);
1522 vfinal_junctions[i].clear();
1523
1524 merge_with(final_insertions, vfinal_insertions[i]);
1525 vfinal_insertions[i].clear();
1526
1527 merge_with(final_deletions, vfinal_deletions[i]);
1528 vfinal_deletions[i].clear();
1529
1530 merge_with(final_fusions, vfinal_fusions[i]);
1531 vfinal_fusions[i].clear();
1532 }
1533
1534 fprintf (stderr, "Reporting final accepted alignments...");
1535 fprintf (stderr, "done.\n");
1536
1537 //small_overhangs = 0;
1538 for (JunctionSet::iterator i = final_junctions.begin(); i != final_junctions.end();)
1539 {
1540 if (i->second.supporting_hits == 0 || i->second.left_extent < 8 || i->second.right_extent < 8)
1541 {
1542 final_junctions.erase(i++);
1543 }
1544 else
1545 {
1546 ++i;
1547 }
1548 }
1549
1550 // if (small_overhangs > 0)
1551 // fprintf(stderr, "Warning: %lu small overhang junctions!\n", small_overhangs);
1552
1553 fprintf (stderr, "Printing junction BED track...");
1554 print_junctions(junctions_out, final_junctions, rt);
1555 fprintf (stderr, "done\n");
1556
1557 fprintf (stderr, "Printing insertions...");
1558 print_insertions(insertions_out, final_insertions,rt);
1559 fclose(insertions_out);
1560 fprintf (stderr, "done\n");
1561
1562 fprintf (stderr, "Printing deletions...");
1563 print_deletions(deletions_out, final_deletions, rt);
1564 fclose(deletions_out);
1565 fprintf (stderr, "done\n");
1566
1567 if (fusion_search)
1568 {
1569 fprintf (stderr, "Printing fusions...");
1570 print_fusions(fusions_out, final_fusions, rt);
1571 fclose(fusions_out);
1572 fprintf (stderr, "done\n");
1573 }
1574
1575 fprintf(stderr, "Found %lu junctions from happy spliced reads\n", (long unsigned int)final_junctions.size());
1576 }
1577
1578 void print_usage()
1579 {
1580 fprintf(stderr, "Usage: tophat_reports <junctions.bed> <insertions.vcf> <deletions.vcf> <accepted_hits.sam> <left_map1,...,left_mapN> <left_reads.fq> [right_map1,...,right_mapN] [right_reads.fq]\n");
1581
1582 // fprintf(stderr, "Usage: tophat_reports <coverage.wig> <junctions.bed> <accepted_hits.sam> <map1.bwtout> [splice_map1.sbwtout]\n");
1583 }
1584
1585 int main(int argc, char** argv)
1586 {
1587 fprintf(stderr, "tophat_reports v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
1588 fprintf(stderr, "---------------------------------------\n");
1589
1590 reads_format = FASTQ;
1591
1592 int parse_ret = parse_options(argc, argv, print_usage);
1593 if (parse_ret)
1594 return parse_ret;
1595
1596 if(optind >= argc)
1597 {
1598 print_usage();
1599 return 1;
1600 }
1601
1602 string ref_file_name = argv[optind++];
1603
1604 if(optind >= argc)
1605 {
1606 print_usage();
1607 return 1;
1608 }
1609
1610 string junctions_file_name = argv[optind++];
1611
1612 if(optind >= argc)
1613 {
1614 print_usage();
1615 return 1;
1616 }
1617
1618 string insertions_file_name = argv[optind++];
1619 if(optind >= argc)
1620 {
1621 print_usage();
1622 return 1;
1623 }
1624
1625 string deletions_file_name = argv[optind++];
1626 if(optind >= argc)
1627 {
1628 print_usage();
1629 return 1;
1630 }
1631
1632 string fusions_file_name = argv[optind++];
1633 if(optind >= argc)
1634 {
1635 print_usage();
1636 return 1;
1637 }
1638
1639 string accepted_hits_file_name = argv[optind++];
1640 if(optind >= argc)
1641 {
1642 print_usage();
1643 return 1;
1644 }
1645
1646 string left_map_filename = argv[optind++];
1647 if(optind >= argc)
1648 {
1649 print_usage();
1650 return 1;
1651 }
1652
1653 string left_reads_filename = argv[optind++];
1654 string unzcmd=getUnpackCmd(left_reads_filename, false);
1655
1656 string right_map_filename;
1657 string right_reads_filename;
1658
1659 if (optind < argc)
1660 {
1661 right_map_filename = argv[optind++];
1662 if(optind >= argc) {
1663 print_usage();
1664 return 1;
1665 }
1666 right_reads_filename=argv[optind++];
1667 }
1668
1669 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
1670 if (!ref_stream.good())
1671 {
1672 fprintf(stderr, "Error: cannot open %s for reading\n",
1673 ref_file_name.c_str());
1674 exit(1);
1675 }
1676
1677 FILE* junctions_file = fopen(junctions_file_name.c_str(), "w");
1678 if (junctions_file == NULL)
1679 {
1680 fprintf(stderr, "Error: cannot open BED file %s for writing\n",
1681 junctions_file_name.c_str());
1682 exit(1);
1683 }
1684
1685 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
1686 if (insertions_file == NULL)
1687 {
1688 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1689 insertions_file_name.c_str());
1690 exit(1);
1691 }
1692
1693 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
1694 if (deletions_file == NULL)
1695 {
1696 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1697 deletions_file_name.c_str());
1698 exit(1);
1699 }
1700
1701 FILE* fusions_file = NULL;
1702 if (fusion_search)
1703 {
1704 fusions_file = fopen(fusions_file_name.c_str(), "w");
1705 if (fusions_file == NULL)
1706 {
1707 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1708 fusions_file_name.c_str());
1709 exit(1);
1710 }
1711 }
1712
1713 driver(accepted_hits_file_name,
1714 ref_stream,
1715 left_map_filename,
1716 left_reads_filename,
1717 right_map_filename,
1718 right_reads_filename,
1719 junctions_file,
1720 insertions_file,
1721 deletions_file,
1722 fusions_file);
1723
1724 return 0;
1725 }