ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat_reports.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (7 years, 9 months ago) by gpertea
File size: 54016 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 /*
2 * tophat_reports.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 11/20/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #else
13 #define PACKAGE_VERSION "INTERNAL"
14 #define SVN_REVISION "XXX"
15 #endif
16
17 #include <cassert>
18 #include <cstdio>
19 #include <cstring>
20 #include <vector>
21 #include <string>
22 #include <map>
23 #include <algorithm>
24 #include <set>
25 #include <stdexcept>
26 #include <iostream>
27 #include <fstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <getopt.h>
32
33 #include <boost/thread.hpp>
34
35 #include "common.h"
36 #include "utils.h"
37 #include "bwt_map.h"
38 #include "junctions.h"
39 #include "insertions.h"
40 #include "deletions.h"
41 #include "fusions.h"
42 #include "align_status.h"
43 #include "fragments.h"
44 #include "wiggles.h"
45 #include "tokenize.h"
46 #include "reads.h"
47
48
49 #include "inserts.h"
50
51 using namespace std;
52 using namespace seqan;
53 using std::set;
54
55 // daehwan - this is redundancy, which should be removed.
56 void get_seqs(istream& ref_stream,
57 RefSequenceTable& rt,
58 bool keep_seqs = true)
59 {
60 while(ref_stream.good() && !ref_stream.eof())
61 {
62 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
63 string name;
64 readMeta(ref_stream, name, Fasta());
65 string::size_type space_pos = name.find_first_of(" \t\r");
66 if (space_pos != string::npos)
67 {
68 name.resize(space_pos);
69 }
70 seqan::read(ref_stream, *ref_str, Fasta());
71
72 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
73 }
74 }
75
76 void read_best_alignments(const HitsForRead& hits_for_read,
77 FragmentAlignmentGrade& best_grade,
78 HitsForRead& best_hits,
79 const JunctionSet& gtf_junctions)
80 {
81 const vector<BowtieHit>& hits = hits_for_read.hits;
82 for (size_t i = 0; i < hits.size(); ++i)
83 {
84 if (hits[i].edit_dist()>max_read_mismatches) continue;
85
86 FragmentAlignmentGrade g(hits[i], gtf_junctions);
87
88 // Is the new status better than the current best one?
89 if (best_grade < g)
90 {
91 best_hits.hits.clear();
92 best_grade = g;
93 best_hits.hits.push_back(hits[i]);
94 }
95 else if (!(g < best_grade)) // is it just as good?
96 {
97 best_grade.num_alignments++;
98 best_hits.hits.push_back(hits[i]);
99 }
100 }
101 }
102
103 void pair_best_alignments(const HitsForRead& left_hits,
104 const HitsForRead& right_hits,
105 InsertAlignmentGrade& best_grade,
106 HitsForRead& left_best_hits,
107 HitsForRead& right_best_hits)
108 {
109 // max mate inner distance (genomic)
110 int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev;
111 if (max_mate_inner_dist == -1)
112 max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev;
113
114 const vector<BowtieHit>& left = left_hits.hits;
115 const vector<BowtieHit>& right = right_hits.hits;
116
117 for (size_t i = 0; i < left.size(); ++i)
118 {
119 const BowtieHit& lh = left[i];
120 if (lh.edit_dist() > max_read_mismatches) continue;
121 for (size_t j = 0; j < right.size(); ++j)
122 {
123 const BowtieHit& rh = right[j];
124 if (rh.edit_dist() > max_read_mismatches) continue;
125
126 bool fusion = false;
127 if (fusion_search)
128 {
129 bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING;
130 bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING;
131 if (left_fusion && right_fusion)
132 continue;
133
134 fusion = left_fusion || right_fusion;
135 if (!fusion && lh.ref_id() != rh.ref_id())
136 fusion = true;
137
138 if (!fusion && lh.ref_id() == rh.ref_id())
139 {
140 if (lh.antisense_align() == rh.antisense_align())
141 fusion = true;
142 else
143 {
144 int inter_dist = 0;
145 if (lh.antisense_align())
146 inter_dist = lh.left() - rh.right();
147 else
148 inter_dist = rh.left() - lh.right();
149
150 if (inter_dist < -(int)max_insertion_length || inter_dist > (int)fusion_min_dist)
151 fusion = true;
152 }
153 }
154 }
155
156 InsertAlignmentGrade g(lh, rh, min_mate_inner_dist, max_mate_inner_dist, fusion);
157
158 // Is the new status better than the current best one?
159 if (best_grade < g)
160 {
161 left_best_hits.hits.clear();
162 right_best_hits.hits.clear();
163
164 best_grade = g;
165 left_best_hits.hits.push_back(lh);
166 right_best_hits.hits.push_back(rh);
167 }
168 else if (!(g < best_grade))
169 {
170 left_best_hits.hits.push_back(lh);
171 right_best_hits.hits.push_back(rh);
172 best_grade.num_alignments++;
173 }
174 }
175 }
176 }
177
178 enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT};
179
180 void add_auxData(vector<string>& auxdata, vector<string>& sam_toks,
181 const RefSequenceTable& rt,const BowtieHit& bh, FragmentType insert_side,
182 int num_hits, const BowtieHit* next_hit, int hitIndex) {
183
184 if (sam_toks.size()>11) {
185 for (size_t i=11;i<sam_toks.size();++i) {
186 if (sam_toks[i].find("NH:i:")==string::npos)
187 auxdata.push_back(sam_toks[i]);
188 }
189 }
190 string aux("NH:i:");
191 str_appendInt(aux, num_hits);
192 auxdata.push_back(aux);
193 if (next_hit) {
194 const char* nh_ref_name = "=";
195 nh_ref_name = rt.get_name(next_hit->ref_id());
196 assert (nh_ref_name != NULL);
197 bool same_contig=(next_hit->ref_id()==bh.ref_id());
198 aux="CC:Z:";
199 aux+= (same_contig ? "=" : nh_ref_name);
200 auxdata.push_back(aux);
201 aux="CP:i:";
202 int nh_gpos=next_hit->left() + 1;
203 str_appendInt(aux, nh_gpos);
204 auxdata.push_back(aux);
205 } //has next_hit
206 // FIXME: this code is still a bit brittle, because it contains no
207 // consistency check that the mates are on opposite strands - a current protocol
208 // requirement, and that the strand indicated by the alignment is consistent
209 // with the orientation of the splices (though that should be handled upstream).
210 const string xs_f("XS:A:+");
211 const string xs_r("XS:A:-");
212 if (bh.contiguous()) {
213 if (library_type == FR_FIRSTSTRAND) {
214 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
215 if (bh.antisense_align())
216 auxdata.push_back(xs_f);
217 else
218 auxdata.push_back(xs_r);
219 }
220 else {
221 if (bh.antisense_align())
222 auxdata.push_back(xs_r);
223 else
224 auxdata.push_back(xs_f);
225 }
226 }
227 else if (library_type == FR_SECONDSTRAND) {
228 if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
229 if (bh.antisense_align())
230 auxdata.push_back(xs_r);
231 else
232 auxdata.push_back(xs_f);
233 }
234 else
235 {
236 if (bh.antisense_align())
237 auxdata.push_back(xs_f);
238 else
239 auxdata.push_back(xs_r);
240 }
241 }
242 } //bh.contiguous()
243 if (hitIndex >= 0)
244 {
245 string aux("HI:i:");
246 str_appendInt(aux, hitIndex);
247 auxdata.push_back(aux);
248 }
249 }
250
251 bool rewrite_sam_record(GBamWriter& bam_writer,
252 const RefSequenceTable& rt,
253 const BowtieHit& bh,
254 const char* bwt_buf,
255 const char* read_alt_name,
256 const FragmentAlignmentGrade& grade,
257 FragmentType insert_side,
258 int num_hits,
259 const BowtieHit* next_hit,
260 bool primary,
261 int hitIndex)
262 {
263 // Rewrite this hit, filling in the alt name, mate mapping
264 // and setting the pair flag
265 vector<string> sam_toks;
266 tokenize(bwt_buf, "\t", sam_toks);
267
268 string ref_name = sam_toks[2], ref_name2 = "";
269 char rebuf2[2048] = {0};
270 char cigar1[256] = {0}, cigar2[256] = {0};
271 string left_seq, right_seq, left_qual, right_qual;
272 int left = -1, left1 = -1, left2 = -1;
273 bool fusion_alignment = false;
274 size_t XF_index = 0;
275 for (size_t i = 11; i < sam_toks.size(); ++i)
276 {
277 const string& tok = sam_toks[i];
278 if (strncmp(tok.c_str(), "XF", 2) == 0)
279 {
280 fusion_alignment = true;
281 XF_index = i;
282
283 vector<string> tuple_fields;
284 tokenize(tok.c_str(), " ", tuple_fields);
285 vector<string> contigs;
286 tokenize(tuple_fields[1].c_str(), "-", contigs);
287 if (contigs.size() >= 2)
288 {
289 ref_name = contigs[0];
290 ref_name2 = contigs[1];
291 }
292
293 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
294 cigar1, cigar2, left_seq, right_seq,
295 left_qual, right_qual, left1, left2);
296
297 break;
298 }
299 }
300
301 string qname(read_alt_name);
302 size_t slash_pos=qname.rfind('/');
303 if (slash_pos!=string::npos)
304 qname.resize(slash_pos);
305 //read_alt_name as QNAME
306 int flag=atoi(sam_toks[1].c_str()); //FLAG
307 if (insert_side != FRAG_UNPAIRED) {
308 //flag = atoi(sam_toks[1].c_str());
309 // mark this as a singleton mate
310 flag |= 0x0001;
311 if (insert_side == FRAG_LEFT)
312 flag |= 0x0040;
313 else if (insert_side == FRAG_RIGHT)
314 flag |= 0x0080;
315 flag |= 0x0008;
316 //char flag_buf[64];
317 //sprintf(flag_buf, "%d", flag);
318 //sam_toks[t] = flag_buf;
319 }
320 if (!primary)
321 flag |= 0x100;
322
323 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
324 int mapQ=255;
325 if (grade.num_alignments > 1) {
326 double err_prob = 1 - (1.0 / grade.num_alignments);
327 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
328 }
329 int tlen =atoi(sam_toks[8].c_str()); //TLEN
330 int mate_pos=atoi(sam_toks[7].c_str());
331
332 GBamRecord* bamrec=NULL;
333 if (fusion_alignment) {
334 vector<string> auxdata;
335 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
336 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
337 cigar1, sam_toks[6].c_str(), mate_pos,
338 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
339 bam_writer.write(bamrec);
340 delete bamrec;
341
342 auxdata.clear();
343 sam_toks[XF_index][5] = '2';
344 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
345 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
346 cigar2, sam_toks[6].c_str(), mate_pos,
347 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
348 bam_writer.write(bamrec);
349 delete bamrec;
350 } else {
351 vector<string> auxdata;
352 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
353 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
354 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
355 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
356 bam_writer.write(bamrec);
357 delete bamrec;
358 }
359
360 return true;
361 }
362
363 bool rewrite_sam_record(GBamWriter& bam_writer,
364 const RefSequenceTable& rt,
365 const BowtieHit& bh,
366 const char* bwt_buf,
367 const char* read_alt_name,
368 const InsertAlignmentGrade& grade,
369 FragmentType insert_side,
370 const BowtieHit* partner,
371 int num_hits,
372 const BowtieHit* next_hit,
373 bool primary,
374 int hitIndex)
375 {
376 // Rewrite this hit, filling in the alt name, mate mapping
377 // and setting the pair flag
378 vector<string> sam_toks;
379 tokenize(bwt_buf, "\t", sam_toks);
380 string qname(read_alt_name);
381 size_t slash_pos=qname.rfind('/');
382 if (slash_pos!=string::npos)
383 qname.resize(slash_pos);
384 //read_alt_name as QNAME
385 int flag = atoi(sam_toks[1].c_str());
386 // 0x0010 (strand of query) is assumed to be set correctly
387 // to begin with
388 flag |= 0x0001; //it must be paired
389 if (insert_side == FRAG_LEFT)
390 flag |= 0x0040;
391 else if (insert_side == FRAG_RIGHT)
392 flag |= 0x0080;
393 if (!primary)
394 flag |= 0x100;
395
396 string ref_name = sam_toks[2], ref_name2 = "";
397 char rebuf2[2048] = {0};
398 char cigar1[256] = {0}, cigar2[256] = {0};
399 string left_seq, right_seq, left_qual, right_qual;
400 int left = -1, left1 = -1, left2 = -1;
401 bool fusion_alignment = false;
402 size_t XF_tok_idx = 11;
403 for (; XF_tok_idx < sam_toks.size(); ++XF_tok_idx)
404 {
405 const string& tok = sam_toks[XF_tok_idx];
406 if (strncmp(tok.c_str(), "XF", 2) == 0)
407 {
408 fusion_alignment = true;
409
410 vector<string> tuple_fields;
411 tokenize(tok.c_str(), " ", tuple_fields);
412 vector<string> contigs;
413 tokenize(tuple_fields[1].c_str(), "-", contigs);
414 if (contigs.size() >= 2)
415 {
416 ref_name = contigs[0];
417 ref_name2 = contigs[1];
418 }
419
420 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
421 cigar1, cigar2, left_seq, right_seq,
422 left_qual, right_qual, left1, left2);
423
424 break;
425 }
426 }
427
428 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
429 int mapQ=255;
430 if (grade.num_alignments > 1) {
431 double err_prob = 1 - (1.0 / grade.num_alignments);
432 mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
433 }
434 int tlen=0; //TLEN
435 int mate_pos=atoi(sam_toks[7].c_str());
436 if (partner) {
437 if (partner->ref_id()==bh.ref_id()) {
438 sam_toks[6] = "="; //same chromosome
439 //TLEN:
440 tlen = bh.left() < partner->left() ? partner->right() - bh.left() :
441 partner->left() - bh.right();
442 }
443
444 else { //partner on different chromosome/contig
445 sam_toks[6] = rt.get_name(partner->ref_id());
446 }
447 mate_pos = partner->left() + 1;
448 if (grade.happy())
449 flag |= 0x0002;
450 if (partner->antisense_align())
451 flag |= 0x0020;
452
453 if (partner->fusion_opcode() != FUSION_NOTHING)
454 {
455 char partner_pos[1024];
456 sprintf(partner_pos, "XP:Z:%s-%s %d", rt.get_name(partner->ref_id()), rt.get_name(partner->ref_id2()), partner->left() + 1);
457 sam_toks.push_back(partner_pos);
458 }
459 }
460 else {
461 sam_toks[6] = "*";
462 mate_pos = 0;
463 flag |= 0x0008;
464 }
465
466 GBamRecord* bamrec=NULL;
467 if (fusion_alignment) {
468 vector<string> auxdata;
469 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
470 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
471 cigar1, sam_toks[6].c_str(), mate_pos,
472 tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
473 bam_writer.write(bamrec);
474 delete bamrec;
475
476 auxdata.clear();
477 sam_toks[XF_tok_idx][5] = '2';
478 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
479 bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
480 cigar2, sam_toks[6].c_str(), mate_pos,
481 tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
482 bam_writer.write(bamrec);
483 delete bamrec;
484 } else {
485 vector<string> auxdata;
486 add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
487 bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
488 sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
489 tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
490 bam_writer.write(bamrec);
491 delete bamrec;
492 }
493
494 return true;
495 }
496
497 struct lex_hit_sort
498 {
499 lex_hit_sort(const RefSequenceTable& rt, const HitsForRead& hits)
500 : _rt(rt), _hits(hits)
501 {}
502
503 bool operator()(const uint32_t& l, const uint32_t& r) const
504 {
505 const BowtieHit& lhs = _hits.hits[l];
506 const BowtieHit& rhs = _hits.hits[r];
507
508 uint32_t l_id = lhs.ref_id();
509 uint32_t r_id = rhs.ref_id();
510 if (l_id != r_id)
511 return l_id < r_id;
512
513 return lhs.left() < rhs.left();
514 }
515
516 const RefSequenceTable& _rt;
517 const HitsForRead& _hits;
518 };
519
520 void print_sam_for_single(const RefSequenceTable& rt,
521 const HitsForRead& hits,
522 const FragmentAlignmentGrade& grade,
523 FragmentType frag_type,
524 //FILE* reads_file,
525 Read& read,
526 GBamWriter& bam_writer,
527 FILE* um_out//write unmapped reads here
528 )
529 {
530 assert(!read.alt_name.empty());
531 lex_hit_sort s(rt, hits);
532 vector<uint32_t> index_vector;
533
534 size_t i;
535 for (i = 0; i < hits.hits.size(); ++i)
536 index_vector.push_back(i);
537
538 sort(index_vector.begin(), index_vector.end(), s);
539
540 size_t primaryHit = (hits.hits.empty()? 0: random() % hits.hits.size());
541 bool multipleHits = (hits.hits.size() > 1);
542 for (i = 0; i < hits.hits.size(); ++i)
543 {
544 bool primary = (i == primaryHit);
545 size_t index = index_vector[i];
546 const BowtieHit& bh = hits.hits[index];
547 rewrite_sam_record(bam_writer, rt,
548 bh,
549 bh.hitfile_rec().c_str(),
550 read.alt_name.c_str(),
551 grade,
552 frag_type,
553 hits.hits.size(),
554 (i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL,
555 primary,
556 (multipleHits? i: -1));
557 }
558 }
559
560 void print_sam_for_pair(const RefSequenceTable& rt,
561 const HitsForRead& left_hits,
562 const HitsForRead& right_hits,
563 const InsertAlignmentGrade& grade,
564 ReadStream& left_reads_file,
565 ReadStream& right_reads_file,
566 GBamWriter& bam_writer,
567 FILE* left_um_out,
568 FILE* right_um_out,
569 uint64_t begin_id = 0,
570 uint64_t end_id = std::numeric_limits<uint64_t>::max())
571 {
572 assert (left_hits.insert_id == right_hits.insert_id);
573
574 Read left_read;
575 Read right_read;
576 assert (left_hits.hits.size() == right_hits.hits.size() ||
577 (left_hits.hits.empty() || right_hits.hits.empty()));
578
579 size_t primaryHit = 0;
580 vector<uint32_t> index_vector;
581 if(right_hits.hits.size() > 0)
582 {
583 lex_hit_sort s(rt, right_hits);
584 for (size_t i = 0; i < right_hits.hits.size(); ++i)
585 index_vector.push_back(i);
586
587 sort(index_vector.begin(), index_vector.end(), s);
588 primaryHit = random() % right_hits.hits.size();
589 }
590 else if (left_hits.hits.size() > 0)
591 {
592 lex_hit_sort s(rt, left_hits);
593 for (size_t i = 0; i < left_hits.hits.size(); ++i)
594 index_vector.push_back(i);
595
596 sort(index_vector.begin(), index_vector.end(), s);
597 primaryHit = random() % left_hits.hits.size();
598 }
599
600 bool got_left_read = left_reads_file.getRead(left_hits.insert_id, left_read,
601 reads_format, false, left_um_out,
602 false, begin_id, end_id);
603
604 bool got_right_read = right_reads_file.getRead(right_hits.insert_id, right_read,
605 reads_format, false, right_um_out,
606 false, begin_id, end_id);
607
608 if (left_hits.hits.size() == right_hits.hits.size())
609 {
610 assert (got_left_read && got_right_read);
611 bool multipleHits = (left_hits.hits.size() > 1);
612 for (size_t i = 0; i < right_hits.hits.size(); ++i)
613 {
614 bool primary = (i == primaryHit);
615 size_t index = index_vector[i];
616 const BowtieHit& right_bh = right_hits.hits[index];
617 const BowtieHit& left_bh = left_hits.hits[index];
618
619 rewrite_sam_record(bam_writer, rt,
620 right_bh,
621 right_bh.hitfile_rec().c_str(),
622 right_read.alt_name.c_str(),
623 grade,
624 FRAG_RIGHT,
625 &left_bh,
626 right_hits.hits.size(),
627 (i < right_hits.hits.size() - 1) ? &(right_hits.hits[index_vector[i+1]]) : NULL,
628 primary,
629 (multipleHits? i: -1));
630 rewrite_sam_record(bam_writer, rt,
631 left_bh,
632 left_bh.hitfile_rec().c_str(),
633 left_read.alt_name.c_str(),
634 grade,
635 FRAG_LEFT,
636 &right_bh,
637 left_hits.hits.size(),
638 (i < left_hits.hits.size() - 1) ? &(left_hits.hits[index_vector[i+1]]) : NULL,
639 primary,
640 (multipleHits? i: -1));
641 }
642 }
643 else if (left_hits.hits.empty())
644 { //only right read was mapped properly
645 if (right_um_out) {
646 //write it in the mapped file with the #MAPPED# flag
647 fprintf(right_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", right_read.alt_name.c_str(),
648 right_read.seq.c_str(), right_read.qual.c_str());
649 }
650 for (size_t i = 0; i < right_hits.hits.size(); ++i)
651 {
652 bool primary = (i == primaryHit);
653 size_t index = index_vector[i];
654 const BowtieHit& bh = right_hits.hits[index];
655
656 rewrite_sam_record(bam_writer, rt,
657 bh,
658 bh.hitfile_rec().c_str(),
659 right_read.alt_name.c_str(),
660 grade,
661 FRAG_RIGHT,
662 NULL,
663 right_hits.hits.size(),
664 (i < right_hits.hits.size() - 1) ? &(right_hits.hits[index_vector[i+1]]) : NULL,
665 primary,
666 -1);
667
668 }
669 }
670 else if (right_hits.hits.empty())
671 { //only left read was mapped properly
672 if (left_um_out) {
673 //write it in the mapped file with the #MAPPED# flag
674 fprintf(left_um_out, "@%s #MAPPED#\n%s\n+\n%s\n", left_read.alt_name.c_str(), left_read.seq.c_str(),
675 left_read.qual.c_str());
676 }
677
678 for (size_t i = 0; i < left_hits.hits.size(); ++i)
679 {
680 bool primary = (i == primaryHit);
681 size_t index = index_vector[i];
682 const BowtieHit& bh = left_hits.hits[index];
683 rewrite_sam_record(bam_writer, rt,
684 bh,
685 bh.hitfile_rec().c_str(),
686 left_read.alt_name.c_str(),
687 grade,
688 FRAG_LEFT,
689 NULL,
690 left_hits.hits.size(),
691 (i < left_hits.hits.size() - 1) ? &(left_hits.hits[index_vector[i+1]]) : NULL,
692 primary,
693 -1);
694
695 }
696 }
697 else
698 {
699 assert (false);
700 }
701 }
702
703 /**
704 * Given all of the hits for a particular read, update the read counts for insertions and deletions.
705 * @param hits hits The alignments for a particular read
706 * @param insertions Maps from an insertion to the number of supporting reads for that insertion
707 * @param deletions Maps from a deletion to the number of supporting reads for that deletion
708 */
709 void update_insertions_and_deletions(const HitsForRead& hits,
710 InsertionSet& insertions,
711 DeletionSet& deletions)
712 {
713 for (size_t i = 0; i < hits.hits.size(); ++i)
714 {
715 const BowtieHit& bh = hits.hits[i];
716 insertions_from_alignment(bh, insertions);
717 deletions_from_alignment(bh, deletions);
718 }
719 }
720
721
722 FusionSet empty_fusions;
723 void update_fusions(const HitsForRead& hits,
724 RefSequenceTable& rt,
725 FusionSet& fusions,
726 FusionSet& fusions_ref = empty_fusions)
727 {
728 if (hits.hits.size() > fusion_multireads)
729 return;
730
731 bool update_stat = fusions_ref.size() > 0;
732 for (size_t i = 0; i < hits.hits.size(); ++i)
733 {
734 const BowtieHit& bh = hits.hits[i];
735
736 if (bh.edit_dist() > fusion_read_mismatches)
737 continue;
738
739 // daehwan
740 #if 0
741 vector<Fusion> new_fusions;
742 fusions_from_spliced_hit(bh, new_fusions);
743
744 for(size_t i = 0; i < new_fusions.size(); ++i)
745 {
746 Fusion fusion = new_fusions[i];
747 if (fusion.left == 110343414 && fusion.right == 80211173 && hits.hits.size() > 1)
748 {
749 for (size_t t = 0; t < hits.hits.size(); ++t)
750 {
751 const BowtieHit& lh = hits.hits[t];
752 cout << "insert id: " << lh.insert_id() << endl;
753 cout << (lh.antisense_align() ? "-" : "+") << endl;
754 cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl;
755 cout << lh.ref_id2() << ": " << print_cigar(lh.cigar()) << endl;
756 }
757 }
758 }
759 #endif
760
761 fusions_from_alignment(bh, fusions, rt, update_stat);
762
763 if (update_stat)
764 unsupport_fusions(bh, fusions, fusions_ref);
765 }
766 }
767
768 void update_junctions(const HitsForRead& hits,
769 JunctionSet& junctions)
770 {
771 for (size_t i = 0; i < hits.hits.size(); ++i)
772 {
773 const BowtieHit& bh = hits.hits[i];
774 junctions_from_alignment(bh, junctions);
775 }
776 }
777
778 // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
779 // resets the stream when finished.
780 void exclude_hits_on_filtered_junctions(const JunctionSet& junctions, HitsForRead& hits)
781 {
782 HitsForRead remaining;
783 remaining.insert_id = hits.insert_id;
784
785 for (size_t i = 0; i < hits.hits.size(); ++i)
786 {
787 BowtieHit& bh = hits.hits[i];
788 bool filter_hit = false;
789 if (!bh.contiguous())
790 {
791 JunctionSet bh_juncs;
792 junctions_from_alignment(bh, bh_juncs);
793 for (JunctionSet::iterator itr = bh_juncs.begin();
794 itr != bh_juncs.end();
795 itr++)
796 {
797 const Junction& j = itr->first;
798 JunctionSet::const_iterator target = junctions.find(j);
799 if (target == junctions.end() || !target->second.accepted)
800 {
801 filter_hit = true;
802 break;
803 }
804 }
805 }
806 if (!filter_hit)
807 remaining.hits.push_back(bh);
808 }
809 hits = remaining;
810 }
811
812 // events include splice junction, indels, and fusion points.
813 struct ConsensusEventsWorker
814 {
815 void operator()()
816 {
817 ReadTable it;
818 vector<BAMHitFactory*> hit_factories;
819 hit_factories.push_back(new BAMHitFactory(it, *rt));
820 HitStream l_hs(left_map_fname, hit_factories.back(), false, true, true, true);
821 if (left_map_offset > 0)
822 l_hs.seek(left_map_offset);
823
824 hit_factories.push_back(new BAMHitFactory(it, *rt));
825 HitStream r_hs(right_map_fname, hit_factories.back(), false, true, true, true);
826 if (right_map_offset > 0)
827 r_hs.seek(right_map_offset);
828
829 HitsForRead curr_left_hit_group;
830 HitsForRead curr_right_hit_group;
831
832 l_hs.next_read_hits(curr_left_hit_group);
833 r_hs.next_read_hits(curr_right_hit_group);
834
835 uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
836 uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
837
838 // While we still have unreported hits...
839 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
840 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
841 {
842 // Chew up left singletons
843 while (curr_left_obs_order < curr_right_obs_order &&
844 curr_left_obs_order < end_id &&
845 curr_left_obs_order != VMAXINT32)
846 {
847 HitsForRead best_hits;
848 best_hits.insert_id = curr_left_obs_order;
849 FragmentAlignmentGrade grade;
850
851 // Process hits for left singleton, select best alignments
852 read_best_alignments(curr_left_hit_group, grade, best_hits, *gtf_junctions);
853 update_junctions(best_hits, *junctions);
854 update_fusions(best_hits, *rt, *fusions);
855
856 // Get next hit group
857 l_hs.next_read_hits(curr_left_hit_group);
858 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
859 }
860
861 // Chew up right singletons
862 while (curr_left_obs_order > curr_right_obs_order &&
863 curr_right_obs_order < end_id &&
864 curr_right_obs_order != VMAXINT32)
865 {
866 HitsForRead best_hits;
867 best_hits.insert_id = curr_right_obs_order;
868 if (curr_right_obs_order >= begin_id)
869 {
870 FragmentAlignmentGrade grade;
871 // Process hit for right singleton, select best alignments
872 read_best_alignments(curr_right_hit_group,grade, best_hits, *gtf_junctions);
873 update_junctions(best_hits, *junctions);
874 update_fusions(best_hits, *rt, *fusions);
875 }
876
877 // Get next hit group
878 r_hs.next_read_hits(curr_right_hit_group);
879 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
880 }
881
882 // Since we have both left hits and right hits for this insert,
883 // Find the best pairing and print both
884 while (curr_left_obs_order == curr_right_obs_order &&
885 curr_left_obs_order < end_id &&
886 curr_left_obs_order != VMAXINT32)
887 {
888 if (curr_left_hit_group.hits.empty())
889 {
890 HitsForRead right_best_hits;
891 right_best_hits.insert_id = curr_right_obs_order;
892
893 FragmentAlignmentGrade grade;
894 read_best_alignments(curr_right_hit_group, grade, right_best_hits, *gtf_junctions);
895 update_junctions(right_best_hits, *junctions);
896 update_fusions(right_best_hits, *rt, *fusions);
897 }
898 else if (curr_right_hit_group.hits.empty())
899 {
900 HitsForRead left_best_hits;
901 left_best_hits.insert_id = curr_left_obs_order;
902
903 FragmentAlignmentGrade grade;
904 // Process hits for left singleton, select best alignments
905 read_best_alignments(curr_left_hit_group, grade, left_best_hits, *gtf_junctions);
906 update_junctions(left_best_hits, *junctions);
907 update_fusions(left_best_hits, *rt, *fusions);
908 }
909 else
910 {
911 HitsForRead left_best_hits;
912 HitsForRead right_best_hits;
913 left_best_hits.insert_id = curr_left_obs_order;
914 right_best_hits.insert_id = curr_right_obs_order;
915
916 // daehwan - apply gtf_junctions here, too!
917
918 InsertAlignmentGrade grade;
919 pair_best_alignments(curr_left_hit_group,
920 curr_right_hit_group,
921 grade,
922 left_best_hits,
923 right_best_hits);
924 update_junctions(left_best_hits, *junctions);
925 update_junctions(right_best_hits, *junctions);
926 update_fusions(left_best_hits, *rt, *fusions);
927 update_fusions(right_best_hits, *rt, *fusions);
928 }
929
930 l_hs.next_read_hits(curr_left_hit_group);
931 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
932
933 r_hs.next_read_hits(curr_right_hit_group);
934 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
935 }
936 }
937
938 for (size_t i = 0; i < hit_factories.size(); ++i)
939 delete hit_factories[i];
940
941 hit_factories.clear();
942 }
943
944 string left_map_fname;
945 string right_map_fname;
946
947 JunctionSet* junctions;
948 JunctionSet* gtf_junctions;
949 FusionSet* fusions;
950 RefSequenceTable* rt;
951
952 uint64_t begin_id;
953 uint64_t end_id;
954 int64_t left_map_offset;
955 int64_t right_map_offset;
956 };
957
958 struct ReportWorker
959 {
960 void operator()()
961 {
962 ReadTable it;
963 GBamWriter bam_writer(bam_output_fname.c_str(), sam_header.c_str());
964
965 ReadStream left_reads_file(left_reads_fname);
966 if (left_reads_file.file() == NULL)
967 err_die("Error: cannot open %s for reading\n", left_reads_fname.c_str());
968
969 if (left_reads_offset > 0)
970 left_reads_file.seek(left_reads_offset);
971
972 if (!zpacker.empty()) left_um_fname+=".z";
973 FZPipe left_um_out;
974 if (left_um_out.openWrite(left_um_fname.c_str(), zpacker)==NULL)
975 err_die("Error: cannot open file %s for writing!\n",left_um_fname.c_str());
976
977 ReadStream right_reads_file(right_reads_fname);
978 if (right_reads_offset > 0)
979 right_reads_file.seek(right_reads_offset);
980
981 FZPipe right_um_out;
982 if (right_reads_fname != "")
983 {
984 if (!zpacker.empty()) right_um_fname+=".z";
985 if (right_um_out.openWrite(right_um_fname.c_str(), zpacker)==NULL)
986 err_die("Error: cannot open file %s for writing!\n",right_um_fname.c_str());
987 }
988
989 vector<BAMHitFactory*> hit_factories;
990 hit_factories.push_back(new BAMHitFactory(it, *rt));
991 HitStream left_hs(left_map_fname, hit_factories.back(), false, true, true, true);
992 if (left_map_offset > 0)
993 left_hs.seek(left_map_offset);
994
995 hit_factories.push_back(new BAMHitFactory(it, *rt));
996 HitStream right_hs(right_map_fname, hit_factories.back(), false, true, true, true);
997 if (right_map_offset > 0)
998 right_hs.seek(right_map_offset);
999
1000 HitsForRead curr_left_hit_group;
1001 HitsForRead curr_right_hit_group;
1002
1003 left_hs.next_read_hits(curr_left_hit_group);
1004 right_hs.next_read_hits(curr_right_hit_group);
1005
1006 uint64_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1007 uint64_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1008
1009 // While we still have unreported hits...
1010 Read l_read;
1011 Read r_read;
1012 while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
1013 (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
1014 {
1015 // Chew up left singletons (pairs with right reads unmapped)
1016 while (curr_left_obs_order < curr_right_obs_order &&
1017 curr_left_obs_order < end_id &&
1018 curr_left_obs_order != VMAXINT32)
1019 {
1020 HitsForRead best_hits;
1021 best_hits.insert_id = curr_left_obs_order;
1022 FragmentAlignmentGrade grade;
1023 bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1024 reads_format, false, left_um_out.file,
1025 false, begin_id, end_id);
1026 assert(got_read);
1027
1028 if (right_reads_file.file()) {
1029 fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(),
1030 l_read.seq.c_str(), l_read.qual.c_str());
1031 got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1032 reads_format, false, right_um_out.file,
1033 true, begin_id, end_id);
1034 assert(got_read);
1035 }
1036
1037 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1038
1039 // Process hits for left singleton, select best alignments
1040 read_best_alignments(curr_left_hit_group, grade, best_hits, *gtf_junctions);
1041
1042 if (best_hits.hits.size()>0 && best_hits.hits.size() <= max_multihits)
1043 {
1044 update_junctions(best_hits, *final_junctions);
1045 update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1046 update_fusions(best_hits, *rt, *final_fusions, *fusions);
1047
1048 print_sam_for_single(*rt,
1049 best_hits,
1050 grade,
1051 (right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT),
1052 l_read,
1053 bam_writer,
1054 left_um_out.file);
1055 }
1056 // Get next hit group
1057 left_hs.next_read_hits(curr_left_hit_group);
1058 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1059 } //left singletons
1060
1061 // Chew up right singletons
1062 while (curr_left_obs_order > curr_right_obs_order &&
1063 curr_right_obs_order < end_id &&
1064 curr_right_obs_order != VMAXINT32)
1065 {
1066 HitsForRead best_hits;
1067 best_hits.insert_id = curr_right_obs_order;
1068 FragmentAlignmentGrade grade;
1069
1070 if (curr_right_obs_order >= begin_id)
1071 {
1072 bool got_read=right_reads_file.getRead(curr_right_obs_order, r_read,
1073 reads_format, false, right_um_out.file,
1074 false, begin_id, end_id);
1075
1076 assert(got_read);
1077
1078 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1079 r_read.seq.c_str(), r_read.qual.c_str());
1080 got_read=left_reads_file.getRead(curr_right_obs_order, l_read,
1081 reads_format, false, left_um_out.file,
1082 true, begin_id, end_id);
1083 assert(got_read);
1084
1085 exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1086
1087 // Process hit for right singleton, select best alignments
1088 read_best_alignments(curr_right_hit_group, grade, best_hits, *gtf_junctions);
1089 if (best_hits.hits.size()>0 && best_hits.hits.size() <= max_multihits)
1090 {
1091 update_junctions(best_hits, *final_junctions);
1092 update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1093 update_fusions(best_hits, *rt, *final_fusions, *fusions);
1094
1095 print_sam_for_single(*rt,
1096 best_hits,
1097 grade,
1098 FRAG_RIGHT,
1099 r_read,
1100 bam_writer,
1101 right_um_out.file);
1102 }
1103 }
1104
1105 // Get next hit group
1106 right_hs.next_read_hits(curr_right_hit_group);
1107 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1108 }
1109
1110 // Since we have both left hits and right hits for this insert,
1111 // Find the best pairing and print both
1112 while (curr_left_obs_order == curr_right_obs_order &&
1113 curr_left_obs_order < end_id &&
1114 curr_left_obs_order != VMAXINT32)
1115 {
1116 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1117 exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1118 if (curr_left_hit_group.hits.empty())
1119 { //only right read mapped
1120 //write it in the mapped file with the #MAPPED# flag
1121
1122 bool got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1123 reads_format, false, right_um_out.file,
1124 false, begin_id, end_id);
1125 assert(got_read);
1126 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1127 r_read.seq.c_str(), r_read.qual.c_str());
1128 HitsForRead right_best_hits;
1129 right_best_hits.insert_id = curr_right_obs_order;
1130
1131 FragmentAlignmentGrade grade;
1132 read_best_alignments(curr_right_hit_group, grade, right_best_hits, *gtf_junctions);
1133
1134 if (right_best_hits.hits.size()>0 && right_best_hits.hits.size() <= max_multihits)
1135 {
1136 update_junctions(right_best_hits, *final_junctions);
1137 update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions);
1138 update_fusions(right_best_hits, *rt, *final_fusions, *fusions);
1139
1140 print_sam_for_single(*rt,
1141 right_best_hits,
1142 grade,
1143 FRAG_RIGHT,
1144 r_read,
1145 bam_writer,
1146 right_um_out.file);
1147 }
1148 }
1149 else if (curr_right_hit_group.hits.empty())
1150 {
1151 HitsForRead left_best_hits;
1152 left_best_hits.insert_id = curr_left_obs_order;
1153 //only left read mapped
1154 bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1155 reads_format, false, left_um_out.file,
1156 false, begin_id, end_id);
1157 assert(got_read);
1158 fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(),
1159 l_read.seq.c_str(), l_read.qual.c_str());
1160 FragmentAlignmentGrade grade;
1161 // Process hits for left singleton, select best alignments
1162 read_best_alignments(curr_left_hit_group, grade, left_best_hits, *gtf_junctions);
1163 if (left_best_hits.hits.size()>0 && left_best_hits.hits.size() <= max_multihits)
1164 {
1165 update_junctions(left_best_hits, *final_junctions);
1166 update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions);
1167 update_fusions(left_best_hits, *rt, *final_fusions, *fusions);
1168
1169 print_sam_for_single(*rt,
1170 left_best_hits,
1171 grade,
1172 FRAG_LEFT,
1173 l_read,
1174 bam_writer,
1175 left_um_out.file);
1176 }
1177 }
1178 else
1179 { //hits for both left and right reads
1180 HitsForRead left_best_hits;
1181 HitsForRead right_best_hits;
1182 left_best_hits.insert_id = curr_left_obs_order;
1183 right_best_hits.insert_id = curr_right_obs_order;
1184
1185 InsertAlignmentGrade grade;
1186 pair_best_alignments(curr_left_hit_group,
1187 curr_right_hit_group,
1188 grade,
1189 left_best_hits,
1190 right_best_hits);
1191
1192 if (left_best_hits.hits.size()>0 && left_best_hits.hits.size() <= max_multihits &&
1193 right_best_hits.hits.size()>0 && right_best_hits.hits.size() <= max_multihits)
1194 {
1195 update_junctions(left_best_hits, *final_junctions);
1196 update_junctions(right_best_hits, *final_junctions);
1197 update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions);
1198 update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions);
1199
1200 pair_support(left_best_hits, right_best_hits, *final_fusions, *fusions);
1201 update_fusions(left_best_hits, *rt, *final_fusions, *fusions);
1202 update_fusions(right_best_hits, *rt, *final_fusions, *fusions);
1203
1204 print_sam_for_pair(*rt,
1205 left_best_hits,
1206 right_best_hits,
1207 grade,
1208 left_reads_file,
1209 right_reads_file,
1210 bam_writer,
1211 left_um_out.file,
1212 right_um_out.file,
1213 begin_id,
1214 end_id);
1215 }
1216 }
1217
1218 left_hs.next_read_hits(curr_left_hit_group);
1219 curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1220
1221 right_hs.next_read_hits(curr_right_hit_group);
1222 curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1223 }
1224 } //while we still have unreported hits..
1225 //print the remaining unmapped reads at the end of each reads' stream
1226 left_reads_file.getRead(VMAXINT32, l_read,
1227 reads_format, false, left_um_out.file,
1228 false, begin_id, end_id);
1229 if (right_reads_file.file())
1230 right_reads_file.getRead(VMAXINT32, r_read,
1231 reads_format, false, right_um_out.file,
1232 false, begin_id, end_id);
1233
1234 left_um_out.close();
1235 right_um_out.close();
1236
1237 for (size_t i = 0; i < hit_factories.size(); ++i)
1238 delete hit_factories[i];
1239
1240 hit_factories.clear();
1241 }
1242
1243 string bam_output_fname;
1244 string sam_header_fname;
1245
1246 string left_reads_fname;
1247 string left_map_fname;
1248 string right_reads_fname;
1249 string right_map_fname;
1250
1251 string left_um_fname;
1252 string right_um_fname;
1253
1254 JunctionSet* gtf_junctions;
1255 JunctionSet* junctions;
1256 FusionSet* fusions;
1257 JunctionSet* final_junctions;
1258 InsertionSet* final_insertions;
1259 DeletionSet* final_deletions;
1260 FusionSet* final_fusions;
1261
1262 RefSequenceTable* rt;
1263
1264 uint64_t begin_id;
1265 uint64_t end_id;
1266 int64_t left_reads_offset;
1267 int64_t left_map_offset;
1268 int64_t right_reads_offset;
1269 int64_t right_map_offset;
1270 };
1271
1272 void driver(const string& bam_output_fname,
1273 istream& ref_stream,
1274 const string& left_map_fname,
1275 const string& left_reads_fname,
1276 const string& right_map_fname,
1277 const string& right_reads_fname,
1278 FILE* junctions_out,
1279 FILE* insertions_out,
1280 FILE* deletions_out,
1281 FILE* fusions_out)
1282 {
1283 if (!parallel)
1284 num_threads = 1;
1285
1286 RefSequenceTable rt(sam_header, true);
1287
1288 if (fusion_search)
1289 get_seqs(ref_stream, rt, true);
1290
1291 srandom(1);
1292
1293 JunctionSet gtf_junctions;
1294 if (!gtf_juncs.empty())
1295 {
1296 char splice_buf[2048];
1297 FILE* splice_coords = fopen(gtf_juncs.c_str(), "r");
1298 if (splice_coords)
1299 {
1300 while (fgets(splice_buf, sizeof(splice_buf), splice_coords))
1301 {
1302 char* nl = strrchr(splice_buf, '\n');
1303 char* buf = splice_buf;
1304 if (nl) *nl = 0;
1305
1306 char* ref_name = get_token((char**)&buf, "\t");
1307 char* scan_left_coord = get_token((char**)&buf, "\t");
1308 char* scan_right_coord = get_token((char**)&buf, "\t");
1309 char* orientation = get_token((char**)&buf, "\t");
1310
1311 if (!scan_left_coord || !scan_right_coord || !orientation)
1312 {
1313 fprintf(stderr,"Error: malformed splice coordinate record in %s\n:%s\n",
1314 gtf_juncs.c_str(), buf);
1315 exit(1);
1316 }
1317
1318 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
1319 uint32_t left_coord = atoi(scan_left_coord);
1320 uint32_t right_coord = atoi(scan_right_coord);
1321 bool antisense = *orientation == '-';
1322
1323 // add 1 to left_coord to meet BED format
1324 gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord + 1, right_coord, antisense), JunctionStats()));
1325 }
1326 }
1327 fprintf(stderr, "Loaded %d GFF junctions from %s.\n", (int)(gtf_junctions.size()), gtf_juncs.c_str());
1328 }
1329
1330 vector<uint64_t> read_ids;
1331 vector<vector<int64_t> > offsets;
1332 if (num_threads > 1)
1333 {
1334 vector<string> fnames;
1335 if (right_map_fname != "")
1336 {
1337 fnames.push_back(right_reads_fname);
1338 fnames.push_back(right_map_fname);
1339 }
1340 fnames.push_back(left_reads_fname);
1341 fnames.push_back(left_map_fname);
1342 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
1343 if (!enough_data)
1344 num_threads = 1;
1345 }
1346
1347 JunctionSet vjunctions[num_threads];
1348 FusionSet vfusions[num_threads];
1349 vector<boost::thread*> threads;
1350 for (int i = 0; i < num_threads; ++i)
1351 {
1352 ConsensusEventsWorker worker;
1353
1354 worker.left_map_fname = left_map_fname;
1355 worker.right_map_fname = right_map_fname;
1356 worker.junctions = &vjunctions[i];
1357 worker.fusions = &vfusions[i];
1358 worker.gtf_junctions = &gtf_junctions;
1359 worker.rt = &rt;
1360
1361 worker.right_map_offset = 0;
1362
1363 if (i == 0)
1364 {
1365 worker.begin_id = 0;
1366 worker.left_map_offset = 0;
1367 }
1368 else
1369 {
1370 size_t offsets_size = offsets[i-1].size();
1371
1372 worker.begin_id = read_ids[i-1];
1373 worker.left_map_offset = offsets[i-1].back();
1374 if (offsets_size == 4)
1375 worker.right_map_offset = offsets[i-1][1];
1376 }
1377
1378 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1379
1380 if (num_threads > 1)
1381 threads.push_back(new boost::thread(worker));
1382 else
1383 worker();
1384 }
1385
1386 for (size_t i = 0; i < threads.size(); ++i)
1387 {
1388 threads[i]->join();
1389 delete threads[i];
1390 threads[i] = NULL;
1391 }
1392 threads.clear();
1393
1394 JunctionSet junctions = vjunctions[0];
1395 FusionSet fusions = vfusions[0];
1396 for (size_t i = 1; i < num_threads; ++i)
1397 {
1398 merge_with(junctions, vjunctions[i]);
1399 vjunctions[i].clear();
1400
1401 merge_with(fusions, vfusions[i]);
1402 vfusions[i].clear();
1403 }
1404
1405 size_t num_unfiltered_juncs = junctions.size();
1406 fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs);
1407
1408 // Read hits, extract junctions, and toss the ones that arent strongly enough supported.
1409
1410 filter_junctions(junctions, gtf_junctions);
1411
1412 //size_t num_juncs_after_filter = junctions.size();
1413 //fprintf(stderr, "Filtered %lu junctions\n",
1414 // num_unfiltered_juncs - num_juncs_after_filter);
1415
1416 /*
1417 size_t small_overhangs = 0;
1418 for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
1419 {
1420 if (i->second.accepted &&
1421 (i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len))
1422 {
1423 small_overhangs++;
1424 }
1425 }
1426
1427 if (small_overhangs >0)
1428 fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs);
1429 */
1430
1431 JunctionSet vfinal_junctions[num_threads];
1432 InsertionSet vfinal_insertions[num_threads];
1433 DeletionSet vfinal_deletions[num_threads];
1434 FusionSet vfinal_fusions[num_threads];
1435
1436 for (int i = 0; i < num_threads; ++i)
1437 {
1438 ReportWorker worker;
1439
1440 char filename[1024] = {0};
1441 sprintf(filename, "%s%d.bam", bam_output_fname.c_str(), i);
1442 worker.bam_output_fname = filename;
1443 worker.sam_header_fname = sam_header;
1444 sprintf(filename, "%s/unmapped_left%d.fq", output_dir.c_str(), i);
1445 worker.left_um_fname = filename;
1446
1447 if (right_reads_fname != "")
1448 {
1449 sprintf(filename, "%s/unmapped_right%d.fq", output_dir.c_str(), i);
1450 worker.right_um_fname = filename;
1451 }
1452
1453 worker.left_reads_fname = left_reads_fname;
1454 worker.left_map_fname = left_map_fname;
1455 worker.right_reads_fname = right_reads_fname;
1456 worker.right_map_fname = right_map_fname;
1457
1458 worker.gtf_junctions = &gtf_junctions;
1459 worker.junctions = &junctions;
1460 worker.fusions = &fusions;
1461 worker.final_junctions = &vfinal_junctions[i];
1462 worker.final_insertions = &vfinal_insertions[i];
1463 worker.final_deletions = &vfinal_deletions[i];
1464 worker.final_fusions = &vfinal_fusions[i];
1465 worker.rt = &rt;
1466
1467 worker.right_reads_offset = 0;
1468 worker.right_map_offset = 0;
1469
1470 if (i == 0)
1471 {
1472 worker.begin_id = 0;
1473 worker.left_reads_offset = 0;
1474 worker.left_map_offset = 0;
1475 }
1476 else
1477 {
1478 size_t offsets_size = offsets[i-1].size();
1479
1480 worker.begin_id = read_ids[i-1];
1481 worker.left_reads_offset = offsets[i-1][offsets_size - 2];
1482 worker.left_map_offset = offsets[i-1].back();
1483 if (offsets_size == 4)
1484 {
1485 worker.right_reads_offset = offsets[i-1][0];
1486 worker.right_map_offset = offsets[i-1][1];
1487 }
1488 }
1489
1490 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1491
1492 if (num_threads > 1)
1493 threads.push_back(new boost::thread(worker));
1494 else
1495 worker();
1496 }
1497
1498 for (size_t i = 0; i < threads.size(); ++i)
1499 {
1500 threads[i]->join();
1501 delete threads[i];
1502 threads[i] = NULL;
1503 }
1504 threads.clear();
1505
1506 JunctionSet final_junctions = vfinal_junctions[0];
1507 InsertionSet final_insertions = vfinal_insertions[0];
1508 DeletionSet final_deletions = vfinal_deletions[0];
1509 FusionSet final_fusions = vfinal_fusions[0];
1510 for (size_t i = 1; i < num_threads; ++i)
1511 {
1512 merge_with(final_junctions, vfinal_junctions[i]);
1513 vfinal_junctions[i].clear();
1514
1515 merge_with(final_insertions, vfinal_insertions[i]);
1516 vfinal_insertions[i].clear();
1517
1518 merge_with(final_deletions, vfinal_deletions[i]);
1519 vfinal_deletions[i].clear();
1520
1521 merge_with(final_fusions, vfinal_fusions[i]);
1522 vfinal_fusions[i].clear();
1523 }
1524
1525 fprintf (stderr, "Reporting final accepted alignments...");
1526 fprintf (stderr, "done.\n");
1527
1528 //small_overhangs = 0;
1529 for (JunctionSet::iterator i = final_junctions.begin(); i != final_junctions.end();)
1530 {
1531 if (i->second.supporting_hits == 0 || i->second.left_extent < 8 || i->second.right_extent < 8)
1532 {
1533 final_junctions.erase(i++);
1534 }
1535 else
1536 {
1537 ++i;
1538 }
1539 }
1540
1541 // if (small_overhangs > 0)
1542 // fprintf(stderr, "Warning: %lu small overhang junctions!\n", small_overhangs);
1543
1544 fprintf (stderr, "Printing junction BED track...");
1545 print_junctions(junctions_out, final_junctions, rt);
1546 fprintf (stderr, "done\n");
1547
1548 fprintf (stderr, "Printing insertions...");
1549 print_insertions(insertions_out, final_insertions,rt);
1550 fclose(insertions_out);
1551 fprintf (stderr, "done\n");
1552
1553 fprintf (stderr, "Printing deletions...");
1554 print_deletions(deletions_out, final_deletions, rt);
1555 fclose(deletions_out);
1556 fprintf (stderr, "done\n");
1557
1558 if (fusion_search)
1559 {
1560 fprintf (stderr, "Printing fusions...");
1561 print_fusions(fusions_out, final_fusions, rt);
1562 fclose(fusions_out);
1563 fprintf (stderr, "done\n");
1564 }
1565
1566 fprintf(stderr, "Found %lu junctions from happy spliced reads\n", (long unsigned int)final_junctions.size());
1567 }
1568
1569 void print_usage()
1570 {
1571 fprintf(stderr, "Usage: tophat_reports <junctions.bed> <insertions.vcf> <deletions.vcf> <accepted_hits.sam> <left_map1,...,left_mapN> <left_reads.fq> [right_map1,...,right_mapN] [right_reads.fq]\n");
1572
1573 // fprintf(stderr, "Usage: tophat_reports <coverage.wig> <junctions.bed> <accepted_hits.sam> <map1.bwtout> [splice_map1.sbwtout]\n");
1574 }
1575
1576 int main(int argc, char** argv)
1577 {
1578 fprintf(stderr, "tophat_reports v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
1579 fprintf(stderr, "---------------------------------------\n");
1580
1581 reads_format = FASTQ;
1582
1583 int parse_ret = parse_options(argc, argv, print_usage);
1584 if (parse_ret)
1585 return parse_ret;
1586
1587 if(optind >= argc)
1588 {
1589 print_usage();
1590 return 1;
1591 }
1592
1593 string ref_file_name = argv[optind++];
1594
1595 if(optind >= argc)
1596 {
1597 print_usage();
1598 return 1;
1599 }
1600
1601 string junctions_file_name = argv[optind++];
1602
1603 if(optind >= argc)
1604 {
1605 print_usage();
1606 return 1;
1607 }
1608
1609 string insertions_file_name = argv[optind++];
1610 if(optind >= argc)
1611 {
1612 print_usage();
1613 return 1;
1614 }
1615
1616 string deletions_file_name = argv[optind++];
1617 if(optind >= argc)
1618 {
1619 print_usage();
1620 return 1;
1621 }
1622
1623 string fusions_file_name = argv[optind++];
1624 if(optind >= argc)
1625 {
1626 print_usage();
1627 return 1;
1628 }
1629
1630 string accepted_hits_file_name = argv[optind++];
1631 if(optind >= argc)
1632 {
1633 print_usage();
1634 return 1;
1635 }
1636
1637 string left_map_filename = argv[optind++];
1638 if(optind >= argc)
1639 {
1640 print_usage();
1641 return 1;
1642 }
1643
1644 string left_reads_filename = argv[optind++];
1645 string unzcmd=getUnpackCmd(left_reads_filename, false);
1646
1647 string right_map_filename;
1648 string right_reads_filename;
1649
1650 if (optind < argc)
1651 {
1652 right_map_filename = argv[optind++];
1653 if(optind >= argc) {
1654 print_usage();
1655 return 1;
1656 }
1657 right_reads_filename=argv[optind++];
1658 }
1659
1660 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
1661 if (!ref_stream.good())
1662 {
1663 fprintf(stderr, "Error: cannot open %s for reading\n",
1664 ref_file_name.c_str());
1665 exit(1);
1666 }
1667
1668 FILE* junctions_file = fopen(junctions_file_name.c_str(), "w");
1669 if (junctions_file == NULL)
1670 {
1671 fprintf(stderr, "Error: cannot open BED file %s for writing\n",
1672 junctions_file_name.c_str());
1673 exit(1);
1674 }
1675
1676 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
1677 if (insertions_file == NULL)
1678 {
1679 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1680 insertions_file_name.c_str());
1681 exit(1);
1682 }
1683
1684 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
1685 if (deletions_file == NULL)
1686 {
1687 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1688 deletions_file_name.c_str());
1689 exit(1);
1690 }
1691
1692 FILE* fusions_file = NULL;
1693 if (fusion_search)
1694 {
1695 fusions_file = fopen(fusions_file_name.c_str(), "w");
1696 if (fusions_file == NULL)
1697 {
1698 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1699 fusions_file_name.c_str());
1700 exit(1);
1701 }
1702 }
1703
1704 driver(accepted_hits_file_name,
1705 ref_stream,
1706 left_map_filename,
1707 left_reads_filename,
1708 right_map_filename,
1709 right_reads_filename,
1710 junctions_file,
1711 insertions_file,
1712 deletions_file,
1713 fusions_file);
1714
1715 return 0;
1716 }