ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/tophat_reports.cpp
Revision: 166
Committed: Fri Feb 10 02:45:06 2012 UTC (8 years, 8 months ago) by gpertea
File size: 52075 byte(s)
Log Message:
resync

Line User Rev File contents
1 gpertea 29 /*
2     * tophat_reports.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 11/20/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #else
13     #define PACKAGE_VERSION "INTERNAL"
14     #define SVN_REVISION "XXX"
15     #endif
16    
17     #include <cassert>
18     #include <cstdio>
19     #include <cstring>
20     #include <vector>
21     #include <string>
22     #include <map>
23     #include <algorithm>
24     #include <set>
25     #include <stdexcept>
26     #include <iostream>
27     #include <fstream>
28     #include <seqan/sequence.h>
29     #include <seqan/find.h>
30     #include <seqan/file.h>
31     #include <getopt.h>
32    
33 gpertea 154 #include <boost/thread.hpp>
34    
35 gpertea 29 #include "common.h"
36 gpertea 154 #include "utils.h"
37 gpertea 29 #include "bwt_map.h"
38     #include "junctions.h"
39     #include "insertions.h"
40     #include "deletions.h"
41 gpertea 154 #include "fusions.h"
42 gpertea 29 #include "align_status.h"
43     #include "fragments.h"
44     #include "wiggles.h"
45     #include "tokenize.h"
46     #include "reads.h"
47    
48 gpertea 154
49 gpertea 29 #include "inserts.h"
50    
51     using namespace std;
52     using namespace seqan;
53     using std::set;
54    
55 gpertea 154 // daehwan - this is redundancy, which should be removed.
56     void get_seqs(istream& ref_stream,
57     RefSequenceTable& rt,
58     bool keep_seqs = true)
59 gpertea 166 {
60 gpertea 154 while(ref_stream.good() && !ref_stream.eof())
61     {
62     RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
63     string name;
64     readMeta(ref_stream, name, Fasta());
65     string::size_type space_pos = name.find_first_of(" \t\r");
66     if (space_pos != string::npos)
67     {
68     name.resize(space_pos);
69     }
70     seqan::read(ref_stream, *ref_str, Fasta());
71 gpertea 166
72 gpertea 154 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
73     }
74     }
75    
76 gpertea 166 struct cmp_read_alignment
77     {
78     cmp_read_alignment(const JunctionSet& gtf_juncs) :
79     _gtf_juncs(&gtf_juncs) {}
80    
81     bool operator() (const BowtieHit& l, const BowtieHit& r) const
82     {
83     FragmentAlignmentGrade gl(l, *_gtf_juncs);
84     FragmentAlignmentGrade gr(r, *_gtf_juncs);
85     bool better = gr < gl;
86     bool worse = gl < gr;
87    
88     if (better && !worse)
89     return true;
90     else
91     return false;
92     }
93    
94     const JunctionSet* _gtf_juncs;
95     };
96    
97 gpertea 135 void read_best_alignments(const HitsForRead& hits_for_read,
98 gpertea 166 FragmentAlignmentGrade& best_grade,
99     HitsForRead& best_hits,
100     const JunctionSet& gtf_junctions)
101 gpertea 29 {
102     const vector<BowtieHit>& hits = hits_for_read.hits;
103     for (size_t i = 0; i < hits.size(); ++i)
104     {
105 gpertea 166 if (hits[i].edit_dist() > max_read_mismatches)
106     continue;
107 gpertea 163
108 gpertea 166 if (report_secondary_alignments)
109     {
110     best_hits.hits.push_back(hits[i]);
111     }
112     else
113     {
114     FragmentAlignmentGrade g(hits[i], gtf_junctions);
115    
116     // Is the new status better than the current best one?
117     if (best_grade < g)
118     {
119     best_hits.hits.clear();
120     best_grade = g;
121     best_hits.hits.push_back(hits[i]);
122     }
123     else if (!(g < best_grade)) // is it just as good?
124     {
125     best_grade.num_alignments++;
126     best_hits.hits.push_back(hits[i]);
127     }
128     }
129     }
130 gpertea 163
131 gpertea 166 if (report_secondary_alignments && best_hits.hits.size() > 0)
132     {
133     cmp_read_alignment cmp(gtf_junctions);
134     sort(best_hits.hits.begin(), best_hits.hits.end(), cmp);
135     best_grade = FragmentAlignmentGrade(best_hits.hits[0], gtf_junctions);
136     best_grade.num_alignments = best_hits.hits.size();
137 gpertea 29 }
138     }
139    
140 gpertea 166 void set_insert_alignment_grade(const BowtieHit& lh, const BowtieHit& rh, InsertAlignmentGrade& grade)
141 gpertea 29 {
142 gpertea 154 // max mate inner distance (genomic)
143     int min_mate_inner_dist = inner_dist_mean - inner_dist_std_dev;
144     if (max_mate_inner_dist == -1)
145     max_mate_inner_dist = inner_dist_mean + inner_dist_std_dev;
146 gpertea 166
147     bool fusion = false;
148     if (fusion_search)
149     {
150     bool left_fusion = lh.fusion_opcode() != FUSION_NOTHING;
151     bool right_fusion = rh.fusion_opcode() != FUSION_NOTHING;
152     if (left_fusion && right_fusion)
153     return;
154    
155     fusion = left_fusion || right_fusion;
156     if (!fusion && lh.ref_id() != rh.ref_id())
157     fusion = true;
158    
159     if (!fusion && lh.ref_id() == rh.ref_id())
160     {
161     if (lh.antisense_align() == rh.antisense_align())
162     fusion = true;
163     else
164     {
165     int inter_dist = 0;
166     if (lh.antisense_align())
167     inter_dist = lh.left() - rh.right();
168     else
169     inter_dist = rh.left() - lh.right();
170    
171     if (inter_dist < -(int)max_insertion_length || inter_dist > (int)fusion_min_dist)
172     fusion = true;
173     }
174     }
175     }
176 gpertea 163
177 gpertea 166 grade = InsertAlignmentGrade(lh, rh, min_mate_inner_dist, max_mate_inner_dist, fusion);
178     }
179    
180     struct cmp_pair_alignment
181     {
182     cmp_pair_alignment(const JunctionSet& gtf_juncs) :
183     _gtf_juncs(&gtf_juncs) {}
184    
185     bool operator() (const pair<BowtieHit, BowtieHit>& l, const pair<BowtieHit, BowtieHit>& r) const
186     {
187     InsertAlignmentGrade gl; set_insert_alignment_grade(l.first, l.second, gl);
188     InsertAlignmentGrade gr; set_insert_alignment_grade(r.first, r.second, gr);
189    
190     bool better = gr < gl;
191     bool worse = gl < gr;
192    
193     if (better && !worse)
194     return true;
195     else
196     return false;
197     }
198    
199     const JunctionSet* _gtf_juncs;
200     };
201    
202     void pair_best_alignments(const HitsForRead& left_hits,
203     const HitsForRead& right_hits,
204     InsertAlignmentGrade& best_grade,
205     vector<pair<BowtieHit, BowtieHit> >& best_hits,
206     const JunctionSet& gtf_junctions)
207     {
208 gpertea 154 const vector<BowtieHit>& left = left_hits.hits;
209     const vector<BowtieHit>& right = right_hits.hits;
210 gpertea 166
211 gpertea 154 for (size_t i = 0; i < left.size(); ++i)
212     {
213     const BowtieHit& lh = left[i];
214     if (lh.edit_dist() > max_read_mismatches) continue;
215     for (size_t j = 0; j < right.size(); ++j)
216 gpertea 29 {
217 gpertea 154 const BowtieHit& rh = right[j];
218     if (rh.edit_dist() > max_read_mismatches) continue;
219 gpertea 135
220 gpertea 166 InsertAlignmentGrade g; set_insert_alignment_grade(lh, rh, g);
221    
222     if (report_secondary_alignments)
223 gpertea 154 {
224 gpertea 166 best_hits.push_back(make_pair(lh, rh));
225 gpertea 154 }
226 gpertea 166 else
227 gpertea 154 {
228 gpertea 166 // Is the new status better than the current best one?
229     if (best_grade < g)
230 gpertea 159 {
231 gpertea 166 best_hits.clear();
232     best_grade = g;
233     best_hits.push_back(make_pair(lh, rh));
234 gpertea 159 }
235 gpertea 166 else if (!(g < best_grade))
236     {
237     best_hits.push_back(make_pair(lh, rh));
238     best_grade.num_alignments++;
239     }
240 gpertea 154 }
241 gpertea 29 }
242 gpertea 154 }
243 gpertea 166
244     if (report_secondary_alignments && best_hits.size() > 0)
245     {
246     cmp_pair_alignment cmp(gtf_junctions);
247     sort(best_hits.begin(), best_hits.end(), cmp);
248     set_insert_alignment_grade(best_hits[0].first, best_hits[0].second, best_grade);
249     best_grade.num_alignments = best_hits.size();
250     }
251 gpertea 29 }
252    
253     enum FragmentType {FRAG_UNPAIRED, FRAG_LEFT, FRAG_RIGHT};
254    
255     void add_auxData(vector<string>& auxdata, vector<string>& sam_toks,
256     const RefSequenceTable& rt,const BowtieHit& bh, FragmentType insert_side,
257     int num_hits, const BowtieHit* next_hit, int hitIndex) {
258 gpertea 159 bool XS_found = false;
259     if (sam_toks.size()>11) {
260 gpertea 166
261 gpertea 159 for (size_t i=11;i<sam_toks.size();++i) {
262     if (sam_toks[i].find("NH:i:")==string::npos &&
263     sam_toks[i].find("XS:i:")==string::npos)
264     auxdata.push_back(sam_toks[i]);
265    
266     if (sam_toks[i].find("XS:A:")!=string::npos)
267     XS_found = true;
268     }
269     }
270     string aux("NH:i:");
271     str_appendInt(aux, num_hits);
272     auxdata.push_back(aux);
273     if (next_hit) {
274     const char* nh_ref_name = "=";
275     nh_ref_name = rt.get_name(next_hit->ref_id());
276     assert (nh_ref_name != NULL);
277     bool same_contig=(next_hit->ref_id()==bh.ref_id());
278     aux="CC:Z:";
279     aux+= (same_contig ? "=" : nh_ref_name);
280 gpertea 29 auxdata.push_back(aux);
281 gpertea 159 aux="CP:i:";
282     int nh_gpos=next_hit->left() + 1;
283     str_appendInt(aux, nh_gpos);
284     auxdata.push_back(aux);
285     } //has next_hit
286 gpertea 29 // FIXME: this code is still a bit brittle, because it contains no
287 gpertea 135 // consistency check that the mates are on opposite strands - a current protocol
288 gpertea 29 // requirement, and that the strand indicated by the alignment is consistent
289     // with the orientation of the splices (though that should be handled upstream).
290 gpertea 159 if (!XS_found) {
291 gpertea 29 const string xs_f("XS:A:+");
292     const string xs_r("XS:A:-");
293 gpertea 159 if (library_type == FR_FIRSTSTRAND) {
294     if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED) {
295     if (bh.antisense_align())
296     auxdata.push_back(xs_f);
297     else
298     auxdata.push_back(xs_r);
299     }
300     else {
301     if (bh.antisense_align())
302     auxdata.push_back(xs_r);
303     else
304     auxdata.push_back(xs_f);
305     }
306     }
307     else if (library_type == FR_SECONDSTRAND) {
308     if (insert_side == FRAG_LEFT || insert_side == FRAG_UNPAIRED){
309     if (bh.antisense_align())
310     auxdata.push_back(xs_r);
311     else
312     auxdata.push_back(xs_f);
313     }
314     else
315     {
316     if (bh.antisense_align())
317     auxdata.push_back(xs_f);
318     else
319     auxdata.push_back(xs_r);
320     }
321     }
322     }
323     if (hitIndex >= 0)
324 gpertea 29 {
325 gpertea 159 string aux("HI:i:");
326     str_appendInt(aux, hitIndex);
327     auxdata.push_back(aux);
328 gpertea 29 }
329     }
330    
331 gpertea 154 bool rewrite_sam_record(GBamWriter& bam_writer,
332     const RefSequenceTable& rt,
333 gpertea 29 const BowtieHit& bh,
334     const char* bwt_buf,
335 gpertea 135 const char* read_alt_name,
336 gpertea 29 const FragmentAlignmentGrade& grade,
337     FragmentType insert_side,
338     int num_hits,
339     const BowtieHit* next_hit,
340     bool primary,
341     int hitIndex)
342     {
343 gpertea 154 // Rewrite this hit, filling in the alt name, mate mapping
344     // and setting the pair flag
345     vector<string> sam_toks;
346     tokenize(bwt_buf, "\t", sam_toks);
347    
348     string ref_name = sam_toks[2], ref_name2 = "";
349     char rebuf2[2048] = {0};
350     char cigar1[256] = {0}, cigar2[256] = {0};
351     string left_seq, right_seq, left_qual, right_qual;
352     int left = -1, left1 = -1, left2 = -1;
353     bool fusion_alignment = false;
354     size_t XF_index = 0;
355     for (size_t i = 11; i < sam_toks.size(); ++i)
356     {
357     const string& tok = sam_toks[i];
358     if (strncmp(tok.c_str(), "XF", 2) == 0)
359     {
360     fusion_alignment = true;
361     XF_index = i;
362 gpertea 166
363 gpertea 154 vector<string> tuple_fields;
364     tokenize(tok.c_str(), " ", tuple_fields);
365     vector<string> contigs;
366     tokenize(tuple_fields[1].c_str(), "-", contigs);
367     if (contigs.size() >= 2)
368     {
369     ref_name = contigs[0];
370     ref_name2 = contigs[1];
371 gpertea 166 }
372    
373 gpertea 154 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
374     cigar1, cigar2, left_seq, right_seq,
375     left_qual, right_qual, left1, left2);
376 gpertea 166
377 gpertea 154 break;
378     }
379     }
380 gpertea 166
381 gpertea 135 string qname(read_alt_name);
382     size_t slash_pos=qname.rfind('/');
383     if (slash_pos!=string::npos)
384 gpertea 154 qname.resize(slash_pos);
385     //read_alt_name as QNAME
386     int flag=atoi(sam_toks[1].c_str()); //FLAG
387     if (insert_side != FRAG_UNPAIRED) {
388     //flag = atoi(sam_toks[1].c_str());
389     // mark this as a singleton mate
390     flag |= 0x0001;
391     if (insert_side == FRAG_LEFT)
392     flag |= 0x0040;
393     else if (insert_side == FRAG_RIGHT)
394     flag |= 0x0080;
395     flag |= 0x0008;
396     //char flag_buf[64];
397     //sprintf(flag_buf, "%d", flag);
398     //sam_toks[t] = flag_buf;
399     }
400     if (!primary)
401     flag |= 0x100;
402 gpertea 166
403 gpertea 154 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
404     int mapQ=255;
405     if (grade.num_alignments > 1) {
406     double err_prob = 1 - (1.0 / grade.num_alignments);
407     mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
408     }
409     int tlen =atoi(sam_toks[8].c_str()); //TLEN
410     int mate_pos=atoi(sam_toks[7].c_str());
411    
412     GBamRecord* bamrec=NULL;
413     if (fusion_alignment) {
414     vector<string> auxdata;
415     add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
416     bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
417     cigar1, sam_toks[6].c_str(), mate_pos,
418     tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
419     bam_writer.write(bamrec);
420     delete bamrec;
421    
422     auxdata.clear();
423     sam_toks[XF_index][5] = '2';
424     add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
425     bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
426     cigar2, sam_toks[6].c_str(), mate_pos,
427     tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
428     bam_writer.write(bamrec);
429     delete bamrec;
430     } else {
431     vector<string> auxdata;
432     add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
433     bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
434     sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
435     tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
436     bam_writer.write(bamrec);
437     delete bamrec;
438     }
439 gpertea 166
440 gpertea 154 return true;
441 gpertea 29 }
442    
443 gpertea 154 bool rewrite_sam_record(GBamWriter& bam_writer,
444     const RefSequenceTable& rt,
445 gpertea 29 const BowtieHit& bh,
446     const char* bwt_buf,
447 gpertea 135 const char* read_alt_name,
448 gpertea 29 const InsertAlignmentGrade& grade,
449     FragmentType insert_side,
450     const BowtieHit* partner,
451     int num_hits,
452     const BowtieHit* next_hit,
453     bool primary,
454     int hitIndex)
455     {
456 gpertea 154 // Rewrite this hit, filling in the alt name, mate mapping
457     // and setting the pair flag
458     vector<string> sam_toks;
459     tokenize(bwt_buf, "\t", sam_toks);
460 gpertea 135 string qname(read_alt_name);
461     size_t slash_pos=qname.rfind('/');
462     if (slash_pos!=string::npos)
463 gpertea 154 qname.resize(slash_pos);
464     //read_alt_name as QNAME
465     int flag = atoi(sam_toks[1].c_str());
466     // 0x0010 (strand of query) is assumed to be set correctly
467     // to begin with
468     flag |= 0x0001; //it must be paired
469     if (insert_side == FRAG_LEFT)
470     flag |= 0x0040;
471     else if (insert_side == FRAG_RIGHT)
472     flag |= 0x0080;
473     if (!primary)
474     flag |= 0x100;
475    
476     string ref_name = sam_toks[2], ref_name2 = "";
477     char rebuf2[2048] = {0};
478     char cigar1[256] = {0}, cigar2[256] = {0};
479     string left_seq, right_seq, left_qual, right_qual;
480     int left = -1, left1 = -1, left2 = -1;
481     bool fusion_alignment = false;
482     size_t XF_tok_idx = 11;
483     for (; XF_tok_idx < sam_toks.size(); ++XF_tok_idx)
484     {
485     const string& tok = sam_toks[XF_tok_idx];
486     if (strncmp(tok.c_str(), "XF", 2) == 0)
487     {
488     fusion_alignment = true;
489 gpertea 166
490 gpertea 154 vector<string> tuple_fields;
491     tokenize(tok.c_str(), " ", tuple_fields);
492     vector<string> contigs;
493     tokenize(tuple_fields[1].c_str(), "-", contigs);
494     if (contigs.size() >= 2)
495     {
496     ref_name = contigs[0];
497     ref_name2 = contigs[1];
498 gpertea 166 }
499    
500 gpertea 154 extract_partial_hits(bh, tuple_fields[4].c_str(), tuple_fields[5].c_str(),
501     cigar1, cigar2, left_seq, right_seq,
502     left_qual, right_qual, left1, left2);
503 gpertea 166
504 gpertea 154 break;
505     }
506     }
507 gpertea 166
508 gpertea 154 int gpos=isdigit(sam_toks[3][0]) ? atoi(sam_toks[3].c_str()) : 0;
509     int mapQ=255;
510     if (grade.num_alignments > 1) {
511     double err_prob = 1 - (1.0 / grade.num_alignments);
512     mapQ = (int)(-10.0 * log(err_prob) / log(10.0));
513     }
514     int tlen=0; //TLEN
515     int mate_pos=atoi(sam_toks[7].c_str());
516     if (partner) {
517     if (partner->ref_id()==bh.ref_id()) {
518     sam_toks[6] = "="; //same chromosome
519     //TLEN:
520     tlen = bh.left() < partner->left() ? partner->right() - bh.left() :
521     partner->left() - bh.right();
522     }
523 gpertea 166
524 gpertea 154 else { //partner on different chromosome/contig
525     sam_toks[6] = rt.get_name(partner->ref_id());
526 gpertea 29 }
527 gpertea 154 mate_pos = partner->left() + 1;
528     if (grade.happy())
529     flag |= 0x0002;
530     if (partner->antisense_align())
531     flag |= 0x0020;
532 gpertea 29
533 gpertea 154 if (partner->fusion_opcode() != FUSION_NOTHING)
534     {
535     char partner_pos[1024];
536     sprintf(partner_pos, "XP:Z:%s-%s %d", rt.get_name(partner->ref_id()), rt.get_name(partner->ref_id2()), partner->left() + 1);
537     sam_toks.push_back(partner_pos);
538     }
539     }
540     else {
541     sam_toks[6] = "*";
542     mate_pos = 0;
543     flag |= 0x0008;
544     }
545    
546     GBamRecord* bamrec=NULL;
547     if (fusion_alignment) {
548     vector<string> auxdata;
549     add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
550     bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name.c_str(), left1 + 1, mapQ,
551     cigar1, sam_toks[6].c_str(), mate_pos,
552     tlen, left_seq.c_str(), left_qual.c_str(), &auxdata);
553     bam_writer.write(bamrec);
554     delete bamrec;
555    
556     auxdata.clear();
557     sam_toks[XF_tok_idx][5] = '2';
558     add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
559     bamrec=bam_writer.new_record(qname.c_str(), flag, ref_name2.c_str(), left2 + 1, mapQ,
560     cigar2, sam_toks[6].c_str(), mate_pos,
561     tlen, right_seq.c_str(), right_qual.c_str(), &auxdata);
562     bam_writer.write(bamrec);
563     delete bamrec;
564     } else {
565     vector<string> auxdata;
566     add_auxData(auxdata, sam_toks, rt, bh, insert_side, num_hits, next_hit, hitIndex);
567     bamrec=bam_writer.new_record(qname.c_str(), flag, sam_toks[2].c_str(), gpos, mapQ,
568     sam_toks[5].c_str(), sam_toks[6].c_str(), mate_pos,
569     tlen, sam_toks[9].c_str(), sam_toks[10].c_str(), &auxdata);
570     bam_writer.write(bamrec);
571     delete bamrec;
572     }
573    
574     return true;
575 gpertea 29 }
576    
577     struct lex_hit_sort
578     {
579     lex_hit_sort(const RefSequenceTable& rt, const HitsForRead& hits)
580     : _rt(rt), _hits(hits)
581     {}
582 gpertea 166
583 gpertea 29 bool operator()(const uint32_t& l, const uint32_t& r) const
584     {
585     const BowtieHit& lhs = _hits.hits[l];
586     const BowtieHit& rhs = _hits.hits[r];
587 gpertea 154
588 gpertea 29 uint32_t l_id = lhs.ref_id();
589     uint32_t r_id = rhs.ref_id();
590     if (l_id != r_id)
591 gpertea 154 return l_id < r_id;
592    
593 gpertea 29 return lhs.left() < rhs.left();
594     }
595 gpertea 166
596 gpertea 29 const RefSequenceTable& _rt;
597     const HitsForRead& _hits;
598     };
599    
600 gpertea 135 void print_sam_for_single(const RefSequenceTable& rt,
601 gpertea 154 const HitsForRead& hits,
602 gpertea 159 const FragmentAlignmentGrade& grade,
603     FragmentType frag_type,
604     Read& read,
605     GBamWriter& bam_writer,
606     FILE* um_out //write unmapped reads here
607     )
608 gpertea 29 {
609 gpertea 135 assert(!read.alt_name.empty());
610 gpertea 166 if (hits.hits.empty())
611     return;
612    
613 gpertea 29 lex_hit_sort s(rt, hits);
614     vector<uint32_t> index_vector;
615 gpertea 154
616     size_t i;
617     for (i = 0; i < hits.hits.size(); ++i)
618 gpertea 29 index_vector.push_back(i);
619 gpertea 166
620 gpertea 29 sort(index_vector.begin(), index_vector.end(), s);
621 gpertea 154
622 gpertea 166 size_t primaryHit = 0;
623     if (!report_secondary_alignments)
624     primaryHit = random() % hits.hits.size();
625    
626 gpertea 29 bool multipleHits = (hits.hits.size() > 1);
627 gpertea 154 for (i = 0; i < hits.hits.size(); ++i)
628     {
629     size_t index = index_vector[i];
630 gpertea 166 bool primary = (index == primaryHit);
631 gpertea 154 const BowtieHit& bh = hits.hits[index];
632     rewrite_sam_record(bam_writer, rt,
633     bh,
634     bh.hitfile_rec().c_str(),
635     read.alt_name.c_str(),
636     grade,
637     frag_type,
638     hits.hits.size(),
639     (i < hits.hits.size()-1) ? &(hits.hits[index_vector[i+1]]) : NULL,
640     primary,
641     (multipleHits? i: -1));
642     }
643 gpertea 29 }
644    
645 gpertea 135 void print_sam_for_pair(const RefSequenceTable& rt,
646 gpertea 166 const vector<pair<BowtieHit, BowtieHit> >& best_hits,
647 gpertea 29 const InsertAlignmentGrade& grade,
648 gpertea 154 ReadStream& left_reads_file,
649     ReadStream& right_reads_file,
650 gpertea 135 GBamWriter& bam_writer,
651     FILE* left_um_out,
652 gpertea 154 FILE* right_um_out,
653     uint64_t begin_id = 0,
654     uint64_t end_id = std::numeric_limits<uint64_t>::max())
655 gpertea 29 {
656 gpertea 135 Read left_read;
657     Read right_read;
658 gpertea 166 if (best_hits.empty())
659     return;
660 gpertea 163
661 gpertea 166 size_t i;
662     HitsForRead right_hits;
663     for (i = 0; i < best_hits.size(); ++i)
664     right_hits.hits.push_back(best_hits[i].second);
665    
666 gpertea 29 size_t primaryHit = 0;
667     vector<uint32_t> index_vector;
668 gpertea 166 lex_hit_sort s(rt, right_hits);
669     for (i = 0; i < right_hits.hits.size(); ++i)
670     index_vector.push_back(i);
671    
672     sort(index_vector.begin(), index_vector.end(), s);
673 gpertea 135
674 gpertea 166 if (!report_secondary_alignments)
675     primaryHit = random() % right_hits.hits.size();
676    
677     bool got_left_read = left_reads_file.getRead(best_hits[0].first.insert_id(), left_read,
678 gpertea 159 reads_format, false, begin_id, end_id,
679     left_um_out, false);
680 gpertea 166
681     bool got_right_read = right_reads_file.getRead(best_hits[0].first.insert_id(), right_read,
682 gpertea 159 reads_format, false, begin_id, end_id,
683     right_um_out, false);
684 gpertea 166
685     assert (got_left_read && got_right_read);
686     bool multipleHits = (best_hits.size() > 1);
687     for (i = 0; i < best_hits.size(); ++i)
688     {
689     size_t index = index_vector[i];
690     bool primary = (index == primaryHit);
691     const BowtieHit& right_bh = best_hits[index].second;
692     const BowtieHit& left_bh = best_hits[index].first;
693    
694     rewrite_sam_record(bam_writer, rt,
695     right_bh,
696     right_bh.hitfile_rec().c_str(),
697     right_read.alt_name.c_str(),
698     grade,
699     FRAG_RIGHT,
700     &left_bh,
701     best_hits.size(),
702     (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].second) : NULL,
703     primary,
704     (multipleHits? i: -1));
705     rewrite_sam_record(bam_writer, rt,
706     left_bh,
707     left_bh.hitfile_rec().c_str(),
708     left_read.alt_name.c_str(),
709     grade,
710     FRAG_LEFT,
711     &right_bh,
712     best_hits.size(),
713     (i < best_hits.size() - 1) ? &(best_hits[index_vector[i+1]].first) : NULL,
714     primary,
715     (multipleHits? i: -1));
716     }
717 gpertea 29 }
718    
719     /**
720     * Given all of the hits for a particular read, update the read counts for insertions and deletions.
721     * @param hits hits The alignments for a particular read
722     * @param insertions Maps from an insertion to the number of supporting reads for that insertion
723     * @param deletions Maps from a deletion to the number of supporting reads for that deletion
724     */
725     void update_insertions_and_deletions(const HitsForRead& hits,
726     InsertionSet& insertions,
727     DeletionSet& deletions)
728     {
729     for (size_t i = 0; i < hits.hits.size(); ++i)
730     {
731     const BowtieHit& bh = hits.hits[i];
732     insertions_from_alignment(bh, insertions);
733     deletions_from_alignment(bh, deletions);
734     }
735     }
736    
737    
738 gpertea 154 FusionSet empty_fusions;
739     void update_fusions(const HitsForRead& hits,
740     RefSequenceTable& rt,
741     FusionSet& fusions,
742     FusionSet& fusions_ref = empty_fusions)
743     {
744     if (hits.hits.size() > fusion_multireads)
745     return;
746 gpertea 29
747 gpertea 154 bool update_stat = fusions_ref.size() > 0;
748     for (size_t i = 0; i < hits.hits.size(); ++i)
749     {
750     const BowtieHit& bh = hits.hits[i];
751    
752     if (bh.edit_dist() > fusion_read_mismatches)
753     continue;
754    
755     // daehwan
756     #if 0
757     vector<Fusion> new_fusions;
758     fusions_from_spliced_hit(bh, new_fusions);
759 gpertea 166
760 gpertea 154 for(size_t i = 0; i < new_fusions.size(); ++i)
761     {
762     Fusion fusion = new_fusions[i];
763     if (fusion.left == 110343414 && fusion.right == 80211173 && hits.hits.size() > 1)
764     {
765     for (size_t t = 0; t < hits.hits.size(); ++t)
766     {
767     const BowtieHit& lh = hits.hits[t];
768     cout << "insert id: " << lh.insert_id() << endl;
769     cout << (lh.antisense_align() ? "-" : "+") << endl;
770     cout << lh.ref_id() << ": " << lh.left() << "-" << lh.right() << endl;
771     cout << lh.ref_id2() << ": " << print_cigar(lh.cigar()) << endl;
772     }
773     }
774     }
775     #endif
776    
777     fusions_from_alignment(bh, fusions, rt, update_stat);
778    
779     if (update_stat)
780     unsupport_fusions(bh, fusions, fusions_ref);
781     }
782     }
783    
784 gpertea 29 void update_junctions(const HitsForRead& hits,
785 gpertea 154 JunctionSet& junctions)
786 gpertea 29 {
787     for (size_t i = 0; i < hits.hits.size(); ++i)
788     {
789     const BowtieHit& bh = hits.hits[i];
790     junctions_from_alignment(bh, junctions);
791     }
792     }
793    
794     // Extracts junctions from all the SAM hits (based on REF_SKIPs) in the hit file
795     // resets the stream when finished.
796 gpertea 154 void exclude_hits_on_filtered_junctions(const JunctionSet& junctions, HitsForRead& hits)
797 gpertea 29 {
798 gpertea 154 HitsForRead remaining;
799     remaining.insert_id = hits.insert_id;
800 gpertea 166
801 gpertea 154 for (size_t i = 0; i < hits.hits.size(); ++i)
802     {
803     BowtieHit& bh = hits.hits[i];
804 gpertea 166 if (bh.edit_dist() > max_read_mismatches)
805     continue;
806    
807 gpertea 154 bool filter_hit = false;
808     if (!bh.contiguous())
809 gpertea 29 {
810 gpertea 154 JunctionSet bh_juncs;
811     junctions_from_alignment(bh, bh_juncs);
812     for (JunctionSet::iterator itr = bh_juncs.begin();
813     itr != bh_juncs.end();
814     itr++)
815     {
816     const Junction& j = itr->first;
817     JunctionSet::const_iterator target = junctions.find(j);
818     if (target == junctions.end() || !target->second.accepted)
819 gpertea 29 {
820 gpertea 154 filter_hit = true;
821     break;
822 gpertea 29 }
823 gpertea 154 }
824 gpertea 29 }
825 gpertea 154 if (!filter_hit)
826     remaining.hits.push_back(bh);
827     }
828     hits = remaining;
829 gpertea 29 }
830    
831 gpertea 154 // events include splice junction, indels, and fusion points.
832     struct ConsensusEventsWorker
833 gpertea 29 {
834 gpertea 154 void operator()()
835     {
836     ReadTable it;
837     vector<BAMHitFactory*> hit_factories;
838     hit_factories.push_back(new BAMHitFactory(it, *rt));
839     HitStream l_hs(left_map_fname, hit_factories.back(), false, true, true, true);
840     if (left_map_offset > 0)
841     l_hs.seek(left_map_offset);
842    
843     hit_factories.push_back(new BAMHitFactory(it, *rt));
844     HitStream r_hs(right_map_fname, hit_factories.back(), false, true, true, true);
845     if (right_map_offset > 0)
846     r_hs.seek(right_map_offset);
847    
848     HitsForRead curr_left_hit_group;
849     HitsForRead curr_right_hit_group;
850 gpertea 166
851 gpertea 154 l_hs.next_read_hits(curr_left_hit_group);
852     r_hs.next_read_hits(curr_right_hit_group);
853 gpertea 166
854 gpertea 154 uint32_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
855     uint32_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
856    
857     // While we still have unreported hits...
858     while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
859     (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
860     {
861     // Chew up left singletons
862     while (curr_left_obs_order < curr_right_obs_order &&
863     curr_left_obs_order < end_id &&
864     curr_left_obs_order != VMAXINT32)
865     {
866     HitsForRead best_hits;
867     best_hits.insert_id = curr_left_obs_order;
868 gpertea 166 FragmentAlignmentGrade best_grade;
869    
870 gpertea 154 // Process hits for left singleton, select best alignments
871 gpertea 166 read_best_alignments(curr_left_hit_group, best_grade, best_hits, *gtf_junctions);
872 gpertea 154 update_junctions(best_hits, *junctions);
873     update_fusions(best_hits, *rt, *fusions);
874 gpertea 166
875 gpertea 154 // Get next hit group
876     l_hs.next_read_hits(curr_left_hit_group);
877     curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
878     }
879 gpertea 166
880 gpertea 154 // Chew up right singletons
881     while (curr_left_obs_order > curr_right_obs_order &&
882     curr_right_obs_order < end_id &&
883     curr_right_obs_order != VMAXINT32)
884     {
885     HitsForRead best_hits;
886     best_hits.insert_id = curr_right_obs_order;
887 gpertea 166 FragmentAlignmentGrade best_grade;
888 gpertea 154 if (curr_right_obs_order >= begin_id)
889     {
890     // Process hit for right singleton, select best alignments
891 gpertea 166 read_best_alignments(curr_right_hit_group, best_grade, best_hits, *gtf_junctions);
892 gpertea 154 update_junctions(best_hits, *junctions);
893     update_fusions(best_hits, *rt, *fusions);
894     }
895 gpertea 166
896 gpertea 154 // Get next hit group
897     r_hs.next_read_hits(curr_right_hit_group);
898     curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
899     }
900 gpertea 166
901 gpertea 154 // Since we have both left hits and right hits for this insert,
902     // Find the best pairing and print both
903     while (curr_left_obs_order == curr_right_obs_order &&
904     curr_left_obs_order < end_id &&
905     curr_left_obs_order != VMAXINT32)
906     {
907 gpertea 166 vector<pair<BowtieHit, BowtieHit> > best_hits;
908 gpertea 163
909 gpertea 166 InsertAlignmentGrade grade;
910     pair_best_alignments(curr_left_hit_group,
911     curr_right_hit_group,
912     grade,
913     best_hits,
914     *gtf_junctions);
915 gpertea 163
916 gpertea 166 HitsForRead single_best_hits;
917     single_best_hits.insert_id = curr_left_obs_order;
918    
919     // this is too much memory copy - we will make it efficient soon
920     for (size_t i = 0; i < best_hits.size(); ++i)
921 gpertea 154 {
922 gpertea 166 single_best_hits.hits.push_back(best_hits[i].first);
923     single_best_hits.hits.push_back(best_hits[i].second);
924 gpertea 154 }
925 gpertea 163
926 gpertea 166 update_junctions(single_best_hits, *junctions);
927     update_fusions(single_best_hits, *rt, *fusions);
928    
929 gpertea 154 l_hs.next_read_hits(curr_left_hit_group);
930     curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
931 gpertea 166
932 gpertea 154 r_hs.next_read_hits(curr_right_hit_group);
933     curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
934     }
935     }
936 gpertea 29
937 gpertea 154 for (size_t i = 0; i < hit_factories.size(); ++i)
938     delete hit_factories[i];
939 gpertea 166
940 gpertea 154 hit_factories.clear();
941     }
942 gpertea 29
943 gpertea 154 string left_map_fname;
944     string right_map_fname;
945    
946     JunctionSet* junctions;
947     JunctionSet* gtf_junctions;
948     FusionSet* fusions;
949     RefSequenceTable* rt;
950    
951     uint64_t begin_id;
952     uint64_t end_id;
953     int64_t left_map_offset;
954     int64_t right_map_offset;
955     };
956    
957     struct ReportWorker
958     {
959     void operator()()
960     {
961     ReadTable it;
962     GBamWriter bam_writer(bam_output_fname.c_str(), sam_header.c_str());
963    
964     ReadStream left_reads_file(left_reads_fname);
965     if (left_reads_file.file() == NULL)
966     err_die("Error: cannot open %s for reading\n", left_reads_fname.c_str());
967    
968     if (left_reads_offset > 0)
969     left_reads_file.seek(left_reads_offset);
970 gpertea 166
971 gpertea 154 if (!zpacker.empty()) left_um_fname+=".z";
972     FZPipe left_um_out;
973     if (left_um_out.openWrite(left_um_fname.c_str(), zpacker)==NULL)
974     err_die("Error: cannot open file %s for writing!\n",left_um_fname.c_str());
975 gpertea 166
976 gpertea 154 ReadStream right_reads_file(right_reads_fname);
977     if (right_reads_offset > 0)
978     right_reads_file.seek(right_reads_offset);
979 gpertea 166
980 gpertea 154 FZPipe right_um_out;
981     if (right_reads_fname != "")
982     {
983     if (!zpacker.empty()) right_um_fname+=".z";
984     if (right_um_out.openWrite(right_um_fname.c_str(), zpacker)==NULL)
985     err_die("Error: cannot open file %s for writing!\n",right_um_fname.c_str());
986     }
987    
988     vector<BAMHitFactory*> hit_factories;
989     hit_factories.push_back(new BAMHitFactory(it, *rt));
990     HitStream left_hs(left_map_fname, hit_factories.back(), false, true, true, true);
991     if (left_map_offset > 0)
992     left_hs.seek(left_map_offset);
993    
994     hit_factories.push_back(new BAMHitFactory(it, *rt));
995     HitStream right_hs(right_map_fname, hit_factories.back(), false, true, true, true);
996     if (right_map_offset > 0)
997     right_hs.seek(right_map_offset);
998 gpertea 166
999 gpertea 154 HitsForRead curr_left_hit_group;
1000     HitsForRead curr_right_hit_group;
1001 gpertea 166
1002 gpertea 154 left_hs.next_read_hits(curr_left_hit_group);
1003     right_hs.next_read_hits(curr_right_hit_group);
1004 gpertea 166
1005 gpertea 154 uint64_t curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1006     uint64_t curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1007    
1008     // While we still have unreported hits...
1009     Read l_read;
1010     Read r_read;
1011     while((curr_left_obs_order != VMAXINT32 || curr_right_obs_order != VMAXINT32) &&
1012     (curr_left_obs_order < end_id || curr_right_obs_order < end_id))
1013     {
1014     // Chew up left singletons (pairs with right reads unmapped)
1015     while (curr_left_obs_order < curr_right_obs_order &&
1016     curr_left_obs_order < end_id &&
1017     curr_left_obs_order != VMAXINT32)
1018     {
1019     HitsForRead best_hits;
1020     best_hits.insert_id = curr_left_obs_order;
1021     FragmentAlignmentGrade grade;
1022     bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1023 gpertea 159 reads_format, false, begin_id, end_id,
1024     left_um_out.file, false);
1025 gpertea 163 //assert(got_read);
1026 gpertea 154
1027     if (right_reads_file.file()) {
1028 gpertea 163 /*
1029 gpertea 154 fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(),
1030     l_read.seq.c_str(), l_read.qual.c_str());
1031 gpertea 163 */
1032 gpertea 154 got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1033 gpertea 159 reads_format, false, begin_id, end_id,
1034     right_um_out.file, true);
1035 gpertea 163 //assert(got_read);
1036 gpertea 154 }
1037 gpertea 166
1038 gpertea 154 exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1039    
1040     // Process hits for left singleton, select best alignments
1041     read_best_alignments(curr_left_hit_group, grade, best_hits, *gtf_junctions);
1042    
1043 gpertea 166 if (best_hits.hits.size() > max_multihits)
1044 gpertea 154 {
1045 gpertea 166 best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end());
1046     grade.num_alignments = best_hits.hits.size();
1047     }
1048    
1049     if (best_hits.hits.size() > 0)
1050     {
1051 gpertea 154 update_junctions(best_hits, *final_junctions);
1052     update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1053     update_fusions(best_hits, *rt, *final_fusions, *fusions);
1054    
1055     print_sam_for_single(*rt,
1056     best_hits,
1057     grade,
1058     (right_map_fname.empty() ? FRAG_UNPAIRED : FRAG_LEFT),
1059     l_read,
1060     bam_writer,
1061     left_um_out.file);
1062     }
1063     // Get next hit group
1064     left_hs.next_read_hits(curr_left_hit_group);
1065     curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1066 gpertea 166 } //left singletons
1067    
1068 gpertea 154 // Chew up right singletons
1069     while (curr_left_obs_order > curr_right_obs_order &&
1070     curr_right_obs_order < end_id &&
1071     curr_right_obs_order != VMAXINT32)
1072     {
1073     HitsForRead best_hits;
1074     best_hits.insert_id = curr_right_obs_order;
1075     FragmentAlignmentGrade grade;
1076    
1077     if (curr_right_obs_order >= begin_id)
1078     {
1079     bool got_read=right_reads_file.getRead(curr_right_obs_order, r_read,
1080 gpertea 159 reads_format, false, begin_id, end_id,
1081     right_um_out.file, false);
1082 gpertea 154
1083 gpertea 163 //assert(got_read);
1084     /*
1085 gpertea 154 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1086     r_read.seq.c_str(), r_read.qual.c_str());
1087 gpertea 163 */
1088 gpertea 154 got_read=left_reads_file.getRead(curr_right_obs_order, l_read,
1089 gpertea 159 reads_format, false, begin_id, end_id,
1090     left_um_out.file, true);
1091 gpertea 163 //assert(got_read);
1092 gpertea 154
1093     exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1094    
1095     // Process hit for right singleton, select best alignments
1096     read_best_alignments(curr_right_hit_group, grade, best_hits, *gtf_junctions);
1097 gpertea 166
1098     if (best_hits.hits.size() > max_multihits)
1099 gpertea 154 {
1100 gpertea 166 best_hits.hits.erase(best_hits.hits.begin() + max_multihits, best_hits.hits.end());
1101     grade.num_alignments = best_hits.hits.size();
1102     }
1103    
1104     if (best_hits.hits.size() > 0)
1105     {
1106 gpertea 154 update_junctions(best_hits, *final_junctions);
1107     update_insertions_and_deletions(best_hits, *final_insertions, *final_deletions);
1108     update_fusions(best_hits, *rt, *final_fusions, *fusions);
1109 gpertea 166
1110 gpertea 154 print_sam_for_single(*rt,
1111     best_hits,
1112     grade,
1113     FRAG_RIGHT,
1114     r_read,
1115     bam_writer,
1116     right_um_out.file);
1117     }
1118     }
1119    
1120     // Get next hit group
1121     right_hs.next_read_hits(curr_right_hit_group);
1122     curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1123     }
1124 gpertea 166
1125 gpertea 154 // Since we have both left hits and right hits for this insert,
1126     // Find the best pairing and print both
1127     while (curr_left_obs_order == curr_right_obs_order &&
1128     curr_left_obs_order < end_id &&
1129     curr_left_obs_order != VMAXINT32)
1130     {
1131     exclude_hits_on_filtered_junctions(*junctions, curr_left_hit_group);
1132     exclude_hits_on_filtered_junctions(*junctions, curr_right_hit_group);
1133     if (curr_left_hit_group.hits.empty())
1134     { //only right read mapped
1135     //write it in the mapped file with the #MAPPED# flag
1136 gpertea 166
1137 gpertea 154 bool got_read=right_reads_file.getRead(curr_left_obs_order, r_read,
1138 gpertea 159 reads_format, false, begin_id, end_id,
1139     right_um_out.file, false);
1140 gpertea 163 //assert(got_read);
1141     /*
1142 gpertea 154 fprintf(right_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", r_read.alt_name.c_str(),
1143     r_read.seq.c_str(), r_read.qual.c_str());
1144 gpertea 163 */
1145 gpertea 154 HitsForRead right_best_hits;
1146     right_best_hits.insert_id = curr_right_obs_order;
1147 gpertea 166
1148 gpertea 154 FragmentAlignmentGrade grade;
1149     read_best_alignments(curr_right_hit_group, grade, right_best_hits, *gtf_junctions);
1150 gpertea 163
1151 gpertea 166 if (right_best_hits.hits.size() > max_multihits)
1152 gpertea 154 {
1153 gpertea 166 right_best_hits.hits.erase(right_best_hits.hits.begin() + max_multihits, right_best_hits.hits.end());
1154     grade.num_alignments = right_best_hits.hits.size();
1155     }
1156    
1157     if (right_best_hits.hits.size() > 0)
1158     {
1159 gpertea 154 update_junctions(right_best_hits, *final_junctions);
1160     update_insertions_and_deletions(right_best_hits, *final_insertions, *final_deletions);
1161     update_fusions(right_best_hits, *rt, *final_fusions, *fusions);
1162 gpertea 166
1163 gpertea 154 print_sam_for_single(*rt,
1164     right_best_hits,
1165     grade,
1166     FRAG_RIGHT,
1167     r_read,
1168     bam_writer,
1169     right_um_out.file);
1170     }
1171     }
1172     else if (curr_right_hit_group.hits.empty())
1173     {
1174     HitsForRead left_best_hits;
1175     left_best_hits.insert_id = curr_left_obs_order;
1176     //only left read mapped
1177     bool got_read=left_reads_file.getRead(curr_left_obs_order, l_read,
1178 gpertea 159 reads_format, false, begin_id, end_id,
1179     left_um_out.file, false);
1180 gpertea 163 //assert(got_read);
1181     /*
1182 gpertea 154 fprintf(left_um_out.file, "@%s #MAPPED#\n%s\n+\n%s\n", l_read.alt_name.c_str(),
1183     l_read.seq.c_str(), l_read.qual.c_str());
1184 gpertea 163 */
1185 gpertea 154 FragmentAlignmentGrade grade;
1186     // Process hits for left singleton, select best alignments
1187     read_best_alignments(curr_left_hit_group, grade, left_best_hits, *gtf_junctions);
1188 gpertea 166
1189     if (left_best_hits.hits.size() > max_multihits)
1190 gpertea 154 {
1191 gpertea 166 left_best_hits.hits.erase(left_best_hits.hits.begin() + max_multihits, left_best_hits.hits.end());
1192     grade.num_alignments = left_best_hits.hits.size();
1193     }
1194    
1195     if (left_best_hits.hits.size() > 0)
1196     {
1197 gpertea 154 update_junctions(left_best_hits, *final_junctions);
1198     update_insertions_and_deletions(left_best_hits, *final_insertions, *final_deletions);
1199     update_fusions(left_best_hits, *rt, *final_fusions, *fusions);
1200 gpertea 166
1201 gpertea 154 print_sam_for_single(*rt,
1202     left_best_hits,
1203     grade,
1204     FRAG_LEFT,
1205     l_read,
1206     bam_writer,
1207     left_um_out.file);
1208     }
1209     }
1210     else
1211 gpertea 166 {
1212     vector<pair<BowtieHit, BowtieHit> > best_hits;
1213 gpertea 154
1214     InsertAlignmentGrade grade;
1215     pair_best_alignments(curr_left_hit_group,
1216     curr_right_hit_group,
1217     grade,
1218 gpertea 166 best_hits,
1219     *gtf_junctions);
1220 gpertea 163
1221 gpertea 166 if (best_hits.size() > max_multihits)
1222 gpertea 154 {
1223 gpertea 166 best_hits.erase(best_hits.begin() + max_multihits, best_hits.end());
1224     grade.num_alignments = best_hits.size();
1225     }
1226 gpertea 154
1227 gpertea 166 //hits for both left and right reads
1228     HitsForRead single_best_hits;
1229     single_best_hits.insert_id = curr_left_obs_order;
1230     for (size_t i = 0; i < best_hits.size(); ++i)
1231     {
1232     single_best_hits.hits.push_back(best_hits[i].first);
1233     single_best_hits.hits.push_back(best_hits[i].second);
1234     }
1235    
1236     if (single_best_hits.hits.size() > 0)
1237     {
1238     update_junctions(single_best_hits, *final_junctions);
1239     update_insertions_and_deletions(single_best_hits, *final_insertions, *final_deletions);
1240 gpertea 154
1241 gpertea 166 pair_support(best_hits, *final_fusions, *fusions);
1242     update_fusions(single_best_hits, *rt, *final_fusions, *fusions);
1243    
1244 gpertea 154 print_sam_for_pair(*rt,
1245 gpertea 166 best_hits,
1246 gpertea 154 grade,
1247     left_reads_file,
1248     right_reads_file,
1249     bam_writer,
1250     left_um_out.file,
1251     right_um_out.file,
1252     begin_id,
1253     end_id);
1254     }
1255     }
1256 gpertea 166
1257 gpertea 154 left_hs.next_read_hits(curr_left_hit_group);
1258     curr_left_obs_order = it.observation_order(curr_left_hit_group.insert_id);
1259 gpertea 166
1260 gpertea 154 right_hs.next_read_hits(curr_right_hit_group);
1261     curr_right_obs_order = it.observation_order(curr_right_hit_group.insert_id);
1262     }
1263     } //while we still have unreported hits..
1264     //print the remaining unmapped reads at the end of each reads' stream
1265     left_reads_file.getRead(VMAXINT32, l_read,
1266 gpertea 159 reads_format, false, begin_id, end_id,
1267     left_um_out.file, false);
1268 gpertea 154 if (right_reads_file.file())
1269     right_reads_file.getRead(VMAXINT32, r_read,
1270 gpertea 159 reads_format, false, begin_id, end_id,
1271     right_um_out.file, false);
1272 gpertea 154
1273     left_um_out.close();
1274     right_um_out.close();
1275    
1276     for (size_t i = 0; i < hit_factories.size(); ++i)
1277     delete hit_factories[i];
1278    
1279     hit_factories.clear();
1280     }
1281    
1282     string bam_output_fname;
1283     string sam_header_fname;
1284    
1285     string left_reads_fname;
1286     string left_map_fname;
1287     string right_reads_fname;
1288     string right_map_fname;
1289    
1290     string left_um_fname;
1291     string right_um_fname;
1292    
1293     JunctionSet* gtf_junctions;
1294     JunctionSet* junctions;
1295     FusionSet* fusions;
1296     JunctionSet* final_junctions;
1297     InsertionSet* final_insertions;
1298     DeletionSet* final_deletions;
1299     FusionSet* final_fusions;
1300    
1301     RefSequenceTable* rt;
1302    
1303     uint64_t begin_id;
1304     uint64_t end_id;
1305     int64_t left_reads_offset;
1306     int64_t left_map_offset;
1307     int64_t right_reads_offset;
1308     int64_t right_map_offset;
1309     };
1310    
1311     void driver(const string& bam_output_fname,
1312     istream& ref_stream,
1313     const string& left_map_fname,
1314     const string& left_reads_fname,
1315     const string& right_map_fname,
1316     const string& right_reads_fname,
1317 gpertea 29 FILE* junctions_out,
1318     FILE* insertions_out,
1319 gpertea 135 FILE* deletions_out,
1320 gpertea 154 FILE* fusions_out)
1321 gpertea 29 {
1322 gpertea 154 if (!parallel)
1323     num_threads = 1;
1324    
1325 gpertea 29 RefSequenceTable rt(sam_header, true);
1326 gpertea 154
1327     if (fusion_search)
1328     get_seqs(ref_stream, rt, true);
1329    
1330     srandom(1);
1331 gpertea 166
1332 gpertea 29 JunctionSet gtf_junctions;
1333 gpertea 135 if (!gtf_juncs.empty())
1334 gpertea 29 {
1335     char splice_buf[2048];
1336     FILE* splice_coords = fopen(gtf_juncs.c_str(), "r");
1337     if (splice_coords)
1338 gpertea 135 {
1339     while (fgets(splice_buf, sizeof(splice_buf), splice_coords))
1340     {
1341     char* nl = strrchr(splice_buf, '\n');
1342     char* buf = splice_buf;
1343     if (nl) *nl = 0;
1344 gpertea 29
1345 gpertea 135 char* ref_name = get_token((char**)&buf, "\t");
1346     char* scan_left_coord = get_token((char**)&buf, "\t");
1347     char* scan_right_coord = get_token((char**)&buf, "\t");
1348     char* orientation = get_token((char**)&buf, "\t");
1349    
1350     if (!scan_left_coord || !scan_right_coord || !orientation)
1351     {
1352     fprintf(stderr,"Error: malformed splice coordinate record in %s\n:%s\n",
1353     gtf_juncs.c_str(), buf);
1354     exit(1);
1355     }
1356    
1357     uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
1358     uint32_t left_coord = atoi(scan_left_coord);
1359     uint32_t right_coord = atoi(scan_right_coord);
1360     bool antisense = *orientation == '-';
1361    
1362     // add 1 to left_coord to meet BED format
1363     gtf_junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord + 1, right_coord, antisense), JunctionStats()));
1364     }
1365     }
1366     fprintf(stderr, "Loaded %d GFF junctions from %s.\n", (int)(gtf_junctions.size()), gtf_juncs.c_str());
1367 gpertea 29 }
1368    
1369 gpertea 154 vector<uint64_t> read_ids;
1370     vector<vector<int64_t> > offsets;
1371     if (num_threads > 1)
1372     {
1373     vector<string> fnames;
1374     if (right_map_fname != "")
1375 gpertea 29 {
1376 gpertea 154 fnames.push_back(right_reads_fname);
1377     fnames.push_back(right_map_fname);
1378     }
1379     fnames.push_back(left_reads_fname);
1380     fnames.push_back(left_map_fname);
1381     bool enough_data = calculate_offsets(fnames, read_ids, offsets);
1382     if (!enough_data)
1383     num_threads = 1;
1384     }
1385 gpertea 29
1386 gpertea 154 JunctionSet vjunctions[num_threads];
1387     FusionSet vfusions[num_threads];
1388     vector<boost::thread*> threads;
1389     for (int i = 0; i < num_threads; ++i)
1390     {
1391     ConsensusEventsWorker worker;
1392 gpertea 29
1393 gpertea 154 worker.left_map_fname = left_map_fname;
1394     worker.right_map_fname = right_map_fname;
1395     worker.junctions = &vjunctions[i];
1396     worker.fusions = &vfusions[i];
1397     worker.gtf_junctions = &gtf_junctions;
1398     worker.rt = &rt;
1399 gpertea 29
1400 gpertea 154 worker.right_map_offset = 0;
1401 gpertea 166
1402 gpertea 154 if (i == 0)
1403     {
1404     worker.begin_id = 0;
1405     worker.left_map_offset = 0;
1406     }
1407     else
1408     {
1409     size_t offsets_size = offsets[i-1].size();
1410 gpertea 29
1411 gpertea 154 worker.begin_id = read_ids[i-1];
1412     worker.left_map_offset = offsets[i-1].back();
1413     if (offsets_size == 4)
1414     worker.right_map_offset = offsets[i-1][1];
1415     }
1416 gpertea 166
1417 gpertea 154 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1418    
1419     if (num_threads > 1)
1420     threads.push_back(new boost::thread(worker));
1421     else
1422     worker();
1423     }
1424    
1425     for (size_t i = 0; i < threads.size(); ++i)
1426     {
1427     threads[i]->join();
1428     delete threads[i];
1429     threads[i] = NULL;
1430     }
1431     threads.clear();
1432    
1433     JunctionSet junctions = vjunctions[0];
1434     FusionSet fusions = vfusions[0];
1435     for (size_t i = 1; i < num_threads; ++i)
1436     {
1437     merge_with(junctions, vjunctions[i]);
1438     vjunctions[i].clear();
1439    
1440     merge_with(fusions, vfusions[i]);
1441     vfusions[i].clear();
1442     }
1443    
1444     size_t num_unfiltered_juncs = junctions.size();
1445     fprintf(stderr, "Loaded %lu junctions\n", (long unsigned int) num_unfiltered_juncs);
1446    
1447     // Read hits, extract junctions, and toss the ones that arent strongly enough supported.
1448    
1449     filter_junctions(junctions, gtf_junctions);
1450 gpertea 166
1451 gpertea 154 //size_t num_juncs_after_filter = junctions.size();
1452     //fprintf(stderr, "Filtered %lu junctions\n",
1453     // num_unfiltered_juncs - num_juncs_after_filter);
1454    
1455     /*
1456 gpertea 29 size_t small_overhangs = 0;
1457     for (JunctionSet::iterator i = junctions.begin(); i != junctions.end(); ++i)
1458     {
1459     if (i->second.accepted &&
1460     (i->second.left_extent < min_anchor_len || i->second.right_extent < min_anchor_len))
1461     {
1462     small_overhangs++;
1463     }
1464     }
1465 gpertea 166
1466 gpertea 29 if (small_overhangs >0)
1467     fprintf(stderr, "Warning: %lu small overhang junctions!\n", (long unsigned int)small_overhangs);
1468 gpertea 135 */
1469 gpertea 154
1470     JunctionSet vfinal_junctions[num_threads];
1471     InsertionSet vfinal_insertions[num_threads];
1472     DeletionSet vfinal_deletions[num_threads];
1473     FusionSet vfinal_fusions[num_threads];
1474    
1475     for (int i = 0; i < num_threads; ++i)
1476 gpertea 29 {
1477 gpertea 154 ReportWorker worker;
1478 gpertea 135
1479 gpertea 154 char filename[1024] = {0};
1480     sprintf(filename, "%s%d.bam", bam_output_fname.c_str(), i);
1481     worker.bam_output_fname = filename;
1482     worker.sam_header_fname = sam_header;
1483     sprintf(filename, "%s/unmapped_left%d.fq", output_dir.c_str(), i);
1484     worker.left_um_fname = filename;
1485 gpertea 166
1486 gpertea 154 if (right_reads_fname != "")
1487     {
1488     sprintf(filename, "%s/unmapped_right%d.fq", output_dir.c_str(), i);
1489     worker.right_um_fname = filename;
1490     }
1491 gpertea 166
1492 gpertea 154 worker.left_reads_fname = left_reads_fname;
1493     worker.left_map_fname = left_map_fname;
1494     worker.right_reads_fname = right_reads_fname;
1495     worker.right_map_fname = right_map_fname;
1496 gpertea 135
1497 gpertea 154 worker.gtf_junctions = &gtf_junctions;
1498     worker.junctions = &junctions;
1499     worker.fusions = &fusions;
1500     worker.final_junctions = &vfinal_junctions[i];
1501     worker.final_insertions = &vfinal_insertions[i];
1502     worker.final_deletions = &vfinal_deletions[i];
1503     worker.final_fusions = &vfinal_fusions[i];
1504     worker.rt = &rt;
1505 gpertea 29
1506 gpertea 154 worker.right_reads_offset = 0;
1507     worker.right_map_offset = 0;
1508 gpertea 166
1509 gpertea 154 if (i == 0)
1510     {
1511     worker.begin_id = 0;
1512     worker.left_reads_offset = 0;
1513     worker.left_map_offset = 0;
1514     }
1515     else
1516     {
1517     size_t offsets_size = offsets[i-1].size();
1518 gpertea 166
1519 gpertea 154 worker.begin_id = read_ids[i-1];
1520     worker.left_reads_offset = offsets[i-1][offsets_size - 2];
1521     worker.left_map_offset = offsets[i-1].back();
1522     if (offsets_size == 4)
1523     {
1524     worker.right_reads_offset = offsets[i-1][0];
1525     worker.right_map_offset = offsets[i-1][1];
1526     }
1527     }
1528 gpertea 166
1529 gpertea 154 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
1530 gpertea 29
1531 gpertea 154 if (num_threads > 1)
1532     threads.push_back(new boost::thread(worker));
1533     else
1534     worker();
1535     }
1536    
1537     for (size_t i = 0; i < threads.size(); ++i)
1538     {
1539     threads[i]->join();
1540     delete threads[i];
1541     threads[i] = NULL;
1542     }
1543     threads.clear();
1544    
1545     JunctionSet final_junctions = vfinal_junctions[0];
1546     InsertionSet final_insertions = vfinal_insertions[0];
1547     DeletionSet final_deletions = vfinal_deletions[0];
1548     FusionSet final_fusions = vfinal_fusions[0];
1549     for (size_t i = 1; i < num_threads; ++i)
1550     {
1551     merge_with(final_junctions, vfinal_junctions[i]);
1552     vfinal_junctions[i].clear();
1553    
1554     merge_with(final_insertions, vfinal_insertions[i]);
1555     vfinal_insertions[i].clear();
1556 gpertea 166
1557 gpertea 154 merge_with(final_deletions, vfinal_deletions[i]);
1558     vfinal_deletions[i].clear();
1559    
1560     merge_with(final_fusions, vfinal_fusions[i]);
1561     vfinal_fusions[i].clear();
1562     }
1563    
1564     fprintf (stderr, "Reporting final accepted alignments...");
1565     fprintf (stderr, "done.\n");
1566 gpertea 166
1567 gpertea 154 //small_overhangs = 0;
1568     for (JunctionSet::iterator i = final_junctions.begin(); i != final_junctions.end();)
1569     {
1570     if (i->second.supporting_hits == 0 || i->second.left_extent < 8 || i->second.right_extent < 8)
1571     {
1572     final_junctions.erase(i++);
1573     }
1574     else
1575     {
1576     ++i;
1577     }
1578     }
1579 gpertea 166
1580 gpertea 154 // if (small_overhangs > 0)
1581     // fprintf(stderr, "Warning: %lu small overhang junctions!\n", small_overhangs);
1582    
1583     fprintf (stderr, "Printing junction BED track...");
1584     print_junctions(junctions_out, final_junctions, rt);
1585     fprintf (stderr, "done\n");
1586 gpertea 166
1587 gpertea 154 fprintf (stderr, "Printing insertions...");
1588     print_insertions(insertions_out, final_insertions,rt);
1589     fclose(insertions_out);
1590     fprintf (stderr, "done\n");
1591 gpertea 166
1592 gpertea 154 fprintf (stderr, "Printing deletions...");
1593     print_deletions(deletions_out, final_deletions, rt);
1594     fclose(deletions_out);
1595     fprintf (stderr, "done\n");
1596    
1597     if (fusion_search)
1598     {
1599     fprintf (stderr, "Printing fusions...");
1600     print_fusions(fusions_out, final_fusions, rt);
1601     fclose(fusions_out);
1602     fprintf (stderr, "done\n");
1603     }
1604 gpertea 166
1605 gpertea 154 fprintf(stderr, "Found %lu junctions from happy spliced reads\n", (long unsigned int)final_junctions.size());
1606 gpertea 29 }
1607    
1608     void print_usage()
1609     {
1610     fprintf(stderr, "Usage: tophat_reports <junctions.bed> <insertions.vcf> <deletions.vcf> <accepted_hits.sam> <left_map1,...,left_mapN> <left_reads.fq> [right_map1,...,right_mapN] [right_reads.fq]\n");
1611 gpertea 166
1612 gpertea 29 // fprintf(stderr, "Usage: tophat_reports <coverage.wig> <junctions.bed> <accepted_hits.sam> <map1.bwtout> [splice_map1.sbwtout]\n");
1613     }
1614    
1615     int main(int argc, char** argv)
1616     {
1617 gpertea 154 fprintf(stderr, "tophat_reports v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
1618     fprintf(stderr, "---------------------------------------\n");
1619 gpertea 166
1620 gpertea 154 reads_format = FASTQ;
1621 gpertea 166
1622 gpertea 154 int parse_ret = parse_options(argc, argv, print_usage);
1623     if (parse_ret)
1624     return parse_ret;
1625 gpertea 166
1626 gpertea 154 if(optind >= argc)
1627     {
1628     print_usage();
1629     return 1;
1630     }
1631 gpertea 166
1632 gpertea 154 string ref_file_name = argv[optind++];
1633 gpertea 166
1634 gpertea 154 if(optind >= argc)
1635     {
1636     print_usage();
1637     return 1;
1638     }
1639 gpertea 166
1640 gpertea 154 string junctions_file_name = argv[optind++];
1641 gpertea 166
1642 gpertea 154 if(optind >= argc)
1643     {
1644     print_usage();
1645     return 1;
1646     }
1647 gpertea 166
1648 gpertea 154 string insertions_file_name = argv[optind++];
1649     if(optind >= argc)
1650     {
1651     print_usage();
1652     return 1;
1653     }
1654 gpertea 166
1655 gpertea 154 string deletions_file_name = argv[optind++];
1656     if(optind >= argc)
1657     {
1658     print_usage();
1659     return 1;
1660     }
1661 gpertea 166
1662 gpertea 154 string fusions_file_name = argv[optind++];
1663     if(optind >= argc)
1664     {
1665     print_usage();
1666     return 1;
1667     }
1668 gpertea 166
1669 gpertea 154 string accepted_hits_file_name = argv[optind++];
1670     if(optind >= argc)
1671     {
1672     print_usage();
1673     return 1;
1674     }
1675 gpertea 166
1676 gpertea 154 string left_map_filename = argv[optind++];
1677     if(optind >= argc)
1678     {
1679     print_usage();
1680     return 1;
1681     }
1682 gpertea 166
1683 gpertea 154 string left_reads_filename = argv[optind++];
1684     string unzcmd=getUnpackCmd(left_reads_filename, false);
1685 gpertea 166
1686 gpertea 154 string right_map_filename;
1687     string right_reads_filename;
1688 gpertea 166
1689 gpertea 154 if (optind < argc)
1690     {
1691     right_map_filename = argv[optind++];
1692     if(optind >= argc) {
1693     print_usage();
1694     return 1;
1695     }
1696     right_reads_filename=argv[optind++];
1697     }
1698 gpertea 135
1699 gpertea 154 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
1700     if (!ref_stream.good())
1701     {
1702     fprintf(stderr, "Error: cannot open %s for reading\n",
1703     ref_file_name.c_str());
1704     exit(1);
1705     }
1706 gpertea 166
1707 gpertea 154 FILE* junctions_file = fopen(junctions_file_name.c_str(), "w");
1708     if (junctions_file == NULL)
1709     {
1710     fprintf(stderr, "Error: cannot open BED file %s for writing\n",
1711     junctions_file_name.c_str());
1712     exit(1);
1713     }
1714 gpertea 166
1715 gpertea 154 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
1716     if (insertions_file == NULL)
1717     {
1718     fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1719     insertions_file_name.c_str());
1720     exit(1);
1721     }
1722 gpertea 166
1723 gpertea 154 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
1724     if (deletions_file == NULL)
1725     {
1726     fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1727     deletions_file_name.c_str());
1728     exit(1);
1729     }
1730 gpertea 166
1731 gpertea 154 FILE* fusions_file = NULL;
1732     if (fusion_search)
1733     {
1734     fusions_file = fopen(fusions_file_name.c_str(), "w");
1735     if (fusions_file == NULL)
1736 gpertea 29 {
1737 gpertea 154 fprintf(stderr, "Error: cannot open VCF file %s for writing\n",
1738     fusions_file_name.c_str());
1739     exit(1);
1740 gpertea 29 }
1741 gpertea 154 }
1742 gpertea 166
1743 gpertea 154 driver(accepted_hits_file_name,
1744     ref_stream,
1745     left_map_filename,
1746     left_reads_filename,
1747     right_map_filename,
1748     right_reads_filename,
1749     junctions_file,
1750     insertions_file,
1751     deletions_file,
1752     fusions_file);
1753 gpertea 166
1754 gpertea 154 return 0;
1755 gpertea 29 }