ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/bam_merge.cpp
Revision: 168
Committed: Fri Feb 10 16:40:38 2012 UTC (8 years, 2 months ago) by gpertea
File size: 5957 byte(s)
Log Message:
finally fixed bam_merge

Line File contents
1 #include <cstdlib>
2 #include <cstdio>
3 #include <cstring>
4
5 #include "common.h"
6 #include <queue>
7 #include <algorithm>
8
9 using namespace std;
10
11 #define USAGE "Usage: bam_merge <out.bam> <in1.bam> <in2.bam> [...]\n"
12
13 #define ERR_BAM_OPEN "Error: bam_merge failed to open BAM file %s\n"
14
15 void print_usage()
16 {
17 fprintf(stderr, USAGE);
18 }
19
20 vector <samfile_t*> srcfiles; //array of SAM file handles
21
22 class CBamLine {
23 public:
24 int fileno;
25 uint64_t read_id;
26 bam1_t* b;
27
28 CBamLine(int fno=-1, bam1_t* br=NULL) {
29 fileno=fno;
30 read_id=0;
31 b=br;
32 b_init();
33 }
34
35 void b_init() {
36 if (b) {
37 char *name = bam1_qname(b);
38 read_id=(uint64_t)atol(name);
39 if (read_id<1) {
40 char* samline=bam_format1(srcfiles[0]->header, b);
41 err_die("Error: invalid read Id (must be numeric) for BAM record:\n%s\n",
42 samline);
43 }
44 }
45 }
46
47 void b_free() {
48 if (b!=NULL) {
49 bam_destroy1(b);
50 b=NULL;
51 }
52 }
53 };
54
55 struct equal_bam {
56 bool operator() (const CBamLine& first, const CBamLine& second) const {
57 if (first.read_id != second.read_id)
58 return false;
59
60 if (first.b->core.tid != second.b->core.tid)
61 return false;
62
63 if (first.b->core.pos != second.b->core.pos)
64 return false;
65
66 if (first.b->core.n_cigar != second.b->core.n_cigar)
67 return false;
68
69 for (int i = 0; i < first.b->core.n_cigar; ++i){
70 if (bam1_cigar(first.b)[i] != bam1_cigar(second.b)[i])
71 return false;
72 }
73
74 return true;
75 }
76 };
77
78 struct less_bam {
79 bool rev_cmp; //reverse the comparison
80 less_bam(bool reverse_cmp=false) {
81 rev_cmp=reverse_cmp;
82 }
83 bool operator() (const CBamLine& f, const CBamLine& s) const {
84 const CBamLine* first = &f;
85 const CBamLine* second = &s;
86 if (rev_cmp) {
87 first=&s;
88 second=&f;
89 }
90 if (first->read_id != second->read_id)
91 return first->read_id < second->read_id;
92
93 if (first->b->core.tid != second->b->core.tid)
94 return first->b->core.tid < second->b->core.tid;
95
96 if (first->b->core.pos != second->b->core.pos)
97 return first->b->core.pos < second->b->core.pos;
98
99 if (first->b->core.n_cigar != second->b->core.n_cigar)
100 return first->b->core.n_cigar < second->b->core.n_cigar;
101
102 for (int i = 0; i < first->b->core.n_cigar; ++i){
103 if (bam1_cigar(first->b)[i] != bam1_cigar(second->b)[i])
104 return bam1_cigar(first->b)[i] < bam1_cigar(second->b)[i];
105 }
106
107 // prefer a record with XS attribute
108 char strand1 = 0, strand2 = 0;
109 uint8_t* ptr = bam_aux_get(first->b, "XS");
110 if (ptr) strand1 = bam_aux2A(ptr);
111
112 ptr = bam_aux_get(second->b, "XS");
113 if (ptr) strand2 = bam_aux2A(ptr);
114
115 if (strand1 == strand2)
116 return false;
117
118 if (strand1 == '+' || strand2 == 0)
119 return true;
120 else
121 return false;
122 }
123 };
124
125 void write_bam_lines(GBamWriter& bw, vector<CBamLine>& bam_lines) {
126 static size_t count = 0;
127 std::sort (bam_lines.begin(), bam_lines.end(), less_bam());
128
129 bool sense_strand = false, antisense_strand = false;
130 vector<CBamLine>::size_type i = 0;
131 for (; i < bam_lines.size(); ++i) {
132 bool do_write = true;
133 const CBamLine& bam_line = bam_lines[i];
134
135 uint8_t* ptr = bam_aux_get(bam_line.b, "XS");
136 char strand = 0;
137 if (ptr) strand = bam_aux2A(ptr);
138
139 if (i > 0) {
140 if (equal_bam()(bam_lines[i-1], bam_line)) {
141 if (strand == 0) {
142 do_write = false;
143 } else {
144 if (strand == '+' && sense_strand) do_write = false;
145 else sense_strand = true;
146
147 if (strand == '-' && antisense_strand) do_write = false;
148 else antisense_strand = true;
149 }
150 } else {
151 sense_strand = false;
152 antisense_strand = false;
153 }
154 }
155
156 if (strand == '+')
157 sense_strand = true;
158 else if (strand == '-')
159 antisense_strand = true;
160
161 if (do_write) {
162 //samwrite(fw, bam_line.b);
163 bw.write(bam_line.b, bam_line.read_id);
164 ++count;
165 }
166 }
167
168 for (i = 0; i < bam_lines.size(); ++i)
169 bam_lines[i].b_free();
170
171 bam_lines.clear();
172 }
173
174
175 //GList<CBamLine> lines(true,true,true); //sorted, free items, unique items
176
177 priority_queue< CBamLine , vector<CBamLine>, less_bam > lines(less_bam(true));
178
179 int main(int argc, char *argv[])
180 {
181 int parse_ret = parse_options(argc, argv, print_usage);
182 if (parse_ret)
183 return parse_ret;
184
185 char* outfname=NULL;
186 if (argc-optind<3) {
187 print_usage();
188 if (argc>1)
189 warn_msg("Error: only %d arguments given.\n", argc-1);
190 return -1;
191 }
192 outfname=argv[optind];
193 int num_src=argc-optind-1;
194 srcfiles.resize(num_src, NULL);
195 for (int i=optind+1;i<argc;i++) {
196 int fno=i-optind-1;
197 samfile_t* fp=samopen(argv[i], "rb", 0);
198 if (fp==0) {
199 warn_msg(ERR_BAM_OPEN, argv[i]);
200 return 1;
201 }
202 bam1_t *b = bam_init1();
203 if (samread(fp, b) > 0) {
204 srcfiles[fno]=fp;
205 CBamLine brec(fno, b);
206 lines.push(brec);
207 }
208 else { bam_destroy1(b); }
209 }
210 if (lines.size()==0) {
211 warn_msg("Warning: no input BAM records found.\n");
212 }
213
214 vector<CBamLine> bam_lines;
215 GBamWriter bamwriter(outfname, srcfiles[0]->header, index_outfile);
216
217 uint64_t last_id = 0;
218 while (lines.size()>0) {
219 CBamLine brec(lines.top()); //should have the smallest read_id
220 lines.pop();
221 assert (brec.fileno>=0 && brec.b!=NULL);
222 // we need to eliminate duplicate records, which can happen when using Bowtie2
223 // as we may map the same read against transcriptome and genome.
224 if (last_id != brec.read_id && bam_lines.size() > 0)
225 write_bam_lines(bamwriter, bam_lines);
226
227 last_id = brec.read_id;
228 bam_lines.push_back(brec);
229 //reuse brec
230 brec.b = bam_init1();
231 if (samread(srcfiles[brec.fileno], brec.b)>0) {
232 brec.b_init();
233 lines.push(brec);
234 }
235 else { //no more BAM records
236 brec.b_free();
237 }
238 }
239
240 if (bam_lines.size() > 0)
241 write_bam_lines(bamwriter, bam_lines);
242 for (int i=0;i<num_src;i++)
243 samclose(srcfiles[i]);
244 srcfiles.clear();
245 return 0;
246 }