ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/fix_map_ordering.cpp
Revision: 167
Committed: Fri Feb 10 05:51:56 2012 UTC (8 years, 1 month ago) by gpertea
File size: 8167 byte(s)
Log Message:
bam_merge needs fixing!

Line File contents
1 /*
2 * fix_map_ordering.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 2/28/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10
11 #include <queue>
12 #include <cstring>
13 #include <seqan/sequence.h>
14 #include <seqan/file.h>
15 #include <getopt.h>
16 #include "common.h"
17 #include "reads.h"
18 #include "bwt_map.h"
19
20 using namespace seqan;
21 using namespace std;
22
23 struct TabSplitLine {
24 //split a text line into an array of strings t[]
25 //holds a copy of the text line
26 int tcount;
27 char **t;
28 char *str; //text line, with \0s instead of tabs
29 int tcap;
30 TabSplitLine(const char* line) {
31 tcount=0;
32 t=NULL;
33 tcap=0;
34 str=NULL;
35 if (line==NULL) return;
36 str=strdup(line);
37 //Notes: * destructive operation for s (replaces every \t with \0)
38 // * user must free t when no longer needed
39 tcap=14;
40 t=(char**)malloc(tcap*sizeof(char*));
41 char prevch=0;
42 for (char* p = str; *p!=0 ;p++) {
43 if (*p=='\t') *p=0; //break the string here
44 if (prevch==0) { //field start
45 if (tcount==tcap) {
46 tcap+=4;
47 t = (char**)realloc(t,tcap*sizeof(char*));
48 }
49 t[tcount]=p;
50 tcount++;
51 } //field start
52 prevch=*p;
53 if (*p=='\n' || *p=='\r') {
54 *p=0;
55 break;
56 }
57 }//for each character on the line
58 }
59 ~TabSplitLine() {
60 if (str!=NULL) {
61 free(str);
62 free(t);
63 }
64 }
65 };
66
67 struct MapOrdering {
68 bool operator()(pair<uint64_t, TabSplitLine*>& lhs, pair<uint64_t, TabSplitLine*>& rhs)
69 {
70 uint64_t lhs_id = lhs.first;
71 uint64_t rhs_id = rhs.first;
72 return lhs_id > rhs_id;
73 }
74 };
75
76 // "AS:i" (alignment score) is considered.
77 struct BamMapOrdering {
78 bool operator()(pair<uint64_t, bam1_t*>& lhs, pair<uint64_t, bam1_t*>& rhs) {
79 uint64_t lhs_id = lhs.first;
80 uint64_t rhs_id = rhs.first;
81
82 if (lhs_id != rhs_id || !bowtie2)
83 return lhs_id > rhs_id;
84
85 int lhs_score, rhs_score;
86 lhs_score = rhs_score = numeric_limits<int>::min();
87 uint8_t* ptr = bam_aux_get(lhs.second, "AS");
88 if (ptr)
89 lhs_score = bam_aux2i(ptr);
90
91 ptr = bam_aux_get(rhs.second, "AS");
92 if (ptr)
93 rhs_score = bam_aux2i(ptr);
94
95 if (lhs_score != rhs_score)
96 return lhs_score < rhs_score;
97
98 return lhs.second->core.flag & BAM_FSECONDARY;
99 }
100 };
101
102 void writeSamLine(TabSplitLine& l, FILE* f) {
103 if (l.tcount<10) {
104 //fprintf(stderr, "Warning: skipping malformed SAM line %s\n",samline);
105 return;
106 }
107 int flag=atoi(l.t[1]); //FLAG
108 if ((flag & BAM_FUNMAP) != 0)
109 return;
110 fprintf(f, "%s", l.t[0]);
111 for (int i=1;i<l.tcount;i++)
112 fprintf(f,"\t%s",l.t[i]);
113 fprintf(f, "\n");
114 }
115
116 void writeSam2Bam(TabSplitLine& l, GBamWriter& wbam) {
117 //let SAM lines through, except for unmapped reads
118 if (l.tcount<10) {
119 //fprintf(stderr, "Warning: skipping malformed SAM line %s\n",samline);
120 return;
121 }
122 int flag=atoi(l.t[1]); //FLAG
123 if ((flag & BAM_FUNMAP) != 0)
124 return;
125 int gpos=isdigit(l.t[3][0]) ? atoi(l.t[3]) : 0;
126 int insert_size=atoi(l.t[8]); //TLEN
127 int mate_pos=atoi(l.t[7]);
128 int mapq=atoi(l.t[4]);
129 GBamRecord *brec=wbam.new_record(l.t[0], flag, l.t[2], gpos, mapq,
130 l.t[5], l.t[6], mate_pos, insert_size, l.t[9], l.t[10]);
131 //now parse and add aux data:
132 if (l.tcount>11) {
133 for (int i=11;i<l.tcount;i++) {
134 brec->add_aux(l.t[i]);
135 }//for each aux field
136 }
137 wbam.write(brec);
138 delete brec;
139 }
140
141 void driver_bam(string& fname, GBamWriter& bam_writer, GBamWriter* umbam) {
142 tamFile fh=sam_open(sam_header.c_str());
143 bam_header_t* header=sam_header_read(fh);
144 sam_close(fh);
145 priority_queue< pair<uint64_t,bam1_t*>,
146 vector<pair<uint64_t, bam1_t*> >,
147 BamMapOrdering > map_pq;
148
149 tamFile fp=sam_open(fname.c_str());
150 bam1_t *b = bam_init1();
151 uint64_t last_id = 0;
152 do {
153 bool new_record = (sam_read1(fp, header, b) >= 0);
154 if (new_record) {
155 char *qname = bam1_qname(b);
156 uint64_t qid=(uint64_t)atol(qname);
157 bam1_t* bamrec=bam_dup1(b);
158 map_pq.push(make_pair(qid, bamrec));
159 }
160
161 while (map_pq.size() > 1000000 || (!new_record && map_pq.size() > 0)) {
162 const pair<uint64_t, bam1_t*>& t = map_pq.top();
163 bam1_t* tb=t.second;
164
165 bool unmapped = (tb->core.flag & BAM_FUNMAP) != 0;
166 if (!unmapped) {
167 // mapped reads, write with index
168 bam_writer.write(tb, t.first);
169
170 // In case of Bowtie2, some of the mapped reads against either transcriptome or genome
171 // may have low alignment scores due to gaps, in which case we will remap those.
172 // Later, we may have better alignments that usually involve splice junctions.
173 if (bowtie2 && t.first != last_id) {
174 uint8_t* ptr = bam_aux_get(tb, "AS");
175 if (ptr) {
176 int score = bam_aux2i(ptr);
177 if (score < bowtie2_min_score)
178 {
179 unmapped = true;
180 }
181 }
182 }
183 last_id = t.first;
184 } // mapped reads
185
186 if (unmapped) { //unmapped reads or those we might want to map later
187 if (umbam!=NULL)
188 umbam->write(tb);
189 }
190 bam_destroy1(tb);
191 map_pq.pop();
192 }
193 } while (map_pq.size() > 0); //while SAM records
194
195 bam_destroy1(b);
196 bam_header_destroy(header);
197
198 }
199
200 void driver_headerless(FILE* map_file, FILE* f_out)
201 {
202
203 char bwt_buf[4096];
204
205 priority_queue< pair<uint64_t,TabSplitLine*>,
206 vector<pair<uint64_t, TabSplitLine*> >,
207 MapOrdering > map_pq;
208
209 while (fgets(bwt_buf, sizeof(bwt_buf), map_file))
210 {
211 // Chomp the newline
212 char* nl = strrchr(bwt_buf, '\n');
213 if (nl) *nl = 0;
214 if (*bwt_buf == 0)
215 continue;
216 TabSplitLine* l=new TabSplitLine(bwt_buf);
217 if (l->tcount<10 || l->t[0][0]=='@') {
218 delete l;
219 continue;
220 }
221 //char* hitline = strdup(bwt_buf);
222 uint64_t qid = (uint64_t)atol(l->t[0]);
223 map_pq.push(make_pair(qid, l));
224
225 if (map_pq.size() > 1000000)
226 {
227 const pair<uint64_t, TabSplitLine*>& t = map_pq.top();
228 writeSamLine(*t.second, f_out);
229 delete t.second;
230 map_pq.pop();
231 }
232 }
233
234 while (map_pq.size())
235 {
236 const pair<uint64_t, TabSplitLine*>& t = map_pq.top();
237 writeSamLine(*t.second, f_out);
238 delete t.second;
239 map_pq.pop();
240 }
241 }
242
243 void print_usage()
244 {
245 // fprintf(stderr, "Usage: fix_map_ordering <map.bwtout> [<reads.fa/.fq>]\n");
246 fprintf(stderr,
247 "Usage: \nfix_map_ordering [--sam-header <sam_header_file>] <in_SAM_file> <out_BAM_file> [<out_unmapped_BAM>]\n");
248 }
249
250 int main(int argc, char** argv)
251 {
252 int parse_ret = parse_options(argc, argv, print_usage);
253 if (parse_ret)
254 return parse_ret;
255
256 //if --sam_header option was given write BAM and expects (headerless) SAM input;
257 // else simply lets SAM lines through
258 if(optind >= argc)
259 {
260 print_usage();
261 return 1;
262 }
263 string map_file_name = argv[optind++];
264 string out_file_name("-");
265 string out_unmapped_fname;
266 if (optind<argc) {
267 out_file_name = argv[optind++];
268 }
269 if (optind<argc) {
270 out_unmapped_fname = argv[optind++];
271 }
272 //fix_map_ordering gets bowtie's SAM output at stdin, uncompressed
273 //if the SAM input is headerless and no unmapped files
274 if (sam_header.empty()) {
275 //write only mapped reads as SAM lines
276 FILE* f_out=NULL;
277 if (out_file_name=="-") f_out=stdout;
278 else {
279 if ((f_out=fopen(out_file_name.c_str(),"w"))==NULL)
280 err_die("Error: cannot create output file %s\n",
281 out_file_name.c_str());
282 }
283 FILE* f_in=NULL;
284 if (map_file_name=="-") f_in=stdin;
285 else {
286 if ((f_in=fopen(map_file_name.c_str(),"r"))==NULL)
287 err_die("Error: cannot open input file %s\n",
288 map_file_name.c_str());
289 }
290 driver_headerless(f_in, f_out);
291 }
292 else {
293 //BAM output, we have SAM header
294 GBamWriter bam_writer(out_file_name.c_str(),sam_header.c_str(), index_outfile);
295 GBamWriter *unmapped_bam_writer=NULL;
296 if (!out_unmapped_fname.empty())
297 unmapped_bam_writer=new GBamWriter(out_unmapped_fname.c_str(),
298 sam_header.c_str());
299 driver_bam(map_file_name, bam_writer, unmapped_bam_writer);
300 delete unmapped_bam_writer;
301 }
302
303 return 0;
304 }