ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/fix_map_ordering.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (7 years, 9 months ago) by gpertea
File size: 10011 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 /*
2 * fix_map_ordering.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 2/28/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10
11 #include <queue>
12 #include <cstring>
13 #include <seqan/sequence.h>
14 #include <seqan/file.h>
15 #include <getopt.h>
16 #include "common.h"
17 #include "reads.h"
18 #include "bwt_map.h"
19
20 using namespace seqan;
21 using namespace std;
22
23 struct TabSplitLine {
24 //split a text line into an array of strings t[]
25 //holds a copy of the text line
26 int tcount;
27 char **t;
28 char *str; //text line, with \0s instead of tabs
29 int tcap;
30 TabSplitLine(const char* line) {
31 tcount=0;
32 t=NULL;
33 tcap=0;
34 str=NULL;
35 if (line==NULL) return;
36 str=strdup(line);
37 //Notes: * destructive operation for s (replaces every \t with \0)
38 // * user must free t when no longer needed
39 tcap=14;
40 t=(char**)malloc(tcap*sizeof(char*));
41 char prevch=0;
42 for (char* p = str; *p!=0 ;p++) {
43 if (*p=='\t') *p=0; //break the string here
44 if (prevch==0) { //field start
45 if (tcount==tcap) {
46 tcap+=4;
47 t = (char**)realloc(t,tcap*sizeof(char*));
48 }
49 t[tcount]=p;
50 tcount++;
51 } //field start
52 prevch=*p;
53 if (*p=='\n' || *p=='\r') {
54 *p=0;
55 break;
56 }
57 }//for each character on the line
58 }
59 ~TabSplitLine() {
60 if (str!=NULL) {
61 free(str);
62 free(t);
63 }
64 }
65 };
66
67 struct MapOrdering {
68 bool operator()(pair<uint64_t, TabSplitLine*>& lhs, pair<uint64_t, TabSplitLine*>& rhs)
69 {
70 uint64_t lhs_id = lhs.first;
71 uint64_t rhs_id = rhs.first;
72 return lhs_id > rhs_id;
73 }
74 };
75
76 // "AS:i" (alignment score) is considered.
77 struct BamMapOrdering {
78 bool operator()(pair<uint64_t, bam1_t*>& lhs, pair<uint64_t, bam1_t*>& rhs) {
79 uint64_t lhs_id = lhs.first;
80 uint64_t rhs_id = rhs.first;
81
82 if (lhs_id != rhs_id || !bowtie2)
83 return lhs_id > rhs_id;
84
85 assert (lhs.second->core.flag & BAM_FUNMAP == 0);
86 assert (rhs.second->core.flag & BAM_FUNMAP == 0);
87
88 int lhs_score, rhs_score;
89 lhs_score = rhs_score = numeric_limits<int>::min();
90 uint8_t* ptr = bam_aux_get(lhs.second, "AS");
91 if (ptr)
92 lhs_score = bam_aux2i(ptr);
93
94 ptr = bam_aux_get(rhs.second, "AS");
95 if (ptr)
96 rhs_score = bam_aux2i(ptr);
97
98 if (lhs_score != rhs_score)
99 return lhs_score < rhs_score;
100
101 return lhs.second->core.flag & BAM_FSECONDARY;
102 }
103 };
104
105 void writeSamLine(TabSplitLine& l, FILE* f) {
106 if (l.tcount<10) {
107 //fprintf(stderr, "Warning: skipping malformed SAM line %s\n",samline);
108 return;
109 }
110 int flag=atoi(l.t[1]); //FLAG
111 if ((flag & BAM_FUNMAP) != 0)
112 return;
113 fprintf(f, "%s", l.t[0]);
114 for (int i=1;i<l.tcount;i++)
115 fprintf(f,"\t%s",l.t[i]);
116 fprintf(f, "\n");
117 }
118
119 void writeSam2Bam(TabSplitLine& l, GBamWriter& wbam) {
120 //let SAM lines through, except for unmapped reads
121 if (l.tcount<10) {
122 //fprintf(stderr, "Warning: skipping malformed SAM line %s\n",samline);
123 return;
124 }
125 int flag=atoi(l.t[1]); //FLAG
126 if ((flag & BAM_FUNMAP) != 0)
127 return;
128 int gpos=isdigit(l.t[3][0]) ? atoi(l.t[3]) : 0;
129 int insert_size=atoi(l.t[8]); //TLEN
130 int mate_pos=atoi(l.t[7]);
131 int mapq=atoi(l.t[4]);
132 GBamRecord *brec=wbam.new_record(l.t[0], flag, l.t[2], gpos, mapq,
133 l.t[5], l.t[6], mate_pos, insert_size, l.t[9], l.t[10]);
134 //now parse and add aux data:
135 if (l.tcount>11) {
136 for (int i=11;i<l.tcount;i++) {
137 brec->add_aux(l.t[i]);
138 }//for each aux field
139 }
140 wbam.write(brec);
141 delete brec;
142 }
143
144 void driver_bam(string& fname, GBamWriter& bam_writer, GBamWriter* umbam) {
145 tamFile fh=sam_open(sam_header.c_str());
146 bam_header_t* header=sam_header_read(fh);
147 sam_close(fh);
148 priority_queue< pair<uint64_t,bam1_t*>,
149 vector<pair<uint64_t, bam1_t*> >,
150 BamMapOrdering > map_pq;
151
152 FILE* findex = NULL;
153 if (!index_outfile.empty()) {
154 findex = fopen(index_outfile.c_str(), "w");
155 if (findex == NULL)
156 err_die("Error: cannot create file %s\n", index_outfile.c_str());
157 }
158
159 tamFile fp=sam_open(fname.c_str());
160 bam1_t *b = bam_init1();
161 uint64_t last_id = 0;
162 uint64_t count = 0;
163 do {
164 bool new_record = (sam_read1(fp, header, b) >= 0);
165 if (new_record) {
166 char *qname = bam1_qname(b);
167 uint64_t qid=(uint64_t)atol(qname);
168 bam1_t* bamrec=bam_dup1(b);
169 map_pq.push(make_pair(qid, bamrec));
170 }
171
172 while (map_pq.size() > 1000000 || (!new_record && map_pq.size() > 0)) {
173 const pair<uint64_t, bam1_t*>& t = map_pq.top();
174 bam1_t* tb=t.second;
175
176 bool unmapped = (tb->core.flag & BAM_FUNMAP) != 0;
177 if (!unmapped) {
178 if (findex != NULL) {
179 if (count >= 1000 && t.first != last_id) {
180 int64_t offset = bam_writer.tell();
181 int block_offset = offset & 0xFFFF;
182
183 // daehwan - this is a bug in bgzf.c in samtools
184 // I'll report this bug with a temporary solution, soon.
185 if (block_offset <= 0xF000) {
186 fprintf(findex, "%lu\t%ld\n", t.first, offset);
187 count = 0;
188 }
189 }
190
191 ++count;
192 }
193
194 bam_writer.write(tb); //mapped
195 }
196
197 // In case of Bowtie2, some of the mapped reads against either transcriptome or genome
198 // may have low alignment scores due to gaps, in which case we will remap those.
199 // Later, we may have better alignments that usually involve splice junctions.
200 if (!unmapped) {
201 if (bowtie2 && t.first != last_id) {
202 uint8_t* ptr = bam_aux_get(tb, "AS");
203 if (ptr) {
204 int score = bam_aux2i(ptr);
205 if (score < bowtie2_min_score)
206 {
207 unmapped = true;
208 }
209 }
210 }
211
212 last_id = t.first;
213 }
214
215 if (unmapped) { //unmapped
216 if (umbam!=NULL)
217 umbam->write(tb);
218 }
219 bam_destroy1(tb);
220 map_pq.pop();
221 }
222 } while (map_pq.size() > 0); //while SAM records
223
224 bam_destroy1(b);
225 bam_header_destroy(header);
226
227 if (findex != NULL)
228 fclose(findex);
229 }
230
231 void driver_headerless(FILE* map_file, FILE* f_out)
232 {
233
234 char bwt_buf[4096];
235
236 priority_queue< pair<uint64_t,TabSplitLine*>,
237 vector<pair<uint64_t, TabSplitLine*> >,
238 MapOrdering > map_pq;
239
240 while (fgets(bwt_buf, sizeof(bwt_buf), map_file))
241 {
242 // Chomp the newline
243 char* nl = strrchr(bwt_buf, '\n');
244 if (nl) *nl = 0;
245 if (*bwt_buf == 0)
246 continue;
247 /*
248 const char* bwt_fmt_str = "%s %c %s %d %s %s %d %s";
249 static const int buf_size = 2048;
250 char qname[buf_size];
251 char flagstr[buf_size];
252 int bwtf_ret = 0;
253 char refname[buf_size];
254 unsigned int text_offset;
255 char orientation;
256 char sequence[buf_size];
257 char qualities[buf_size];
258 unsigned int other_occs;
259 char mismatches[buf_size];
260 memset(mismatches, 0, sizeof(mismatches));
261 // Get a new record from the tab-delimited Bowtie map
262 bwtf_ret = sscanf(bwt_buf,
263 bwt_fmt_str,
264 qname,
265 &orientation,
266 refname, // name of reference sequence
267 &text_offset,
268 sequence,
269 qualities,
270 &other_occs,
271 mismatches);
272
273 // If we didn't get enough fields, this record is bad, so skip it
274 if (bwtf_ret > 0 && bwtf_ret < 6)
275 {
276 //fprintf(stderr, "Warning: found malformed record, skipping\n");
277 continue;
278 }
279 */
280 TabSplitLine* l=new TabSplitLine(bwt_buf);
281 /* bwtf_ret = sscanf(bwt_buf, sam_fmt_str, qname, flagstr, refname);
282 if (qname[0]=='@' && qname[3]==0) { //header line, skip it
283 continue;
284 }
285 if (bwtf_ret>9 && bwtf_ret<3) {
286 continue; //skip this line
287 }
288 */
289 if (l->tcount<10 || l->t[0][0]=='@') {
290 delete l;
291 continue;
292 }
293 //char* hitline = strdup(bwt_buf);
294 uint64_t qid = (uint64_t)atol(l->t[0]);
295 map_pq.push(make_pair(qid, l));
296
297 if (map_pq.size() > 1000000)
298 {
299 const pair<uint64_t, TabSplitLine*>& t = map_pq.top();
300 writeSamLine(*t.second, f_out);
301 delete t.second;
302 map_pq.pop();
303 }
304 }
305
306 while (map_pq.size())
307 {
308 const pair<uint64_t, TabSplitLine*>& t = map_pq.top();
309 writeSamLine(*t.second, f_out);
310 delete t.second;
311 map_pq.pop();
312 }
313 }
314
315 void print_usage()
316 {
317 // fprintf(stderr, "Usage: fix_map_ordering <map.bwtout> [<reads.fa/.fq>]\n");
318 fprintf(stderr,
319 "Usage: \nfix_map_ordering [--sam-header <sam_header_file>] <in_SAM_file> <out_BAM_file> [<out_unmapped_BAM>]\n");
320 }
321
322 int main(int argc, char** argv)
323 {
324 int parse_ret = parse_options(argc, argv, print_usage);
325 if (parse_ret)
326 return parse_ret;
327
328 //if --sam_header option was given write BAM and expects (headerless) SAM input;
329 // else simply lets SAM lines through
330 if(optind >= argc)
331 {
332 print_usage();
333 return 1;
334 }
335 string map_file_name = argv[optind++];
336 string out_file_name("-");
337 string out_unmapped_fname;
338 if (optind<argc) {
339 out_file_name = argv[optind++];
340 }
341 if (optind<argc) {
342 out_unmapped_fname = argv[optind++];
343 }
344 //fix_map_ordering gets bowtie's SAM output at stdin, uncompressed
345 //if the SAM input is headerless and no unmapped files
346 if (sam_header.empty()) {
347 //write only mapped reads as SAM lines
348 FILE* f_out=NULL;
349 if (out_file_name=="-") f_out=stdout;
350 else {
351 if ((f_out=fopen(out_file_name.c_str(),"w"))==NULL)
352 err_die("Error: cannot create output file %s\n",
353 out_file_name.c_str());
354 }
355 FILE* f_in=NULL;
356 if (map_file_name=="-") f_in=stdin;
357 else {
358 if ((f_in=fopen(map_file_name.c_str(),"r"))==NULL)
359 err_die("Error: cannot open input file %s\n",
360 map_file_name.c_str());
361 }
362 driver_headerless(f_in, f_out);
363 }
364 else {
365 //BAM output, we have SAM header
366 GBamWriter bam_writer(out_file_name.c_str(),sam_header.c_str());
367 GBamWriter *unmapped_bam_writer=NULL;
368 if (!out_unmapped_fname.empty())
369 unmapped_bam_writer=new GBamWriter(out_unmapped_fname.c_str(),
370 sam_header.c_str());
371 driver_bam(map_file_name, bam_writer, unmapped_bam_writer);
372 delete unmapped_bam_writer;
373 }
374
375 return 0;
376 }