ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/fix_map_ordering.cpp
Revision: 159
Committed: Mon Jan 30 22:38:08 2012 UTC (7 years, 8 months ago) by gpertea
File size: 8278 byte(s)
Log Message:
wip

Line File contents
1 /*
2 * fix_map_ordering.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 2/28/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10
11 #include <queue>
12 #include <cstring>
13 #include <seqan/sequence.h>
14 #include <seqan/file.h>
15 #include <getopt.h>
16 #include "common.h"
17 #include "reads.h"
18 #include "bwt_map.h"
19
20 using namespace seqan;
21 using namespace std;
22
23 struct TabSplitLine {
24 //split a text line into an array of strings t[]
25 //holds a copy of the text line
26 int tcount;
27 char **t;
28 char *str; //text line, with \0s instead of tabs
29 int tcap;
30 TabSplitLine(const char* line) {
31 tcount=0;
32 t=NULL;
33 tcap=0;
34 str=NULL;
35 if (line==NULL) return;
36 str=strdup(line);
37 //Notes: * destructive operation for s (replaces every \t with \0)
38 // * user must free t when no longer needed
39 tcap=14;
40 t=(char**)malloc(tcap*sizeof(char*));
41 char prevch=0;
42 for (char* p = str; *p!=0 ;p++) {
43 if (*p=='\t') *p=0; //break the string here
44 if (prevch==0) { //field start
45 if (tcount==tcap) {
46 tcap+=4;
47 t = (char**)realloc(t,tcap*sizeof(char*));
48 }
49 t[tcount]=p;
50 tcount++;
51 } //field start
52 prevch=*p;
53 if (*p=='\n' || *p=='\r') {
54 *p=0;
55 break;
56 }
57 }//for each character on the line
58 }
59 ~TabSplitLine() {
60 if (str!=NULL) {
61 free(str);
62 free(t);
63 }
64 }
65 };
66
67 struct MapOrdering {
68 bool operator()(pair<uint64_t, TabSplitLine*>& lhs, pair<uint64_t, TabSplitLine*>& rhs)
69 {
70 uint64_t lhs_id = lhs.first;
71 uint64_t rhs_id = rhs.first;
72 return lhs_id > rhs_id;
73 }
74 };
75
76 // "AS:i" (alignment score) is considered.
77 struct BamMapOrdering {
78 bool operator()(pair<uint64_t, bam1_t*>& lhs, pair<uint64_t, bam1_t*>& rhs) {
79 uint64_t lhs_id = lhs.first;
80 uint64_t rhs_id = rhs.first;
81
82 if (lhs_id != rhs_id || !bowtie2)
83 return lhs_id > rhs_id;
84
85 assert (lhs.second->core.flag & BAM_FUNMAP == 0);
86 assert (rhs.second->core.flag & BAM_FUNMAP == 0);
87
88 int lhs_score, rhs_score;
89 lhs_score = rhs_score = numeric_limits<int>::min();
90 uint8_t* ptr = bam_aux_get(lhs.second, "AS");
91 if (ptr)
92 lhs_score = bam_aux2i(ptr);
93
94 ptr = bam_aux_get(rhs.second, "AS");
95 if (ptr)
96 rhs_score = bam_aux2i(ptr);
97
98 if (lhs_score != rhs_score)
99 return lhs_score < rhs_score;
100
101 return lhs.second->core.flag & BAM_FSECONDARY;
102 }
103 };
104
105 void writeSamLine(TabSplitLine& l, FILE* f) {
106 if (l.tcount<10) {
107 //fprintf(stderr, "Warning: skipping malformed SAM line %s\n",samline);
108 return;
109 }
110 int flag=atoi(l.t[1]); //FLAG
111 if ((flag & BAM_FUNMAP) != 0)
112 return;
113 fprintf(f, "%s", l.t[0]);
114 for (int i=1;i<l.tcount;i++)
115 fprintf(f,"\t%s",l.t[i]);
116 fprintf(f, "\n");
117 }
118
119 void writeSam2Bam(TabSplitLine& l, GBamWriter& wbam) {
120 //let SAM lines through, except for unmapped reads
121 if (l.tcount<10) {
122 //fprintf(stderr, "Warning: skipping malformed SAM line %s\n",samline);
123 return;
124 }
125 int flag=atoi(l.t[1]); //FLAG
126 if ((flag & BAM_FUNMAP) != 0)
127 return;
128 int gpos=isdigit(l.t[3][0]) ? atoi(l.t[3]) : 0;
129 int insert_size=atoi(l.t[8]); //TLEN
130 int mate_pos=atoi(l.t[7]);
131 int mapq=atoi(l.t[4]);
132 GBamRecord *brec=wbam.new_record(l.t[0], flag, l.t[2], gpos, mapq,
133 l.t[5], l.t[6], mate_pos, insert_size, l.t[9], l.t[10]);
134 //now parse and add aux data:
135 if (l.tcount>11) {
136 for (int i=11;i<l.tcount;i++) {
137 brec->add_aux(l.t[i]);
138 }//for each aux field
139 }
140 wbam.write(brec);
141 delete brec;
142 }
143
144 void driver_bam(string& fname, GBamWriter& bam_writer, GBamWriter* umbam) {
145 tamFile fh=sam_open(sam_header.c_str());
146 bam_header_t* header=sam_header_read(fh);
147 sam_close(fh);
148 priority_queue< pair<uint64_t,bam1_t*>,
149 vector<pair<uint64_t, bam1_t*> >,
150 BamMapOrdering > map_pq;
151
152 tamFile fp=sam_open(fname.c_str());
153 bam1_t *b = bam_init1();
154 uint64_t last_id = 0;
155 do {
156 bool new_record = (sam_read1(fp, header, b) >= 0);
157 if (new_record) {
158 char *qname = bam1_qname(b);
159 uint64_t qid=(uint64_t)atol(qname);
160 bam1_t* bamrec=bam_dup1(b);
161 map_pq.push(make_pair(qid, bamrec));
162 }
163
164 while (map_pq.size() > 1000000 || (!new_record && map_pq.size() > 0)) {
165 const pair<uint64_t, bam1_t*>& t = map_pq.top();
166 bam1_t* tb=t.second;
167
168 bool unmapped = (tb->core.flag & BAM_FUNMAP) != 0;
169 if (!unmapped) {
170 // mapped reads, write with index
171 bam_writer.write(tb, t.first);
172
173 // In case of Bowtie2, some of the mapped reads against either transcriptome or genome
174 // may have low alignment scores due to gaps, in which case we will remap those.
175 // Later, we may have better alignments that usually involve splice junctions.
176 if (bowtie2 && t.first != last_id) {
177 uint8_t* ptr = bam_aux_get(tb, "AS");
178 if (ptr) {
179 int score = bam_aux2i(ptr);
180 if (score < bowtie2_min_score)
181 {
182 unmapped = true;
183 }
184 }
185 }
186 last_id = t.first;
187 } // mapped reads
188
189 if (unmapped) { //unmapped reads or those we might want to map later
190 if (umbam!=NULL)
191 umbam->write(tb);
192 }
193 bam_destroy1(tb);
194 map_pq.pop();
195 }
196 } while (map_pq.size() > 0); //while SAM records
197
198 bam_destroy1(b);
199 bam_header_destroy(header);
200
201 }
202
203 void driver_headerless(FILE* map_file, FILE* f_out)
204 {
205
206 char bwt_buf[4096];
207
208 priority_queue< pair<uint64_t,TabSplitLine*>,
209 vector<pair<uint64_t, TabSplitLine*> >,
210 MapOrdering > map_pq;
211
212 while (fgets(bwt_buf, sizeof(bwt_buf), map_file))
213 {
214 // Chomp the newline
215 char* nl = strrchr(bwt_buf, '\n');
216 if (nl) *nl = 0;
217 if (*bwt_buf == 0)
218 continue;
219 TabSplitLine* l=new TabSplitLine(bwt_buf);
220 if (l->tcount<10 || l->t[0][0]=='@') {
221 delete l;
222 continue;
223 }
224 //char* hitline = strdup(bwt_buf);
225 uint64_t qid = (uint64_t)atol(l->t[0]);
226 map_pq.push(make_pair(qid, l));
227
228 if (map_pq.size() > 1000000)
229 {
230 const pair<uint64_t, TabSplitLine*>& t = map_pq.top();
231 writeSamLine(*t.second, f_out);
232 delete t.second;
233 map_pq.pop();
234 }
235 }
236
237 while (map_pq.size())
238 {
239 const pair<uint64_t, TabSplitLine*>& t = map_pq.top();
240 writeSamLine(*t.second, f_out);
241 delete t.second;
242 map_pq.pop();
243 }
244 }
245
246 void print_usage()
247 {
248 // fprintf(stderr, "Usage: fix_map_ordering <map.bwtout> [<reads.fa/.fq>]\n");
249 fprintf(stderr,
250 "Usage: \nfix_map_ordering [--sam-header <sam_header_file>] <in_SAM_file> <out_BAM_file> [<out_unmapped_BAM>]\n");
251 }
252
253 int main(int argc, char** argv)
254 {
255 int parse_ret = parse_options(argc, argv, print_usage);
256 if (parse_ret)
257 return parse_ret;
258
259 //if --sam_header option was given write BAM and expects (headerless) SAM input;
260 // else simply lets SAM lines through
261 if(optind >= argc)
262 {
263 print_usage();
264 return 1;
265 }
266 string map_file_name = argv[optind++];
267 string out_file_name("-");
268 string out_unmapped_fname;
269 if (optind<argc) {
270 out_file_name = argv[optind++];
271 }
272 if (optind<argc) {
273 out_unmapped_fname = argv[optind++];
274 }
275 //fix_map_ordering gets bowtie's SAM output at stdin, uncompressed
276 //if the SAM input is headerless and no unmapped files
277 if (sam_header.empty()) {
278 //write only mapped reads as SAM lines
279 FILE* f_out=NULL;
280 if (out_file_name=="-") f_out=stdout;
281 else {
282 if ((f_out=fopen(out_file_name.c_str(),"w"))==NULL)
283 err_die("Error: cannot create output file %s\n",
284 out_file_name.c_str());
285 }
286 FILE* f_in=NULL;
287 if (map_file_name=="-") f_in=stdin;
288 else {
289 if ((f_in=fopen(map_file_name.c_str(),"r"))==NULL)
290 err_die("Error: cannot open input file %s\n",
291 map_file_name.c_str());
292 }
293 driver_headerless(f_in, f_out);
294 }
295 else {
296 //BAM output, we have SAM header
297 GBamWriter bam_writer(out_file_name.c_str(),sam_header.c_str(), index_outfile);
298 GBamWriter *unmapped_bam_writer=NULL;
299 if (!out_unmapped_fname.empty())
300 unmapped_bam_writer=new GBamWriter(out_unmapped_fname.c_str(),
301 sam_header.c_str());
302 driver_bam(map_file_name, bam_writer, unmapped_bam_writer);
303 delete unmapped_bam_writer;
304 }
305
306 return 0;
307 }