ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/map2gtf.cpp
Revision: 165
Committed: Fri Feb 10 01:04:05 2012 UTC (8 years, 1 month ago) by gpertea
File size: 10212 byte(s)
Log Message:
sync back with Daehwan. Test bam_merge for memory leaks!

Line File contents
1 /*
2 * Author: Harold Pimentel
3 * Contact: http://cs.berkeley.edu/~pimentel
4 * Date: June 10, 2011
5 */
6 #include "map2gtf.h"
7 #include "tokenize.h"
8
9 void m2g_print_usage()
10 {
11 std::cerr << "Usage: map2gtf\tannotation.gtf "
12 << " alignments.bwtout out_file.sam" << std::endl;
13 }
14
15 Map2GTF::Map2GTF(std::string gtf_fname, std::string reads_fname) :
16 gtf_fname_(gtf_fname), reads_fname_(reads_fname), refSeqTable_(true)
17 {
18 gtf_fhandle_ = fopen(gtf_fname_.c_str(), "r");
19 if (gtf_fhandle_ == NULL)
20 {
21 std::cerr << "FATAL: Couldn't open annotation: " << gtf_fname_
22 << std::endl;
23 exit(1);
24 }
25 std::cout << "Reading the annotation file: " << gtf_fname_ << std::endl;
26 gtfReader_.init(gtf_fhandle_, true); //only recognizable transcripts will be loaded
27 gtfReader_.readAll();
28
29 std::cout << "Initializing the BAMHitFactory." << std::endl;
30 hitFactory_ = new BAMHitFactory(readTable_, refSeqTable_);
31 std::cout << "Done with init samfactory" << std::endl;
32
33 // read bam file
34 std::cout << "Initializing the HitStream." << std::endl;
35 hitStream_ = new HitStream(reads_fname, hitFactory_, false, true,
36 true, true, true, true);
37
38 std::cout << "Done with initializtion. " << std::endl;
39 }
40
41 Map2GTF::~Map2GTF()
42 {
43 std::cout << "map2gtf has completed. Cleaning up." << std::endl;
44 if (gtf_fhandle_ != NULL && fclose(gtf_fhandle_))
45 {
46 std::cerr << "Warning: Error closing annotation: " << gtf_fname_
47 << std::endl;
48 }
49
50 if (zpacker != "")
51 {
52 std::cout << "Closing the zpacker. " << std::endl;
53 reads_pipe_.close();
54 }
55 else
56 {
57 if (reads_fhandle_ != NULL && fclose(reads_fhandle_))
58 {
59 std::cerr << "Warning: Error closing reads: " << reads_fname_
60 << std::endl;
61 }
62 }
63
64 delete hitFactory_;
65 delete hitStream_;
66 std::cout << "Done. Thanks!" << std::endl;
67 }
68
69 void Map2GTF::convert_coords(std::string out_fname, std::string sam_header)
70 {
71 // FIXME: come up with a more efficient layout for doing this
72 GBamWriter bam_writer(out_fname.c_str(), sam_header.c_str());
73
74 std::vector<TranscriptomeHit> read_list;
75
76 GffObj* p_trans = NULL;
77
78 HitsForRead hit_group;
79 char read_name[MAX_READ_NAME_LEN]; // FIXME: use the macro size
80
81 std::vector<TranscriptomeHit>::iterator bh_it;
82 std::vector<TranscriptomeHit>::iterator bh_unique_it;
83 BowtieHit bwt_hit;
84 // a hit group is a set of reads with the same name
85 while (hitStream_->next_read_hits(hit_group))
86 {
87 TranscriptomeHit primary_hit;
88 for (size_t i = 0; i < hit_group.hits.size(); ++i)
89 {
90 // TODO: check if read is unmapped.
91 // if so, figure out what to do with it
92 bwt_hit = hit_group.hits[i];
93 sprintf(read_name, "%u", bwt_hit.insert_id());
94
95 size_t trans_idx = atoi(refSeqTable_.get_name(bwt_hit.ref_id()));
96 p_trans = gtfReader_.gflst.Get(trans_idx);
97 TranscriptomeHit converted_out(p_trans);
98 trans_to_genomic_coords(read_name, hitFactory_, bwt_hit,
99 converted_out);
100
101 vector<string> aux_fields;
102 tokenize(bwt_hit.hitfile_rec().c_str(), "\t", aux_fields);
103 for (size_t j = 11; j < aux_fields.size(); ++j)
104 {
105 const string& aux = aux_fields[j];
106 if (aux.substr(0, 2) == "XS" || aux.substr(0,2) == "NM")
107 continue;
108
109 converted_out.aux_fields.push_back(aux);
110 }
111
112 string strand = "XS:A:";
113 strand.push_back(converted_out.trans->strand);
114 converted_out.aux_fields.push_back(strand);
115
116 // since Bowtie2 outputs quality values only for the primary hit,
117 // we need to make sure that the primary hit is output.
118 if (bowtie2 && bwt_hit.qual().length() > 0 && bwt_hit.qual()[0] != ' ')
119 primary_hit = converted_out;
120 else
121 read_list.push_back(converted_out);
122 }
123
124 // XXX: Fine for now... should come up with a more efficient way though
125 // FIXME: Take frag length into consideration when filtering
126 std::sort(read_list.begin(), read_list.end());
127 bh_unique_it = std::unique(read_list.begin(), read_list.end());
128
129 if (bowtie2)
130 print_bamhit(bam_writer, read_name, primary_hit.hit, primary_hit.trans->getRefName(), primary_hit.trans->getRefName(),
131 primary_hit.hit.seq().c_str(), primary_hit.hit.qual().c_str(), true, &(primary_hit.aux_fields));
132
133
134 for (bh_it = read_list.begin(); bh_it != bh_unique_it; ++bh_it)
135 {
136 if (bowtie2 && primary_hit.hit == bh_it->hit)
137 continue;
138
139 print_bamhit(bam_writer, read_name, bh_it->hit, bh_it->trans->getRefName(), bh_it->trans->getRefName(),
140 bh_it->hit.seq().c_str(), bh_it->hit.qual().c_str(), true, &(bh_it->aux_fields));
141 }
142 read_list.clear();
143 }
144
145 // fclose(out_sam);
146 }
147
148 void trans_to_genomic_coords(const char* read_name, HitFactory* hitFactory,
149 const BowtieHit& in, TranscriptomeHit& out)
150 //out.trans must already have the corresponding GffObj*
151 {
152 std::string read_id(read_name);
153 std::string ref_name(out.trans->getRefName());
154 bool spliced = false;
155 std::vector<CigarOp> cig_list;
156
157 // read start in genomic coords
158 size_t read_start = 0;
159
160 GList<GffExon>& exon_list = out.trans->exons;
161 GffExon* cur_exon;
162 GffExon* next_exon;
163 size_t cur_pos;
164 size_t match_length;
165 size_t miss_length;
166 size_t cur_intron_len = 0;
167 int i = 0;
168
169 // TODO: Check this return value
170 bool ret_val = get_read_start(&exon_list, in.left(), read_start, i);
171 cur_pos = read_start;
172
173 const std::vector<CigarOp>& cigars = in.cigar();
174 for (size_t c = 0; c < cigars.size(); ++c)
175 {
176 const CigarOp& cigar = cigars[c];
177
178 if (cigar.opcode == INS)
179 cig_list.push_back(cigar);
180
181 if (cigar.opcode != MATCH && cigar.opcode != DEL)
182 continue;
183
184 size_t remaining_length = cigar.length;
185 for (; i < exon_list.Count(); ++i)
186 {
187 cur_exon = exon_list.Get(i);
188 if (cur_pos >= cur_exon->start &&
189 cur_pos + remaining_length - 1 <= cur_exon->end) // read ends in this exon
190 {
191 CigarOp cig(cigar.opcode, remaining_length);
192 cig_list.push_back(cig);
193
194 cur_pos += remaining_length;
195 break;
196 }
197
198 // shouldn't need the check... can switch to a regular "else"
199 else if (cur_pos >= cur_exon->start &&
200 cur_pos + remaining_length - 1 > cur_exon->end)// read is spliced and overlaps this exon
201 {
202 spliced = true;
203 // XXX: This should _never_ go out of range.
204 // get the max length that fits in this exon, go to next exon
205 // cur_pos should be the next exon start
206 // set assertion to check this
207
208 // TODO: check this
209 match_length = cur_exon->end - cur_pos + 1;
210 if (match_length > 0)
211 {
212 CigarOp cig(cigar.opcode, match_length);
213 cig_list.push_back(cig);
214 }
215
216 // XXX: DEBUG
217 if (i + 1 >= exon_list.Count())
218 {
219 std::cerr << "trying to access: " << i + 1 << " when size is: "
220 << exon_list.Count() << std::endl;
221 print_trans(out.trans, in, remaining_length, match_length, cur_pos,
222 read_start);
223 exit(1);
224 }
225
226 else
227 next_exon = exon_list.Get(i + 1);
228
229 // and this
230 miss_length = next_exon->start - cur_exon->end - 1;
231 cur_intron_len += miss_length;
232 CigarOp miss_cig(REF_SKIP, miss_length);
233 cig_list.push_back(miss_cig);
234
235 cur_pos += match_length + miss_length;
236 remaining_length -= match_length;
237 assert(cur_pos == next_exon->start);
238 }
239 }
240 }
241
242 bool antisense_splice = (spliced && out.trans->strand=='-'); //transcript strand <=> splice strand (if spliced)
243 read_start -= 1; // handle the off-by-one problem
244 out.hit = hitFactory->create_hit(read_id, ref_name, "",
245 static_cast<int> (read_start), cig_list, in.antisense_align(),
246 antisense_splice, in.edit_dist(), in.splice_mms(), in.end());
247 out.hit.seq(in.seq());
248 out.hit.qual(in.qual());
249 }
250
251 void print_trans(GffObj* trans, const BowtieHit& in, size_t rem_len,
252 size_t match_len, size_t cur_pos, size_t start_pos)
253 {
254 GffExon* p_exon;
255 std::cerr << "\tCur_pos: " << cur_pos << " remaining: " << rem_len
256 << " match_len: " << match_len << std::endl;
257 std::cerr << "\tTranscript:\t" << trans->start << "\t" << trans->end
258 << std::endl;
259 for (int i = 0; i < trans->exons.Count(); ++i)
260 {
261 p_exon = trans->exons.Get(i);
262 std::cerr << "\t\t" << p_exon->start << "\t" << p_exon->end
263 << std::endl;
264 }
265 std::cerr << std::endl;
266
267 std::cerr << "Read_id: " << in.insert_id() << std::endl;
268 std::cerr << "\tgff_start: " << in.left() << "\tgenome_start: "
269 << start_pos << std::endl;
270 }
271
272 // Returns false if not in this exon list
273 bool get_read_start(GList<GffExon>* exon_list, size_t gtf_start,
274 size_t& genome_start, int& exon_idx)
275 {
276 GffExon* cur_exon;
277 size_t cur_intron_dist = 0;
278 size_t trans_start = exon_list->First()->start;
279 int trans_offset = 0;
280 for (int i = 0; i < exon_list->Count(); ++i)
281 {
282 cur_exon = exon_list->Get(i);
283 trans_offset = trans_start + cur_intron_dist;
284
285 if (gtf_start >= cur_exon->start - trans_offset && gtf_start
286 <= cur_exon->end - trans_offset)
287 {
288 genome_start = gtf_start + trans_start + cur_intron_dist;
289 exon_idx = i;
290 return true;
291 }
292 else
293 {
294 if (i + 1 < exon_list->Count())
295 cur_intron_dist += exon_list->Get(i + 1)->start - cur_exon->end
296 - 1;
297 else
298 return false;
299 }
300 }
301
302 return false;
303 }
304
305 int main(int argc, char *argv[])
306 {
307 int parse_ret = parse_options(argc, argv, m2g_print_usage);
308 if (parse_ret)
309 return parse_ret;
310
311 if (optind >= argc)
312 {
313 m2g_print_usage();
314 return 1;
315 }
316
317 std::string gtf_file(argv[optind++]);
318 std::string reads_file(argv[optind++]);
319 std::string out_fname(argv[optind++]);
320
321 if (gtf_file == "" || reads_file == "" || out_fname == "")
322 {
323 m2g_print_usage();
324 exit(1);
325 }
326
327 Map2GTF gtfMapper(gtf_file, reads_file);
328 // gtfMapper.convert_coords(out_fname);
329 gtfMapper.convert_coords(out_fname, sam_header);
330
331 return 0;
332 }