ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/GTFToFasta.cpp
Revision: 165
Committed: Fri Feb 10 01:04:05 2012 UTC (8 years, 9 months ago) by gpertea
File size: 4885 byte(s)
Log Message:
sync back with Daehwan. Test bam_merge for memory leaks!

Line File contents
1 //
2 // gtfToFasta.cpp
3 // TopHat
4 //
5 // Created by Harold Pimentel on 10/26/11.
6 //
7 #include "GTFToFasta.h"
8
9
10 std::string get_exonic_sequence(GffObj *p_trans,
11 FastaRecord *rec)
12 {
13 // TODO: Write me.
14
15 // TODO: Should I check if the transcript name matches the record name?
16 // For now, leave this up to the caller.
17 GList<GffExon>& exon_list = p_trans->exons;
18 GffExon* cur_exon;
19 std::string exon_seq("");
20 size_t length;
21
22 for (int i = 0; i < exon_list.Count(); ++i) {
23 cur_exon = exon_list.Get(i);
24 length = cur_exon->end - cur_exon->start + 1;
25 exon_seq += rec->seq_.substr(cur_exon->start - 1, length);
26 }
27
28
29 return exon_seq;
30 }
31
32
33 GTFToFasta::GTFToFasta(std::string gtf_fname, std::string genome_fname)
34 : genome_fhandle_(genome_fname.c_str(), false)
35 {
36 gtf_fname_ = gtf_fname;
37 gtf_fhandle_ = fopen(gtf_fname_.c_str(), "r");
38 if (gtf_fhandle_ == NULL)
39 {
40 std::cerr << "FATAL: Couldn't open annotation: " << gtf_fname_
41 << std::endl;
42 exit(1);
43 }
44 std::cout << "Reading the annotation file: " << gtf_fname_ << std::endl;
45 gtfReader_.init(gtf_fhandle_, true); //load recognizable transcript features only
46 gtfReader_.readAll();
47
48 genome_fname_ = genome_fname;
49
50 // Make a map from the GffObj
51 transcript_map();
52 }
53
54 GTFToFasta::~GTFToFasta()
55 {
56 ContigTransMap::iterator it;
57
58 for (it = contigTransMap_.begin(); it != contigTransMap_.end(); ++it) {
59 delete it->second;
60 }
61 }
62
63 void GTFToFasta::make_transcriptome(std::string out_fname)
64 {
65 GffObj *p_trans;
66
67 std::vector<size_t> *p_contig_vec;
68
69 FastaReader fastaReader(genome_fname_);
70 FastaWriter fastaWriter(out_fname);
71
72 FastaRecord *cur_contig;
73
74 while (fastaReader.good()) {
75 cur_contig = new FastaRecord();
76 fastaReader.next(cur_contig);
77
78 // If this contig isn't in the map, then there are no transcripts
79 // associated with it. Skip it.
80 if (contigTransMap_.find(cur_contig->id_) ==
81 contigTransMap_.end())
82 {
83 delete cur_contig;
84 continue;
85 }
86
87 p_contig_vec = contigTransMap_[cur_contig->id_.c_str()];
88 std::string exon_seq;
89
90 FastaRecord out_rec;
91 for (size_t i = 0; i < p_contig_vec->size(); ++i) {
92 size_t trans_idx = (*p_contig_vec)[i];
93 p_trans = gtfReader_.gflst.Get(trans_idx);
94 exon_seq = get_exonic_sequence(p_trans, cur_contig);
95 if (exon_seq.empty()) continue;
96 std::stringstream ss;
97 ss << trans_idx;
98 out_rec.id_ = ss.str();
99 ss.str(std::string()); //clear ss
100 ss << p_trans->getGSeqName() << ':' << p_trans->start << '-' << p_trans->end << ' ' << p_trans->getID();
101 //out_rec.desc_ = "";
102 out_rec.desc_ = ss.str();
103 out_rec.seq_ = exon_seq;
104 fastaWriter.write(&out_rec);
105 }
106 delete cur_contig;
107 }
108 }
109
110 void GTFToFasta::transcript_map()
111 {
112 GffObj *p_gffObj;
113 const char *p_contig_name;
114 std::vector<size_t> *p_contig_vec;
115
116 for (int i = 0; i < gtfReader_.gflst.Count(); ++i)
117 {
118 p_gffObj = gtfReader_.gflst.Get(i);
119 p_contig_name = p_gffObj->getRefName();
120 std::string contig_name(p_contig_name);
121
122 // Check if the current contig exists in the map
123 // If it doesn't, add it
124 if (contigTransMap_.find(contig_name) == contigTransMap_.end())
125 {
126 p_contig_vec = new std::vector<size_t>;
127 contigTransMap_[contig_name] = p_contig_vec;
128 }
129 else
130 {
131 p_contig_vec = contigTransMap_[contig_name];
132 }
133
134 p_contig_vec->push_back(i);
135 }
136 }
137
138 void GTFToFasta::print_mapping()
139 {
140 std::ofstream out_file("out.names");
141 GffObj *p_gffObj;
142
143 for (int i = 0; i < gtfReader_.gflst.Count(); ++i)
144 {
145 p_gffObj = gtfReader_.gflst.Get(i);
146 out_file << i << "\t" << p_gffObj->getID() << std::endl;
147 }
148
149 out_file.close();
150 }
151
152 void gtf2fasta_print_usage()
153 {
154 std::cerr << "Usage: gtf_to_fasta transcripts.gtf genome.fa out_file" << std::endl;
155 }
156
157 int main(int argc, char *argv[])
158 {
159 int parse_ret = parse_options(argc, argv, gtf2fasta_print_usage);
160 if (parse_ret)
161 return parse_ret;
162
163 if (optind >= argc)
164 {
165 gtf2fasta_print_usage();
166 return 1;
167 }
168
169 std::string gtf_fname(argv[optind++]);
170 std::string genome_fname(argv[optind++]);
171 std::string out_fname(argv[optind++]);
172
173 GTFToFasta gtfToFasta(gtf_fname, genome_fname);
174 gtfToFasta.make_transcriptome(out_fname);
175 //gtfToFasta.print_mapping();
176
177 return 0;
178 }

Properties

Name Value
svn:executable *