ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/GTFToFasta.cpp
Revision: 165
Committed: Fri Feb 10 01:04:05 2012 UTC (8 years, 3 months ago) by gpertea
File size: 4885 byte(s)
Log Message:
sync back with Daehwan. Test bam_merge for memory leaks!

Line User Rev File contents
1 gpertea 165 //
2     // gtfToFasta.cpp
3     // TopHat
4     //
5     // Created by Harold Pimentel on 10/26/11.
6     //
7     #include "GTFToFasta.h"
8    
9    
10     std::string get_exonic_sequence(GffObj *p_trans,
11     FastaRecord *rec)
12     {
13     // TODO: Write me.
14    
15     // TODO: Should I check if the transcript name matches the record name?
16     // For now, leave this up to the caller.
17     GList<GffExon>& exon_list = p_trans->exons;
18     GffExon* cur_exon;
19     std::string exon_seq("");
20     size_t length;
21    
22     for (int i = 0; i < exon_list.Count(); ++i) {
23     cur_exon = exon_list.Get(i);
24     length = cur_exon->end - cur_exon->start + 1;
25     exon_seq += rec->seq_.substr(cur_exon->start - 1, length);
26     }
27    
28    
29     return exon_seq;
30     }
31    
32    
33     GTFToFasta::GTFToFasta(std::string gtf_fname, std::string genome_fname)
34     : genome_fhandle_(genome_fname.c_str(), false)
35     {
36     gtf_fname_ = gtf_fname;
37     gtf_fhandle_ = fopen(gtf_fname_.c_str(), "r");
38     if (gtf_fhandle_ == NULL)
39     {
40     std::cerr << "FATAL: Couldn't open annotation: " << gtf_fname_
41     << std::endl;
42     exit(1);
43     }
44     std::cout << "Reading the annotation file: " << gtf_fname_ << std::endl;
45     gtfReader_.init(gtf_fhandle_, true); //load recognizable transcript features only
46     gtfReader_.readAll();
47    
48     genome_fname_ = genome_fname;
49    
50     // Make a map from the GffObj
51     transcript_map();
52     }
53    
54     GTFToFasta::~GTFToFasta()
55     {
56     ContigTransMap::iterator it;
57    
58     for (it = contigTransMap_.begin(); it != contigTransMap_.end(); ++it) {
59     delete it->second;
60     }
61     }
62    
63     void GTFToFasta::make_transcriptome(std::string out_fname)
64     {
65     GffObj *p_trans;
66    
67     std::vector<size_t> *p_contig_vec;
68    
69     FastaReader fastaReader(genome_fname_);
70     FastaWriter fastaWriter(out_fname);
71    
72     FastaRecord *cur_contig;
73    
74     while (fastaReader.good()) {
75     cur_contig = new FastaRecord();
76     fastaReader.next(cur_contig);
77    
78     // If this contig isn't in the map, then there are no transcripts
79     // associated with it. Skip it.
80     if (contigTransMap_.find(cur_contig->id_) ==
81     contigTransMap_.end())
82     {
83     delete cur_contig;
84     continue;
85     }
86    
87     p_contig_vec = contigTransMap_[cur_contig->id_.c_str()];
88     std::string exon_seq;
89    
90     FastaRecord out_rec;
91     for (size_t i = 0; i < p_contig_vec->size(); ++i) {
92     size_t trans_idx = (*p_contig_vec)[i];
93     p_trans = gtfReader_.gflst.Get(trans_idx);
94     exon_seq = get_exonic_sequence(p_trans, cur_contig);
95     if (exon_seq.empty()) continue;
96     std::stringstream ss;
97     ss << trans_idx;
98     out_rec.id_ = ss.str();
99     ss.str(std::string()); //clear ss
100     ss << p_trans->getGSeqName() << ':' << p_trans->start << '-' << p_trans->end << ' ' << p_trans->getID();
101     //out_rec.desc_ = "";
102     out_rec.desc_ = ss.str();
103     out_rec.seq_ = exon_seq;
104     fastaWriter.write(&out_rec);
105     }
106     delete cur_contig;
107     }
108     }
109    
110     void GTFToFasta::transcript_map()
111     {
112     GffObj *p_gffObj;
113     const char *p_contig_name;
114     std::vector<size_t> *p_contig_vec;
115    
116     for (int i = 0; i < gtfReader_.gflst.Count(); ++i)
117     {
118     p_gffObj = gtfReader_.gflst.Get(i);
119     p_contig_name = p_gffObj->getRefName();
120     std::string contig_name(p_contig_name);
121    
122     // Check if the current contig exists in the map
123     // If it doesn't, add it
124     if (contigTransMap_.find(contig_name) == contigTransMap_.end())
125     {
126     p_contig_vec = new std::vector<size_t>;
127     contigTransMap_[contig_name] = p_contig_vec;
128     }
129     else
130     {
131     p_contig_vec = contigTransMap_[contig_name];
132     }
133    
134     p_contig_vec->push_back(i);
135     }
136     }
137    
138     void GTFToFasta::print_mapping()
139     {
140     std::ofstream out_file("out.names");
141     GffObj *p_gffObj;
142    
143     for (int i = 0; i < gtfReader_.gflst.Count(); ++i)
144     {
145     p_gffObj = gtfReader_.gflst.Get(i);
146     out_file << i << "\t" << p_gffObj->getID() << std::endl;
147     }
148    
149     out_file.close();
150     }
151    
152     void gtf2fasta_print_usage()
153     {
154     std::cerr << "Usage: gtf_to_fasta transcripts.gtf genome.fa out_file" << std::endl;
155     }
156    
157     int main(int argc, char *argv[])
158     {
159     int parse_ret = parse_options(argc, argv, gtf2fasta_print_usage);
160     if (parse_ret)
161     return parse_ret;
162    
163     if (optind >= argc)
164     {
165     gtf2fasta_print_usage();
166     return 1;
167     }
168    
169     std::string gtf_fname(argv[optind++]);
170     std::string genome_fname(argv[optind++]);
171     std::string out_fname(argv[optind++]);
172    
173     GTFToFasta gtfToFasta(gtf_fname, genome_fname);
174     gtfToFasta.make_transcriptome(out_fname);
175     //gtfToFasta.print_mapping();
176    
177     return 0;
178     }

Properties

Name Value
svn:executable *