ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/gtf_juncs.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (7 years, 8 months ago) by gpertea
File size: 3574 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 /*
2 * gff_juncs.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 1/15/09.
6 * Copyright 2009 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #else
13 #define PACKAGE_VERSION "INTERNAL"
14 #define SVN_REVISION "XXX"
15 #endif
16
17 #include <getopt.h>
18 #include <string>
19 #include <cstdio>
20 #include <set>
21 #include "gff.h"
22
23 #include "common.h"
24 #include "bwt_map.h"
25
26 using namespace std;
27
28
29 void print_usage()
30 {
31 fprintf(stderr, "Usage: gtf_juncs <transcripts.gtf>\n");
32 }
33
34 void read_transcripts(FILE* f, GffReader& gffr) {
35 //assume gffr was just created but not initialized
36 gffr.init(f, true, true); //(gffile, mRNA-only, sortByLoc)
37 gffr.showWarnings(verbose);
38 //(keepAttr, mergeCloseExons, noExonAttr)
39 gffr.readAll(false, true, true);
40 //now all parsed GffObjs are in gffr.gflst, grouped by genomic sequence
41 }
42
43 uint32_t get_junctions_from_gff(FILE* ref_mRNA_file,
44 RefSequenceTable& rt)
45 {
46 GffReader gff_reader(ref_mRNA_file, true); //only recognizable transcript features, sort them by locus
47 if (ref_mRNA_file)
48 {
49 read_transcripts(ref_mRNA_file, gff_reader);
50 }
51
52 set<pair<string, pair<int, int> > > uniq_juncs;
53
54 //if any ref data was loaded
55 int last_gseqid=-1;
56 const char* gseqname=NULL;
57 for (int i=0;i<gff_reader.gflst.Count();i++) {
58 //ref data is grouped by genomic sequence
59 GffObj& rna = *(gff_reader.gflst[i]);
60 uint tlen=rna.len();
61 if (rna.hasErrors() || (tlen+500>GFF_MAX_LOCUS)) {
62 //if (verbose)
63 GMessage("Warning: transcript %s discarded (structural errors found, length=%d).\n", rna.getID(), tlen);
64 continue;
65 }
66 if (rna.isDiscarded()) {
67 //discarded generic "gene" or "locus" features with no other detailed subfeatures
68 continue;
69 }
70 if (rna.exons.Count()==0) {
71 //if (verbose)
72 // GMessage("Warning: %s %s found without exon segments (adding default exon).\n",rna.getFeatureName(), rna.getID());
73 rna.addExon(rna.start,rna.end);
74 }
75
76 if (rna.gseq_id!=last_gseqid) {
77 gseqname=rna.getGSeqName();
78 rt.get_id(gseqname, NULL, 0);
79 last_gseqid=rna.gseq_id;
80 }
81 for (int e = 1; e < rna.exons.Count(); ++e) {
82 GffExon& ex = *(rna.exons[e]);
83 GffExon& prex = *(rna.exons[e-1]);
84 fprintf(stdout, "%s\t%d\t%d\t%c\n",
85 gseqname,
86 prex.end-1, ex.start-1, rna.strand);
87 uniq_juncs.insert(make_pair(gseqname, make_pair(prex.end - 1, ex.start - 1)));
88 }
89 } //for each loaded GFF record
90 return uniq_juncs.size();
91 }
92
93 int main(int argc, char** argv)
94 {
95 int parse_ret = parse_options(argc, argv, print_usage);
96 if (parse_ret)
97 return parse_ret;
98
99
100 if(optind >= argc)
101 {
102 print_usage();
103 return 2;
104 }
105
106 string gtf_filename = argv[optind++];
107
108 //GFF_database gff_db;
109 if (gtf_filename == "")
110 {
111 print_usage();
112 exit(2);
113 }
114
115 FILE* ref_gtf = fopen(gtf_filename.c_str(), "r");
116 if (!ref_gtf)
117 {
118 fprintf (stderr, "Error: could not open GTF file %s for reading\n", gtf_filename.c_str());
119 exit(1);
120 }
121
122 fprintf(stderr, "gtf_juncs v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
123 fprintf(stderr, "---------------------------\n");
124
125 //gff_db.from_file(gff_filename);
126 // gff_db.sort_entries();
127 //
128
129 RefSequenceTable rt(true);
130 uint32_t num_juncs_reported = get_junctions_from_gff(ref_gtf, rt);
131
132
133 //uint32_t num_juncs_reported = 0;
134
135 fprintf(stderr, "Extracted %u junctions from %s\n",
136 num_juncs_reported, gtf_filename.c_str());
137 if (!num_juncs_reported)
138 return 1;
139 return 0;
140 }