ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/gtf_juncs.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (8 years, 4 months ago) by gpertea
File size: 3574 byte(s)
Log Message:
massive update with Daehwan's work

Line User Rev File contents
1 gpertea 29 /*
2     * gff_juncs.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 1/15/09.
6     * Copyright 2009 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #else
13     #define PACKAGE_VERSION "INTERNAL"
14     #define SVN_REVISION "XXX"
15     #endif
16    
17     #include <getopt.h>
18     #include <string>
19     #include <cstdio>
20     #include <set>
21     #include "gff.h"
22    
23     #include "common.h"
24     #include "bwt_map.h"
25    
26     using namespace std;
27    
28    
29     void print_usage()
30     {
31     fprintf(stderr, "Usage: gtf_juncs <transcripts.gtf>\n");
32     }
33    
34     void read_transcripts(FILE* f, GffReader& gffr) {
35     //assume gffr was just created but not initialized
36     gffr.init(f, true, true); //(gffile, mRNA-only, sortByLoc)
37     gffr.showWarnings(verbose);
38     //(keepAttr, mergeCloseExons, noExonAttr)
39     gffr.readAll(false, true, true);
40     //now all parsed GffObjs are in gffr.gflst, grouped by genomic sequence
41     }
42    
43     uint32_t get_junctions_from_gff(FILE* ref_mRNA_file,
44     RefSequenceTable& rt)
45     {
46     GffReader gff_reader(ref_mRNA_file, true); //only recognizable transcript features, sort them by locus
47     if (ref_mRNA_file)
48     {
49     read_transcripts(ref_mRNA_file, gff_reader);
50     }
51    
52     set<pair<string, pair<int, int> > > uniq_juncs;
53    
54     //if any ref data was loaded
55     int last_gseqid=-1;
56     const char* gseqname=NULL;
57     for (int i=0;i<gff_reader.gflst.Count();i++) {
58     //ref data is grouped by genomic sequence
59     GffObj& rna = *(gff_reader.gflst[i]);
60     uint tlen=rna.len();
61     if (rna.hasErrors() || (tlen+500>GFF_MAX_LOCUS)) {
62     //if (verbose)
63     GMessage("Warning: transcript %s discarded (structural errors found, length=%d).\n", rna.getID(), tlen);
64     continue;
65     }
66     if (rna.isDiscarded()) {
67     //discarded generic "gene" or "locus" features with no other detailed subfeatures
68     continue;
69     }
70     if (rna.exons.Count()==0) {
71     //if (verbose)
72     // GMessage("Warning: %s %s found without exon segments (adding default exon).\n",rna.getFeatureName(), rna.getID());
73     rna.addExon(rna.start,rna.end);
74     }
75    
76     if (rna.gseq_id!=last_gseqid) {
77     gseqname=rna.getGSeqName();
78     rt.get_id(gseqname, NULL, 0);
79     last_gseqid=rna.gseq_id;
80     }
81     for (int e = 1; e < rna.exons.Count(); ++e) {
82     GffExon& ex = *(rna.exons[e]);
83     GffExon& prex = *(rna.exons[e-1]);
84     fprintf(stdout, "%s\t%d\t%d\t%c\n",
85     gseqname,
86     prex.end-1, ex.start-1, rna.strand);
87     uniq_juncs.insert(make_pair(gseqname, make_pair(prex.end - 1, ex.start - 1)));
88     }
89     } //for each loaded GFF record
90     return uniq_juncs.size();
91     }
92    
93     int main(int argc, char** argv)
94     {
95     int parse_ret = parse_options(argc, argv, print_usage);
96     if (parse_ret)
97     return parse_ret;
98    
99    
100     if(optind >= argc)
101     {
102     print_usage();
103     return 2;
104     }
105    
106     string gtf_filename = argv[optind++];
107    
108     //GFF_database gff_db;
109     if (gtf_filename == "")
110     {
111     print_usage();
112     exit(2);
113     }
114    
115     FILE* ref_gtf = fopen(gtf_filename.c_str(), "r");
116     if (!ref_gtf)
117     {
118     fprintf (stderr, "Error: could not open GTF file %s for reading\n", gtf_filename.c_str());
119     exit(1);
120     }
121    
122     fprintf(stderr, "gtf_juncs v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
123     fprintf(stderr, "---------------------------\n");
124    
125     //gff_db.from_file(gff_filename);
126     // gff_db.sort_entries();
127     //
128    
129     RefSequenceTable rt(true);
130     uint32_t num_juncs_reported = get_junctions_from_gff(ref_gtf, rt);
131    
132    
133     //uint32_t num_juncs_reported = 0;
134    
135     fprintf(stderr, "Extracted %u junctions from %s\n",
136     num_juncs_reported, gtf_filename.c_str());
137     if (!num_juncs_reported)
138     return 1;
139     return 0;
140     }