ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff3_simplify.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 2 months ago) by gpertea
File size: 3037 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/env perl
2     #
3     #converts newer,complex GFF3 file (e.g. from TAIR)
4     # to the simpler format expected by Cufflinks/Tophat
5     #
6     # Usage:
7     # gff3_simplify.pl < input.gff3 > output.gff3
8    
9     #http://www.sequenceontology.org/resources/gff3.html
10     # Unsupported features by Cufflinks/Tophat GFF3 parser:
11     # * multiple parents (e.g. exons shared by multiple isoforms)
12     # * gene name can only be found in the parent "gene" feature
13     # * exon and CDS can be parented directly by a "gene" feature
14     # * spliced non-coding transcripts the parent feature is "noncoding_transcript"
15     # with child "exon" features (this is actually supported)
16    
17     #corrections made:
18     # *create dummy RNA IDs for orphan CDS/exons or those owned directly by a gene feature
19     # *discard gene features but keep their name in the gene_name attribute of their transcripts
20     # *discard any non-transcript or redudant features (i.e. keep only ?RNA, exon and CDS features)
21     # *copy shared exons/CDS to their parent transcripts as needed (to avoid multiple parents for a feature)
22    
23    
24     use strict;
25     my %genes; # id => name
26     my %rnas;
27     # t_id=>[gseq, src, "ftype fstart fend score strand phase", [exons], [cds], name, gene_name]
28     # 0 1 2 3 4 5 6
29     my @tlist;
30     my $tdummy; #dummy transcripts counter
31     while (<>) {
32     chomp;
33     my ($ctg, $src, $f, $fstart, $fend, $score, $strand, $phase, $attrs)=split(/\t/);
34     next unless $attrs;
35     my ($id)=($attrs=~m/ID=([^;]+)/);
36     my ($name)=($attrs=~m/Name=([^;]+)/);
37     if ($f eq 'gene') {
38     my $gname=$name || $id;
39     $genes{$id}=[$gname, join("\t", $fstart, $fend, $score, $strand, $phase)];
40     next;
41     }
42     my ($parent)=($attrs=~m/Parent=([^;]+)/);
43     if ($f=~m/RNA|transcript/) {
44     my $gname=$genes{$parent}->[0] if exists($genes{$parent});
45     $rnas{$id}=[$ctg, $src, join("\t", $f, $fstart, $fend, $score, $strand, $phase),
46     [], [], $name, $gname];
47     push(@tlist, $id);
48     next;
49     }
50     next unless $f eq 'exon' || $f eq 'CDS';
51     my @parents=split(/\,/,$parent);
52     my $x= ($f eq 'CDS') ? 4 : 3;
53     foreach my $p (@parents) {
54     my $td=$rnas{$p};
55     if ($td) {
56     push(@{$$td[$x]}, join("\t",$fstart, $fend, $score, $strand, $phase));
57     }
58     elsif (exists($genes{$p})) {
59     my $grna=$ctg.'|'.$genes{$p}->[0].'|RNA';
60     if (!exists($rnas{$grna})) {
61     $rnas{$grna}=[$ctg, $src, $genes{$p}->[1], [], [], $genes{$p}->[0]];
62     push(@tlist, $grna);
63     }
64     push(@{$rnas{$grna}->[$x]}, join("\t",$fstart, $fend, $score, $strand, $phase));
65     }
66     }
67     }
68    
69     foreach my $tid (@tlist) {
70     my $td=$rnas{$tid};
71     my $tattrs="ID=$tid";
72     $tattrs.=";Name=".$$td[6].';gene_name='.$$td[6] if ($$td[6]);
73     print join("\t", @$td[0..2], $tattrs)."\n";
74     foreach my $xd (@{$$td[3]}) {
75     print join("\t", $$td[0], $$td[1], 'exon', $xd, "Parent=$tid")."\n";
76     }
77     foreach my $xd (@{$$td[4]}) {
78     print join("\t", $$td[0], $$td[1], 'CDS', $xd, "Parent=$tid")."\n";
79     }
80     }

Properties

Name Value
svn:executable *