ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff3_simplify.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 3037 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/env perl
2 #
3 #converts newer,complex GFF3 file (e.g. from TAIR)
4 # to the simpler format expected by Cufflinks/Tophat
5 #
6 # Usage:
7 # gff3_simplify.pl < input.gff3 > output.gff3
8
9 #http://www.sequenceontology.org/resources/gff3.html
10 # Unsupported features by Cufflinks/Tophat GFF3 parser:
11 # * multiple parents (e.g. exons shared by multiple isoforms)
12 # * gene name can only be found in the parent "gene" feature
13 # * exon and CDS can be parented directly by a "gene" feature
14 # * spliced non-coding transcripts the parent feature is "noncoding_transcript"
15 # with child "exon" features (this is actually supported)
16
17 #corrections made:
18 # *create dummy RNA IDs for orphan CDS/exons or those owned directly by a gene feature
19 # *discard gene features but keep their name in the gene_name attribute of their transcripts
20 # *discard any non-transcript or redudant features (i.e. keep only ?RNA, exon and CDS features)
21 # *copy shared exons/CDS to their parent transcripts as needed (to avoid multiple parents for a feature)
22
23
24 use strict;
25 my %genes; # id => name
26 my %rnas;
27 # t_id=>[gseq, src, "ftype fstart fend score strand phase", [exons], [cds], name, gene_name]
28 # 0 1 2 3 4 5 6
29 my @tlist;
30 my $tdummy; #dummy transcripts counter
31 while (<>) {
32 chomp;
33 my ($ctg, $src, $f, $fstart, $fend, $score, $strand, $phase, $attrs)=split(/\t/);
34 next unless $attrs;
35 my ($id)=($attrs=~m/ID=([^;]+)/);
36 my ($name)=($attrs=~m/Name=([^;]+)/);
37 if ($f eq 'gene') {
38 my $gname=$name || $id;
39 $genes{$id}=[$gname, join("\t", $fstart, $fend, $score, $strand, $phase)];
40 next;
41 }
42 my ($parent)=($attrs=~m/Parent=([^;]+)/);
43 if ($f=~m/RNA|transcript/) {
44 my $gname=$genes{$parent}->[0] if exists($genes{$parent});
45 $rnas{$id}=[$ctg, $src, join("\t", $f, $fstart, $fend, $score, $strand, $phase),
46 [], [], $name, $gname];
47 push(@tlist, $id);
48 next;
49 }
50 next unless $f eq 'exon' || $f eq 'CDS';
51 my @parents=split(/\,/,$parent);
52 my $x= ($f eq 'CDS') ? 4 : 3;
53 foreach my $p (@parents) {
54 my $td=$rnas{$p};
55 if ($td) {
56 push(@{$$td[$x]}, join("\t",$fstart, $fend, $score, $strand, $phase));
57 }
58 elsif (exists($genes{$p})) {
59 my $grna=$ctg.'|'.$genes{$p}->[0].'|RNA';
60 if (!exists($rnas{$grna})) {
61 $rnas{$grna}=[$ctg, $src, $genes{$p}->[1], [], [], $genes{$p}->[0]];
62 push(@tlist, $grna);
63 }
64 push(@{$rnas{$grna}->[$x]}, join("\t",$fstart, $fend, $score, $strand, $phase));
65 }
66 }
67 }
68
69 foreach my $tid (@tlist) {
70 my $td=$rnas{$tid};
71 my $tattrs="ID=$tid";
72 $tattrs.=";Name=".$$td[6].';gene_name='.$$td[6] if ($$td[6]);
73 print join("\t", @$td[0..2], $tattrs)."\n";
74 foreach my $xd (@{$$td[3]}) {
75 print join("\t", $$td[0], $$td[1], 'exon', $xd, "Parent=$tid")."\n";
76 }
77 foreach my $xd (@{$$td[4]}) {
78 print join("\t", $$td[0], $$td[1], 'CDS', $xd, "Parent=$tid")."\n";
79 }
80 }

Properties

Name Value
svn:executable *