ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff2bed
Revision: 120
Committed: Sun Dec 4 01:46:51 2011 UTC (7 years, 10 months ago) by gpertea
File size: 6169 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     use FindBin;use lib $FindBin::Bin;
5    
6     my $usage = q/Usage:
7     gff2bed [-c:<rgb_color>] <input.gtf>
8     Convert GFF annotation to BED12 format
9     /;
10     umask 0002;
11     getopts('c:o:') || die($usage."\n");
12     my $outfile=$Getopt::Std::opt_o;
13     if ($outfile) {
14     open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
15     select(OUTF);
16     }
17     my $tcolor= $Getopt::Std::opt_c || 0;
18 gpertea 120 my $tmaxscore = 0;
19 gpertea 23 #$tcolor=hex($tcolor) if ($tcolor=~m/^0x/ || $tcolor=~m/[a-f,A-F]/);
20     # --------------------- 0 1 2 3 4 5 6 7 8
21     my %recs; # recID => [ chr, strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ]
22     #
23     my ($gname, $tdescr);
24     while (<>) {
25     next if m/^\s*#/;
26     chomp;
27     my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
28     next unless $fstart>1 && $lnum;
29     next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
30     my $gff3_ID;
31     my $gff3_Parent;
32 gpertea 120 my $score;
33 gpertea 23 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
34 gpertea 120 ($score)=($lnum=~m/\bcov[= ]\"?([\d\.]+)/i); #or we can use fpkm of transcripts
35     if ($score>$tmaxscore) {
36     $tmaxscore=$score;
37     }
38 gpertea 23 ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
39     ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
40     if ($gff3_ID || $gff3_Parent) { # GFF format
41     $gff3_ID=~tr/"//d; #"
42     $gff3_Parent=~tr/"//d; #"
43     $gff3_Parent='' if ($f eq 'mRNA');
44     if ($gff3_ID && !$gff3_Parent) { #top level feature
45     if ($f=~m/RNA/i || $f=~/gene/) {
46     # try to parse the description, if any
47     ($gname,$tdescr)=();
48     if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
49     $tdescr=$1;
50     }
51     elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
52     $tdescr=$1;
53     }
54     if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
55     $gname=$1;
56     }
57     elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
58     $gname=$1;
59     }
60     $tdescr='' if ($tdescr eq $gname);
61     $gname='' if $gname eq $gff3_ID;
62     }
63     die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{"$chr|$gff3_ID"}));
64     my $recID="$chr|$gff3_ID";
65 gpertea 120 $recs{$recID} = [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [], $score ];
66 gpertea 23 next;
67     } # parent/top-level feature
68     } #GFF
69     else { #GTF format
70     next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
71     }
72     # -------------- exon/CDS line here:
73     my $recID;
74     ($gname, $tdescr)=();
75     if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
76     $recID=$chr.'.jsm.'.$lnum;
77     }
78     elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) {
79     $recID=$1;
80     $recID=~tr/"//d; #"
81     }
82     elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
83     $recID=$1;
84     $recID=~tr/"//d; #"
85     }
86     else {
87     die("Error: cannot parse locus/transcript name from input line:\n$_\n");
88     }
89     if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
90     $gname=$1;
91     $gname=~tr/"//d; #"
92     }
93     $tdescr='' if index($recID, $tdescr)>=0;
94     $gname='' if index($recID, $gname)>=0;
95     $recID=$chr.'|'.$recID;
96     my $ld = $recs{$recID};
97     if ($ld) { #existing entry
98     my $i=($f eq 'CDS') ? 8 : 7;
99     my ($lstart, $lend)=($$ld[5], $$ld[6]);
100     $$ld[5]=$fstart if $fstart<$lstart;
101     $$ld[6]=$fend if $fend>$lend;
102     push(@{$$ld[$i]}, [$fstart, $fend, $fscore]);
103     }
104 gpertea 120 else { # first time seeing this transcript
105 gpertea 23 $recs{$recID} = ($f eq 'CDS') ?
106 gpertea 120 [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]], $score ] :
107     [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [], $score ] ;
108 gpertea 23 # 0 1 2 3 4 5 6 7 (exons) 8 (CDS)
109     ($gname,$tdescr)=();
110     }
111     } #while <>
112    
113    
114     # -- sort features by chromosome:
115     print STDERR "GFF data loaded. Sorting by chromosome location..\n";
116     my @sorted_features=sort sortByLoc keys(%recs);
117     print STDERR "Writing BED file..\n";
118     writeModels(\@sorted_features);
119     print STDERR "Done.\n";
120     # --
121     if ($outfile) {
122     select(STDOUT);
123     close(OUTF);
124     }
125    
126     #************ Subroutines **************
127     sub sortByLoc {
128     my $da=$recs{$a};
129     my $db=$recs{$b};
130     if ($$da[0] eq $$db[0]) {
131     return ($$da[5]==$$db[5]) ? $$da[6] <=> $$db[6] : $$da[5] <=> $$db[5] ;
132     }
133     else { return $$da[0] cmp $$db[0] ; }
134     }
135    
136     sub writeModels {
137     #return if keys(%recs)==0;
138     my $rlist=shift(@_);
139     my @recs_keys;
140     unless ($rlist) {
141     @recs_keys=keys(%recs);
142     $rlist=\@recs_keys;
143     }
144 gpertea 120
145 gpertea 23 foreach my $gffid (@$rlist) {
146     my $ld=$recs{$gffid};
147     # my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n");
148 gpertea 120 # 0 1 2 3 4 5 6 7 8 9
149     my ($chr, $strand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr, $score) = @$ld;
150 gpertea 23 my ($mstart,$mend)=($lstart, $lend);
151     my $CDexons=0;
152     my @ex;
153     my @cds;
154     if (@$er<1 && @$cr>0) {
155     @ex = sort { $a->[0] <=> $b->[0] } @$cr;
156     $CDexons=1;
157     }
158     else {
159     @ex = sort { $a->[0] <=> $b->[0] } @$er;
160     if (@$cr>0) {
161     @cds= sort { $a->[0] <=> $b->[0] } @$cr;
162     }
163     }
164     ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]); # get the more accurate version of the start-end coords for the feature
165     my ($cds_start, $cds_end) = ($CDexons || @cds==0) ? ($mstart, $mend) : ($cds[0]->[0], $cds[-1]->[1]) ;
166     my @bstarts;
167     my @blens;
168     foreach my $ed (@ex) {
169     push(@bstarts, $$ed[0]-$mstart);
170     push(@blens, $$ed[1]-$$ed[0]+1);
171     }
172 gpertea 120 if ($score) {
173     $tcolor=score2color($score);
174    
175     }
176     else {
177     $score=666;
178     }
179     print join("\t", $chr, $mstart-1, $mend, "$gffid|$gname", $score, $strand, $cds_start-1, $cds_end,
180     join(',',$tcolor,$tcolor,$tcolor),
181 gpertea 23 scalar(@bstarts), join(',', @blens), join(',', @bstarts))."\n";
182     } #for each stored transcript
183     }
184 gpertea 120
185     sub score2color {
186     my $c=int(255-(255.0 * ($_[0]/$tmaxscore)));
187     $c=0 if $c<0;
188     $c=128 if $c>=128;
189     return $c;
190     }

Properties

Name Value
svn:executable *