1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
use FindBin;use lib $FindBin::Bin; |
5 |
|
6 |
my $usage = q/Usage: |
7 |
gff2bed [-c:<rgb_color>] <input.gtf> |
8 |
Convert GFF annotation to BED12 format |
9 |
/; |
10 |
umask 0002; |
11 |
getopts('c:o:') || die($usage."\n"); |
12 |
my $outfile=$Getopt::Std::opt_o; |
13 |
if ($outfile) { |
14 |
open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n"); |
15 |
select(OUTF); |
16 |
} |
17 |
my $tcolor= $Getopt::Std::opt_c || 0; |
18 |
my $tmaxscore = 0; |
19 |
#$tcolor=hex($tcolor) if ($tcolor=~m/^0x/ || $tcolor=~m/[a-f,A-F]/); |
20 |
# --------------------- 0 1 2 3 4 5 6 7 8 |
21 |
my %recs; # recID => [ chr, strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ] |
22 |
# |
23 |
my ($gname, $tdescr); |
24 |
while (<>) { |
25 |
next if m/^\s*#/; |
26 |
chomp; |
27 |
my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/); |
28 |
next unless $fstart>1 && $lnum; |
29 |
next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally |
30 |
my $gff3_ID; |
31 |
my $gff3_Parent; |
32 |
my $score; |
33 |
($fstart, $fend)=($fend, $fstart) if $fend<$fstart; |
34 |
($score)=($lnum=~m/\bcov[= ]\"?([\d\.]+)/i); #or we can use fpkm of transcripts |
35 |
if ($score>$tmaxscore) { |
36 |
$tmaxscore=$score; |
37 |
} |
38 |
($gff3_ID)=($lnum=~m/\bID=([^;]+)/); |
39 |
($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/); |
40 |
if ($gff3_ID || $gff3_Parent) { # GFF format |
41 |
$gff3_ID=~tr/"//d; #" |
42 |
$gff3_Parent=~tr/"//d; #" |
43 |
$gff3_Parent='' if ($f eq 'mRNA'); |
44 |
if ($gff3_ID && !$gff3_Parent) { #top level feature |
45 |
if ($f=~m/RNA/i || $f=~/gene/) { |
46 |
# try to parse the description, if any |
47 |
($gname,$tdescr)=(); |
48 |
if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) { |
49 |
$tdescr=$1; |
50 |
} |
51 |
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
52 |
$tdescr=$1; |
53 |
} |
54 |
if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) { |
55 |
$gname=$1; |
56 |
} |
57 |
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
58 |
$gname=$1; |
59 |
} |
60 |
$tdescr='' if ($tdescr eq $gname); |
61 |
$gname='' if $gname eq $gff3_ID; |
62 |
} |
63 |
die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{"$chr|$gff3_ID"})); |
64 |
my $recID="$chr|$gff3_ID"; |
65 |
$recs{$recID} = [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [], $score ]; |
66 |
next; |
67 |
} # parent/top-level feature |
68 |
} #GFF |
69 |
else { #GTF format |
70 |
next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature |
71 |
} |
72 |
# -------------- exon/CDS line here: |
73 |
my $recID; |
74 |
($gname, $tdescr)=(); |
75 |
if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) { |
76 |
$recID=$chr.'.jsm.'.$lnum; |
77 |
} |
78 |
elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) { |
79 |
$recID=$1; |
80 |
$recID=~tr/"//d; #" |
81 |
} |
82 |
elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) { |
83 |
$recID=$1; |
84 |
$recID=~tr/"//d; #" |
85 |
} |
86 |
else { |
87 |
die("Error: cannot parse locus/transcript name from input line:\n$_\n"); |
88 |
} |
89 |
if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) { |
90 |
$gname=$1; |
91 |
$gname=~tr/"//d; #" |
92 |
} |
93 |
$tdescr='' if index($recID, $tdescr)>=0; |
94 |
$gname='' if index($recID, $gname)>=0; |
95 |
$recID=$chr.'|'.$recID; |
96 |
my $ld = $recs{$recID}; |
97 |
if ($ld) { #existing entry |
98 |
my $i=($f eq 'CDS') ? 8 : 7; |
99 |
my ($lstart, $lend)=($$ld[5], $$ld[6]); |
100 |
$$ld[5]=$fstart if $fstart<$lstart; |
101 |
$$ld[6]=$fend if $fend>$lend; |
102 |
push(@{$$ld[$i]}, [$fstart, $fend, $fscore]); |
103 |
} |
104 |
else { # first time seeing this transcript |
105 |
$recs{$recID} = ($f eq 'CDS') ? |
106 |
[$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]], $score ] : |
107 |
[$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [], $score ] ; |
108 |
# 0 1 2 3 4 5 6 7 (exons) 8 (CDS) |
109 |
($gname,$tdescr)=(); |
110 |
} |
111 |
} #while <> |
112 |
|
113 |
|
114 |
# -- sort features by chromosome: |
115 |
print STDERR "GFF data loaded. Sorting by chromosome location..\n"; |
116 |
my @sorted_features=sort sortByLoc keys(%recs); |
117 |
print STDERR "Writing BED file..\n"; |
118 |
writeModels(\@sorted_features); |
119 |
print STDERR "Done.\n"; |
120 |
# -- |
121 |
if ($outfile) { |
122 |
select(STDOUT); |
123 |
close(OUTF); |
124 |
} |
125 |
|
126 |
#************ Subroutines ************** |
127 |
sub sortByLoc { |
128 |
my $da=$recs{$a}; |
129 |
my $db=$recs{$b}; |
130 |
if ($$da[0] eq $$db[0]) { |
131 |
return ($$da[5]==$$db[5]) ? $$da[6] <=> $$db[6] : $$da[5] <=> $$db[5] ; |
132 |
} |
133 |
else { return $$da[0] cmp $$db[0] ; } |
134 |
} |
135 |
|
136 |
sub writeModels { |
137 |
#return if keys(%recs)==0; |
138 |
my $rlist=shift(@_); |
139 |
my @recs_keys; |
140 |
unless ($rlist) { |
141 |
@recs_keys=keys(%recs); |
142 |
$rlist=\@recs_keys; |
143 |
} |
144 |
|
145 |
foreach my $gffid (@$rlist) { |
146 |
my $ld=$recs{$gffid}; |
147 |
# my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n"); |
148 |
# 0 1 2 3 4 5 6 7 8 9 |
149 |
my ($chr, $strand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr, $score) = @$ld; |
150 |
my ($mstart,$mend)=($lstart, $lend); |
151 |
my $CDexons=0; |
152 |
my @ex; |
153 |
my @cds; |
154 |
if (@$er<1 && @$cr>0) { |
155 |
@ex = sort { $a->[0] <=> $b->[0] } @$cr; |
156 |
$CDexons=1; |
157 |
} |
158 |
else { |
159 |
@ex = sort { $a->[0] <=> $b->[0] } @$er; |
160 |
if (@$cr>0) { |
161 |
@cds= sort { $a->[0] <=> $b->[0] } @$cr; |
162 |
} |
163 |
} |
164 |
($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]); # get the more accurate version of the start-end coords for the feature |
165 |
my ($cds_start, $cds_end) = ($CDexons || @cds==0) ? ($mstart, $mend) : ($cds[0]->[0], $cds[-1]->[1]) ; |
166 |
my @bstarts; |
167 |
my @blens; |
168 |
foreach my $ed (@ex) { |
169 |
push(@bstarts, $$ed[0]-$mstart); |
170 |
push(@blens, $$ed[1]-$$ed[0]+1); |
171 |
} |
172 |
if ($score) { |
173 |
$tcolor=score2color($score); |
174 |
|
175 |
} |
176 |
else { |
177 |
$score=666; |
178 |
} |
179 |
print join("\t", $chr, $mstart-1, $mend, "$gffid|$gname", $score, $strand, $cds_start-1, $cds_end, |
180 |
join(',',$tcolor,$tcolor,$tcolor), |
181 |
scalar(@bstarts), join(',', @blens), join(',', @bstarts))."\n"; |
182 |
} #for each stored transcript |
183 |
} |
184 |
|
185 |
sub score2color { |
186 |
my $c=int(255-(255.0 * ($_[0]/$tmaxscore))); |
187 |
$c=0 if $c<0; |
188 |
$c=128 if $c>=128; |
189 |
return $c; |
190 |
} |