ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff2bed
Revision: 120
Committed: Sun Dec 4 01:46:51 2011 UTC (7 years, 10 months ago) by gpertea
File size: 6169 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 gff2bed [-c:<rgb_color>] <input.gtf>
8 Convert GFF annotation to BED12 format
9 /;
10 umask 0002;
11 getopts('c:o:') || die($usage."\n");
12 my $outfile=$Getopt::Std::opt_o;
13 if ($outfile) {
14 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
15 select(OUTF);
16 }
17 my $tcolor= $Getopt::Std::opt_c || 0;
18 my $tmaxscore = 0;
19 #$tcolor=hex($tcolor) if ($tcolor=~m/^0x/ || $tcolor=~m/[a-f,A-F]/);
20 # --------------------- 0 1 2 3 4 5 6 7 8
21 my %recs; # recID => [ chr, strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ]
22 #
23 my ($gname, $tdescr);
24 while (<>) {
25 next if m/^\s*#/;
26 chomp;
27 my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
28 next unless $fstart>1 && $lnum;
29 next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
30 my $gff3_ID;
31 my $gff3_Parent;
32 my $score;
33 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
34 ($score)=($lnum=~m/\bcov[= ]\"?([\d\.]+)/i); #or we can use fpkm of transcripts
35 if ($score>$tmaxscore) {
36 $tmaxscore=$score;
37 }
38 ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
39 ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
40 if ($gff3_ID || $gff3_Parent) { # GFF format
41 $gff3_ID=~tr/"//d; #"
42 $gff3_Parent=~tr/"//d; #"
43 $gff3_Parent='' if ($f eq 'mRNA');
44 if ($gff3_ID && !$gff3_Parent) { #top level feature
45 if ($f=~m/RNA/i || $f=~/gene/) {
46 # try to parse the description, if any
47 ($gname,$tdescr)=();
48 if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
49 $tdescr=$1;
50 }
51 elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
52 $tdescr=$1;
53 }
54 if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
55 $gname=$1;
56 }
57 elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
58 $gname=$1;
59 }
60 $tdescr='' if ($tdescr eq $gname);
61 $gname='' if $gname eq $gff3_ID;
62 }
63 die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{"$chr|$gff3_ID"}));
64 my $recID="$chr|$gff3_ID";
65 $recs{$recID} = [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [], $score ];
66 next;
67 } # parent/top-level feature
68 } #GFF
69 else { #GTF format
70 next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
71 }
72 # -------------- exon/CDS line here:
73 my $recID;
74 ($gname, $tdescr)=();
75 if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
76 $recID=$chr.'.jsm.'.$lnum;
77 }
78 elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) {
79 $recID=$1;
80 $recID=~tr/"//d; #"
81 }
82 elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
83 $recID=$1;
84 $recID=~tr/"//d; #"
85 }
86 else {
87 die("Error: cannot parse locus/transcript name from input line:\n$_\n");
88 }
89 if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
90 $gname=$1;
91 $gname=~tr/"//d; #"
92 }
93 $tdescr='' if index($recID, $tdescr)>=0;
94 $gname='' if index($recID, $gname)>=0;
95 $recID=$chr.'|'.$recID;
96 my $ld = $recs{$recID};
97 if ($ld) { #existing entry
98 my $i=($f eq 'CDS') ? 8 : 7;
99 my ($lstart, $lend)=($$ld[5], $$ld[6]);
100 $$ld[5]=$fstart if $fstart<$lstart;
101 $$ld[6]=$fend if $fend>$lend;
102 push(@{$$ld[$i]}, [$fstart, $fend, $fscore]);
103 }
104 else { # first time seeing this transcript
105 $recs{$recID} = ($f eq 'CDS') ?
106 [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]], $score ] :
107 [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [], $score ] ;
108 # 0 1 2 3 4 5 6 7 (exons) 8 (CDS)
109 ($gname,$tdescr)=();
110 }
111 } #while <>
112
113
114 # -- sort features by chromosome:
115 print STDERR "GFF data loaded. Sorting by chromosome location..\n";
116 my @sorted_features=sort sortByLoc keys(%recs);
117 print STDERR "Writing BED file..\n";
118 writeModels(\@sorted_features);
119 print STDERR "Done.\n";
120 # --
121 if ($outfile) {
122 select(STDOUT);
123 close(OUTF);
124 }
125
126 #************ Subroutines **************
127 sub sortByLoc {
128 my $da=$recs{$a};
129 my $db=$recs{$b};
130 if ($$da[0] eq $$db[0]) {
131 return ($$da[5]==$$db[5]) ? $$da[6] <=> $$db[6] : $$da[5] <=> $$db[5] ;
132 }
133 else { return $$da[0] cmp $$db[0] ; }
134 }
135
136 sub writeModels {
137 #return if keys(%recs)==0;
138 my $rlist=shift(@_);
139 my @recs_keys;
140 unless ($rlist) {
141 @recs_keys=keys(%recs);
142 $rlist=\@recs_keys;
143 }
144
145 foreach my $gffid (@$rlist) {
146 my $ld=$recs{$gffid};
147 # my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n");
148 # 0 1 2 3 4 5 6 7 8 9
149 my ($chr, $strand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr, $score) = @$ld;
150 my ($mstart,$mend)=($lstart, $lend);
151 my $CDexons=0;
152 my @ex;
153 my @cds;
154 if (@$er<1 && @$cr>0) {
155 @ex = sort { $a->[0] <=> $b->[0] } @$cr;
156 $CDexons=1;
157 }
158 else {
159 @ex = sort { $a->[0] <=> $b->[0] } @$er;
160 if (@$cr>0) {
161 @cds= sort { $a->[0] <=> $b->[0] } @$cr;
162 }
163 }
164 ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]); # get the more accurate version of the start-end coords for the feature
165 my ($cds_start, $cds_end) = ($CDexons || @cds==0) ? ($mstart, $mend) : ($cds[0]->[0], $cds[-1]->[1]) ;
166 my @bstarts;
167 my @blens;
168 foreach my $ed (@ex) {
169 push(@bstarts, $$ed[0]-$mstart);
170 push(@blens, $$ed[1]-$$ed[0]+1);
171 }
172 if ($score) {
173 $tcolor=score2color($score);
174
175 }
176 else {
177 $score=666;
178 }
179 print join("\t", $chr, $mstart-1, $mend, "$gffid|$gname", $score, $strand, $cds_start-1, $cds_end,
180 join(',',$tcolor,$tcolor,$tcolor),
181 scalar(@bstarts), join(',', @blens), join(',', @bstarts))."\n";
182 } #for each stored transcript
183 }
184
185 sub score2color {
186 my $c=int(255-(255.0 * ($_[0]/$tmaxscore)));
187 $c=0 if $c<0;
188 $c=128 if $c>=128;
189 return $c;
190 }

Properties

Name Value
svn:executable *