ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff2bed
Revision: 23
Committed: Tue Jul 26 21:44:38 2011 UTC (8 years, 2 months ago) by gpertea
Original Path: ann_bin/gff2bed
File size: 5703 byte(s)
Log Message:
adding misc scripts

Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 gff2bed [-c:<rgb_color>] <input.gtf>
8 Convert GFF annotation to BED12 format
9 /;
10 umask 0002;
11 getopts('c:o:') || die($usage."\n");
12 my $outfile=$Getopt::Std::opt_o;
13 if ($outfile) {
14 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
15 select(OUTF);
16 }
17 my $tcolor= $Getopt::Std::opt_c || 0;
18 #$tcolor=hex($tcolor) if ($tcolor=~m/^0x/ || $tcolor=~m/[a-f,A-F]/);
19 # --------------------- 0 1 2 3 4 5 6 7 8
20 my %recs; # recID => [ chr, strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ]
21 #
22 my ($gname, $tdescr);
23 while (<>) {
24 next if m/^\s*#/;
25 chomp;
26 my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
27 next unless $fstart>1 && $lnum;
28 next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
29 my $gff3_ID;
30 my $gff3_Parent;
31 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
32 ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
33 ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
34 if ($gff3_ID || $gff3_Parent) { # GFF format
35 $gff3_ID=~tr/"//d; #"
36 $gff3_Parent=~tr/"//d; #"
37 $gff3_Parent='' if ($f eq 'mRNA');
38 if ($gff3_ID && !$gff3_Parent) { #top level feature
39 if ($f=~m/RNA/i || $f=~/gene/) {
40 # try to parse the description, if any
41 ($gname,$tdescr)=();
42 if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
43 $tdescr=$1;
44 }
45 elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
46 $tdescr=$1;
47 }
48 if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
49 $gname=$1;
50 }
51 elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
52 $gname=$1;
53 }
54 $tdescr='' if ($tdescr eq $gname);
55 $gname='' if $gname eq $gff3_ID;
56 }
57 die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{"$chr|$gff3_ID"}));
58 my $recID="$chr|$gff3_ID";
59 $recs{$recID} = [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [] ];
60 next;
61 } # parent/top-level feature
62 } #GFF
63 else { #GTF format
64 next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
65 }
66 # -------------- exon/CDS line here:
67 my $recID;
68 ($gname, $tdescr)=();
69 if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
70 $recID=$chr.'.jsm.'.$lnum;
71 }
72 elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) {
73 $recID=$1;
74 $recID=~tr/"//d; #"
75 }
76 elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
77 $recID=$1;
78 $recID=~tr/"//d; #"
79 }
80 else {
81 die("Error: cannot parse locus/transcript name from input line:\n$_\n");
82 }
83 if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
84 $gname=$1;
85 $gname=~tr/"//d; #"
86 }
87 $tdescr='' if index($recID, $tdescr)>=0;
88 $gname='' if index($recID, $gname)>=0;
89 $recID=$chr.'|'.$recID;
90 my $ld = $recs{$recID};
91 if ($ld) { #existing entry
92 my $i=($f eq 'CDS') ? 8 : 7;
93 my ($lstart, $lend)=($$ld[5], $$ld[6]);
94 $$ld[5]=$fstart if $fstart<$lstart;
95 $$ld[6]=$fend if $fend>$lend;
96 push(@{$$ld[$i]}, [$fstart, $fend, $fscore]);
97 }
98 else { # first time seeing this locus/gene
99 $recs{$recID} = ($f eq 'CDS') ?
100 [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]] ] :
101 [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [] ] ;
102 # 0 1 2 3 4 5 6 7 (exons) 8 (CDS)
103 ($gname,$tdescr)=();
104 }
105 } #while <>
106
107
108 # -- sort features by chromosome:
109 print STDERR "GFF data loaded. Sorting by chromosome location..\n";
110 my @sorted_features=sort sortByLoc keys(%recs);
111 print STDERR "Writing BED file..\n";
112 writeModels(\@sorted_features);
113 print STDERR "Done.\n";
114 # --
115 if ($outfile) {
116 select(STDOUT);
117 close(OUTF);
118 }
119
120 #************ Subroutines **************
121 sub sortByLoc {
122 my $da=$recs{$a};
123 my $db=$recs{$b};
124 if ($$da[0] eq $$db[0]) {
125 return ($$da[5]==$$db[5]) ? $$da[6] <=> $$db[6] : $$da[5] <=> $$db[5] ;
126 }
127 else { return $$da[0] cmp $$db[0] ; }
128 }
129
130 sub writeModels {
131 #return if keys(%recs)==0;
132 my $rlist=shift(@_);
133 my @recs_keys;
134 unless ($rlist) {
135 @recs_keys=keys(%recs);
136 $rlist=\@recs_keys;
137 }
138
139 foreach my $gffid (@$rlist) {
140 my $ld=$recs{$gffid};
141 # my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n");
142 # 0 1 2 3 4 5 6 7 8
143 my ($chr, $strand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr) = @$ld;
144 my ($mstart,$mend)=($lstart, $lend);
145 my $CDexons=0;
146 my @ex;
147 my @cds;
148 if (@$er<1 && @$cr>0) {
149 @ex = sort { $a->[0] <=> $b->[0] } @$cr;
150 $CDexons=1;
151 }
152 else {
153 @ex = sort { $a->[0] <=> $b->[0] } @$er;
154 if (@$cr>0) {
155 @cds= sort { $a->[0] <=> $b->[0] } @$cr;
156 }
157 }
158 ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]); # get the more accurate version of the start-end coords for the feature
159 my ($cds_start, $cds_end) = ($CDexons || @cds==0) ? ($mstart, $mend) : ($cds[0]->[0], $cds[-1]->[1]) ;
160 my @bstarts;
161 my @blens;
162 foreach my $ed (@ex) {
163 push(@bstarts, $$ed[0]-$mstart);
164 push(@blens, $$ed[1]-$$ed[0]+1);
165 }
166 print join("\t", $chr, $mstart-1, $mend, "$gffid|$gname", 666, $strand, $cds_start-1, $cds_end, $tcolor,
167 scalar(@bstarts), join(',', @blens), join(',', @bstarts))."\n";
168 } #for each stored transcript
169 }

Properties

Name Value
svn:executable *