ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/hg19_getcons.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 2 months ago) by gpertea
File size: 9149 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     use FindBin;use lib $FindBin::Bin;
5     my $wigbindir='/fs/szannotation/human_rnaseq/hg19_phastCons';
6     #my $fmid='phyloP46way'; #middle part of the file name
7    
8     my $usage = qq/Usage:
9     hg19_getcons.pl {-r <chr>:<interval_list> | -g <gff>} \
10     [-d <wigfixbin_dir>]
11     Returns conservation average and sample standard deviation across
12     bases in given intervals (exons)
13    
14     Requires a directory with .wigfixbin files for each chromosome.
15     (default: $wigbindir)
16     /;
17    
18     umask 0002;
19     my %ch; # file access cache: chr->[filepath, start_cpos]
20     # the value for start_cpos is always at offset 9 in the file!
21    
22     getopts('g:r:o:') || die($usage."\n");
23     my $outfile=$Getopt::Std::opt_o;
24     if ($outfile) {
25     open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
26     select(OUTF);
27     }
28    
29     my $intv=$Getopt::Std::opt_r;
30     if ($intv) {
31     my ($chr,$rlst)=split(/\:/,$intv);
32     die("$usage Incorrect format for the interval list!\n") unless $chr && $rlst;
33     my $strand=chop($chr);
34     if ($strand ne '-' && $strand ne '+') {
35     $chr.=$strand;
36     $strand=undef;
37     }
38     my @rdata=map { [split(/[\-\.]+/)] } (split(/[\,\;\s]+/,$rlst));
39     foreach my $d (@rdata) {
40     ($$d[0], $$d[1])=($$d[1], $$d[0]) if $$d[0]>$$d[1];
41     }
42     my @ex = sort { $a->[0] <=> $b->[0] } @rdata;
43     my ($avg, $std)=fetchCons($chr, \@ex);
44     print STDOUT "$intv\t$avg\t$std\n";
45     exit;
46     }
47     #else {
48     # GFF/GTF input
49     my $fgff=$Getopt::Std::opt_g;
50     my %gffrecs;
51     # --------------------- 0 1 2 3 4 5 6 7 8
52     # recID => [ chr, strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ]
53    
54     # --
55     die("$usage Error: -r or -g required!\n") unless $fgff;
56     my $gffh;
57     if ($fgff eq '-') {
58     open($gffh, "<&=STDIN") || die("Error: couldn't alias STDIN. $!\n");
59     }
60     else {
61     open($gffh, $fgff) || die("Error: failed to open $fgff. $!\n");
62     }
63    
64     loadGff($gffh, \%gffrecs);
65     # -- sort features by chromosome:
66     #print STDERR "GFF data loaded. Sorting by chromosome location..\n";
67     my @sorted_features=sort sortByLoc keys(%gffrecs);
68     # print STDERR "Writing BED file..\n";
69     processGffRecs(\%gffrecs, \@sorted_features);
70     # --
71     if ($outfile) {
72     select(STDOUT);
73     close(OUTF);
74     }
75    
76     #************ Subroutines **************
77     sub loadGff {
78     my ($fh, $recs)=@_; # $hr=\%gffrecs
79     # --------------------- 0 1 2 3 4 5 6 7 8
80     # recID => [ chr, strand, feat_type, gname, tdescr, fstart, fend, [@exons], [@cds] ]
81     while (<$fh>) {
82     next if m/^\s*#/;
83     chomp;
84     my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
85     next unless $fstart>1 && $lnum;
86     next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
87     my $gff3_ID;
88     my $gff3_Parent;
89     my ($gname, $tdescr);
90     ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
91     ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
92     ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
93     if ($gff3_ID || $gff3_Parent) { # GFF format
94     $gff3_ID=~tr/"//d; #"
95     $gff3_Parent=~tr/"//d; #"
96     $gff3_Parent='' if ($f eq 'mRNA');
97     if ($gff3_ID && !$gff3_Parent) { #top level feature
98     if ($f=~m/RNA/i || $f=~/gene/) {
99     # try to parse the description, if any
100     $tdescr='';
101     $gname='';
102     if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
103     $tdescr=$1;
104     }
105     elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
106     $tdescr=$1;
107     }
108     if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
109     $gname=$1;
110     }
111     elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
112     $gname=$1;
113     }
114     $tdescr='' if ($tdescr eq $gname);
115     $gname='' if $gname eq $gff3_ID;
116     }
117     die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($$recs{"$chr|$gff3_ID"}));
118     my $recID="$chr|$gff3_ID";
119     $$recs{$recID} = [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [] ];
120     next;
121     } # parent/top-level feature
122     } #GFF
123     else { #GTF format
124     next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
125     }
126     # -------------- exon/CDS line here:
127     my $recID;
128     if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
129     $recID=$chr.'.jsm.'.$lnum;
130     }
131     elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) {
132     $recID=$1;
133     $recID=~tr/"//d; #"
134     }
135     elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
136     $recID=$1;
137     $recID=~tr/"//d; #"
138     }
139     else {
140     die("Error: cannot parse locus/transcript name from input line:\n$_\n");
141     }
142     if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
143     $gname=$1;
144     $gname=~tr/"//d; #"
145     }
146     $tdescr='' if index($recID, $tdescr)>=0;
147     $gname='' if index($recID, $gname)>=0;
148     $recID=$chr.'|'.$recID;
149     my $ld = $$recs{$recID};
150     if ($ld) { #existing entry
151     my $i=($f eq 'CDS') ? 8 : 7;
152     my ($lstart, $lend)=($$ld[5], $$ld[6]);
153     $$ld[5]=$fstart if $fstart<$lstart;
154     $$ld[6]=$fend if $fend>$lend;
155     push(@{$$ld[$i]}, [$fstart, $fend, $fscore]);
156     }
157     else { # first time seeing this locus/gene
158     $$recs{$recID} = ($f eq 'CDS') ?
159     [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]] ] :
160     [$chr, $strand, $f, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [] ] ;
161     } # 0 1 2 3 4 5 6 7 (exons) 8 (CDS)
162     } #while <$fh>
163     } # loadGff
164    
165     sub sortByLoc {
166     my $da=$gffrecs{$a};
167     my $db=$gffrecs{$b};
168     if ($$da[0] eq $$db[0]) {
169     return ($$da[5]==$$db[5]) ? $$da[6] <=> $$db[6] : $$da[5] <=> $$db[5] ;
170     }
171     else { return $$da[0] cmp $$db[1] ; }
172     }
173    
174     sub processGffRecs {
175     #return if keys(%recs)==0;
176     my ($recs, $rlist)=@_;
177     my @recs_keys;
178     unless ($rlist) {
179     @recs_keys=keys(%$recs);
180     $rlist=\@recs_keys;
181     }
182     foreach my $gffid (@$rlist) {
183     my $ld=$$recs{$gffid};
184     # my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n");
185     # 0 1 2 3 4 5 6 7 8
186     my ($chr, $strand, $ftype, $gname, $descr, $lstart, $lend, $er, $cr) = @$ld;
187     # my ($mstart,$mend)=($lstart, $lend);
188     my $CDexons=0;
189     my @ex;
190     my @cds;
191     #some records might lack exons, but have only CDS segments (e.g. mitochondrial genes)
192     if (@$er<1 && @$cr>0) {
193     @ex = sort { $a->[0] <=> $b->[0] } @$cr;
194     $CDexons=1;
195     }
196     else {
197     @ex = sort { $a->[0] <=> $b->[0] } @$er;
198     #if (@$cr>0) { # sort cds segments too
199     # @cds= sort { $a->[0] <=> $b->[0] } @$cr;
200     # }
201     }
202     # get the more accurate version of the start-end coords for the feature
203     #($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
204     my @exonlst = map { $_->[0].'-'.$_->[1] } @ex;
205     my ($avg, $std)=fetchCons($chr, \@ex);
206     substr($gffid, 0, length($chr)+1)='';
207     print join("\t", $gffid, $chr.$strand.':'.join(',',@exonlst),$avg, $std)."\n";
208     } #for each stored transcript
209     }
210    
211    
212     # -- fetch conservation data for a sorted set of exons
213     sub fetchCons {
214     my ($chr, $exr)=@_; # $exr is \@exons (sorted)
215     my ($tstart, $tend)=($$exr[0]->[0], $$exr[-1]->[1]);
216     my $chd=$ch{$chr};
217     my ($cfname, $cfstart);
218     my $fh;
219     if ($chd) {
220     ($cfname, $cfstart)=@$chd;
221     open($fh, $cfname) || die("Error opening file $cfname!\n");
222     binmode($fh);
223     }
224     else {
225     $cfname="$wigbindir/$chr.wigfixbin";
226     open($fh, $cfname) || die("Error opening file $cfname!\n");
227     binmode($fh);
228     my ($tag, $r);
229     read($fh, $tag, 4);
230     die("Error: $cfname is not a wigbin file (tag=$tag)!\n") unless $tag eq 'WIGB';
231     read($fh, $r, 4);
232     my @v=unpack('I',$r);
233     $cfstart=$v[0];
234     #print STDERR "start=$cfstart\n";
235     $ch{$chr}=[$cfname, $cfstart];
236     }
237     my $tspan=($tend-$tstart+1);
238     my @rdata = (0) x $tspan;
239     if ($tend>=$cfstart) {
240     my ($rstart, $rspan)=($tstart, $tspan); # real start and read values
241     if ($rstart<$cfstart) {
242     $rstart=$cfstart;
243     $rspan=($tend-$rstart+1);
244     }
245     #print STDERR "..seeking to ".(8+2*($rstart-$cfstart))."\n";
246     seek($fh, 8+2*($rstart-$cfstart), 0);
247     my $rdata;
248     # my $rb;
249     # my $nr=0;
250     # my @v;
251     # for (my $i=0;$i<$rspan;$i++) {
252     # $rb=read($fh,$rdata,2);
253     # last if $rb<2;
254     # my @cv=unpack('s',$rdata);
255     # push(@v,$cv[0]);
256     # $nr++;
257     # }
258     my $rb=read($fh, $rdata, 2*$rspan);
259     $rb>>=1; # number of values read (each value is 2 bytes)
260     my @v=unpack('s'.$rb, $rdata);
261     #
262     # print STDERR "rspan=$rspan; unpacking s$rb values: (".join(',',@v).")\n";
263     @rdata[$rstart-$tstart .. $rstart-$tstart+$#v]=@v;
264     }
265     #now compute average&std for exons only
266     my ($sum, $vcount);
267     foreach my $e (@$exr) {
268     $vcount+=$$e[1]-$$e[0]+1;
269     map { $sum+=($_/1000) } @rdata[$$e[0]-$tstart .. $$e[1]-$tstart];
270     }
271     my $avg=$sum/$vcount;
272     $sum=0;
273     foreach my $e (@$exr) {
274     map { $sum+=((($_/1000)-$avg)**2) } @rdata[$$e[0]-$tstart .. $$e[1]-$tstart];
275     }
276     my $std=sqrt($sum/($vcount-1));
277     return(sprintf('%.3f',$avg), sprintf('%.3f',$std));
278     }

Properties

Name Value
svn:executable *