ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/cuff_loci_stats.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 2 months ago) by gpertea
File size: 16300 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     use FindBin;use lib $FindBin::Bin;
5    
6     my $usage = q/Usage:
7     cuff_loci_stats.pl [-o <out_prefix>] \
8     <cuffcmp.combined.gtf> <isoforms.fpkm_tracking> tcons.orflen [<tcons_phastCons.tab>]
9    
10     <cuff_loci.tab> is the [filtered] output of cuffdiff_loc_check.pl
11     <tcons_phastCons.tab> can be generated by hg19_getcons.pl from the <cuffcmp.combined.gtf> file
12     /;
13     umask 0002;
14     getopts('o:') || die($usage."\n");
15     #my ($f_loci, $f_tracking, $f_gtf, $f_fpkm)=@ARGV;
16     my ($f_gtf, $f_fpkm, $f_orflen, $f_cons)=@ARGV;
17     die("$usage\n") unless $f_fpkm;
18     foreach my $f (@ARGV) {
19     die("Error: file $f not found!\n") unless -f $f;
20     }
21     my $outprefix=$Getopt::Std::opt_o || 'cuff_stats';
22     # --
23     my %samples=(
24     'BJAB'=>['GCB',250],
25     'Dogum'=>['GCB', 250],
26     'DOHH2'=>['GCB', 250],
27     'Gumbus'=>['GCB',250],
28     'HT'=>['GCB',250],
29     'Karpas422'=>['GCB',250],
30     'Ly1'=>['GCB',150],
31     'Ly18'=>['GCB',250],
32     'Ly19'=>['GCB',250],
33     'Ly2'=>['GCB',250],
34     'Ly4'=>['GCB',250],
35     'Ly7'=>['GCB',250],
36     'Ly8'=>['GCB',150],
37     'Mieu'=>['GCB',250],
38     'Pfieffer'=>['GCB',250],
39     'ULA'=>['GCB',250],
40     'HBL1'=>['ABC',150], # <====== ABC from here on
41     'TMD8'=>['ABC',150],
42     'Ly10'=>['ABC',150],
43     'Ly3'=>['ABC',250],
44     'RIVA'=>['ABC',250],
45     'SUDHL2'=>['ABC',150],
46     'U2932'=>['ABC',150]
47     );
48     # ABC : ( HBL1 TMD8 Ly10 Ly3 RIVA SUDHL2 U2932 ) # 7 samples
49     # GCB : ( BJAB Dogum DOHH2 Gumbus HT Karpas422 Ly1 Ly18 Ly19 Ly2 Ly4 Ly7 Ly8 Mieu Pfieffer ULA ) # 16 samples
50     #----
51     # ABC merged sample: TMD8 + HBL1 + Ly3 + Ly10
52     # GCB merged sample: BJAB + DOHH2 + Ly1
53    
54     # model samples:
55     my @ABCmodels=qw(HBL1 TMD8 Ly3 Ly10);
56     my @GCBmodels=qw(BJAB DOHH2 Dogum HT Karpas422 Ly1 Ly18 Ly19 Ly4 Ly2 Ly7 Ly8 Mieu Pfieffer ULA);
57     my %mABC; @mABC{@ABCmodels}=();
58     my %mGCB; @mGCB{@GCBmodels}=();
59     my %tcons; # conservation data for each transfrag (optional)
60     my %loc_codes; # keep track of class codes for all transfrags in a loc
61     my %bad_codes;
62     my %bad_loci; #for novel loci, if any transfrags have 'x', 's' or 'o' codes
63     @bad_codes{qw(x s o p)}=();
64     print STDERR "Loading conservation data from $f_cons..\n";
65     if ($f_cons) {
66     open(CONS, $f_cons) || die("Error opening $f_cons!\n");
67     while (<CONS>) {
68     chomp;
69     my @t=split(/\t/);
70     next unless $t[3];
71     $tcons{$t[0]}=[$t[2],$t[3]];
72     }
73     close(CONS);
74     }
75     my %torf; # conservation data for each transfrag (optional)
76     if ($f_orflen) {
77     print STDERR "Loading ORF data from $f_orflen..\n";
78     open(TORF, $f_orflen) || die("Error opening $f_orflen!\n");
79     while (<TORF>) {
80     chomp;
81     my @t=split(/\t/);
82     next unless $t[1]>3;
83     $torf{$t[0]}=$t[1];
84     }
85     close(TORF);
86     }
87     my %tfpkm; # tid=>[@fpkmsmp_values] values are in the same order as @fpkmsmp
88     my @fpkmsmp; # array of sample names, in the same order as found in the <isoforms.fpkm_tracking> header
89     my %s2idx; # sample_name => index in @fpkmsmp
90     print STDERR "Loading FPKM data from $f_fpkm..\n";
91     open(FPKM, $f_fpkm) || die("Error opening $f_fpkm!\n");
92     my $fheader=<FPKM>;
93     chomp($fheader);
94     #TODO: load here all the fpkm data, with header names in fpkmsmp, and write subroutines to retrieve sums for ABC and GCB model samples
95     my @colidx; #index of columns in the isoforms.fpkm_tracking file
96     my $i=0;
97     foreach my $fld (split(/\t/,$fheader)) {
98     my $sn=$fld;
99     if ($sn=~s/_FPKM$//) {
100     push(@fpkmsmp, $sn);
101     $s2idx{$sn}=$#fpkmsmp;
102     push(@colidx, $i);
103     }
104     $i++;
105     }
106     while (<FPKM>) {
107     chomp;
108     my @t=split(/\t/);
109     $tfpkm{$t[0]}=[@t[@colidx]];
110     }
111     #now to get FPKM of a transcript for a specific sample name: $tfpkm{$tid}->[$s2idx{$sample_name}]
112     close(FPKM);
113    
114     # --> model ABC vs. model GCB samples:
115     # allABC/HBL1.bam allABC/TMD8.bam allABC/Ly3.bam allABC/Ly10.bam
116     # allGCB/BJAB.bam allGCB/DOHH2.bam allGCB/Dogum.bam allGCB/HT.bam allGCB/Karpas422.bam allGCB/Ly1.bam allGCB/Ly18.bam allGCB/Ly19.bam allGCB/Ly4.bam allGCB/Ly2.bam allGCB/Ly7.bam allGCB/Ly8.bam allGCB/Mieu.bam allGCB/Pfieffer.bam allGCB/ULA.bam
117     #
118     # --> all ABC vs all GCB samples:
119     # allABC/HBL1.bam allABC/Ly10.bam allABC/Ly3.bam allABC/RIVA.bam allABC/SUDHL2.bam allABC/TMD8.bam allABC/U2932.bam
120     # allGCB/BJAB.bam allGCB/Dogum.bam allGCB/DOHH2.bam allGCB/Gumbus.bam allGCB/HT.bam allGCB/Karpas422.bam allGCB/Ly18.bam allGCB/Ly19.bam allGCB/Ly1.bam allGCB/Ly2.bam allGCB/Ly4.bam allGCB/Ly7.bam allGCB/Ly8.bam allGCB/Mieu.bam allGCB/Pfieffer.bam allGCB/ULA.bam
121     # 0 1 2 3 4 5 6 7 8 9
122     my %ldat; # locus => [ ref_genes, <strand>chr, chrstart, chrstop, [@tcons], max_tlen, maxorf_len, max_numexons, cons_avg, cons_stdev,
123     # modelABC_fpkm, modelGCB_fpkm, sample_fpkm_sums ]
124     # 10 11 12..
125     #
126     # 0 1 2 3 4 5 6 7 8 9 10 11 12...
127     #my %tdat; # tcons => [locus, <strand>chr:start-stop, ref_code, ref_id, ref_gene, [@exons], covlen, maxorf_len, cons_avg, cons_stdev, modelABC_fpkm, modelGCB_fpkm, per_sample_fpkm]
128     my %gffrecs;
129     # --------------------- 0 1 2 3 4 5 6 7 8 9 10
130     # recID => [ chr, strand, xloc, gene_name, tdescr, fstart, fend, [@exons], [@cds], class_code, ref_id ]
131    
132     my $gffh;
133     open($gffh, $f_gtf) || die("Error: failed to open $f_gtf. $!\n");
134     print STDERR "Loading GTF data from $f_gtf..\n";
135     loadGff($gffh, \%gffrecs);
136     #my @sorted_features=sort sortByLoc keys(%gffrecs);
137     print STDERR "Sorting by genomic location..\n";
138     my @sorted_recs=sort sortByLoc keys(%gffrecs);
139     print STDERR "Processing GTF records and writing transfrag stats..\n";
140    
141     ## transcript data:
142     my $tfh;
143     open($tfh, ">$outprefix.t_stats.tab") || die("Error creating $outprefix.t_stats.tab!\n");
144     print $tfh join("\t", qw(transfrag ref_info location xlocus tlen num_exons max_ORF cons_mean cons_stdev ABC_fpkm GCB_fpkm),
145     @fpkmsmp)."\n";
146     # - set up some loci exclusion flags:
147     foreach my $xloc (keys(%loc_codes)) {
148     my $lcodes=$loc_codes{$xloc};
149     if (index($lcodes,'j')<0 && $lcodes=~tr/xso//) {
150     $bad_loci{$xloc}=1;
151     #print STDERR "..declaring $xloc bad..\n";
152     }
153     }
154     my $xloc_list=[];
155     processGffRecs(\%gffrecs, $tfh, \@sorted_recs, $xloc_list); #populates %ldat and writes t_stats, uses %tcons too
156     close($tfh);
157    
158     #now print %ldat and %tdat
159     print STDERR "Writing XLOCi stats..\n";
160     open(LOCSTATS, ">$outprefix.xloc_stats.tab") || die("Error creating $outprefix.xloc_stats.tab!\n");
161     #print locus table header:
162     print LOCSTATS join("\t", qw(xlocus ref_genes location tlist max_tlen max_exons max_ORF cons_mean cons_stdev ABC_fpkm GCB_fpkm),
163     @fpkmsmp)."\n";
164     #foreach my $xloc (keys(%ldat)) {
165     foreach my $xloc (@$xloc_list) {
166     # 0 1 2 3 4 5 6 7 8 9
167     my ($reflst, $schr, $cstart, $cend, $tlst, $max_tlen, $max_orf, $max_exons, $cons_avg, $cons_stdev,
168     # 10 11 12..
169     $modelABC_fpkm, $modelGCB_fpkm, @sample_fpkms)=@{$ldat{$xloc}};
170     print LOCSTATS join("\t", $xloc, ((@$reflst>0) ? join(',',@$reflst) : '-'), $schr.":$cstart-$cend", join(',', @$tlst),
171     $max_tlen, $max_exons, $max_orf, sprintf('%.5f', $cons_avg), sprintf('%.5f', $cons_stdev),
172     sprintf0($modelABC_fpkm), sprintf0($modelGCB_fpkm), (map { sprintf0($_) } @sample_fpkms) )."\n";
173     }
174     close(LOCSTATS);
175    
176     ## transcript data:
177     #open(TSTATS, ">$outprefix.t_stats.tab") || die("Error creating $outprefix.t_stats.tab!\n");
178     #print TSTATS join("\t", qw(xloc refcode:refId|refGene location exons len num_exons cons_mean const_stdev ABC_fpkm GCB_fpkm),
179     # @fpkmsmp)."\n";
180     #foreach my $t (keys(%tdat)) {
181     # # 0 1 2 3 4 5 6 7 8 9 10 11
182     # my ($xloc, $chrloc, $ref_code, $ref_id, $ref_gene, $exonlst, $covlen, $maxorf_len, $cons_avg, $cons_stdev, $modelABC_fpkm, $modelGCB_fpkm,
183     # @sample_fpkms)=@{$tdat{$t}};
184     # print TSTATS join("\t", $t, "$ref_code:$ref_id|$ref_gene", $chrloc, join(',', (map { $_->[0].'-'.$_->[1] } @$exonlst)), $covlen, $maxorf_len, scalar(@$exonlst),
185     # $cons_avg, $cons_stdev, $modelABC_fpkm, $modelGCB_fpkm, @sample_fpkms)."\n";
186     # }
187     #close(TSTATS);
188    
189     print STDERR "Done.\n";
190    
191     # --
192     #if ($outfile) {
193     # select(STDOUT);
194     # close(OUTF);
195     # }
196    
197     #************ Subroutines **************
198    
199     sub sprintf0 {
200     return ($_[0]==0.0) ? '0' : sprintf('%.5f',$_[0]);
201     }
202    
203     sub loadGff {
204     my ($fh, $recs)=@_; # $hr=\%gffrecs
205     # --------------------- 0 1 2 3 4 5 6 7 8 9 10
206     # recID => [ chr, strand, xloc, gene_name, tdescr, fstart, fend, [@exons], [@cds], class_code, nearest_ref_id ]
207     while (<$fh>) {
208     next if m/^\s*#/;
209     chomp;
210     my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
211     next unless $fstart>1 && $lnum;
212     next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
213     my ($gname, $tdescr, $xloc, $classcode, $refid);
214     if ($lnum=~m/nearest_ref[= ]+"?([^;"]+)/) {
215     $refid=$1;
216     }
217     if ($lnum=~m/class_code[\.= ]+"?([\w=])/) {
218     $classcode=$1;
219     }
220     my $gff3_ID;
221     my $gff3_Parent;
222     ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
223     ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
224     ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
225     if ($gff3_ID || $gff3_Parent) { # GFF format
226     $gff3_ID=~tr/"//d; #"
227     $gff3_Parent=~tr/"//d; #"
228     $gff3_Parent='' if ($f eq 'mRNA');
229     if ($gff3_ID && !$gff3_Parent) { #top level feature
230     if ($f=~m/RNA/i || $f=~/gene/) {
231     # try to parse the description, if any
232     if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
233     $tdescr=$1;
234     }
235     #elsif ($lnum=~m/Name[= ]\s*"?([^;"]+)/) {
236     # $tdescr=$1;
237     # }
238     if ($lnum=~m/\bgene_name[ =]+"?([^;"]+)/i) {
239     $gname=$1;
240     }
241     elsif ($lnum=~m/Name[= ]+"?([^;"]+)/) {
242     $gname=$1;
243     }
244     if ($lnum=~m/locus="?([^;"]+)/i || $lnum=~m/loc="?([^;"]+)/i) {
245     $xloc=$1;
246     }
247     $tdescr='' if ($tdescr eq $gname);
248     $gname='' if $gname eq $gff3_ID;
249     }
250     die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($$recs{"$chr|$gff3_ID"}));
251     #my $recID="$chr|$gff3_ID";
252     my $recID=$gff3_ID;
253     $gname=$refid if $refid;
254     $$recs{$recID} = [$chr, $strand, $xloc, $gname, $tdescr, $fstart, $fend, [], [], $classcode, $refid ];
255     $loc_codes{$xloc}.=$classcode if $classcode;
256     next;
257     } # parent/top-level feature
258     } #GFF
259     else { #GTF format
260     next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
261     }
262     # -------------- exon/CDS line here:
263     my $recID;
264     if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
265     $recID=$chr.'.jsm.'.$lnum;
266     }
267     elsif ($lnum=~m/Parent="?(['\:\w\|\-\.]+)/) {
268     $recID=$1;
269     }
270     elsif ($lnum=~m/transcript_id[= ]+"?(['\:\w\.\|\-]+)/) {
271     $recID=$1;
272     }
273     else {
274     die("Error: cannot parse locus/transcript name from input line:\n$_\n");
275     }
276     if ($lnum=~m/gene_id[= ]+"?(['\:\w\.\|\-]+)/) {
277     $xloc=$1; # cuffcompare writes XLOC as gene_id
278     }
279     if ($lnum=~m/\bgene_name[ =]+"?([^;"]+)/i) {
280     $gname=$1;
281     }
282     $tdescr='' if index($recID, $tdescr)>=0;
283     $gname='' if index($recID, $gname)>=0;
284     #$recID=$chr.'|'.$recID;
285     my $ld = $$recs{$recID};
286     if ($ld) { #existing entry
287     my $i=($f eq 'CDS') ? 8 : 7;
288     my ($lstart, $lend)=($$ld[5], $$ld[6]);
289     $$ld[5]=$fstart if $fstart<$lstart;
290     $$ld[6]=$fend if $fend>$lend;
291     push(@{$$ld[$i]}, [$fstart, $fend, $fscore]);
292     }
293     else { # first time seeing this locus/gene
294     $$recs{$recID} = ($f eq 'CDS') ?
295     [$chr, $strand, $xloc, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]], $classcode, $refid ] :
296     [$chr, $strand, $xloc, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [], $classcode, $refid ] ;
297     # 0 1 2 3 4 5 6 7 (exons) 8 (CDS) 9 10
298     $loc_codes{$xloc}.=$classcode if $classcode;
299     }
300     } #while <$fh>
301     } # loadGff
302    
303     sub sortByLoc {
304     my $da=$gffrecs{$a};
305     my $db=$gffrecs{$b};
306     if ($$da[0] eq $$db[0]) {
307     return ($$da[5]==$$db[5]) ? $$da[6] <=> $$db[6] : $$da[5] <=> $$db[5] ;
308     }
309     else { return $$da[0] cmp $$db[0] ; }
310     }
311    
312     sub processGffRecs {
313     #return if keys(%recs)==0;
314     my ($recs, $tstats, $rlist, $loclist)=@_;
315     my @recs_keys;
316     my %lseen; #seen xloci
317     unless ($rlist) {
318     @recs_keys=keys(%$recs);
319     $rlist=\@recs_keys;
320     }
321     foreach my $gffid (@$rlist) {
322     my $gffdata=$$recs{$gffid};
323     # my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n");
324     # 0 1 2 3 4 5 6 7 8 9 10
325     my ($chr, $strand, $xloc, $gname, $descr, $lstart, $lend, $er, $cr, $classcode, $refid) = @$gffdata;
326     # my ($mstart,$mend)=($lstart, $lend);
327     next if exists($bad_codes{$classcode}) || exists($bad_loci{$xloc});
328     my $CDexons=0;
329     my @ex;
330     my @cds;
331     #some records might lack exons, but have only CDS segments (e.g. mitochondrial genes)
332     if (@$er<1 && @$cr>0) {
333     @ex = sort { $a->[0] <=> $b->[0] } @$cr;
334     $CDexons=1;
335     }
336     else {
337     @ex = sort { $a->[0] <=> $b->[0] } @$er;
338     }
339     # --------------
340     # get the more accurate version of the start-end coords for the feature
341     my $covlen=0;
342     map { $covlen+=$_->[1]-$_->[0]+1 } @ex;
343     my ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
344     my ($cavg, $cstd)=('-','-');
345     my $cd=$tcons{$gffid};
346     ($cavg, $cstd)=@$cd if ($cd);
347     my $orfmax=$torf{$gffid} || '0';
348     my $numexons=scalar(@ex);
349     my ($ABC_fpkm, $GCB_fpkm)=(0,0);
350     map { $ABC_fpkm+=$tfpkm{$gffid}->[$s2idx{$_}] } @ABCmodels;
351     map { $GCB_fpkm+=$tfpkm{$gffid}->[$s2idx{$_}] } @GCBmodels;
352     if ($GCB_fpkm == 0.0 && $ABC_fpkm == 0.0) {
353     print STDERR "Warning: transfrag $gffid has 0 FPKM in all ABC and GCB model samples!\n";
354     next;
355     }
356     #$tdat{$gffid}=[$xloc, $strand.$chr.":$mstart-$mend", $classcode, $refid, $gname, [@ex], $covlen, $orfmax, $cavg, $cstd, $ABC_fpkm, $GCB_fpkm, @{$tfpkm{$gffid}}];
357     # 0 1 2 3 4 5 6 7 8 9 10 11 12..
358     my $refstatus = $refid ? "$classcode:$refid|$gname" : '-';
359    
360     print $tstats join("\t", $gffid, $refstatus, $strand.$chr.":$mstart-$mend",
361     $xloc, $covlen, scalar(@ex), $orfmax,
362     sprintf('%.5f',$cavg), sprintf('%.5f',$cstd), sprintf0($ABC_fpkm),
363     sprintf0($GCB_fpkm), (map { sprintf0($_) } @{$tfpkm{$gffid}}) )."\n";
364    
365     #my $td=$tdat{$gffid};
366     unless (exists($lseen{$xloc})) {
367     $lseen{$xloc}=1;
368     push(@$loclist, $xloc);
369     }
370     my $ld=$ldat{$xloc};
371     if ($ld) { #update existing $ldat
372     #$$ld[0].=$classcode;
373     push(@{$$ld[0]},"$refid|$gname") if ($classcode eq 'j');
374     push(@{$$ld[4]}, $gffid);
375     $$ld[2]=$mstart if $mstart<$$ld[2];
376     $$ld[3]=$mend if $mend>$$ld[3];
377     $$ld[5]=$covlen if $covlen>$$ld[5];
378     $$ld[6]=$orfmax if int($orfmax)>$$ld[6];
379     $$ld[7]=$numexons if $numexons>$$ld[7];
380     $$ld[8]+=$cavg; $$ld[8]/=2;
381     $$ld[9]+=$cstd; $$ld[9]/=2;
382     $$ld[10]+=$ABC_fpkm;
383     $$ld[11]+=$GCB_fpkm;
384     for (my $i=0;$i<@fpkmsmp; $i++) {
385     #$$ld[10+$i]+=$$td[10+$i];
386     $$ld[12+$i]+=$tfpkm{$gffid}->[$i];
387     }
388     }
389     else { #new xloc
390     # 0 1 2 3 4 5 6 7 8 9 10 11 12 ..
391     $ldat{$xloc}=[[], $strand.$chr, $mstart, $mend, [$gffid], $covlen, $orfmax, $numexons, $cavg, $cstd, $ABC_fpkm, $GCB_fpkm, @{$tfpkm{$gffid}}];
392     }
393     #my @exonlst = map { $_->[0].'-'.$_->[1] } @ex;
394     #my ($avg, $std)=fetchCons($chr, \@ex);
395     #substr($gffid, 0, length($chr)+1)='';
396     #print join("\t", $gffid, $chr.$strand.':'.join(',',@exonlst),$avg, $std)."\n";
397     } #for each stored transcript
398     }

Properties

Name Value
svn:executable *