ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/cuff_loci_stats.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 4 months ago) by gpertea
File size: 16300 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 cuff_loci_stats.pl [-o <out_prefix>] \
8 <cuffcmp.combined.gtf> <isoforms.fpkm_tracking> tcons.orflen [<tcons_phastCons.tab>]
9
10 <cuff_loci.tab> is the [filtered] output of cuffdiff_loc_check.pl
11 <tcons_phastCons.tab> can be generated by hg19_getcons.pl from the <cuffcmp.combined.gtf> file
12 /;
13 umask 0002;
14 getopts('o:') || die($usage."\n");
15 #my ($f_loci, $f_tracking, $f_gtf, $f_fpkm)=@ARGV;
16 my ($f_gtf, $f_fpkm, $f_orflen, $f_cons)=@ARGV;
17 die("$usage\n") unless $f_fpkm;
18 foreach my $f (@ARGV) {
19 die("Error: file $f not found!\n") unless -f $f;
20 }
21 my $outprefix=$Getopt::Std::opt_o || 'cuff_stats';
22 # --
23 my %samples=(
24 'BJAB'=>['GCB',250],
25 'Dogum'=>['GCB', 250],
26 'DOHH2'=>['GCB', 250],
27 'Gumbus'=>['GCB',250],
28 'HT'=>['GCB',250],
29 'Karpas422'=>['GCB',250],
30 'Ly1'=>['GCB',150],
31 'Ly18'=>['GCB',250],
32 'Ly19'=>['GCB',250],
33 'Ly2'=>['GCB',250],
34 'Ly4'=>['GCB',250],
35 'Ly7'=>['GCB',250],
36 'Ly8'=>['GCB',150],
37 'Mieu'=>['GCB',250],
38 'Pfieffer'=>['GCB',250],
39 'ULA'=>['GCB',250],
40 'HBL1'=>['ABC',150], # <====== ABC from here on
41 'TMD8'=>['ABC',150],
42 'Ly10'=>['ABC',150],
43 'Ly3'=>['ABC',250],
44 'RIVA'=>['ABC',250],
45 'SUDHL2'=>['ABC',150],
46 'U2932'=>['ABC',150]
47 );
48 # ABC : ( HBL1 TMD8 Ly10 Ly3 RIVA SUDHL2 U2932 ) # 7 samples
49 # GCB : ( BJAB Dogum DOHH2 Gumbus HT Karpas422 Ly1 Ly18 Ly19 Ly2 Ly4 Ly7 Ly8 Mieu Pfieffer ULA ) # 16 samples
50 #----
51 # ABC merged sample: TMD8 + HBL1 + Ly3 + Ly10
52 # GCB merged sample: BJAB + DOHH2 + Ly1
53
54 # model samples:
55 my @ABCmodels=qw(HBL1 TMD8 Ly3 Ly10);
56 my @GCBmodels=qw(BJAB DOHH2 Dogum HT Karpas422 Ly1 Ly18 Ly19 Ly4 Ly2 Ly7 Ly8 Mieu Pfieffer ULA);
57 my %mABC; @mABC{@ABCmodels}=();
58 my %mGCB; @mGCB{@GCBmodels}=();
59 my %tcons; # conservation data for each transfrag (optional)
60 my %loc_codes; # keep track of class codes for all transfrags in a loc
61 my %bad_codes;
62 my %bad_loci; #for novel loci, if any transfrags have 'x', 's' or 'o' codes
63 @bad_codes{qw(x s o p)}=();
64 print STDERR "Loading conservation data from $f_cons..\n";
65 if ($f_cons) {
66 open(CONS, $f_cons) || die("Error opening $f_cons!\n");
67 while (<CONS>) {
68 chomp;
69 my @t=split(/\t/);
70 next unless $t[3];
71 $tcons{$t[0]}=[$t[2],$t[3]];
72 }
73 close(CONS);
74 }
75 my %torf; # conservation data for each transfrag (optional)
76 if ($f_orflen) {
77 print STDERR "Loading ORF data from $f_orflen..\n";
78 open(TORF, $f_orflen) || die("Error opening $f_orflen!\n");
79 while (<TORF>) {
80 chomp;
81 my @t=split(/\t/);
82 next unless $t[1]>3;
83 $torf{$t[0]}=$t[1];
84 }
85 close(TORF);
86 }
87 my %tfpkm; # tid=>[@fpkmsmp_values] values are in the same order as @fpkmsmp
88 my @fpkmsmp; # array of sample names, in the same order as found in the <isoforms.fpkm_tracking> header
89 my %s2idx; # sample_name => index in @fpkmsmp
90 print STDERR "Loading FPKM data from $f_fpkm..\n";
91 open(FPKM, $f_fpkm) || die("Error opening $f_fpkm!\n");
92 my $fheader=<FPKM>;
93 chomp($fheader);
94 #TODO: load here all the fpkm data, with header names in fpkmsmp, and write subroutines to retrieve sums for ABC and GCB model samples
95 my @colidx; #index of columns in the isoforms.fpkm_tracking file
96 my $i=0;
97 foreach my $fld (split(/\t/,$fheader)) {
98 my $sn=$fld;
99 if ($sn=~s/_FPKM$//) {
100 push(@fpkmsmp, $sn);
101 $s2idx{$sn}=$#fpkmsmp;
102 push(@colidx, $i);
103 }
104 $i++;
105 }
106 while (<FPKM>) {
107 chomp;
108 my @t=split(/\t/);
109 $tfpkm{$t[0]}=[@t[@colidx]];
110 }
111 #now to get FPKM of a transcript for a specific sample name: $tfpkm{$tid}->[$s2idx{$sample_name}]
112 close(FPKM);
113
114 # --> model ABC vs. model GCB samples:
115 # allABC/HBL1.bam allABC/TMD8.bam allABC/Ly3.bam allABC/Ly10.bam
116 # allGCB/BJAB.bam allGCB/DOHH2.bam allGCB/Dogum.bam allGCB/HT.bam allGCB/Karpas422.bam allGCB/Ly1.bam allGCB/Ly18.bam allGCB/Ly19.bam allGCB/Ly4.bam allGCB/Ly2.bam allGCB/Ly7.bam allGCB/Ly8.bam allGCB/Mieu.bam allGCB/Pfieffer.bam allGCB/ULA.bam
117 #
118 # --> all ABC vs all GCB samples:
119 # allABC/HBL1.bam allABC/Ly10.bam allABC/Ly3.bam allABC/RIVA.bam allABC/SUDHL2.bam allABC/TMD8.bam allABC/U2932.bam
120 # allGCB/BJAB.bam allGCB/Dogum.bam allGCB/DOHH2.bam allGCB/Gumbus.bam allGCB/HT.bam allGCB/Karpas422.bam allGCB/Ly18.bam allGCB/Ly19.bam allGCB/Ly1.bam allGCB/Ly2.bam allGCB/Ly4.bam allGCB/Ly7.bam allGCB/Ly8.bam allGCB/Mieu.bam allGCB/Pfieffer.bam allGCB/ULA.bam
121 # 0 1 2 3 4 5 6 7 8 9
122 my %ldat; # locus => [ ref_genes, <strand>chr, chrstart, chrstop, [@tcons], max_tlen, maxorf_len, max_numexons, cons_avg, cons_stdev,
123 # modelABC_fpkm, modelGCB_fpkm, sample_fpkm_sums ]
124 # 10 11 12..
125 #
126 # 0 1 2 3 4 5 6 7 8 9 10 11 12...
127 #my %tdat; # tcons => [locus, <strand>chr:start-stop, ref_code, ref_id, ref_gene, [@exons], covlen, maxorf_len, cons_avg, cons_stdev, modelABC_fpkm, modelGCB_fpkm, per_sample_fpkm]
128 my %gffrecs;
129 # --------------------- 0 1 2 3 4 5 6 7 8 9 10
130 # recID => [ chr, strand, xloc, gene_name, tdescr, fstart, fend, [@exons], [@cds], class_code, ref_id ]
131
132 my $gffh;
133 open($gffh, $f_gtf) || die("Error: failed to open $f_gtf. $!\n");
134 print STDERR "Loading GTF data from $f_gtf..\n";
135 loadGff($gffh, \%gffrecs);
136 #my @sorted_features=sort sortByLoc keys(%gffrecs);
137 print STDERR "Sorting by genomic location..\n";
138 my @sorted_recs=sort sortByLoc keys(%gffrecs);
139 print STDERR "Processing GTF records and writing transfrag stats..\n";
140
141 ## transcript data:
142 my $tfh;
143 open($tfh, ">$outprefix.t_stats.tab") || die("Error creating $outprefix.t_stats.tab!\n");
144 print $tfh join("\t", qw(transfrag ref_info location xlocus tlen num_exons max_ORF cons_mean cons_stdev ABC_fpkm GCB_fpkm),
145 @fpkmsmp)."\n";
146 # - set up some loci exclusion flags:
147 foreach my $xloc (keys(%loc_codes)) {
148 my $lcodes=$loc_codes{$xloc};
149 if (index($lcodes,'j')<0 && $lcodes=~tr/xso//) {
150 $bad_loci{$xloc}=1;
151 #print STDERR "..declaring $xloc bad..\n";
152 }
153 }
154 my $xloc_list=[];
155 processGffRecs(\%gffrecs, $tfh, \@sorted_recs, $xloc_list); #populates %ldat and writes t_stats, uses %tcons too
156 close($tfh);
157
158 #now print %ldat and %tdat
159 print STDERR "Writing XLOCi stats..\n";
160 open(LOCSTATS, ">$outprefix.xloc_stats.tab") || die("Error creating $outprefix.xloc_stats.tab!\n");
161 #print locus table header:
162 print LOCSTATS join("\t", qw(xlocus ref_genes location tlist max_tlen max_exons max_ORF cons_mean cons_stdev ABC_fpkm GCB_fpkm),
163 @fpkmsmp)."\n";
164 #foreach my $xloc (keys(%ldat)) {
165 foreach my $xloc (@$xloc_list) {
166 # 0 1 2 3 4 5 6 7 8 9
167 my ($reflst, $schr, $cstart, $cend, $tlst, $max_tlen, $max_orf, $max_exons, $cons_avg, $cons_stdev,
168 # 10 11 12..
169 $modelABC_fpkm, $modelGCB_fpkm, @sample_fpkms)=@{$ldat{$xloc}};
170 print LOCSTATS join("\t", $xloc, ((@$reflst>0) ? join(',',@$reflst) : '-'), $schr.":$cstart-$cend", join(',', @$tlst),
171 $max_tlen, $max_exons, $max_orf, sprintf('%.5f', $cons_avg), sprintf('%.5f', $cons_stdev),
172 sprintf0($modelABC_fpkm), sprintf0($modelGCB_fpkm), (map { sprintf0($_) } @sample_fpkms) )."\n";
173 }
174 close(LOCSTATS);
175
176 ## transcript data:
177 #open(TSTATS, ">$outprefix.t_stats.tab") || die("Error creating $outprefix.t_stats.tab!\n");
178 #print TSTATS join("\t", qw(xloc refcode:refId|refGene location exons len num_exons cons_mean const_stdev ABC_fpkm GCB_fpkm),
179 # @fpkmsmp)."\n";
180 #foreach my $t (keys(%tdat)) {
181 # # 0 1 2 3 4 5 6 7 8 9 10 11
182 # my ($xloc, $chrloc, $ref_code, $ref_id, $ref_gene, $exonlst, $covlen, $maxorf_len, $cons_avg, $cons_stdev, $modelABC_fpkm, $modelGCB_fpkm,
183 # @sample_fpkms)=@{$tdat{$t}};
184 # print TSTATS join("\t", $t, "$ref_code:$ref_id|$ref_gene", $chrloc, join(',', (map { $_->[0].'-'.$_->[1] } @$exonlst)), $covlen, $maxorf_len, scalar(@$exonlst),
185 # $cons_avg, $cons_stdev, $modelABC_fpkm, $modelGCB_fpkm, @sample_fpkms)."\n";
186 # }
187 #close(TSTATS);
188
189 print STDERR "Done.\n";
190
191 # --
192 #if ($outfile) {
193 # select(STDOUT);
194 # close(OUTF);
195 # }
196
197 #************ Subroutines **************
198
199 sub sprintf0 {
200 return ($_[0]==0.0) ? '0' : sprintf('%.5f',$_[0]);
201 }
202
203 sub loadGff {
204 my ($fh, $recs)=@_; # $hr=\%gffrecs
205 # --------------------- 0 1 2 3 4 5 6 7 8 9 10
206 # recID => [ chr, strand, xloc, gene_name, tdescr, fstart, fend, [@exons], [@cds], class_code, nearest_ref_id ]
207 while (<$fh>) {
208 next if m/^\s*#/;
209 chomp;
210 my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
211 next unless $fstart>1 && $lnum;
212 next if $f eq 'gene' || $f eq 'locus'; # Warning: skipping any 'gene' or 'locus' features, unconditionally
213 my ($gname, $tdescr, $xloc, $classcode, $refid);
214 if ($lnum=~m/nearest_ref[= ]+"?([^;"]+)/) {
215 $refid=$1;
216 }
217 if ($lnum=~m/class_code[\.= ]+"?([\w=])/) {
218 $classcode=$1;
219 }
220 my $gff3_ID;
221 my $gff3_Parent;
222 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
223 ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
224 ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
225 if ($gff3_ID || $gff3_Parent) { # GFF format
226 $gff3_ID=~tr/"//d; #"
227 $gff3_Parent=~tr/"//d; #"
228 $gff3_Parent='' if ($f eq 'mRNA');
229 if ($gff3_ID && !$gff3_Parent) { #top level feature
230 if ($f=~m/RNA/i || $f=~/gene/) {
231 # try to parse the description, if any
232 if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
233 $tdescr=$1;
234 }
235 #elsif ($lnum=~m/Name[= ]\s*"?([^;"]+)/) {
236 # $tdescr=$1;
237 # }
238 if ($lnum=~m/\bgene_name[ =]+"?([^;"]+)/i) {
239 $gname=$1;
240 }
241 elsif ($lnum=~m/Name[= ]+"?([^;"]+)/) {
242 $gname=$1;
243 }
244 if ($lnum=~m/locus="?([^;"]+)/i || $lnum=~m/loc="?([^;"]+)/i) {
245 $xloc=$1;
246 }
247 $tdescr='' if ($tdescr eq $gname);
248 $gname='' if $gname eq $gff3_ID;
249 }
250 die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($$recs{"$chr|$gff3_ID"}));
251 #my $recID="$chr|$gff3_ID";
252 my $recID=$gff3_ID;
253 $gname=$refid if $refid;
254 $$recs{$recID} = [$chr, $strand, $xloc, $gname, $tdescr, $fstart, $fend, [], [], $classcode, $refid ];
255 $loc_codes{$xloc}.=$classcode if $classcode;
256 next;
257 } # parent/top-level feature
258 } #GFF
259 else { #GTF format
260 next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
261 }
262 # -------------- exon/CDS line here:
263 my $recID;
264 if ($track=~/^jigsaw/ && $lnum=~m/^\d+$/) {
265 $recID=$chr.'.jsm.'.$lnum;
266 }
267 elsif ($lnum=~m/Parent="?(['\:\w\|\-\.]+)/) {
268 $recID=$1;
269 }
270 elsif ($lnum=~m/transcript_id[= ]+"?(['\:\w\.\|\-]+)/) {
271 $recID=$1;
272 }
273 else {
274 die("Error: cannot parse locus/transcript name from input line:\n$_\n");
275 }
276 if ($lnum=~m/gene_id[= ]+"?(['\:\w\.\|\-]+)/) {
277 $xloc=$1; # cuffcompare writes XLOC as gene_id
278 }
279 if ($lnum=~m/\bgene_name[ =]+"?([^;"]+)/i) {
280 $gname=$1;
281 }
282 $tdescr='' if index($recID, $tdescr)>=0;
283 $gname='' if index($recID, $gname)>=0;
284 #$recID=$chr.'|'.$recID;
285 my $ld = $$recs{$recID};
286 if ($ld) { #existing entry
287 my $i=($f eq 'CDS') ? 8 : 7;
288 my ($lstart, $lend)=($$ld[5], $$ld[6]);
289 $$ld[5]=$fstart if $fstart<$lstart;
290 $$ld[6]=$fend if $fend>$lend;
291 push(@{$$ld[$i]}, [$fstart, $fend, $fscore]);
292 }
293 else { # first time seeing this locus/gene
294 $$recs{$recID} = ($f eq 'CDS') ?
295 [$chr, $strand, $xloc, $gname, $tdescr, $fstart, $fend, [], [[$fstart, $fend, $fscore]], $classcode, $refid ] :
296 [$chr, $strand, $xloc, $gname, $tdescr, $fstart, $fend, [[$fstart, $fend, $fscore]], [], $classcode, $refid ] ;
297 # 0 1 2 3 4 5 6 7 (exons) 8 (CDS) 9 10
298 $loc_codes{$xloc}.=$classcode if $classcode;
299 }
300 } #while <$fh>
301 } # loadGff
302
303 sub sortByLoc {
304 my $da=$gffrecs{$a};
305 my $db=$gffrecs{$b};
306 if ($$da[0] eq $$db[0]) {
307 return ($$da[5]==$$db[5]) ? $$da[6] <=> $$db[6] : $$da[5] <=> $$db[5] ;
308 }
309 else { return $$da[0] cmp $$db[0] ; }
310 }
311
312 sub processGffRecs {
313 #return if keys(%recs)==0;
314 my ($recs, $tstats, $rlist, $loclist)=@_;
315 my @recs_keys;
316 my %lseen; #seen xloci
317 unless ($rlist) {
318 @recs_keys=keys(%$recs);
319 $rlist=\@recs_keys;
320 }
321 foreach my $gffid (@$rlist) {
322 my $gffdata=$$recs{$gffid};
323 # my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n");
324 # 0 1 2 3 4 5 6 7 8 9 10
325 my ($chr, $strand, $xloc, $gname, $descr, $lstart, $lend, $er, $cr, $classcode, $refid) = @$gffdata;
326 # my ($mstart,$mend)=($lstart, $lend);
327 next if exists($bad_codes{$classcode}) || exists($bad_loci{$xloc});
328 my $CDexons=0;
329 my @ex;
330 my @cds;
331 #some records might lack exons, but have only CDS segments (e.g. mitochondrial genes)
332 if (@$er<1 && @$cr>0) {
333 @ex = sort { $a->[0] <=> $b->[0] } @$cr;
334 $CDexons=1;
335 }
336 else {
337 @ex = sort { $a->[0] <=> $b->[0] } @$er;
338 }
339 # --------------
340 # get the more accurate version of the start-end coords for the feature
341 my $covlen=0;
342 map { $covlen+=$_->[1]-$_->[0]+1 } @ex;
343 my ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
344 my ($cavg, $cstd)=('-','-');
345 my $cd=$tcons{$gffid};
346 ($cavg, $cstd)=@$cd if ($cd);
347 my $orfmax=$torf{$gffid} || '0';
348 my $numexons=scalar(@ex);
349 my ($ABC_fpkm, $GCB_fpkm)=(0,0);
350 map { $ABC_fpkm+=$tfpkm{$gffid}->[$s2idx{$_}] } @ABCmodels;
351 map { $GCB_fpkm+=$tfpkm{$gffid}->[$s2idx{$_}] } @GCBmodels;
352 if ($GCB_fpkm == 0.0 && $ABC_fpkm == 0.0) {
353 print STDERR "Warning: transfrag $gffid has 0 FPKM in all ABC and GCB model samples!\n";
354 next;
355 }
356 #$tdat{$gffid}=[$xloc, $strand.$chr.":$mstart-$mend", $classcode, $refid, $gname, [@ex], $covlen, $orfmax, $cavg, $cstd, $ABC_fpkm, $GCB_fpkm, @{$tfpkm{$gffid}}];
357 # 0 1 2 3 4 5 6 7 8 9 10 11 12..
358 my $refstatus = $refid ? "$classcode:$refid|$gname" : '-';
359
360 print $tstats join("\t", $gffid, $refstatus, $strand.$chr.":$mstart-$mend",
361 $xloc, $covlen, scalar(@ex), $orfmax,
362 sprintf('%.5f',$cavg), sprintf('%.5f',$cstd), sprintf0($ABC_fpkm),
363 sprintf0($GCB_fpkm), (map { sprintf0($_) } @{$tfpkm{$gffid}}) )."\n";
364
365 #my $td=$tdat{$gffid};
366 unless (exists($lseen{$xloc})) {
367 $lseen{$xloc}=1;
368 push(@$loclist, $xloc);
369 }
370 my $ld=$ldat{$xloc};
371 if ($ld) { #update existing $ldat
372 #$$ld[0].=$classcode;
373 push(@{$$ld[0]},"$refid|$gname") if ($classcode eq 'j');
374 push(@{$$ld[4]}, $gffid);
375 $$ld[2]=$mstart if $mstart<$$ld[2];
376 $$ld[3]=$mend if $mend>$$ld[3];
377 $$ld[5]=$covlen if $covlen>$$ld[5];
378 $$ld[6]=$orfmax if int($orfmax)>$$ld[6];
379 $$ld[7]=$numexons if $numexons>$$ld[7];
380 $$ld[8]+=$cavg; $$ld[8]/=2;
381 $$ld[9]+=$cstd; $$ld[9]/=2;
382 $$ld[10]+=$ABC_fpkm;
383 $$ld[11]+=$GCB_fpkm;
384 for (my $i=0;$i<@fpkmsmp; $i++) {
385 #$$ld[10+$i]+=$$td[10+$i];
386 $$ld[12+$i]+=$tfpkm{$gffid}->[$i];
387 }
388 }
389 else { #new xloc
390 # 0 1 2 3 4 5 6 7 8 9 10 11 12 ..
391 $ldat{$xloc}=[[], $strand.$chr, $mstart, $mend, [$gffid], $covlen, $orfmax, $numexons, $cavg, $cstd, $ABC_fpkm, $GCB_fpkm, @{$tfpkm{$gffid}}];
392 }
393 #my @exonlst = map { $_->[0].'-'.$_->[1] } @ex;
394 #my ($avg, $std)=fetchCons($chr, \@ex);
395 #substr($gffid, 0, length($chr)+1)='';
396 #print join("\t", $gffid, $chr.$strand.':'.join(',',@exonlst),$avg, $std)."\n";
397 } #for each stored transcript
398 }

Properties

Name Value
svn:executable *