ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/fisher_exact_test.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 11212 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5 use Fisher qw(fishers_exact log_factorial);
6
7
8 my $usage = q/Usage:
9 fisher_exact_test.pl [-T] a1 b1 a2 b2
10 Computes Fisher's exact test p-values for 2x2 matrix:
11
12 a1 b1
13
14 a2 b2
15
16 -T option will make the input values to be taken in terms of totals:
17
18 t1 t2 n1 n2
19
20 ..where n1 is a subset of t1 and n2 a subset of t2, i.e.
21 t1=a1+b1 and t2=a2+b2
22
23 /;
24 umask 0002;
25 getopts('To:') || die($usage."\n");
26 die($usage." Error: exactly 4 values are needed (totals first)!\n") unless @ARGV==4;
27 my $totals=$Getopt::Std::opt_T;
28 my @fncache=(0,0);
29
30 # -- needed by Audic-Calverie: calc_audic()
31 my $MAXIT = 500;
32 my $EPS = 3.0E-30;
33 my $FPMIN = 1.0E-30;
34 my @COF = ( 76.18009172947146, -86.50532032941677,
35 24.01409824083091, -1.231739572450155,
36 0.1208650973866179E-2,-0.5395239384953E-5 );
37
38
39
40 {
41 my ($a, $b , $c, $d)=@ARGV;
42 if ($totals) {
43 my ($t1, $t2, $n1,$n2)=@ARGV;
44 die("Error: totals should be larger than parts!\n")
45 unless $n1<$t1 && $n2<$t2;
46 ($a, $b , $c, $d)=($n1, $t1-$n1, $n2, $t2-$n2);
47 print STDERR " a b c d = $a $b $c $d\n";
48 #my $ap=audic($t1, $t2, $n1, $n2);
49 my ($ap, $apdir)=calc_audic($n1, $n2, $t1, $t2, 1);
50 print ">>>> Audic & Claverie : $ap ($apdir)\n";
51 }
52 my $FisherProb = ProbCTable($a, $b , $c, $d);
53 print "-----> Fisher Probability: $FisherProb\n";
54 my $fet1=fishers_exact($a,$b,$c,$d);
55 my $fet2=fishers_exact($a,$b,$c,$d, 1);
56 print "Fisher.pm : one-tailed: $fet1\n";
57 print "Fisher.pm : two-tailed: $fet2\n";
58 exit;
59 #--- rest is optional:
60 my $a_b = $a + $b;
61 my $c_d = $c + $d;
62 my $a_c = $a + $c;
63 my $b_d = $b + $d;
64 my $total = $a + $b + $c + $d;
65 my $p1 = $a/$a_b;
66 my $p2 = $c/$c_d;
67 my $deltap = abs($p1 - $p2);
68 my $p = $a_c / $total;
69 my $sd = sqrt($p * (1-$p) * (1/$a_b + 1/$c_d));
70 my $z = abs($p1 - $p2) / $sd;
71 my $zp = abs(ltqnorm($FisherProb));
72 my $z95 = abs(ltqnorm(0.95));
73
74 my $z_ratio = $z / $zp;
75 my $deltap95 = $deltap * ($z95 / $zp);
76 my $ratio95 = $z95 / $zp;
77 print "Deltap:$deltap\tProjectedDeltap95:$deltap95\tRatio:$ratio95\n\n";
78
79 print "$a\t$b\t$a_b\n$c\t$d\t$c_d\n".
80 "$a_c\t$b_d\t$total\nFisherProb: $FisherProb <----\n" .
81 "z:$z\nzp:$zp\nratio:$z_ratio\n$p1\t$p2\t$p\n\n";
82
83 my $p_this_table = ProbOneTable($a , $b , $c , $d);
84 print "This one table p:$p_this_table\n";
85
86 my $mid_p = $FisherProb - $p_this_table/2;
87 print "Mid-p:$mid_p\n\n";
88 }
89
90 #================ SUBROUTINES ============
91
92 sub LnFactorial {
93 my $n = shift;
94 return $fncache[$n] if defined $fncache[$n];
95 return undef if $n < 0;
96 for (my $i = scalar(@fncache); $i <= $n; $i++) {
97 $fncache[$i] = $fncache[$i-1] + log($i);
98 }
99
100 return $fncache[$n];
101 }
102 sub factorial {
103 my $r=1;
104 $r *= $_ foreach 2..$_[0];
105 return $r;
106 }
107
108 #compute Audic & Claverie probability
109 sub audic {
110 my ($n1, $n2, $x, $y)=@_;
111 my $v=$y*log($n2/$n1)+log_factorial($x+$y)-log_factorial($x)-log_factorial($y)-
112 ($x+$y+1)*log(1+$n2/$n1);
113 return exp($v);
114 #my $nratio=$n2/$n1;
115 #my $v=($nratio ** $y)*factorial($x+$y)/(factorial($x)*factorial($y)*((1+$nratio)**($x+$y+1)));
116
117 }
118
119 sub calc_audic {
120 =pod
121
122 =head2 calc_audic $x, $y, $Nx, $Ny, <$signedValue>
123
124 Determines the statistical significance of the difference
125 in tag count (expression) between two libraries. This
126 function uses the method described by Audic and
127 Claverie (1997). This method can be called on an
128 instantiated object, as well as statically.
129
130 B<Arguments>
131
132 I<$x,$y>
133
134 The number of tags in the x- and y-axis
135 libraries, respectively.
136
137 I<$Nx,$Ny>
138
139 The total number of tags in the x- and y-axis
140 libraries, respectively.
141
142 I<$signedValue> (optional)
143
144 A boolean value (>=1 is FALSE). If this flag is
145 set to TRUE, downregulated comparisons will return
146 both a p-value and either +1, -1, or 0 to indicate
147 up/down/same-regulation (i.e. -1 if the expression
148 ratio of tags in the x-axis library(s) is greater
149 than that of the y-axis library(s)). This flag
150 is FALSE by default.
151
152 B<Returns>
153
154 The p-value for the observation. A lower number is
155 more significant. Typically, observations with
156 p <= 0.05 are considered statistically significant.
157
158 If $signedValue is set to TRUE, the function also
159 returns a 0, -1 or +1 to indicate same/down/up-regulation.
160
161 B<Usage>
162
163 # the function is static, so it can be accessed directly
164 my $p = calc_audic( 3, 10, 50000, 60000 );
165
166 # or:
167 my ( $p, $sign ) = calc_audic( 3, 10, 50000, 60000, 1 );
168 if( $p <= 0.05 ) {
169 if( $sign == +1 ) { print "Significantly upregulated.\n"; }
170 if( $sign == -1 ) { print "Significantly downregulated.\n"; }
171 if( $sign == 0 ) { die( "Same expression should never be significant!" ); }
172 }
173
174 =cut
175 my $x = shift;
176 if( !defined( $x ) ) { die( " calc_audic no arguments provided." ); }
177 my $y = shift;
178 my $M = shift; # cf n1
179 my $N = shift; # cf n2
180 my $bSign = shift || 0;
181
182 my $p = $M / ( $M+$N );
183
184 my $thisproba = __betai( $x+1, $y+1, $p );
185 my $thisproba2 = __betai( $y+1, $x+1, 1.0-$p );
186
187 if( $bSign >= 1 ) {
188 my $ratio1 = $x / $M;
189 my $ratio2 = $y / $N;
190 my $sign = 0;
191 if( $ratio1 > $ratio2 ) { $sign = -1; }
192 if( $ratio1 < $ratio2 ) { $sign = +1; }
193 #return ( ( $thisproba < $thisproba2 ? ( 2*$thisproba ) : ( 2*$thisproba2 ) ), $sign );
194 return ( ($thisproba < $thisproba2) ? $thisproba : $thisproba2, $sign );
195 }
196
197 # return ( $thisproba < $thisproba2 ? ( 2*$thisproba ) : ( 2*$thisproba2 ) );
198 return ( $thisproba < $thisproba2) ? $thisproba : $thisproba2 ;
199
200 }
201
202 ###################################
203 # Audic and Claverie C->Perl Port #
204 ###################################
205
206 sub __gammln {
207
208 my $xx = shift;
209
210 my $x = $xx;
211 my $y = $xx;
212
213 my $tmp = $x + 5.5;
214 $tmp -= ( $x + 0.5 ) * log( $tmp );
215 my $ser = 1.000000000190015;
216 for( my $j = 0; $j <= 5; $j++ ) { $ser += $COF[$j] / ++$y; }
217
218 return -$tmp + log( 2.5066282746310005 * $ser / $x );
219
220 }
221
222 sub __betai {
223 my $a = shift;
224 my $b = shift;
225 my $x = shift;
226
227 if( $x < 0.0 || $x > 1.0 ) {
228 die( "Bad x in routine betai." );
229 }
230
231 my $bt;
232
233 if( $x == 0.0 || $x == 1.0 ) {
234 $bt = 0.0;
235 } else {
236 $bt = exp( __gammln( $a+$b ) - __gammln( $a ) - __gammln( $b ) + $a*log( $x ) + $b*log( 1.0-$x ) );
237 }
238
239 if( $x < ( $a+1.0 )/( $a+$b+2.0 ) ) {
240 return $bt * __betacf( $a, $b, $x ) / $a;
241 }
242
243 return 1.0 - $bt * __betacf( $b, $a, 1.0-$x ) / $b;
244
245 }
246
247 sub __fabs {
248 my $x = shift;
249 return ( $x < 0 ? -$x : $x );
250 }
251
252 sub __betacf {
253
254 my $a = shift;
255 my $b = shift;
256 my $x = shift;
257
258 my $qab = $a + $b;
259 my $qap = $a + 1.0;
260 my $qam = $a - 1.0;
261 my $c = 1.0;
262 my $d = 1.0 - $qab * $x / $qap;
263
264 if( __fabs( $d ) < $FPMIN ) { $d = $FPMIN; }
265
266 $d = 1.0 / $d; # inverse d
267 my $h = $d;
268 my $m;
269 for( $m = 1; $m <= $MAXIT; $m++ ) {
270 my $m2 = 2 * $m;
271 my $aa = $m * ( $b-$m ) * $x / ( ( $qam + $m2 ) * ( $a + $m2 ) );
272
273 $d = 1.0 + $aa*$d;
274 if( __fabs( $d ) < $FPMIN ) { $d = $FPMIN; }
275
276 $c = 1.0 + $aa/$c;
277 if( __fabs( $c ) < $FPMIN ) { $c = $FPMIN; }
278
279 $d = 1.0 / $d; # inverse d
280
281 $h *= $d*$c;
282
283 $aa = -($a+$m)*($qab+$m)*$x/(($a+$m2)*($qap+$m2));
284
285 $d = 1.0 + $aa * $d;
286 if( __fabs( $d ) < $FPMIN ) { $d = $FPMIN; }
287
288 $c = 1.0 + $aa / $c;
289 if( __fabs( $c ) < $FPMIN ) { $c = $FPMIN; }
290
291 $d = 1.0 / $d; # inverse d;
292
293 my $del = $d*$c;
294 $h *= $del;
295 if( __fabs( $del-1.0 ) < $EPS ) { last; }
296 }
297
298 if( $m > $MAXIT ) {
299 # die( "a or b too big, or MAXIT too small in __betacf" );
300 print STDERR "a($a) or b($b) too big, or MAXIT($MAXIT) too small in __betacf!\n";
301 }
302 return $h;
303 }
304
305
306
307 # Compute the probability of getting this exact table
308 # using the hypergeometric distribution
309 sub ProbOneTable {
310 my ($a , $b , $c, $d) = @_;
311 my $n = $a + $b + $c + $d;
312 my $LnNumerator = LnFactorial($a+$b)+
313 LnFactorial($c+$d)+
314 LnFactorial($a+$c)+
315 LnFactorial($b+$d);
316
317 my $LnDenominator = LnFactorial($a) +
318 LnFactorial($b) +
319 LnFactorial($c) +
320 LnFactorial($d) +
321 LnFactorial($n);
322
323 my $LnP = $LnNumerator - $LnDenominator;
324 return exp($LnP);
325 }
326
327 # Compute the cumulative probability by adding up individual
328 # probabilities
329 sub ProbCTable {
330 my ($a, $b, $c, $d) = @_;
331
332 my $min;
333
334 my $n = $a + $b + $c + $d;
335
336 my $p = 0;
337 $p += ProbOneTable($a, $b, $c, $d);
338 if( ($a * $d) >= ($b * $c) ) {
339 $min = ($c < $b) ? $c : $b;
340 for(my $i = 0; $i < $min; $i++) {
341 $p += ProbOneTable(++$a, --$b, --$c, ++$d);
342 }
343 }
344
345 if ( ($a * $d) < ($b * $c) ) {
346 $min = ($a < $d) ? $a : $d;
347 for(my $i = 0; $i < $min; $i++) {
348 $p += ProbOneTable(--$a, ++$b, ++$c, --$d);
349 }
350 }
351 return $p;
352 }
353
354 # Lower tail quantile for standard normal distribution function.
355 #
356 # This function returns an approximation of the inverse cumulative
357 # standard normal distribution function. I.e., given P, it returns
358 # an approximation to the X satisfying P = Pr{Z <= X} where Z is a
359 # random variable from the standard normal distribution.
360 #
361 # The algorithm uses a minimax approximation by rational functions
362 # and the result has a relative error whose absolute value is less
363 # than 1.15e-9.
364 #
365 # Author: Peter J. Acklam
366 # Time-stamp: 2000-07-19 18:26:14
367 # E-mail: pjacklam@online.no
368 # WWW URL: http://home.online.no/~pjacklam
369 sub ltqnorm {
370 my $p = shift;
371 die "input argument must be in (0,1)\n" unless 0 < $p && $p < 1;
372
373 # Coefficients in rational approximations.
374 my @a = (-3.969683028665376e+01, 2.209460984245205e+02,
375 -2.759285104469687e+02, 1.383577518672690e+02,
376 -3.066479806614716e+01, 2.506628277459239e+00);
377 my @b = (-5.447609879822406e+01, 1.615858368580409e+02,
378 -1.556989798598866e+02, 6.680131188771972e+01,
379 -1.328068155288572e+01 );
380 my @c = (-7.784894002430293e-03, -3.223964580411365e-01,
381 -2.400758277161838e+00, -2.549732539343734e+00,
382 4.374664141464968e+00, 2.938163982698783e+00);
383 my @d = ( 7.784695709041462e-03, 3.224671290700398e-01,
384 2.445134137142996e+00, 3.754408661907416e+00);
385
386 # Define break-points.
387 my $plow = 0.02425;
388 my $phigh = 1 - $plow;
389
390 # Rational approximation for lower region:
391 if ( $p < $plow ) {
392 my $q = sqrt(-2*log($p));
393 return ((((($c[0]*$q+$c[1])*$q+$c[2])*$q+$c[3])*$q+$c[4])*$q+$c[5]) /
394 (((($d[0]*$q+$d[1])*$q+$d[2])*$q+$d[3])*$q+1);
395 }
396
397 # Rational approximation for upper region:
398 if ( $phigh < $p ) {
399 my $q = sqrt(-2*log(1-$p));
400 return -((((($c[0]*$q+$c[1])*$q+$c[2])*$q+$c[3])*$q+$c[4])*$q+$c[5]) /
401 (((($d[0]*$q+$d[1])*$q+$d[2])*$q+$d[3])*$q+1);
402 }
403
404 # Rational approximation for central region:
405 my $q = $p - 0.5;
406 my $r = $q*$q;
407 return ((((($a[0]*$r+$a[1])*$r+$a[2])*$r+$a[3])*$r+$a[4])*$r+$a[5])*$q /
408 ((((($b[0]*$r+$b[1])*$r+$b[2])*$r+$b[3])*$r+$b[4])*$r+1);
409 }

Properties

Name Value
svn:executable *