ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/Fisher.pm
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 5092 byte(s)
Log Message:
Line File contents
1 =head1 NAME
2
3 Fisher - Fisher's Exact Test statistic (2x2 case)
4
5 =head1 SYNOPSIS
6
7 use Fisher qw(fishers_exact log_factorial);
8
9 =head1 DESCRIPTION
10
11 This module exports only one function, C<fishers_exact>, which
12 computes the one- or two-sided Fisher's Exact Test statistic for the 2
13 x 2 case. In the following documentation I will be referring to the
14 following family of 2 x 2 contingency tables
15
16
17 a | b | r1 = a+b
18 --------+-------+----------
19 c | d | r2 = c+d
20 --------+-------+----------
21 c1 | c2 | N
22 = a+c | = b+d | = a+b+c+d
23
24 The *'s mark the cells, N is the total number of points represented by
25 the table, and r1, r2, c1, c2 are the marginals. As suggested by the
26 equalities, the letters a, b, c, d refer to the various cells (reading
27 them left-to-right, top-to-bottom). Depending on context, the letter
28 c (for example) will refer *either* to the lower left cell of the
29 table *or* to a specific value in that cell. Same for a, b, and d.
30
31 In what follows, H(x) (or more precisely, H(x; r1, r2, c1)) refers
32 to the hypergeometric expression
33
34 r1!*r2!*c1!*c2!
35 -------------------------------------
36 (r1+r2)!*x!*(r1-x)!*(c1-x)!*(r2-c1+x)!
37
38 (I omit c2 from the parametrization of H because c2 = r1 + r2 - c1.)
39
40 =head1 FUNCTION
41
42 =over 4
43
44 =item fishers_exact( $a, $b, $c, $d, $two_sided )
45
46 The paramater C<$two_sided> is optional. If missing or false
47 C<fishers_exact> computes the one-sided FET p-value. More
48 specifically, it computes the sum of H(x; a+b, c+d, a+c) for x = a to
49 x = min(a+b, a+c) - "the right side". (If you want "the left side", i.e. the sum of H(x; a+b, c+d, a+c) for x
50 = max(0, a-d) to x = a, then compute C<fishers_exact( $b, $a, $d, $c )
51 +> or
52 C<fishers_exact( $c, $d, $a, $b )> (these two are equivalent).)
53
54 If C<$two_sided> is true, the returned p-value will be for the two-sid
55 +ed
56 FET.
57
58 =back
59
60 =cut
61
62 ## let the code begin...
63 package Fisher;
64
65 use strict;
66 use warnings;
67 use Exporter;
68
69 our ($VERSION, @ISA, @EXPORT);
70 @ISA = qw(Exporter);
71 @EXPORT = qw( fishers_exact log_factorial );
72
73 # @Fishers::EXPORT_OK = ('fishers_exact', 'log_factorial');
74
75 my $Tolerance = 1;
76 $Tolerance /= 2 while 1 + $Tolerance/2 > 1;
77
78 sub fishers_exact {
79 my ( $a, $b, $c, $d, $ts ) = @_;
80 TEST:
81 ## simplified test equation
82 my $test = $a*$d - $b*$c;
83 ## introduced switching around of input pairs
84 #if ($test < 0 && $ts){
85 if ($test < 0 ) {
86 ($a, $b, $c, $d) = ($b, $a, $d, $c);
87 goto TEST;
88 }
89
90 if ($test < 0 and $ts){
91 die "Cannot compute two tailed FET for input values given \( ".(join ", ", @_)."\)\n";
92 }
93 # below here, $test < 0 implies !$ts;
94
95 my $p_val;
96 if ( $test < 0 ) {
97 if ( $d < $a ) {
98 $p_val = _fishers_exact( $d, $c, $b, $a, 0, 1 );
99 }
100 else {
101 $p_val = _fishers_exact( $a, $b, $c, $d, 0, 1 );
102 }
103 }
104 else {
105 if ( $b < $c ) {
106 $p_val = _fishers_exact( $b, $a, $d, $c, $ts, 0 );
107 }
108 else {
109 $p_val = _fishers_exact( $c, $d, $a, $b, $ts, 0 );
110 }
111 }
112
113 return $p_val;
114 }
115
116 sub _fishers_exact {
117 my ( $a, $b, $c, $d, $ts, $complement ) = @_;
118 die "Bad args\n" if $ts && $complement;
119
120 my ( $aa, $bb, $cc, $dd ) = ( $a, $b, $c, $d );
121 my $first = my $delta = _single_term( $aa, $bb, $cc, $dd );
122 my $p_val = 0;
123
124 {
125 $p_val += $delta;
126 last if $aa < 1;
127 $delta *= ( ( $aa-- * $dd-- )/( ++$bb * ++$cc ) );
128 redo;
129 }
130
131 if ( $ts ) {
132 my $m = $b < $c ? $b : $c;
133 ($aa, $bb, $cc, $dd ) = ( $a + $m, $b - $m, $c - $m, $d + $m );
134 $delta = _single_term( $aa, $bb, $cc, $dd );
135 my $bound = -$Tolerance;
136 while ( $bound <= ( $first - $delta )/$first && $aa > $a ) {
137
138 $p_val += $delta;
139 $delta *= ( ( $aa-- * $dd-- )/( ++$bb * ++$cc ) );
140 }
141 }
142 elsif ( $complement ) {
143 $p_val = 1 - $p_val + $first;
144 }
145
146 return $p_val;
147 }
148
149 sub _single_term {
150 my ( $a, $b, $c, $d ) = @_;
151 my ( $r1, $r2 ) = ($a + $b, $c + $d);
152 my ( $c1, $c2 ) = ($a + $c, $b + $d);
153 my $N = $r1 + $r2;
154
155 return exp( log_factorial( $r1 ) + log_factorial( $r2 ) +
156 log_factorial( $c1 ) + log_factorial( $c2 ) -
157 log_factorial( $N ) -
158 ( log_factorial( $a ) + log_factorial( $b ) +
159 log_factorial( $c ) + log_factorial( $d ) ) );
160 }
161
162 {
163 my $PI=atan2(0,-1);
164 my $twoPI=2*$PI;
165 my $pi_3=$PI/3;
166 my @lnfcache=(0,0);
167
168 sub log_factorial {
169 #my $n = Math::Pari::PARI( shift() );
170 my $n=shift;
171 die "Bad args to log_factorial: $n" if $n < 0;
172 my $ln_fact;
173 if ($n<900) {
174 return $lnfcache[$n] if defined $lnfcache[$n];
175 for (my $i = scalar(@lnfcache); $i <= $n; $i++) {
176 $lnfcache[$i] = $lnfcache[$i-1] + log($i);
177 }
178 return $lnfcache[$n];
179 }
180 else {
181 # Gosper's approximation; from
182 # http://mathworld.wolfram.com/StirlingsApproximation.html
183 $ln_fact = 0.5 * log($twoPI*$n + $pi_3) + $n*log($n) - $n;
184 }
185 return $ln_fact;
186 }
187 }
188
189 1;
190
191 __END__

Properties

Name Value
svn:executable *