Revision: 24 Committed: Tue Jul 26 21:46:39 2011 UTC (9 years ago) by gpertea File size: 5092 byte(s) Log Message:
Line File contents
2
3 Fisher - Fisher's Exact Test statistic (2x2 case)
4
6
7 use Fisher qw(fishers_exact log_factorial);
8
10
11 This module exports only one function, C<fishers_exact>, which
12 computes the one- or two-sided Fisher's Exact Test statistic for the 2
13 x 2 case. In the following documentation I will be referring to the
14 following family of 2 x 2 contingency tables
15
16
17 a | b | r1 = a+b
18 --------+-------+----------
19 c | d | r2 = c+d
20 --------+-------+----------
21 c1 | c2 | N
22 = a+c | = b+d | = a+b+c+d
23
24 The *'s mark the cells, N is the total number of points represented by
25 the table, and r1, r2, c1, c2 are the marginals. As suggested by the
26 equalities, the letters a, b, c, d refer to the various cells (reading
27 them left-to-right, top-to-bottom). Depending on context, the letter
28 c (for example) will refer *either* to the lower left cell of the
29 table *or* to a specific value in that cell. Same for a, b, and d.
30
31 In what follows, H(x) (or more precisely, H(x; r1, r2, c1)) refers
32 to the hypergeometric expression
33
34 r1!*r2!*c1!*c2!
35 -------------------------------------
36 (r1+r2)!*x!*(r1-x)!*(c1-x)!*(r2-c1+x)!
37
38 (I omit c2 from the parametrization of H because c2 = r1 + r2 - c1.)
39
41
42 =over 4
43
44 =item fishers_exact( \$a, \$b, \$c, \$d, \$two_sided )
45
46 The paramater C<\$two_sided> is optional. If missing or false
47 C<fishers_exact> computes the one-sided FET p-value. More
48 specifically, it computes the sum of H(x; a+b, c+d, a+c) for x = a to
49 x = min(a+b, a+c) - "the right side". (If you want "the left side", i.e. the sum of H(x; a+b, c+d, a+c) for x
50 = max(0, a-d) to x = a, then compute C<fishers_exact( \$b, \$a, \$d, \$c )
51 +> or
52 C<fishers_exact( \$c, \$d, \$a, \$b )> (these two are equivalent).)
53
54 If C<\$two_sided> is true, the returned p-value will be for the two-sid
55 +ed
56 FET.
57
58 =back
59
60 =cut
61
62 ## let the code begin...
63 package Fisher;
64
65 use strict;
66 use warnings;
67 use Exporter;
68
69 our (\$VERSION, @ISA, @EXPORT);
70 @ISA = qw(Exporter);
71 @EXPORT = qw( fishers_exact log_factorial );
72
73 # @Fishers::EXPORT_OK = ('fishers_exact', 'log_factorial');
74
75 my \$Tolerance = 1;
76 \$Tolerance /= 2 while 1 + \$Tolerance/2 > 1;
77
78 sub fishers_exact {
79 my ( \$a, \$b, \$c, \$d, \$ts ) = @_;
80 TEST:
81 ## simplified test equation
82 my \$test = \$a*\$d - \$b*\$c;
83 ## introduced switching around of input pairs
84 #if (\$test < 0 && \$ts){
85 if (\$test < 0 ) {
86 (\$a, \$b, \$c, \$d) = (\$b, \$a, \$d, \$c);
87 goto TEST;
88 }
89
90 if (\$test < 0 and \$ts){
91 die "Cannot compute two tailed FET for input values given \( ".(join ", ", @_)."\)\n";
92 }
93 # below here, \$test < 0 implies !\$ts;
94
95 my \$p_val;
96 if ( \$test < 0 ) {
97 if ( \$d < \$a ) {
98 \$p_val = _fishers_exact( \$d, \$c, \$b, \$a, 0, 1 );
99 }
100 else {
101 \$p_val = _fishers_exact( \$a, \$b, \$c, \$d, 0, 1 );
102 }
103 }
104 else {
105 if ( \$b < \$c ) {
106 \$p_val = _fishers_exact( \$b, \$a, \$d, \$c, \$ts, 0 );
107 }
108 else {
109 \$p_val = _fishers_exact( \$c, \$d, \$a, \$b, \$ts, 0 );
110 }
111 }
112
113 return \$p_val;
114 }
115
116 sub _fishers_exact {
117 my ( \$a, \$b, \$c, \$d, \$ts, \$complement ) = @_;
118 die "Bad args\n" if \$ts && \$complement;
119
120 my ( \$aa, \$bb, \$cc, \$dd ) = ( \$a, \$b, \$c, \$d );
121 my \$first = my \$delta = _single_term( \$aa, \$bb, \$cc, \$dd );
122 my \$p_val = 0;
123
124 {
125 \$p_val += \$delta;
126 last if \$aa < 1;
127 \$delta *= ( ( \$aa-- * \$dd-- )/( ++\$bb * ++\$cc ) );
128 redo;
129 }
130
131 if ( \$ts ) {
132 my \$m = \$b < \$c ? \$b : \$c;
133 (\$aa, \$bb, \$cc, \$dd ) = ( \$a + \$m, \$b - \$m, \$c - \$m, \$d + \$m );
134 \$delta = _single_term( \$aa, \$bb, \$cc, \$dd );
135 my \$bound = -\$Tolerance;
136 while ( \$bound <= ( \$first - \$delta )/\$first && \$aa > \$a ) {
137
138 \$p_val += \$delta;
139 \$delta *= ( ( \$aa-- * \$dd-- )/( ++\$bb * ++\$cc ) );
140 }
141 }
142 elsif ( \$complement ) {
143 \$p_val = 1 - \$p_val + \$first;
144 }
145
146 return \$p_val;
147 }
148
149 sub _single_term {
150 my ( \$a, \$b, \$c, \$d ) = @_;
151 my ( \$r1, \$r2 ) = (\$a + \$b, \$c + \$d);
152 my ( \$c1, \$c2 ) = (\$a + \$c, \$b + \$d);
153 my \$N = \$r1 + \$r2;
154
155 return exp( log_factorial( \$r1 ) + log_factorial( \$r2 ) +
156 log_factorial( \$c1 ) + log_factorial( \$c2 ) -
157 log_factorial( \$N ) -
158 ( log_factorial( \$a ) + log_factorial( \$b ) +
159 log_factorial( \$c ) + log_factorial( \$d ) ) );
160 }
161
162 {
163 my \$PI=atan2(0,-1);
164 my \$twoPI=2*\$PI;
165 my \$pi_3=\$PI/3;
166 my @lnfcache=(0,0);
167
168 sub log_factorial {
169 #my \$n = Math::Pari::PARI( shift() );
170 my \$n=shift;
171 die "Bad args to log_factorial: \$n" if \$n < 0;
172 my \$ln_fact;
173 if (\$n<900) {
174 return \$lnfcache[\$n] if defined \$lnfcache[\$n];
175 for (my \$i = scalar(@lnfcache); \$i <= \$n; \$i++) {
176 \$lnfcache[\$i] = \$lnfcache[\$i-1] + log(\$i);
177 }
178 return \$lnfcache[\$n];
179 }
180 else {
181 # Gosper's approximation; from
182 # http://mathworld.wolfram.com/StirlingsApproximation.html
183 \$ln_fact = 0.5 * log(\$twoPI*\$n + \$pi_3) + \$n*log(\$n) - \$n;
184 }
185 return \$ln_fact;
186 }
187 }
188
189 1;
190
191 __END__

Name Value
svn:executable *