From 7ee053dc803e15e96a0581851629bdb25e045b9e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 6 Aug 2010 03:46:53 +0000 Subject: [PATCH] Fisher's exact test --- bcftools/fet.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 bcftools/fet.c diff --git a/bcftools/fet.c b/bcftools/fet.c new file mode 100644 index 0000000..ed81a81 --- /dev/null +++ b/bcftools/fet.c @@ -0,0 +1,112 @@ +#include +#include + +// log\binom{n}{k} +static double lbinom(int n, int k) +{ + if (k == 0 || n == k) return 0; + return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1); +} + +// n11 n12 | n1_ +// n21 n22 | n2_ +//-----------+---- +// n_1 n_2 | n + +// hypergeometric distribution +static double hypergeo(int n11, int n1_, int n_1, int n) +{ + return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1)); +} + +typedef struct { + int n11, n1_, n_1, n; + double p; +} hgacc_t; + +// incremental version of hypergenometric distribution +static double hypergeo_acc(int n11, int n1_, int n_1, int n, hgacc_t *aux) +{ + if (n1_ || n_1 || n) { + aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n; + } else { // then only n11 changed; the rest fixed + if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) { + if (n11 == aux->n11 + 1) { + aux->p *= (double)(aux->n1_ - aux->n11) / n11 + * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1); + aux->n11 = n11; + return aux->p; + } + if (n11 == aux->n11 - 1) { + aux->p *= (double)aux->n11 / (aux->n1_ - n11) + * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11); + aux->n11 = n11; + return aux->p; + } + } + aux->n11 = n11; + } + aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n); + return aux->p; +} + +double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two) +{ + int i, j, max, min; + double p, q, left, right; + hgacc_t aux; + int n1_, n_1, n; + + n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n + max = (n_1 < n1_) ? n_1 : n1_; // for right tail + min = (n1_ + n_1 - n < 0) ? 0 : (n1_ + n_1 - n < 0); // for left tail + if (min == max) { // no need to do test + *two = *_left = *_right = 1.; + return 1.; + } + // the probability of the current table + q = hypergeo_acc(n11, n1_, n_1, n, &aux); + // left tail + p = hypergeo_acc(min, 0, 0, 0, &aux); + for (left = 0., i = min + 1; p < 0.99999999 * q; ++i) { + left += p; + p = hypergeo_acc(i, 0, 0, 0, &aux); + } + --i; + if (p < 1.00000001 * q) left += p; + else --i; + // right tail + p = hypergeo_acc(max, 0, 0, 0, &aux); + for (right = 0., j = max - 1; p < 0.99999999 * q; --j) { + right += p; + p = hypergeo_acc(j, 0, 0, 0, &aux); + } + if (p < 1.00000001 * q) right += p; + else ++j; + // two-tail + *two = left + right; + if (*two > 1.) *two = 1.; + // adjust left and right + if (abs(i - n11) < abs(j - n11)) right = 1. - left + q; + else left = 1.0 - right + q; + *_left = left; *_right = right; + return q; +} + +#ifdef FET_MAIN +#include + +int main(int argc, char *argv[]) +{ + char id[1024]; + int n11, n12, n21, n22; + double left, right, twotail, prob; + + while (scanf("%s%d%d%d%d", id, &n11, &n12, &n21, &n22) == 5) { + prob = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &twotail); + printf("%s\t%d\t%d\t%d\t%d\t%.6g\t%.6g\t%.6g\t%.6g\n", id, n11, n12, n21, n22, + prob, left, right, twotail); + } + return 0; +} +#endif -- 2.39.2