+ double x, z;
+ x = (double)(a+b) * (c+d) * (b+d) * (a+c);
+ if (x == 0.) return 1;
+ z = a * d - b * c;
+ return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x);
+}
+
+// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)]
+static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int n1, int n2, int k1, int k2, double x[3])
+{
+ double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2];
+ if (p < CONTRAST_TINY) return -1;
+ if (.5*k1/n1 < .5*k2/n2) x[1] += p;
+ else if (.5*k1/n1 > .5*k2/n2) x[2] += p;
+ else x[0] += p;
+ return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2);
+}
+
+static double contrast2(bcf_p1aux_t *p1, double ret[3])
+{
+ int k, k1, k2, k10, k20, n1, n2;
+ double sum;
+ // get n1 and n2
+ n1 = p1->n1; n2 = p1->n - p1->n1;
+ if (n1 <= 0 || n2 <= 0) return 0.;
+ if (p1->hg == 0) { // initialize the hypergeometric distribution
+ /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way
+ to avoid precomputing this matrix, but it is slower and quite intricate. The following
+ computation in this block can be accelerated with a similar strategy, but perhaps this
+ is not a serious concern for now. */
+ double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1));
+ p1->hg = calloc(2*n1+1, sizeof(void*));
+ for (k1 = 0; k1 <= 2*n1; ++k1) {
+ p1->hg[k1] = calloc(2*n2+1, sizeof(double));
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp));
+ }
+ }
+ { // compute sum1 and sum2
+ long double suml = 0;
+ for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k];
+ sum = suml;
+ }
+ { // get the mean k1 and k2
+ double max;
+ int max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) {
+ double x = p1->phi1[k] * p1->z1[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k10 = max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) {
+ double x = p1->phi2[k] * p1->z2[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k20 = max_k;
+ }
+ { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N.
+ double x[3], y;
+ long double z = 0.;
+ x[0] = x[1] = x[2] = 0;
+ for (k1 = k10; k1 >= 0; --k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2];
+ x[0] = x[1] = x[2] = 0;
+ for (k1 = k10 + 1; k1 <= 2*n1; ++k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2];
+ if (ret[0] + ret[1] + ret[2] < 0.99) { // in case of bad things happened
+ ret[0] = ret[1] = ret[2] = 0;
+ for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1)
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, ret)) >= 0) z += y;
+ if (ret[0] + ret[1] + ret[2] < 0.99) // It seems that this may be caused by floating point errors. I do not really understand why...
+ z = 1.0, ret[0] = ret[1] = ret[2] = 1./3;
+ }
+ return (double)z;