+static void mc_cal_y(bcf_p1aux_t *ma)
+{
+ if (ma->n1 > 0 && ma->n1 < ma->n && ma->M == ma->n * 2) { // NB: ma->n1 is ineffective when there are haploid samples
+ int k;
+ long double x;
+ memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1));
+ memset(ma->z2, 0, sizeof(double) * (2 * (ma->n - ma->n1) + 1));
+ ma->t1 = ma->t2 = 0.;
+ mc_cal_y_core(ma, ma->n1);
+ ma->t2 = ma->t;
+ memcpy(ma->z2, ma->z, sizeof(double) * (2 * (ma->n - ma->n1) + 1));
+ mc_cal_y_core(ma, 0);
+ // rescale z
+ x = expl(ma->t - (ma->t1 + ma->t2));
+ for (k = 0; k <= ma->M; ++k) ma->z[k] *= x;
+ } else mc_cal_y_core(ma, 0);
+}
+
+#define CONTRAST_TINY 1e-30
+
+extern double kf_gammaq(double s, double z); // incomplete gamma function for chi^2 test
+
+static inline double chi2_test(int a, int b, int c, int d)
+{
+ double x, z;
+ x = (double)(a+b) * (c+d) * (b+d) * (a+c);
+ if (x == 0.) return 1;
+ z = a * d - b * c;
+ return kf_gammaq(.5, .5 * z * z * (a+b+c+d) / x);
+}
+
+// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)]
+static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int k1, int k2, double x[3])
+{
+ double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2];
+ int n1 = p1->n1, n2 = p1->n - p1->n1;
+ if (p < CONTRAST_TINY) return -1;
+ if (.5*k1/n1 < .5*k2/n2) x[1] += p;
+ else if (.5*k1/n1 > .5*k2/n2) x[2] += p;
+ else x[0] += p;
+ return p * chi2_test(k1, k2, (n1<<1) - k1, (n2<<1) - k2);
+}
+
+static double contrast2(bcf_p1aux_t *p1, double ret[3])
+{
+ int k, k1, k2, k10, k20, n1, n2;
+ double sum;
+ // get n1 and n2
+ n1 = p1->n1; n2 = p1->n - p1->n1;
+ if (n1 <= 0 || n2 <= 0) return 0.;
+ if (p1->hg == 0) { // initialize the hypergeometric distribution
+ /* NB: the hg matrix may take a lot of memory when there are many samples. There is a way
+ to avoid precomputing this matrix, but it is slower and quite intricate. The following
+ computation in this block can be accelerated with a similar strategy, but perhaps this
+ is not a serious concern for now. */
+ double tmp = lgamma(2*(n1+n2)+1) - (lgamma(2*n1+1) + lgamma(2*n2+1));
+ p1->hg = calloc(2*n1+1, sizeof(void*));
+ for (k1 = 0; k1 <= 2*n1; ++k1) {
+ p1->hg[k1] = calloc(2*n2+1, sizeof(double));
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp));
+ }
+ }
+ { // compute
+ long double suml = 0;
+ for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k];
+ sum = suml;
+ }
+ { // get the max k1 and k2
+ double max;
+ int max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) {
+ double x = p1->phi1[k] * p1->z1[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k10 = max_k;
+ for (k = 0, max = 0, max_k = -1; k <= 2*n2; ++k) {
+ double x = p1->phi2[k] * p1->z2[k];
+ if (x > max) max = x, max_k = k;
+ }
+ k20 = max_k;
+ }
+ { // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N.
+ double x[3], y;
+ long double z = 0., L[2];
+ x[0] = x[1] = x[2] = 0; L[0] = L[1] = 0;
+ for (k1 = k10; k1 >= 0; --k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] = x[0]; ret[1] = x[1]; ret[2] = x[2];
+ x[0] = x[1] = x[2] = 0;
+ for (k1 = k10 + 1; k1 <= 2*n1; ++k1) {
+ for (k2 = k20; k2 >= 0; --k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
+ else z += y;
+ }
+ }
+ ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2];
+ if (ret[0] + ret[1] + ret[2] < 0.95) { // in case of bad things happened
+ ret[0] = ret[1] = ret[2] = 0; L[0] = L[1] = 0;
+ for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1)
+ for (k2 = 0; k2 <= 2*n2; ++k2)
+ if ((y = contrast2_aux(p1, sum, k1, k2, ret)) >= 0) z += y;
+ if (ret[0] + ret[1] + ret[2] < 0.95) // It seems that this may be caused by floating point errors. I do not really understand why...
+ z = 1.0, ret[0] = ret[1] = ret[2] = 1./3;
+ }
+ return (double)z;
+ }
+}
+
+static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded)