CC= gcc
CFLAGS= -g -Wall -O2 #-m64 #-arch ppc
DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE
-LOBJS= bcf.o vcf.o bcfutils.o prob1.o ld.o kfunc.o index.o fet.o bcf2qcall.o
+LOBJS= bcf.o vcf.o bcfutils.o prob1.o em.o kfunc.o kmin.o index.o fet.o bcf2qcall.o
OMISC= ..
AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o $(OMISC)/bedidx.o
PROG= bcftools
#define VC_NO_INDEL 8192
#define VC_ANNO_MAX 16384
#define VC_FIX_PL 32768
+#define VC_EM 0x10000
typedef struct {
int flag, prior_type, n1, n_sub, *sublist, n_perm;
char *prior_file, **subsam, *fn_dict;
uint8_t *ploidy;
- double theta, pref, indel_frac, min_perm_p, min_smpl_frac;
+ double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt;
void *bed;
} viewconf_t;
void bed_destroy(void *_h);
int bed_overlap(const void *_h, const char *chr, int beg, int end);
-static double test_hwe(const double g[3])
-{
- extern double kf_gammaq(double p, double x);
- double fexp, chi2, f[3], n;
- int i;
- n = g[0] + g[1] + g[2];
- fexp = (2. * g[2] + g[1]) / (2. * n);
- if (fexp > 1. - 1e-10) fexp = 1. - 1e-10;
- if (fexp < 1e-10) fexp = 1e-10;
- f[0] = n * (1. - fexp) * (1. - fexp);
- f[1] = n * 2. * fexp * (1. - fexp);
- f[2] = n * fexp * fexp;
- for (i = 0, chi2 = 0.; i < 3; ++i)
- chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i];
- return kf_gammaq(.5, chi2 / 2.);
-}
-
typedef struct {
double p[4];
int mq, depth, is_tested, d[4];
bcf_sync(b);
}
-static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag)
+static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[9])
{
kstring_t s;
- int has_I16, is_var = (pr->p_ref < pref);
- double fq, r = is_var? pr->p_ref : pr->p_var;
+ int has_I16, is_var;
+ double fq, r;
anno16_t a;
has_I16 = test16(b, &a) >= 0? 1 : 0;
- rm_info(b, "I16=");
+ rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed!
memset(&s, 0, sizeof(kstring_t));
kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s);
kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
kputs(b->info, &s);
if (b->info[0]) kputc(';', &s);
-// ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih);
- ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g;G3=%.4g,%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih, pr->g[2], pr->g[1], pr->g[0]);
- if (n_smpl > 5) {
- double hwe = test_hwe(pr->g);
- if (hwe < 0.1) ksprintf(&s, ";HWE=%.4g", hwe);
+ { // print EM
+ if (em[0] >= 0) ksprintf(&s, "AF1=%.4g", 1 - em[0]);
+ if (em[4] >= 0 && em[4] <= 0.05) ksprintf(&s, ";G3=%.4g,%.4g,%.4g;HWE=%.3g", em[3], em[2], em[1], em[4]);
+ if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]);
+ if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]);
}
+ if (pr == 0) { // if pr is unset, return
+ kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s);
+ free(b->str);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ bcf_sync(b);
+ return 1;
+ }
+
+ is_var = (pr->p_ref < pref);
+ r = is_var? pr->p_ref : pr->p_var;
+
+ ksprintf(&s, ";CI95=%.4g,%.4g", pr->cil, pr->cih); // FIXME: when EM is not used, ";" should be omitted!
if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
if (fq < -999) fq = -999;
if (fq > 999) fq = 999;
ksprintf(&s, ";FQ=%.3g", fq);
if (pr->cmp[0] >= 0.) { // two sample groups
- int i, q[3], pq;
+ int i, q[3];
for (i = 1; i < 3; ++i) {
double x = pr->cmp[i] + pr->cmp[0]/2.;
q[i] = x == 0? 255 : (int)(-4.343 * log(x) + .499);
if (q[i] > 255) q[i] = 255;
}
- pq = (int)(-4.343 * log(pr->p_chi2) + .499);
if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank);
- ksprintf(&s, ";QCHI2=%d;PCHI2=%.3g;PC2=%d,%d", pq, q[1], q[2], pr->p_chi2);
- ksprintf(&s, ";AF2=%.4g,%.4g", 1.-pr->f_em2[0], 1.-pr->f_em2[1]);
-// ksprintf(&s, ",%g,%g,%g", pr->cmp[0], pr->cmp[1], pr->cmp[2]);
+ ksprintf(&s, ";PCHI2=%.3g;PC2=%d,%d", q[1], q[2], pr->p_chi2);
}
if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
kputc('\0', &s);
b->m_str = s.m; b->l_str = s.l; b->str = s.s;
bcf_sync(b);
for (i = 0; i < b->n_smpl; ++i) {
- x = bcf_p1_call_gt(pa, pr->f_em, i);
+ x = bcf_p1_call_gt(pa, pr->f_exp, i);
((uint8_t*)b->gi[old_n_gi].data)[i] = (x&3) == 0? 1<<3|1 : (x&3) == 1? 1 : 0;
((uint8_t*)b->gi[old_n_gi+1].data)[i] = x>>2;
}
h->l_txt = str.l + 1; h->txt = str.s;
}
-double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]);
+double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]);
int bcfview(int argc, char *argv[])
{
tid = begin = end = -1;
memset(&vc, 0, sizeof(viewconf_t));
- vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0;
- while ((c = getopt(argc, argv, "FN1:l:cHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:")) >= 0) {
+ vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; vc.min_lrt = 0.01;
+ while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:")) >= 0) {
switch (c) {
case '1': vc.n1 = atoi(optarg); break;
case 'l': vc.bed = bed_read(optarg); break;
case 'b': vc.flag |= VC_BCFOUT; break;
case 'S': vc.flag |= VC_VCFIN; break;
case 'c': vc.flag |= VC_CALL; break;
+ case 'e': vc.flag |= VC_EM; break;
case 'v': vc.flag |= VC_VARONLY | VC_CALL; break;
case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break;
case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break;
case 'Q': vc.flag |= VC_QCALL; break;
case 'L': vc.flag |= VC_ADJLD; break;
case 'U': vc.n_perm = atoi(optarg); break;
+ case 'C': vc.min_lrt = atof(optarg); break;
case 'X': vc.min_perm_p = atof(optarg); break;
case 'd': vc.min_smpl_frac = atof(optarg); break;
case 's': vc.subsam = read_samples(optarg, &vc.n_sub);
fprintf(stderr, " -S input is VCF\n");
fprintf(stderr, " -u uncompressed BCF output (force -b)\n");
fprintf(stderr, "\nConsensus/variant calling options:\n\n");
- fprintf(stderr, " -c SNP calling\n");
+ fprintf(stderr, " -c SNP calling (force -e)\n");
fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n");
+ fprintf(stderr, " -e likelihood based analyses\n");
fprintf(stderr, " -g call genotypes at variant sites (force -c)\n");
fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac);
fprintf(stderr, " -I skip indels\n");
fprintf(stderr, " -v output potential variant sites only (force -c)\n");
fprintf(stderr, "\nContrast calling and association test options:\n\n");
fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
+ fprintf(stderr, " -C FLOAT posterior constrast for LRT<FLOAT and P(ref|D)<0.5 [%g]\n", vc.min_lrt);
fprintf(stderr, " -U INT number of permutations for association testing (effective with -1) [0]\n");
fprintf(stderr, " -X FLOAT only perform permutations for P(chi^2)<FLOAT [%g]\n", vc.min_perm_p);
fprintf(stderr, "\n");
return 1;
}
+ if (vc.flag & VC_CALL) vc.flag |= VC_EM;
if ((vc.flag & VC_VCFIN) && (vc.flag & VC_BCFOUT) && vc.fn_dict == 0) {
fprintf(stderr, "[%s] For VCF->BCF conversion please specify the sequence dictionary with -D\n", __func__);
return 1;
return 1;
}
} else bcf_p1_init_prior(p1, vc.prior_type, vc.theta);
- if (vc.n1 > 0) {
+ if (vc.n1 > 0 && vc.min_lrt > 0.) { // set n1
bcf_p1_set_n1(p1, vc.n1);
bcf_p1_init_subprior(p1, vc.prior_type, vc.theta);
}
}
while (vcf_read(bp, hin, b) > 0) {
int is_indel;
+ double em[9];
if ((vc.flag & VC_VARONLY) && strcmp(b->alt, "X") == 0) continue;
if ((vc.flag & VC_VARONLY) && vc.min_smpl_frac > 0.) {
extern int bcf_smpl_covered(const bcf1_t *b);
bcf_2qcall(hout, b);
continue;
}
+ if (vc.flag & VC_EM) bcf_em1(b, vc.n1, 0xff, em);
+ else {
+ int i;
+ for (i = 0; i < 9; ++i) em[i] = -1.;
+ }
if (vc.flag & (VC_CALL|VC_ADJLD)) bcf_gl2pl(b);
if (vc.flag & VC_CALL) { // call variants
bcf_p1rst_t pr;
- bcf_p1_cal(b, p1, &pr); // pr.g[3] is not calculated here
+ bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr);
if (n_processed % 100000 == 0) {
fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed);
bcf_p1_dump_afs(p1);
bcf_p1rst_t r;
int i, n = 0;
for (i = 0; i < vc.n_perm; ++i) {
+#ifdef BCF_PERM_LRT // LRT based permutation is much faster but less robust to artifacts
+ double x[9];
+ bcf_shuffle(b, seeds[i]);
+ bcf_em1(b, vc.n1, 1<<7, x);
+ if (x[7] < em[7]) ++n;
+#else
bcf_shuffle(b, seeds[i]);
- bcf_p1_cal(b, p1, &r);
+ bcf_p1_cal(b, 1, p1, &r);
if (pr.p_chi2 >= r.p_chi2) ++n;
+#endif
}
pr.perm_rank = n;
}
- update_bcf1(hout->n_smpl, b, p1, &pr, vc.pref, vc.flag);
- }
+ update_bcf1(b, p1, &pr, vc.pref, vc.flag, em);
+ } else if (vc.flag & VC_EM) update_bcf1(b, 0, 0, 0, vc.flag, em);
if (vc.flag & VC_ADJLD) { // compute LD
double f[4], r2;
- if ((r2 = bcf_ld_freq(blast, b, f)) >= 0) {
+ if ((r2 = bcf_pair_freq(blast, b, f)) >= 0) {
kstring_t s;
s.m = s.l = 0; s.s = 0;
if (*b->info) kputc(';', &s);
--- /dev/null
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "bcf.h"
+#include "kmin.h"
+
+static double g_q2p[256];
+
+#define ITER_MAX 50
+#define ITER_TRY 10
+#define EPS 1e-5
+
+extern double kf_gammaq(double, double);
+
+/*
+ Generic routines
+ */
+// get the 3 genotype likelihoods
+static double *get_pdg3(const bcf1_t *b)
+{
+ double *pdg;
+ const uint8_t *PL = 0;
+ int i, PL_len = 0;
+ // initialize g_q2p if necessary
+ if (g_q2p[0] == 0.)
+ for (i = 0; i < 256; ++i)
+ g_q2p[i] = pow(10., -i / 10.);
+ // set PL and PL_len
+ for (i = 0; i < b->n_gi; ++i) {
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+ PL = (const uint8_t*)b->gi[i].data;
+ PL_len = b->gi[i].len;
+ break;
+ }
+ }
+ if (i == b->n_gi) return 0; // no PL
+ // fill pdg
+ pdg = malloc(3 * b->n_smpl * sizeof(double));
+ for (i = 0; i < b->n_smpl; ++i) {
+ const uint8_t *pi = PL + i * PL_len;
+ double *p = pdg + i * 3;
+ p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]];
+ }
+ return pdg;
+}
+
+// estimate site allele frequency in a very naive and inaccurate way
+static double est_freq(int n, const double *pdg)
+{
+ int i, gcnt[3], tmp1;
+ // get a rough estimate of the genotype frequency
+ gcnt[0] = gcnt[1] = gcnt[2] = 0;
+ for (i = 0; i < n; ++i) {
+ const double *p = pdg + i * 3;
+ if (p[0] != 1. || p[1] != 1. || p[2] != 1.) {
+ int which = p[0] > p[1]? 0 : 1;
+ which = p[which] > p[2]? which : 2;
+ ++gcnt[which];
+ }
+ }
+ tmp1 = gcnt[0] + gcnt[1] + gcnt[2];
+ return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1;
+}
+
+/*
+ Single-locus EM
+ */
+
+typedef struct {
+ int beg, end;
+ const double *pdg;
+} minaux1_t;
+
+static double prob1(double f, void *data)
+{
+ minaux1_t *a = (minaux1_t*)data;
+ double p = 1., l = 0., f3[3];
+ int i;
+// printf("%lg\n", f);
+ if (f < 0 || f > 1) return 1e300;
+ f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f;
+ for (i = a->beg; i < a->end; ++i) {
+ const double *pdg = a->pdg + i * 3;
+ p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2];
+ if (p < 1e-200) l -= log(p), p = 1.;
+ }
+ return l - log(p);
+}
+
+// one EM iteration for allele frequency estimate
+static double freq_iter(double *f, const double *_pdg, int beg, int end)
+{
+ double f0 = *f, f3[3], err;
+ int i;
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ for (i = beg, f0 = 0.; i < end; ++i) {
+ const double *pdg = _pdg + i * 3;
+ f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2])
+ / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]);
+ }
+ f0 /= (end - beg) * 2;
+ err = fabs(f0 - *f);
+ *f = f0;
+ return err;
+}
+
+/* The following function combines EM and Brent's method. When the signal from
+ * the data is strong, EM is faster but sometimes, EM may converge very slowly.
+ * When this happens, we switch to Brent's method. The idea is learned from
+ * Rasmus Nielsen.
+ */
+static double freqml(double f0, int beg, int end, const double *pdg)
+{
+ int i;
+ double f;
+ for (i = 0, f = f0; i < ITER_TRY; ++i)
+ if (freq_iter(&f, pdg, beg, end) < EPS) break;
+ if (i == ITER_TRY) { // haven't converged yet; try Brent's method
+ minaux1_t a;
+ a.beg = beg; a.end = end; a.pdg = pdg;
+ kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f);
+ }
+ return f;
+}
+
+// one EM iteration for genotype frequency estimate
+static double g3_iter(double g[3], const double *_pdg, int beg, int end)
+{
+ double err, gg[3];
+ int i;
+ gg[0] = gg[1] = gg[2] = 0.;
+// printf("%lg,%lg,%lg\n", g[0], g[1], g[2]);
+ for (i = beg; i < end; ++i) {
+ double sum, tmp[3];
+ const double *pdg = _pdg + i * 3;
+ tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2];
+ sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg);
+ gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum;
+ }
+ err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]);
+ err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]);
+ g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2];
+ return err;
+}
+
+// perform likelihood ratio test
+static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3])
+{
+ double r;
+ int i;
+ for (i = 0, r = 1.; i < n1; ++i) {
+ const double *p = pdg + i * 3;
+ r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2])
+ / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
+ }
+ for (; i < n; ++i) {
+ const double *p = pdg + i * 3;
+ r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2])
+ / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
+ }
+ return r;
+}
+
+// x[0]: ref frequency
+// x[1..3]: alt-alt, alt-ref, ref-ref frequenc
+// x[4]: HWE P-value
+// x[5..6]: group1 freq, group2 freq
+// x[7]: 1-degree P-value
+// x[8]: 2-degree P-value
+int bcf_em1(const bcf1_t *b, int n1, int flag, double x[9])
+{
+ double *pdg;
+ int i, n, n2;
+ if (b->n_alleles < 2) return -1; // one allele only
+ // initialization
+ if (n1 < 0 || n1 > b->n_smpl) n1 = 0;
+ if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required
+ if (flag & 0xf<<1) flag |= 0xf<<1;
+ n = b->n_smpl; n2 = n - n1;
+ pdg = get_pdg3(b);
+ for (i = 0; i < 9; ++i) x[i] = -1.;
+ {
+ if ((x[0] = est_freq(n, pdg)) < 0.) {
+ free(pdg);
+ return -1; // no data
+ }
+ x[0] = freqml(x[0], 0, n, pdg);
+ }
+ if (flag & (0xf<<1|1<<8)) { // estimate the genotype frequency and test HWE
+ double *g = x + 1, f3[3], r;
+ f3[0] = g[0] = (1 - x[0]) * (1 - x[0]);
+ f3[1] = g[1] = 2 * x[0] * (1 - x[0]);
+ f3[2] = g[2] = x[0] * x[0];
+ for (i = 0; i < ITER_MAX; ++i)
+ if (g3_iter(g, pdg, 0, n) < EPS) break;
+ // Hardy-Weinberg equilibrium (HWE)
+ for (i = 0, r = 1.; i < n; ++i) {
+ double *p = pdg + i * 3;
+ r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]);
+ }
+ x[4] = kf_gammaq(.5, log(r));
+ }
+ if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency
+ x[5] = freqml(x[0], 0, n1, pdg);
+ x[6] = freqml(x[0], n1, n, pdg);
+ }
+ if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value
+ double f[3], f3[3][3];
+ f[0] = x[0]; f[1] = x[5]; f[2] = x[6];
+ for (i = 0; i < 3; ++i)
+ f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i];
+ x[7] = kf_gammaq(.5, log(lk_ratio_test(n, n1, pdg, f3)));
+ }
+ if ((flag & 1<<8) && n1 > 0 && n1 < n) { // 2-degree P-value
+ double g[3][3];
+ for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double));
+ for (i = 0; i < ITER_MAX; ++i)
+ if (g3_iter(g[1], pdg, 0, n1) < EPS) break;
+ for (i = 0; i < ITER_MAX; ++i)
+ if (g3_iter(g[2], pdg, n1, n) < EPS) break;
+ x[8] = kf_gammaq(1., log(lk_ratio_test(n, n1, pdg, g)));
+ }
+ // free
+ free(pdg);
+ return 0;
+}
+
+/*
+ Two-locus EM (LD)
+ */
+
+#define _G1(h, k) ((h>>1&1) + (k>>1&1))
+#define _G2(h, k) ((h&1) + (k&1))
+
+// 0: the previous site; 1: the current site
+static int pair_freq_iter(int n, double *pdg[2], double f[4])
+{
+ double ff[4];
+ int i, k, h;
+// printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]);
+ memset(ff, 0, 4 * sizeof(double));
+ for (i = 0; i < n; ++i) {
+ double *p[2], sum, tmp;
+ p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3;
+ for (k = 0, sum = 0.; k < 4; ++k)
+ for (h = 0; h < 4; ++h)
+ sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)];
+ for (k = 0; k < 4; ++k) {
+ tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)])
+ + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)])
+ + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)])
+ + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]);
+ ff[k] += f[k] * tmp / sum;
+ }
+ }
+ for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n);
+ return 0;
+}
+
+double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4])
+{
+ const bcf1_t *b[2];
+ int i, j, n_smpl;
+ double *pdg[2], flast[4], r, f0[2];
+ // initialize others
+ if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples
+ n_smpl = b0->n_smpl;
+ b[0] = b0; b[1] = b1;
+ f[0] = f[1] = f[2] = f[3] = -1.;
+ if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only
+ pdg[0] = get_pdg3(b0); pdg[1] = get_pdg3(b1);
+ if (pdg[0] == 0 || pdg[1] == 0) {
+ free(pdg[0]); free(pdg[1]);
+ return -1;
+ }
+ // set the initial value
+ f0[0] = est_freq(n_smpl, pdg[0]);
+ f0[1] = est_freq(n_smpl, pdg[1]);
+ f[0] = (1 - f0[0]) * (1 - f0[1]); f[3] = f0[0] * f0[1];
+ f[1] = (1 - f0[0]) * f0[1]; f[2] = f0[0] * (1 - f0[1]);
+ // iteration
+ for (j = 0; j < ITER_MAX; ++j) {
+ double eps = 0;
+ memcpy(flast, f, 4 * sizeof(double));
+ pair_freq_iter(n_smpl, pdg, f);
+ for (i = 0; i < 4; ++i) {
+ double x = fabs(f[i] - flast[i]);
+ if (x > eps) eps = x;
+ }
+ if (eps < EPS) break;
+ }
+ // free
+ free(pdg[0]); free(pdg[1]);
+ { // calculate r^2
+ double p[2], q[2], D;
+ p[0] = f[0] + f[1]; q[0] = 1 - p[0];
+ p[1] = f[0] + f[2]; q[1] = 1 - p[1];
+ D = f[0] * f[3] - f[1] * f[2];
+ r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1]));
+// printf("R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r);
+ if (isnan(r)) r = -1.;
+ }
+ return r;
+}
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2008, 2010 by Attractive Chaos <attractor@live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Hooke-Jeeves algorithm for nonlinear minimization
+
+ Based on the pseudocodes by Bell and Pike (CACM 9(9):684-685), and
+ the revision by Tomlin and Smith (CACM 12(11):637-638). Both of the
+ papers are comments on Kaupe's Algorithm 178 "Direct Search" (ACM
+ 6(6):313-314). The original algorithm was designed by Hooke and
+ Jeeves (ACM 8:212-229). This program is further revised according to
+ Johnson's implementation at Netlib (opt/hooke.c).
+
+ Hooke-Jeeves algorithm is very simple and it works quite well on a
+ few examples. However, it might fail to converge due to its heuristic
+ nature. A possible improvement, as is suggested by Johnson, may be to
+ choose a small r at the beginning to quickly approach to the minimum
+ and a large r at later step to hit the minimum.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "kmin.h"
+
+static double __kmin_hj_aux(kmin_f func, int n, double *x1, void *data, double fx1, double *dx, int *n_calls)
+{
+ int k, j = *n_calls;
+ double ftmp;
+ for (k = 0; k != n; ++k) {
+ x1[k] += dx[k];
+ ftmp = func(n, x1, data); ++j;
+ if (ftmp < fx1) fx1 = ftmp;
+ else { /* search the opposite direction */
+ dx[k] = 0.0 - dx[k];
+ x1[k] += dx[k] + dx[k];
+ ftmp = func(n, x1, data); ++j;
+ if (ftmp < fx1) fx1 = ftmp;
+ else x1[k] -= dx[k]; /* back to the original x[k] */
+ }
+ }
+ *n_calls = j;
+ return fx1; /* here: fx1=f(n,x1) */
+}
+
+double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls)
+{
+ double fx, fx1, *x1, *dx, radius;
+ int k, n_calls = 0;
+ x1 = (double*)calloc(n, sizeof(double));
+ dx = (double*)calloc(n, sizeof(double));
+ for (k = 0; k != n; ++k) { /* initial directions, based on MGJ */
+ dx[k] = fabs(x[k]) * r;
+ if (dx[k] == 0) dx[k] = r;
+ }
+ radius = r;
+ fx1 = fx = func(n, x, data); ++n_calls;
+ for (;;) {
+ memcpy(x1, x, n * sizeof(double)); /* x1 = x */
+ fx1 = __kmin_hj_aux(func, n, x1, data, fx, dx, &n_calls);
+ while (fx1 < fx) {
+ for (k = 0; k != n; ++k) {
+ double t = x[k];
+ dx[k] = x1[k] > x[k]? fabs(dx[k]) : 0.0 - fabs(dx[k]);
+ x[k] = x1[k];
+ x1[k] = x1[k] + x1[k] - t;
+ }
+ fx = fx1;
+ if (n_calls >= max_calls) break;
+ fx1 = func(n, x1, data); ++n_calls;
+ fx1 = __kmin_hj_aux(func, n, x1, data, fx1, dx, &n_calls);
+ if (fx1 >= fx) break;
+ for (k = 0; k != n; ++k)
+ if (fabs(x1[k] - x[k]) > .5 * fabs(dx[k])) break;
+ if (k == n) break;
+ }
+ if (radius >= eps) {
+ if (n_calls >= max_calls) break;
+ radius *= r;
+ for (k = 0; k != n; ++k) dx[k] *= r;
+ } else break; /* converge */
+ }
+ free(x1); free(dx);
+ return fx1;
+}
+
+// I copied this function somewhere several years ago with some of my modifications, but I forgot the source.
+double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin)
+{
+ double bound, u, r, q, fu, tmp, fa, fb, fc, c;
+ const double gold1 = 1.6180339887;
+ const double gold2 = 0.3819660113;
+ const double tiny = 1e-20;
+ const int max_iter = 100;
+
+ double e, d, w, v, mid, tol1, tol2, p, eold, fv, fw;
+ int iter;
+
+ fa = func(a, data); fb = func(b, data);
+ if (fb > fa) { // swap, such that f(a) > f(b)
+ tmp = a; a = b; b = tmp;
+ tmp = fa; fa = fb; fb = tmp;
+ }
+ c = b + gold1 * (b - a), fc = func(c, data); // golden section extrapolation
+ while (fb > fc) {
+ bound = b + 100.0 * (c - b); // the farthest point where we want to go
+ r = (b - a) * (fb - fc);
+ q = (b - c) * (fb - fa);
+ if (fabs(q - r) < tiny) { // avoid 0 denominator
+ tmp = q > r? tiny : 0.0 - tiny;
+ } else tmp = q - r;
+ u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp); // u is the parabolic extrapolation point
+ if ((b > u && u > c) || (b < u && u < c)) { // u lies between b and c
+ fu = func(u, data);
+ if (fu < fc) { // (b,u,c) bracket the minimum
+ a = b; b = u; fa = fb; fb = fu;
+ break;
+ } else if (fu > fb) { // (a,b,u) bracket the minimum
+ c = u; fc = fu;
+ break;
+ }
+ u = c + gold1 * (c - b); fu = func(u, data); // golden section extrapolation
+ } else if ((c > u && u > bound) || (c < u && u < bound)) { // u lies between c and bound
+ fu = func(u, data);
+ if (fu < fc) { // fb > fc > fu
+ b = c; c = u; u = c + gold1 * (c - b);
+ fb = fc; fc = fu; fu = func(u, data);
+ } else { // (b,c,u) bracket the minimum
+ a = b; b = c; c = u;
+ fa = fb; fb = fc; fc = fu;
+ break;
+ }
+ } else if ((u > bound && bound > c) || (u < bound && bound < c)) { // u goes beyond the bound
+ u = bound; fu = func(u, data);
+ } else { // u goes the other way around, use golden section extrapolation
+ u = c + gold1 * (c - b); fu = func(u, data);
+ }
+ a = b; b = c; c = u;
+ fa = fb; fb = fc; fc = fu;
+ }
+ if (a > c) u = a, a = c, c = u; // swap
+
+ // now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm
+ e = d = 0.0;
+ w = v = b; fv = fw = fb;
+ for (iter = 0; iter != max_iter; ++iter) {
+ mid = 0.5 * (a + c);
+ tol2 = 2.0 * (tol1 = tol * fabs(b) + tiny);
+ if (fabs(b - mid) <= (tol2 - 0.5 * (c - a))) {
+ *xmin = b; return fb; // found
+ }
+ if (fabs(e) > tol1) {
+ // related to parabolic interpolation
+ r = (b - w) * (fb - fv);
+ q = (b - v) * (fb - fw);
+ p = (b - v) * q - (b - w) * r;
+ q = 2.0 * (q - r);
+ if (q > 0.0) p = 0.0 - p;
+ else q = 0.0 - q;
+ eold = e; e = d;
+ if (fabs(p) >= fabs(0.5 * q * eold) || p <= q * (a - b) || p >= q * (c - b)) {
+ d = gold2 * (e = (b >= mid ? a - b : c - b));
+ } else {
+ d = p / q; u = b + d; // actual parabolic interpolation happens here
+ if (u - a < tol2 || c - u < tol2)
+ d = (mid > b)? tol1 : 0.0 - tol1;
+ }
+ } else d = gold2 * (e = (b >= mid ? a - b : c - b)); // golden section interpolation
+ u = fabs(d) >= tol1 ? b + d : b + (d > 0.0? tol1 : -tol1);
+ fu = func(u, data);
+ if (fu <= fb) { // u is the minimum point so far
+ if (u >= b) a = b;
+ else c = b;
+ v = w; w = b; b = u; fv = fw; fw = fb; fb = fu;
+ } else { // adjust (a,c) and (u,v,w)
+ if (u < b) a = u;
+ else c = u;
+ if (fu <= fw || w == b) {
+ v = w; w = u;
+ fv = fw; fw = fu;
+ } else if (fu <= fv || v == b || v == w) {
+ v = u; fv = fu;
+ }
+ }
+ }
+ *xmin = b;
+ return fb;
+}
--- /dev/null
+/*
+ Copyright (c) 2008, 2010 by Attractive Chaos <attractor@live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef KMIN_H
+#define KMIN_H
+
+#define KMIN_RADIUS 0.5
+#define KMIN_EPS 1e-7
+#define KMIN_MAXCALL 50000
+
+typedef double (*kmin_f)(int, double*, void*);
+typedef double (*kmin1_f)(double, void*);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls);
+ double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+++ /dev/null
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include "bcf.h"
-
-static double g_q2p[256];
-
-#define LD_ITER_MAX 50
-#define LD_ITER_EPS 1e-4
-
-#define _G1(h, k) ((h>>1&1) + (k>>1&1))
-#define _G2(h, k) ((h&1) + (k&1))
-
-// 0: the previous site; 1: the current site
-static int freq_iter(int n, double *pdg[2], double f[4])
-{
- double ff[4];
- int i, k, h;
- memset(ff, 0, 4 * sizeof(double));
- for (i = 0; i < n; ++i) {
- double *p[2], sum, tmp;
- p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3;
- for (k = 0, sum = 0.; k < 4; ++k)
- for (h = 0; h < 4; ++h)
- sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)];
- for (k = 0; k < 4; ++k) {
- tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)])
- + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)])
- + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)])
- + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]);
- ff[k] += f[k] * tmp / sum;
- }
- }
- for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n);
- return 0;
-}
-
-double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4])
-{
- const bcf1_t *b[2];
- uint8_t *PL[2];
- int i, j, PL_len[2], n_smpl;
- double *pdg[2], flast[4], r;
- // initialize g_q2p if necessary
- if (g_q2p[0] == 0.)
- for (i = 0; i < 256; ++i)
- g_q2p[i] = pow(10., -i / 10.);
- // initialize others
- if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples
- n_smpl = b0->n_smpl;
- b[0] = b0; b[1] = b1;
- f[0] = f[1] = f[2] = f[3] = -1.;
- if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only
- // set PL and PL_len
- for (j = 0; j < 2; ++j) {
- const bcf1_t *bj = b[j];
- for (i = 0; i < bj->n_gi; ++i) {
- if (bj->gi[i].fmt == bcf_str2int("PL", 2)) {
- PL[j] = (uint8_t*)bj->gi[i].data;
- PL_len[j] = bj->gi[i].len;
- break;
- }
- }
- if (i == bj->n_gi) return -1; // no PL
- }
- // fill pdg[2]
- pdg[0] = malloc(3 * n_smpl * sizeof(double));
- pdg[1] = malloc(3 * n_smpl * sizeof(double));
- for (j = 0; j < 2; ++j) {
- for (i = 0; i < n_smpl; ++i) {
- const uint8_t *pi = PL[j] + i * PL_len[j];
- double *p = pdg[j] + i * 3;
- p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]];
- }
- }
- // iteration
- f[0] = f[1] = f[2] = f[3] = 0.25; // this is a really bad guess...
- for (j = 0; j < LD_ITER_MAX; ++j) {
- double eps = 0;
- memcpy(flast, f, 4 * sizeof(double));
- freq_iter(n_smpl, pdg, f);
- for (i = 0; i < 4; ++i) {
- double x = fabs(f[i] - flast[i]);
- if (x > eps) eps = x;
- }
- if (eps < LD_ITER_EPS) break;
- }
- // free
- free(pdg[0]); free(pdg[1]);
- { // calculate r^2
- double p[2], q[2], D;
- p[0] = f[0] + f[1]; q[0] = 1 - p[0];
- p[1] = f[0] + f[2]; q[1] = 1 - p[1];
- D = f[0] * f[3] - f[1] * f[2];
- r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1]));
- // fprintf(stderr, "R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r2);
- if (isnan(r)) r = -1.;
- }
- return r;
-}
-
-int bcf_main_pwld(int argc, char *argv[])
-{
- bcf_t *fp;
- bcf_hdr_t *h;
- bcf1_t **b, *b0;
- int i, j, m, n;
- double f[4];
- if (argc == 1) {
- fprintf(stderr, "Usage: bcftools pwld <in.bcf>\n");
- return 1;
- }
- fp = bcf_open(argv[1], "rb");
- h = bcf_hdr_read(fp);
- // read the entire BCF
- m = n = 0; b = 0;
- b0 = calloc(1, sizeof(bcf1_t));
- while (bcf_read(fp, h, b0) >= 0) {
- if (m == n) {
- m = m? m<<1 : 16;
- b = realloc(b, sizeof(void*) * m);
- }
- b[n] = calloc(1, sizeof(bcf1_t));
- bcf_cpy(b[n++], b0);
- }
- bcf_destroy(b0);
- // compute pair-wise r^2
- printf("%d\n", n); // the number of loci
- for (i = 0; i < n; ++i) {
- printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1);
- for (j = 0; j < i; ++j) {
- double r = bcf_ld_freq(b[i], b[j], f);
- printf("\t%.3f", r*r);
- }
- printf("\t1.000\n");
- }
- // free
- for (i = 0; i < n; ++i) bcf_destroy(b[i]);
- free(b);
- bcf_hdr_destroy(h);
- bcf_close(fp);
- return 0;
-}
int bcfview(int argc, char *argv[]);
int bcf_main_index(int argc, char *argv[]);
-int bcf_main_pwld(int argc, char *argv[]);
#define BUF_SIZE 0x10000
return 0;
}
+int bcf_main_pwld(int argc, char *argv[])
+{
+ extern double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]);
+ bcf_t *fp;
+ bcf_hdr_t *h;
+ bcf1_t **b, *b0;
+ int i, j, m, n;
+ double f[4];
+ if (argc == 1) {
+ fprintf(stderr, "Usage: bcftools pwld <in.bcf>\n");
+ return 1;
+ }
+ fp = bcf_open(argv[1], "rb");
+ h = bcf_hdr_read(fp);
+ // read the entire BCF
+ m = n = 0; b = 0;
+ b0 = calloc(1, sizeof(bcf1_t));
+ while (bcf_read(fp, h, b0) >= 0) {
+ if (m == n) {
+ m = m? m<<1 : 16;
+ b = realloc(b, sizeof(void*) * m);
+ }
+ b[n] = calloc(1, sizeof(bcf1_t));
+ bcf_cpy(b[n++], b0);
+ }
+ bcf_destroy(b0);
+ // compute pair-wise r^2
+ printf("%d\n", n); // the number of loci
+ for (i = 0; i < n; ++i) {
+ printf("%s:%d", h->ns[b[i]->tid], b[i]->pos + 1);
+ for (j = 0; j < i; ++j) {
+ double r = bcf_pair_freq(b[i], b[j], f);
+ printf("\t%.3f", r*r);
+ }
+ printf("\t1.000\n");
+ }
+ // free
+ for (i = 0; i < n; ++i) bcf_destroy(b[i]);
+ free(b);
+ bcf_hdr_destroy(h);
+ bcf_close(fp);
+ return 0;
+}
+
int main(int argc, char *argv[])
{
if (argc == 1) {
double *z, *zswap; // aux for afs
double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set
double **hg; // hypergeometric distribution
+ double *lf; // log factorial
double t, t1, t2;
double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution
const uint8_t *PL; // point to PL
ma->z2 = calloc(ma->M + 1, sizeof(double));
ma->afs = calloc(ma->M + 1, sizeof(double));
ma->afs1 = calloc(ma->M + 1, sizeof(double));
+ ma->lf = calloc(ma->M + 1, sizeof(double));
for (i = 0; i < 256; ++i)
ma->q2p[i] = pow(10., -i / 10.);
+ for (i = 0; i <= ma->M; ++i) ma->lf[i] = lgamma(i + 1);
bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior
return ma;
}
{
if (ma) {
int k;
+ free(ma->lf);
if (ma->hg && ma->n1 > 0) {
for (k = 0; k <= 2*ma->n1; ++k) free(ma->hg[k]);
free(ma->hg);
if ((p[i]&0xf) == 0) break;
return i;
}
-// f0 is the reference allele frequency
-static double mc_freq_iter(double f0, const bcf_p1aux_t *ma, int beg, int end)
-{
- double f, f3[3];
- int i;
- f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
- for (i = beg, f = 0.; i < end; ++i) {
- double *pdg;
- pdg = ma->pdg + i * 3;
- f += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2])
- / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]);
- }
- f /= (end - beg) * 2.;
- return f;
-}
-
-static double mc_gtfreq_iter(double g[3], const bcf_p1aux_t *ma, int beg, int end)
-{
- double err, gg[3];
- int i;
- gg[0] = gg[1] = gg[2] = 0.;
- for (i = beg; i < end; ++i) {
- double *pdg, sum, tmp[3];
- pdg = ma->pdg + i * 3;
- tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2];
- sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg);
- gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum;
- }
- err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]);
- err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]);
- g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2];
- return err;
-}
int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
{
}
// chi2=(a+b+c+d)(ad-bc)^2/[(a+b)(c+d)(a+c)(b+d)]
-static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int n1, int n2, int k1, int k2, double x[3])
+static inline double contrast2_aux(const bcf_p1aux_t *p1, double sum, int k1, int k2, double x[3])
{
double p = p1->phi[k1+k2] * p1->z1[k1] * p1->z2[k2] / sum * p1->hg[k1][k2];
+ int n1 = p1->n1, n2 = p1->n - p1->n1;
if (p < CONTRAST_TINY) return -1;
if (.5*k1/n1 < .5*k2/n2) x[1] += p;
else if (.5*k1/n1 > .5*k2/n2) x[2] += p;
p1->hg[k1][k2] = exp(lgamma(k1+k2+1) + lgamma(p1->M-k1-k2+1) - (lgamma(k1+1) + lgamma(k2+1) + lgamma(2*n1-k1+1) + lgamma(2*n2-k2+1) + tmp));
}
}
- { // compute sum1 and sum2
+ { // compute
long double suml = 0;
for (k = 0; k <= p1->M; ++k) suml += p1->phi[k] * p1->z[k];
sum = suml;
}
- { // get the mean k1 and k2
+ { // get the max k1 and k2
double max;
int max_k;
for (k = 0, max = 0, max_k = -1; k <= 2*n1; ++k) {
}
{ // We can do the following with one nested loop, but that is an O(N^2) thing. The following code block is much faster for large N.
double x[3], y;
- long double z = 0.;
- x[0] = x[1] = x[2] = 0;
+ long double z = 0., L[2];
+ x[0] = x[1] = x[2] = 0; L[0] = L[1] = 0;
for (k1 = k10; k1 >= 0; --k1) {
for (k2 = k20; k2 >= 0; --k2) {
- if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
else z += y;
}
for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
- if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
else z += y;
}
}
x[0] = x[1] = x[2] = 0;
for (k1 = k10 + 1; k1 <= 2*n1; ++k1) {
for (k2 = k20; k2 >= 0; --k2) {
- if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
else z += y;
}
for (k2 = k20 + 1; k2 <= 2*n2; ++k2) {
- if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, x)) < 0) break;
+ if ((y = contrast2_aux(p1, sum, k1, k2, x)) < 0) break;
else z += y;
}
}
ret[0] += x[0]; ret[1] += x[1]; ret[2] += x[2];
- if (ret[0] + ret[1] + ret[2] < 0.99) { // in case of bad things happened
- ret[0] = ret[1] = ret[2] = 0;
+ if (ret[0] + ret[1] + ret[2] < 0.95) { // in case of bad things happened
+ ret[0] = ret[1] = ret[2] = 0; L[0] = L[1] = 0;
for (k1 = 0, z = 0.; k1 <= 2*n1; ++k1)
for (k2 = 0; k2 <= 2*n2; ++k2)
- if ((y = contrast2_aux(p1, sum, n1, n2, k1, k2, ret)) >= 0) z += y;
- if (ret[0] + ret[1] + ret[2] < 0.99) // It seems that this may be caused by floating point errors. I do not really understand why...
+ if ((y = contrast2_aux(p1, sum, k1, k2, ret)) >= 0) z += y;
+ if (ret[0] + ret[1] + ret[2] < 0.95) // It seems that this may be caused by floating point errors. I do not really understand why...
z = 1.0, ret[0] = ret[1] = ret[2] = 1./3;
}
return (double)z;
return sum / ma->M;
}
-int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
+int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
{
int i, k;
long double sum = 0.;
rst->f_flat += k * p;
}
rst->f_flat /= ma->M;
- { // calculate f_em
- double flast = rst->f_flat;
- for (i = 0; i < MC_MAX_EM_ITER; ++i) {
- rst->f_em = mc_freq_iter(flast, ma, 0, ma->n);
- if (fabs(rst->f_em - flast) < MC_EM_EPS) break;
- flast = rst->f_em;
- }
- if (ma->n1 > 0 && ma->n1 < ma->n) {
- for (k = 0; k < 2; ++k) {
- flast = rst->f_em;
- for (i = 0; i < MC_MAX_EM_ITER; ++i) {
- rst->f_em2[k] = k? mc_freq_iter(flast, ma, ma->n1, ma->n) : mc_freq_iter(flast, ma, 0, ma->n1);
- if (fabs(rst->f_em2[k] - flast) < MC_EM_EPS) break;
- flast = rst->f_em2[k];
- }
- }
- }
- }
- { // compute g[3]
- rst->g[0] = (1. - rst->f_em) * (1. - rst->f_em);
- rst->g[1] = 2. * rst->f_em * (1. - rst->f_em);
- rst->g[2] = rst->f_em * rst->f_em;
- for (i = 0; i < MC_MAX_EM_ITER; ++i)
- if (mc_gtfreq_iter(rst->g, ma, 0, ma->n) < MC_EM_EPS) break;
- }
{ // estimate equal-tail credible interval (95% level)
int l, h;
double p;
rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M;
}
rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0;
- if (rst->p_var > 0.1) // skip contrast2() if the locus is a strong non-variant
+ if (do_contrast && rst->p_var > 0.5) // skip contrast2() if the locus is a strong non-variant
rst->p_chi2 = contrast2(ma, rst->cmp);
return 0;
}
typedef struct {
int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal()
- double f_em, f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var;
+ double f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var;
double cil, cih;
- double cmp[3], p_chi2, f_em2[2]; // used by contrast2()
- double g[3];
+ double cmp[3], p_chi2; // used by contrast2()
} bcf_p1rst_t;
#define MC_PTYPE_FULL 1
void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta);
void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta);
void bcf_p1_destroy(bcf_p1aux_t *ma);
- int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst);
+ int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst);
int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k);
void bcf_p1_dump_afs(bcf_p1aux_t *ma);
int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn);
int bcf_p1_set_n1(bcf_p1aux_t *b, int n1);
void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called
+ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[9]);
+
#ifdef __cplusplus
}
#endif