#include <stdlib.h>
#include "bam_mcns.h"
-#define MC_MIN_QUAL 20
+#define MC_MIN_QUAL 13
#define MC_MAX_SUMQ 3000
#define MC_MAX_SUMQP 1e-300
int n, N;
int ref, alt;
double *q2p, *pdg; // pdg -> P(D|g)
+ double *alpha, *beta;
int *qsum, *bcnt;
};
+void mc_init_prior(mc_aux_t *ma, int type, double theta)
+{
+ int i;
+ if (type == MC_PTYPE_COND2) {
+ for (i = 0; i <= 2 * ma->n; ++i)
+ ma->alpha[i] = 2. * (i + 1) / (2 * ma->n + 1) / (2 * ma->n + 2);
+ } else {
+ double sum;
+ for (i = 0, sum = 0.; i < 2 * ma->n; ++i)
+ sum += (ma->alpha[i] = theta / (2 * ma->n - i));
+ ma->alpha[2 * ma->n] = 1. - sum;
+ }
+}
+
mc_aux_t *mc_init(int n) // FIXME: assuming diploid
{
mc_aux_t *ma;
ma->qsum = calloc(4 * ma->n, sizeof(int));
ma->bcnt = calloc(4 * ma->n, sizeof(int));
ma->pdg = calloc(3 * ma->n, sizeof(double));
+ ma->alpha = calloc(2 * ma->n + 1, sizeof(double));
+ ma->beta = calloc((2 * ma->n + 1) * 3, sizeof(double));
for (i = 0; i <= MC_MAX_SUMQ; ++i)
ma->q2p[i] = pow(10., -i / 10.);
+ for (i = 0; i <= 2 * ma->n; ++i) { // beta[k][g]=P(g|k/M)
+ double *bi = ma->beta + 3 * i;
+ double f = (double)i / (2 * ma->n);
+ bi[0] = (1. - f) * (1. - f);
+ bi[1] = 2 * f * (1. - f);
+ bi[2] = f * f;
+ }
+ mc_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior
return ma;
}
if (ma) {
free(ma->qsum); free(ma->bcnt);
free(ma->q2p); free(ma->pdg);
+ free(ma->alpha); free(ma->beta);
free(ma);
}
}
memset(ma->qsum, 0, sizeof(int) * 4 * ma->n);
memset(ma->bcnt, 0, sizeof(int) * 4 * ma->n);
for (j = 0; j < ma->n; ++j) {
- int *qsum = ma->qsum + j * 4;
+ int tmp, *qsum = ma->qsum + j * 4;
int *bcnt = ma->bcnt + j * 4;
- for (i = 0; i < n[j]; ++i) {
+ for (i = tmp = 0; i < n[j]; ++i) {
const bam_pileup1_t *p = plp[j] + i;
int q, b;
if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue;
if (b > 3) continue; // N
qsum[b] += q;
++bcnt[b];
+ ++tmp;
}
}
}
qsum = ma->qsum + j * 4;
bcnt = ma->bcnt + j * 4;
pi[1] = 3 * (bcnt[ma->ref] + bcnt[ma->alt]);
- pi[0] = qsum[ma->alt];
- pi[2] = qsum[ma->ref];
+ pi[0] = qsum[ma->ref];
+ pi[2] = qsum[ma->alt];
for (i = 0; i < 3; ++i)
- pdg[i] = pi[i] < MC_MAX_SUMQ? MC_MAX_SUMQP : ma->q2p[pi[i]];
+ pdg[i] = pi[i] > MC_MAX_SUMQ? MC_MAX_SUMQP : ma->q2p[pi[i]];
}
}
-
+// return the reference allele frequency
double mc_freq0(int ref, int *n, const bam_pileup1_t **plp, mc_aux_t *ma, int *_ref, int *alt)
{
- int i, acnt[4], j;
- double f0;
+ int i, cnt;
+ double f;
sum_err(n, plp, ma);
set_allele(ref, ma);
cal_pdg(ma);
- acnt[0] = acnt[1] = acnt[2] = acnt[3] = 0;
- for (i = 0; i < ma->n; ++i)
- for (j = 0; j < 4; ++j)
- acnt[j] += ma->bcnt[i * 4 + j];
- f0 = acnt[ma->ref] + acnt[ma->alt] == 0? -1.
- : (double)acnt[ref] / (acnt[ma->ref] + acnt[ma->alt]);
*_ref = ma->ref; *alt = ma->alt;
- return f0;
+ for (i = cnt = 0, f = 0.; i < ma->n; ++i) {
+ int *bcnt = ma->bcnt + i * 4;
+ int x = bcnt[ma->ref] + bcnt[ma->alt];
+ if (x) {
+ ++cnt;
+ f += (double)bcnt[ma->ref] / x;
+ }
+ }
+ return f / cnt;
}
-
+// f0 is the reference allele frequency
double mc_freq_iter(double f0, mc_aux_t *ma)
{
double f, f3[3];
int i, j;
- f3[0] = f0*f0; f3[1] = 2.*f0*(1.-f0); f3[2] = (1.-f0)*(1.-f0);
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
for (i = 0, f = 0.; i < ma->n; ++i) {
double up, dn, *pdg;
pdg = ma->pdg + i * 3;
dn += pdg[j] * f3[j];
f += up / dn;
}
+ f /= ma->n * 2.;
return f;
}
+
+double mc_freq_post(mc_aux_t *ma)
+{
+ int i, k;
+ long double f = 0.;
+ for (i = 0; i < ma->n; ++i) {
+ double *pdg = ma->pdg + i * 3;
+ long double y = 0., z = 0.;
+ for (k = 0; k <= ma->n * 2; ++k) {
+ double *bk = ma->beta + k * 3;
+/*
+ int g;
+ double yk = 0., zk = 0.;
+ for (g = 0; g < 3; ++g) {
+ yk += g * pdg[g] * bk[g];
+ zk += pdg[g] * bk[g];
+ }
+ y += yk * ma->alpha[k];
+ z += zk * ma->alpha[k];
+*/
+ y += (pdg[1] * bk[1] + 2. * pdg[2] * bk[2]) * ma->alpha[k];
+ z += (pdg[0] * bk[0] + pdg[1] * bk[1] + pdg[2] * bk[2]) * ma->alpha[k];
+ }
+ f += y / z;
+ }
+ return f / ma->n / 2;
+}
+
+double mc_ref_prob(mc_aux_t *ma)
+{
+ int k, i;
+ long double PD = 0., Pref = 0.;
+ for (k = 0; k <= ma->n * 2; ++k) {
+ long double x = 1.;
+ double *bk = ma->beta + k * 3;
+ for (i = 0; i < ma->n; ++i) {
+ double *pdg = ma->pdg + i * 3;
+// int g; double y=0.; for (g = 0; g < 3; ++g) y += pdg[g] * bk[g];
+ x *= pdg[0] * bk[0] + pdg[1] * bk[1] + pdg[2] * bk[2];
+ }
+ PD += x * ma->alpha[k];
+ }
+ for (k = 0; k <= ma->n * 2; ++k) {
+ long double x = 1.0;
+ for (i = 0; i < ma->n; ++i)
+ x *= ma->pdg[i * 3 + 2] * ma->beta[k * 3 + 2];
+ Pref += x * ma->alpha[k];
+ }
+ return Pref / PD;
+}
+
+int mc_call_gt(const mc_aux_t *ma, double f0, int k)
+{
+ double sum, g[3];
+ double max, f3[3], *pdg = ma->pdg + k * 3;
+ int q, i, max_i;
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ for (i = 0, sum = 0.; i < 3; ++i)
+ sum += (g[i] = pdg[i] * f3[i]);
+ for (i = 0, max = -1., max_i = 0; i < 3; ++i) {
+ g[i] /= sum;
+ if (g[i] > max) max = g[i], max_i = i;
+ }
+// printf("***%lg,%lg,%lg,%lg,%lg,%lg\n", g[0], g[1], g[2], pdg[0], pdg[1], pdg[2]);
+ max = 1. - max;
+ if (max < 1e-308) max = 1e-308;
+ q = (int)(-3.434 * log(max) + .499);
+ if (q > 99) q = 99;
+ return q<<2|max_i;
+}