From a11820a502707a04d5218a6a1cc32f8858b76a32 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 23 Jul 2010 15:48:51 +0000 Subject: [PATCH] calculate posterior allele frequency --- bam_mcns.c | 39 +++++++++++++++++++++++++++++++++------ bam_mcns.h | 1 + bam_plcmd.c | 28 ++++++++++++++++++---------- 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/bam_mcns.c b/bam_mcns.c index dee2016..c1cdb90 100644 --- a/bam_mcns.c +++ b/bam_mcns.c @@ -39,7 +39,7 @@ mc_aux_t *mc_init(int n) // FIXME: assuming diploid ma->beta = calloc((2 * ma->n + 1) * 3, sizeof(double)); for (i = 0; i <= MC_MAX_SUMQ; ++i) ma->q2p[i] = pow(10., -i / 10.); - for (i = 0; i <= 2 * ma->n; ++i) { + for (i = 0; i <= 2 * ma->n; ++i) { // beta[k][g]=P(g|k/M) double *bi = ma->beta + 3 * i; double f = (double)i / (2 * ma->n); bi[0] = (1. - f) * (1. - f); @@ -153,18 +153,44 @@ double mc_freq_iter(double f0, mc_aux_t *ma) return f; } +double mc_freq_post(mc_aux_t *ma) +{ + int i, k; + long double f = 0.; + for (i = 0; i < ma->n; ++i) { + double *pdg = ma->pdg + i * 3; + long double y = 0., z = 0.; + for (k = 0; k <= ma->n * 2; ++k) { + double *bk = ma->beta + k * 3; +/* + int g; + double yk = 0., zk = 0.; + for (g = 0; g < 3; ++g) { + yk += g * pdg[g] * bk[g]; + zk += pdg[g] * bk[g]; + } + y += yk * ma->alpha[k]; + z += zk * ma->alpha[k]; +*/ + y += (pdg[1] * bk[1] + 2. * pdg[2] * bk[2]) * ma->alpha[k]; + z += (pdg[0] * bk[0] + pdg[1] * bk[1] + pdg[2] * bk[2]) * ma->alpha[k]; + } + f += y / z; + } + return f / ma->n / 2; +} + double mc_ref_prob(mc_aux_t *ma) { - int k, i, g; + int k, i; long double PD = 0., Pref = 0.; for (k = 0; k <= ma->n * 2; ++k) { long double x = 1.; double *bk = ma->beta + k * 3; for (i = 0; i < ma->n; ++i) { - double y = 0., *pdg = ma->pdg + i * 3; - for (g = 0; g < 3; ++g) - y += pdg[g] * bk[g]; - x *= y; + double *pdg = ma->pdg + i * 3; +// int g; double y=0.; for (g = 0; g < 3; ++g) y += pdg[g] * bk[g]; + x *= pdg[0] * bk[0] + pdg[1] * bk[1] + pdg[2] * bk[2]; } PD += x * ma->alpha[k]; } @@ -189,6 +215,7 @@ int mc_call_gt(const mc_aux_t *ma, double f0, int k) g[i] /= sum; if (g[i] > max) max = g[i], max_i = i; } +// printf("***%lg,%lg,%lg,%lg,%lg,%lg\n", g[0], g[1], g[2], pdg[0], pdg[1], pdg[2]); max = 1. - max; if (max < 1e-308) max = 1e-308; q = (int)(-3.434 * log(max) + .499); diff --git a/bam_mcns.h b/bam_mcns.h index 6d98de1..89879fb 100644 --- a/bam_mcns.h +++ b/bam_mcns.h @@ -14,6 +14,7 @@ extern "C" { void mc_destroy(mc_aux_t *ma); double mc_freq0(int ref, int *n, const bam_pileup1_t **plp, mc_aux_t *ma, int *_ref, int *alt); double mc_freq_iter(double f0, mc_aux_t *ma); + double mc_freq_post(mc_aux_t *ma); double mc_ref_prob(mc_aux_t *ma); int mc_call_gt(const mc_aux_t *ma, double f0, int k); diff --git a/bam_plcmd.c b/bam_plcmd.c index 2afebea..258b83f 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -521,6 +521,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) kstring_t s; s.l = s.m = 0; s.s = 0; puts("##fileformat=VCFv4.0"); + puts("##INFO="); + puts("##INFO="); + puts("##INFO="); + puts("##FILTER="); kputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", &s); for (i = 0; i < n; ++i) { const char *p; @@ -549,12 +553,12 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) ref_tid = tid; } if (conf->vcf) { - double f0, f, pref = -1.0; // the reference allele frequency + double f0, f, fpost, pref = -1.0; // the reference allele frequency int j, _ref, _alt, _ref0, depth, rms_q, _ref0b, is_var = 0, qref = 0; uint64_t sqr_sum; _ref0 = _ref0b = (ref && pos < ref_len)? ref[pos] : 'N'; _ref0 = bam_nt16_nt4_table[bam_nt16_table[_ref0]]; - f = f0 = mc_freq0(_ref0, n_plp, plp, ma, &_ref, &_alt); + f = f0 = fpost = mc_freq0(_ref0, n_plp, plp, ma, &_ref, &_alt); if (f >= 0.0) { double q, flast = f; for (j = 0; j < 10; ++j) { @@ -563,8 +567,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if (fabs(f - flast) < 1e-3) break; flast = f; } - if (1. - f < 1e-4) f = 1.; pref = mc_ref_prob(ma); + fpost = mc_freq_post(ma); + if (1. - f < 1e-4) f = 1.; + if (1. - fpost < 1e-4) fpost = 1.; is_var = (pref < .5); q = is_var? pref : 1. - pref; if (q < 1e-308) q = 1e-308; @@ -589,13 +595,15 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } } rms_q = (int)(sqrt((double)sqr_sum / depth) + .499); - printf("DP=%d;MQ=%d;AF=%.3lg", depth, rms_q, f0<0.?0.:1.-f); - printf("\tGT:GQ:DP"); - // output genotype information; FIXME: to be implmented... - for (i = 0; i < n; ++i) { - int x = mc_call_gt(ma, f, i); - printf("\t%c/%c:%d:%d", "10"[((x&3)==2)], "10"[((x&3)>0)], x>>2, n_plp[i]); - } + printf("DP=%d;MQ=%d", depth, rms_q); + if (fpost >= 0. && fpost <= 1.) printf(";AF=%.3lg", 1. - fpost); + if (f >= 0. && f <= 1.) printf(";AFEM=%.3lg", 1. - f); + if (fpost >= 0. && fpost <= 1.) { + for (i = 0; i < n; ++i) { + int x = mc_call_gt(ma, fpost, i); + printf("\t%c/%c:%d:%d", "10"[((x&3)==2)], "10"[((x&3)>0)], x>>2, n_plp[i]); + } + } else for (i = 0; i < n; ++i) printf("\t./.:0:0"); putchar('\n'); } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); -- 2.39.2