From f27c5d351576e00cece3708adcb386da61fe7b7f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 7 Aug 2010 02:42:54 +0000 Subject: [PATCH] * bcftools: add HWE (no testing for now) * record end dist in a 2x2 table, not avg, std any more --- bam2bcf.c | 32 ++++++++++---------------------- bam2bcf.h | 6 ++---- bcftools/fet.c | 34 ++++++++++++++++------------------ bcftools/prob1.c | 27 +++++++++++++++++++++++++++ bcftools/prob1.h | 2 +- bcftools/vcfout.c | 2 +- 6 files changed, 57 insertions(+), 46 deletions(-) diff --git a/bam2bcf.c b/bam2bcf.c index 5ad81c0..1f7a0c0 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -5,7 +5,7 @@ #include "bam2bcf.h" #include "bcftools/bcf.h" -#define MAX_END_DIST 30 +#define END_DIST_THRES 11 extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); @@ -108,10 +108,8 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf // calculate min_dist min_dist = p->b->core.l_qseq - 1 - p->qpos; if (min_dist > p->qpos) min_dist = p->qpos; - if (min_dist > MAX_END_DIST) min_dist = MAX_END_DIST; - k >>= 1; - r->dsum[k] += min_dist; - r->d2sum[k] += min_dist * min_dist; + k = (k&2) | (min_dist <= END_DIST_THRES); + ++r->ed[k]; } ks_introsort_uint32_t(n, bca->info); r->depth = n; @@ -218,25 +216,12 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, call->shift = (int)(sum_min + .499); } memset(call->d, 0, 4 * sizeof(int)); - call->davg[0] = call->davg[1] = call->dstd[0] = call->dstd[1] = 0.; + memset(call->ed, 0, 4 * sizeof(int)); for (i = call->depth = 0, tmp = 0; i < n; ++i) { call->depth += calls[i].depth; - for (j = 0; j < 4; ++j) call->d[j] += calls[i].d[j]; - for (j = 0; j < 2; ++j) { - call->davg[j] += calls[i].dsum[j]; - call->dstd[j] += calls[i].d2sum[j]; - } + for (j = 0; j < 4; ++j) call->d[j] += calls[i].d[j], call->ed[j] += calls[i].ed[j]; tmp += calls[i].sum_Q2; } - for (j = 0; j < 2; ++j) { - int x = call->d[j*2] + call->d[j*2+1]; - if (x == 0) { - call->davg[j] = call->dstd[j] = 0.; - } else { - call->davg[j] /= x; - call->dstd[j] = sqrt(call->dstd[j] / x - call->davg[j] * call->davg[j]); - } - } call->rmsQ = (int)(sqrt((double)tmp / call->depth) + .499); return 0; } @@ -264,8 +249,11 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b) if (i) kputc(',', &s); kputw(bc->d[i], &s); } - if (bc->d[2] + bc->d[3] > 1) - ksprintf(&s, ";MED=%.1lf,%.1lf,%.1lf,%.1lf", bc->davg[0], bc->dstd[0], bc->davg[1], bc->dstd[1]); + kputs(";ED4=", &s); + for (i = 0; i < 4; ++i) { + if (i) kputc(',', &s); + kputw(bc->ed[i], &s); + } kputc('\0', &s); // FMT kputs("PL", &s); kputc('\0', &s); diff --git a/bam2bcf.h b/bam2bcf.h index 73624c0..4038f39 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -8,8 +8,7 @@ struct __bcf_callaux_t; typedef struct __bcf_callaux_t bcf_callaux_t; typedef struct { - int depth, d[4]; - int dsum[2], d2sum[2]; + int depth, d[4], ed[4]; uint64_t sum_Q2; float p[25], esum[5]; } bcf_callret1_t; @@ -17,8 +16,7 @@ typedef struct { typedef struct { int a[5]; // alleles: ref, alt, alt2, alt3 int n, n_alleles, shift, ori_ref, unseen; - int d[4], depth, rmsQ; - double davg[2], dstd[2]; + int d[4], ed[4], depth, rmsQ; uint8_t *PL; } bcf_call_t; diff --git a/bcftools/fet.c b/bcftools/fet.c index ed81a81..845f8c2 100644 --- a/bcftools/fet.c +++ b/bcftools/fet.c @@ -1,6 +1,11 @@ #include #include +/* This program is implemented with ideas from this web page: + * + * http://www.langsrud.com/fisher.htm + */ + // log\binom{n}{k} static double lbinom(int n, int k) { @@ -31,13 +36,13 @@ static double hypergeo_acc(int n11, int n1_, int n_1, int n, hgacc_t *aux) aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n; } else { // then only n11 changed; the rest fixed if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) { - if (n11 == aux->n11 + 1) { + if (n11 == aux->n11 + 1) { // incremental aux->p *= (double)(aux->n1_ - aux->n11) / n11 * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1); aux->n11 = n11; return aux->p; } - if (n11 == aux->n11 - 1) { + if (n11 == aux->n11 - 1) { // incremental aux->p *= (double)aux->n11 / (aux->n1_ - n11) * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11); aux->n11 = n11; @@ -58,29 +63,22 @@ double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double int n1_, n_1, n; n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n - max = (n_1 < n1_) ? n_1 : n1_; // for right tail - min = (n1_ + n_1 - n < 0) ? 0 : (n1_ + n_1 - n < 0); // for left tail - if (min == max) { // no need to do test - *two = *_left = *_right = 1.; - return 1.; - } - // the probability of the current table - q = hypergeo_acc(n11, n1_, n_1, n, &aux); + max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail + min = (n1_ + n_1 - n < 0) ? 0 : (n1_ + n_1 - n < 0); // min n11, for left tail + *two = *_left = *_right = 1.; + if (min == max) return 1.; // no need to do test + q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table // left tail p = hypergeo_acc(min, 0, 0, 0, &aux); - for (left = 0., i = min + 1; p < 0.99999999 * q; ++i) { - left += p; - p = hypergeo_acc(i, 0, 0, 0, &aux); - } + for (left = 0., i = min + 1; p < 0.99999999 * q; ++i) // loop until underflow + left += p, p = hypergeo_acc(i, 0, 0, 0, &aux); --i; if (p < 1.00000001 * q) left += p; else --i; // right tail p = hypergeo_acc(max, 0, 0, 0, &aux); - for (right = 0., j = max - 1; p < 0.99999999 * q; --j) { - right += p; - p = hypergeo_acc(j, 0, 0, 0, &aux); - } + for (right = 0., j = max - 1; p < 0.99999999 * q; --j) // loop until underflow + right += p, p = hypergeo_acc(j, 0, 0, 0, &aux); if (p < 1.00000001 * q) right += p; else ++j; // two-tail diff --git a/bcftools/prob1.c b/bcftools/prob1.c index c5896f7..6afd664 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -189,6 +189,32 @@ static double mc_cal_afs(bcf_p1aux_t *ma) return sum / ma->M; } +static long double p1_cal_g3(bcf_p1aux_t *p1a, double g[3]) +{ + long double pd = 0., g2[3]; + int i, k; + memset(g2, 0, sizeof(long double) * 3); + for (k = 0; k < p1a->M; ++k) { + double f = (double)k / p1a->M, f3[3], g1[3]; + long double z = 1.; + g1[0] = g1[1] = g1[2] = 0.; + f3[0] = (1. - f) * (1. - f); f3[1] = 2. * f * (1. - f); f3[2] = f * f; + for (i = 0; i < p1a->n; ++i) { + double *pdg = p1a->pdg + i * 3; + double x = pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]; + z *= x; + g1[0] += pdg[0] * f3[0] / x; + g1[1] += pdg[1] * f3[1] / x; + g1[2] += pdg[2] * f3[2] / x; + } + pd += p1a->phi[k] * z; + for (i = 0; i < 3; ++i) + g2[i] += p1a->phi[k] * z * g1[i]; + } + for (i = 0; i < 3; ++i) g[i] = g2[i] / pd; + return pd; +} + int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) { int i, k; @@ -223,6 +249,7 @@ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) flast = rst->f_em; } } + p1_cal_g3(ma, rst->g); return 0; } diff --git a/bcftools/prob1.h b/bcftools/prob1.h index d34a75f..96beb04 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -8,7 +8,7 @@ typedef struct __bcf_p1aux_t bcf_p1aux_t; typedef struct { int rank0; - double f_em, f_exp, f_flat, p_ref; + double f_em, f_exp, f_flat, p_ref, g[3]; } bcf_p1rst_t; #define MC_PTYPE_FULL 1 diff --git a/bcftools/vcfout.c b/bcftools/vcfout.c index 075fb61..af9b0c9 100644 --- a/bcftools/vcfout.c +++ b/bcftools/vcfout.c @@ -70,7 +70,7 @@ static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, kputc('\0', &s); kputc('\0', &s); kputs(b->info, &s); if (b->info[0]) kputc(';', &s); - ksprintf(&s, "AF1=%.3lf;AFE=%.3lf", 1.-pr->f_em, 1.-pr->f_exp); + ksprintf(&s, "AF1=%.3lf;AFE=%.3lf;HWE=%.3lf,%.3lf,%.3lf", 1.-pr->f_em, 1.-pr->f_exp, pr->g[0], pr->g[1], pr->g[2]); kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); free(b->str); -- 2.39.2