From 765ae9ea96a18b109c405871ebbfe9172888fdcb Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 3 Dec 2010 22:12:30 +0000 Subject: [PATCH] * remove "-f". Instead always compute consensus quality * increase the upper limit of quality --- bcftools/call1.c | 16 ++++++++-------- bcftools/prob1.c | 39 +++++++++++++++++---------------------- bcftools/prob1.h | 2 +- 3 files changed, 26 insertions(+), 31 deletions(-) diff --git a/bcftools/call1.c b/bcftools/call1.c index f293a6c..8ba7828 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -26,7 +26,6 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_CALL_GT 2048 #define VC_ADJLD 4096 #define VC_NO_INDEL 8192 -#define VC_FOLDED 16384 typedef struct { int flag, prior_type, n1; @@ -151,7 +150,7 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p { kstring_t s; int is_var = (pr->p_ref < pref); - double p_hwe, r = is_var? pr->p_ref : 1. - pr->p_ref; + double p_hwe, r = is_var? pr->p_ref : pr->p_var, fq; anno16_t a; p_hwe = pr->g[0] >= 0.? test_hwe(pr->g) : 1.0; // only do HWE g[] is calculated @@ -166,6 +165,10 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p // ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih); ksprintf(&s, "AF1=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, pr->cil, pr->cih); ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded); + if (fq < -999) fq = -999; + if (fq > 999) fq = 999; + ksprintf(&s, ";FQ=%.3lg", fq); if (a.is_tested) { if (pr->pc[0] >= 0.) ksprintf(&s, ";PC4=%lg,%lg,%lg,%lg", pr->pc[0], pr->pc[1], pr->pc[2], pr->pc[3]); ksprintf(&s, ";PV4=%.2lg,%.2lg,%.2lg,%.2lg", a.p[0], a.p[1], a.p[2], a.p[3]); @@ -176,8 +179,8 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p kputs(b->fmt, &s); kputc('\0', &s); free(b->str); b->m_str = s.m; b->l_str = s.l; b->str = s.s; - b->qual = r < 1e-100? 99 : -4.343 * log(r); - if (b->qual > 99) b->qual = 99; + b->qual = r < 1e-100? 999 : -4.343 * log(r); + if (b->qual > 999) b->qual = 999; bcf_sync(b); if (!is_var) bcf_shrink_alt(b, 1); else if (!(flag&VC_KEEPALT)) @@ -217,9 +220,8 @@ int bcfview(int argc, char *argv[]) tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; - while ((c = getopt(argc, argv, "fN1:l:cHAGvbSuP:t:p:QgLi:I")) >= 0) { + while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:I")) >= 0) { switch (c) { - case 'f': vc.flag |= VC_FOLDED; break; case '1': vc.n1 = atoi(optarg); break; case 'l': vc.fn_list = strdup(optarg); break; case 'N': vc.flag |= VC_ACGT_ONLY; break; @@ -262,7 +264,6 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, " -Q output the QCALL likelihood format\n"); fprintf(stderr, " -L calculate LD for adjacent sites\n"); fprintf(stderr, " -I skip indels\n"); - fprintf(stderr, " -f reference-free variant calling\n"); fprintf(stderr, " -1 INT number of group-1 samples [0]\n"); fprintf(stderr, " -l FILE list of sites to output [all sites]\n"); fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4lg]\n", vc.theta); @@ -297,7 +298,6 @@ int bcfview(int argc, char *argv[]) bcf_p1_init_subprior(p1, vc.prior_type, vc.theta); } if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac - if (vc.flag & VC_FOLDED) bcf_p1_set_folded(p1); } if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h); if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) { diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 8bf968f..c820f71 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -32,7 +32,7 @@ unsigned char seq_nt4_table[256] = { }; struct __bcf_p1aux_t { - int n, M, n1, is_indel, is_folded; + int n, M, n1, is_indel; double *q2p, *pdg; // pdg -> P(D|g) double *phi, *phi_indel; double *z, *zswap; // aux for afs @@ -43,13 +43,6 @@ struct __bcf_p1aux_t { int PL_len; }; -static void fold_array(int M, double *x) -{ - int k; - for (k = 0; k < M/2; ++k) - x[k] = x[M-k] = (x[k] + x[M-k]) / 2.; -} - void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x) { int i; @@ -162,15 +155,6 @@ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) return 0; } -void bcf_p1_set_folded(bcf_p1aux_t *p1a) -{ - if (p1a->n1 < 0) { - p1a->is_folded = 1; - fold_array(p1a->M, p1a->phi); - fold_array(p1a->M, p1a->phi_indel); - } -} - void bcf_p1_destroy(bcf_p1aux_t *ma) { if (ma) { @@ -326,19 +310,28 @@ static void contrast(bcf_p1aux_t *ma, double pc[4]) // mc_cal_y() must be called pc[1] = pc[1] == 1.? 99 : (int)(-4.343 * log(1. - pc[1]) + .499); } -static double mc_cal_afs(bcf_p1aux_t *ma) +static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded) { int k; - long double sum = 0.; + long double sum = 0., sum2; double *phi = ma->is_indel? ma->phi_indel : ma->phi; memset(ma->afs1, 0, sizeof(double) * (ma->M + 1)); mc_cal_y(ma); + // compute AFS for (k = 0, sum = 0.; k <= ma->M; ++k) sum += (long double)phi[k] * ma->z[k]; for (k = 0; k <= ma->M; ++k) { ma->afs1[k] = phi[k] * ma->z[k] / sum; if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.; } + // compute folded variant probability + for (k = 0, sum = 0.; k <= ma->M; ++k) + sum += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k]; + for (k = 1, sum2 = 0.; k < ma->M; ++k) + sum2 += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k]; + *p_var_folded = sum2 / sum; + *p_ref_folded = (phi[k] + phi[ma->M - k]) / 2. * (ma->z[ma->M] + ma->z[0]) / sum; + // the expected frequency for (k = 0, sum = 0.; k <= ma->M; ++k) { ma->afs[k] += ma->afs1[k]; sum += k * ma->afs1[k]; @@ -388,8 +381,11 @@ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) if (b->n_alleles < 2) return -1; // FIXME: find a better solution // rst->rank0 = cal_pdg(b, ma); - rst->f_exp = mc_cal_afs(ma); - rst->p_ref = ma->is_folded? ma->afs1[ma->M] + ma->afs1[0] : ma->afs1[ma->M]; + rst->f_exp = mc_cal_afs(ma, &rst->p_ref_folded, &rst->p_var_folded); + rst->p_ref = ma->afs1[ma->M]; + for (k = 0, sum = 0.; k < ma->M; ++k) + sum += ma->afs1[k]; + rst->p_var = (double)sum; // calculate f_flat and f_em for (k = 0, sum = 0.; k <= ma->M; ++k) sum += (long double)ma->z[k]; @@ -428,7 +424,6 @@ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) void bcf_p1_dump_afs(bcf_p1aux_t *ma) { int k; - if (ma->is_folded) fold_array(ma->M, ma->afs); fprintf(stderr, "[afs]"); for (k = 0; k <= ma->M; ++k) fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]); diff --git a/bcftools/prob1.h b/bcftools/prob1.h index 3827534..157ba46 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -8,7 +8,7 @@ typedef struct __bcf_p1aux_t bcf_p1aux_t; typedef struct { int rank0; - double f_em, f_exp, f_flat, p_ref; + double f_em, f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var; double cil, cih; double pc[4]; double g[3]; -- 2.39.2