From 765ae9ea96a18b109c405871ebbfe9172888fdcb Mon Sep 17 00:00:00 2001
From: Heng Li <lh3@live.co.uk>
Date: Fri, 3 Dec 2010 22:12:30 +0000
Subject: [PATCH]  * remove "-f". Instead always compute consensus quality  *
 increase the upper limit of quality

---
 bcftools/call1.c | 16 ++++++++--------
 bcftools/prob1.c | 39 +++++++++++++++++----------------------
 bcftools/prob1.h |  2 +-
 3 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/bcftools/call1.c b/bcftools/call1.c
index f293a6c..8ba7828 100644
--- a/bcftools/call1.c
+++ b/bcftools/call1.c
@@ -26,7 +26,6 @@ KSTREAM_INIT(gzFile, gzread, 16384)
 #define VC_CALL_GT 2048
 #define VC_ADJLD   4096
 #define VC_NO_INDEL 8192
-#define VC_FOLDED 16384
 
 typedef struct {
 	int flag, prior_type, n1;
@@ -151,7 +150,7 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p
 {
 	kstring_t s;
 	int is_var = (pr->p_ref < pref);
-	double p_hwe, r = is_var? pr->p_ref : 1. - pr->p_ref;
+	double p_hwe, r = is_var? pr->p_ref : pr->p_var, fq;
 	anno16_t a;
 
 	p_hwe = pr->g[0] >= 0.? test_hwe(pr->g) : 1.0; // only do HWE g[] is calculated
@@ -166,6 +165,10 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p
 //	ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih);
 	ksprintf(&s, "AF1=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, pr->cil, pr->cih);
 	ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
+	fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
+	if (fq < -999) fq = -999;
+	if (fq > 999) fq = 999;
+	ksprintf(&s, ";FQ=%.3lg", fq);
 	if (a.is_tested) {
 		if (pr->pc[0] >= 0.) ksprintf(&s, ";PC4=%lg,%lg,%lg,%lg", pr->pc[0], pr->pc[1], pr->pc[2], pr->pc[3]);
 		ksprintf(&s, ";PV4=%.2lg,%.2lg,%.2lg,%.2lg", a.p[0], a.p[1], a.p[2], a.p[3]);
@@ -176,8 +179,8 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p
 	kputs(b->fmt, &s); kputc('\0', &s);
 	free(b->str);
 	b->m_str = s.m; b->l_str = s.l; b->str = s.s;
-	b->qual = r < 1e-100? 99 : -4.343 * log(r);
-	if (b->qual > 99) b->qual = 99;
+	b->qual = r < 1e-100? 999 : -4.343 * log(r);
+	if (b->qual > 999) b->qual = 999;
 	bcf_sync(b);
 	if (!is_var) bcf_shrink_alt(b, 1);
 	else if (!(flag&VC_KEEPALT))
@@ -217,9 +220,8 @@ int bcfview(int argc, char *argv[])
 	tid = begin = end = -1;
 	memset(&vc, 0, sizeof(viewconf_t));
 	vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.;
-	while ((c = getopt(argc, argv, "fN1:l:cHAGvbSuP:t:p:QgLi:I")) >= 0) {
+	while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:I")) >= 0) {
 		switch (c) {
-		case 'f': vc.flag |= VC_FOLDED; break;
 		case '1': vc.n1 = atoi(optarg); break;
 		case 'l': vc.fn_list = strdup(optarg); break;
 		case 'N': vc.flag |= VC_ACGT_ONLY; break;
@@ -262,7 +264,6 @@ int bcfview(int argc, char *argv[])
 		fprintf(stderr, "         -Q        output the QCALL likelihood format\n");
 		fprintf(stderr, "         -L        calculate LD for adjacent sites\n");
 		fprintf(stderr, "         -I        skip indels\n");
-		fprintf(stderr, "         -f        reference-free variant calling\n");
 		fprintf(stderr, "         -1 INT    number of group-1 samples [0]\n");
 		fprintf(stderr, "         -l FILE   list of sites to output [all sites]\n");
 		fprintf(stderr, "         -t FLOAT  scaled substitution mutation rate [%.4lg]\n", vc.theta);
@@ -297,7 +298,6 @@ int bcfview(int argc, char *argv[])
 			bcf_p1_init_subprior(p1, vc.prior_type, vc.theta);
 		}
 		if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac
-		if (vc.flag & VC_FOLDED) bcf_p1_set_folded(p1);
 	}
 	if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h);
 	if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) {
diff --git a/bcftools/prob1.c b/bcftools/prob1.c
index 8bf968f..c820f71 100644
--- a/bcftools/prob1.c
+++ b/bcftools/prob1.c
@@ -32,7 +32,7 @@ unsigned char seq_nt4_table[256] = {
 };
 
 struct __bcf_p1aux_t {
-	int n, M, n1, is_indel, is_folded;
+	int n, M, n1, is_indel;
 	double *q2p, *pdg; // pdg -> P(D|g)
 	double *phi, *phi_indel;
 	double *z, *zswap; // aux for afs
@@ -43,13 +43,6 @@ struct __bcf_p1aux_t {
 	int PL_len;
 };
 
-static void fold_array(int M, double *x)
-{
-	int k;
-	for (k = 0; k < M/2; ++k)
-		x[k] = x[M-k] = (x[k] + x[M-k]) / 2.;
-}
-
 void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x)
 {
 	int i;
@@ -162,15 +155,6 @@ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1)
 	return 0;
 }
 
-void bcf_p1_set_folded(bcf_p1aux_t *p1a)
-{
-	if (p1a->n1 < 0) {
-		p1a->is_folded = 1;
-		fold_array(p1a->M, p1a->phi);
-		fold_array(p1a->M, p1a->phi_indel);
-	}
-}
-
 void bcf_p1_destroy(bcf_p1aux_t *ma)
 {
 	if (ma) {
@@ -326,19 +310,28 @@ static void contrast(bcf_p1aux_t *ma, double pc[4]) // mc_cal_y() must be called
 	pc[1] = pc[1] == 1.? 99 : (int)(-4.343 * log(1. - pc[1]) + .499);
 }
 
-static double mc_cal_afs(bcf_p1aux_t *ma)
+static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_folded)
 {
 	int k;
-	long double sum = 0.;
+	long double sum = 0., sum2;
 	double *phi = ma->is_indel? ma->phi_indel : ma->phi;
 	memset(ma->afs1, 0, sizeof(double) * (ma->M + 1));
 	mc_cal_y(ma);
+	// compute AFS
 	for (k = 0, sum = 0.; k <= ma->M; ++k)
 		sum += (long double)phi[k] * ma->z[k];
 	for (k = 0; k <= ma->M; ++k) {
 		ma->afs1[k] = phi[k] * ma->z[k] / sum;
 		if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.;
 	}
+	// compute folded variant probability
+	for (k = 0, sum = 0.; k <= ma->M; ++k)
+		sum += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k];
+	for (k = 1, sum2 = 0.; k < ma->M; ++k)
+		sum2 += (long double)(phi[k] + phi[ma->M - k]) / 2. * ma->z[k];
+	*p_var_folded = sum2 / sum;
+	*p_ref_folded = (phi[k] + phi[ma->M - k]) / 2. * (ma->z[ma->M] + ma->z[0]) / sum;
+	// the expected frequency
 	for (k = 0, sum = 0.; k <= ma->M; ++k) {
 		ma->afs[k] += ma->afs1[k];
 		sum += k * ma->afs1[k];
@@ -388,8 +381,11 @@ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
 	if (b->n_alleles < 2) return -1; // FIXME: find a better solution
 	// 
 	rst->rank0 = cal_pdg(b, ma);
-	rst->f_exp = mc_cal_afs(ma);
-	rst->p_ref = ma->is_folded? ma->afs1[ma->M] + ma->afs1[0] : ma->afs1[ma->M];
+	rst->f_exp = mc_cal_afs(ma, &rst->p_ref_folded, &rst->p_var_folded);
+	rst->p_ref = ma->afs1[ma->M];
+	for (k = 0, sum = 0.; k < ma->M; ++k)
+		sum += ma->afs1[k];
+	rst->p_var = (double)sum;
 	// calculate f_flat and f_em
 	for (k = 0, sum = 0.; k <= ma->M; ++k)
 		sum += (long double)ma->z[k];
@@ -428,7 +424,6 @@ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
 void bcf_p1_dump_afs(bcf_p1aux_t *ma)
 {
 	int k;
-	if (ma->is_folded) fold_array(ma->M, ma->afs);
 	fprintf(stderr, "[afs]");
 	for (k = 0; k <= ma->M; ++k)
 		fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]);
diff --git a/bcftools/prob1.h b/bcftools/prob1.h
index 3827534..157ba46 100644
--- a/bcftools/prob1.h
+++ b/bcftools/prob1.h
@@ -8,7 +8,7 @@ typedef struct __bcf_p1aux_t bcf_p1aux_t;
 
 typedef struct {
 	int rank0;
-	double f_em, f_exp, f_flat, p_ref;
+	double f_em, f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var;
 	double cil, cih;
 	double pc[4];
 	double g[3];
-- 
2.39.5