From e561d6e1a3c58df4de0e2856c2cfb9b2b68e7f11 Mon Sep 17 00:00:00 2001
From: Heng Li <lh3@live.co.uk>
Date: Tue, 9 Nov 2010 15:45:33 +0000
Subject: [PATCH]  * samtools-0.1.9-6 (r803)  * mpileup: apply homopolymer
 correction when calculating GL, instead of before  * bcftools: apply a
 different prior to indels

---
 bam2bcf.c        |  6 ++++--
 bam2bcf_indel.c  | 17 ++++++++---------
 bamtk.c          |  2 +-
 bcftools/bcf.c   | 10 ++++++++++
 bcftools/bcf.h   |  2 ++
 bcftools/prob1.c | 25 +++++++++++++++++++------
 6 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/bam2bcf.c b/bam2bcf.c
index f7c48aa..1ba288c 100644
--- a/bam2bcf.c
+++ b/bam2bcf.c
@@ -53,11 +53,13 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
 	memset(r, 0, sizeof(bcf_callret1_t));
 	for (i = n = 0; i < _n; ++i) {
 		const bam_pileup1_t *p = pl + i;
-		int q, b, mapQ, baseQ, is_diff, min_dist;
+		int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
 		// set base
 		if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
 		baseQ = q = is_indel? p->aux&0xff : (int)bam1_qual(p->b)[p->qpos]; // base/indel quality
+		seqQ = is_indel? (p->aux>>8&0xff) : 99;
 		if (q < bca->min_baseQ) continue;
+		if (q > seqQ) q = seqQ;
 		mapQ = p->b->core.qual < bca->capQ? p->b->core.qual : bca->capQ;
 		if (q > mapQ) q = mapQ;
 		if (q > 63) q = 63;
@@ -67,7 +69,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
 			b = bam_nt16_nt4_table[b? b : ref_base]; // b is the 2-bit base
 			is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
 		} else {
-			b = p->aux>>8&0x3f;
+			b = p->aux>>16&0x3f;
 			is_diff = (b != 0);
 		}
 		bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c
index 0632eef..2254cf0 100644
--- a/bam2bcf_indel.c
+++ b/bam2bcf_indel.c
@@ -228,7 +228,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
 		for (s = K = 0; s < n; ++s) {
 			for (i = 0; i < n_plp[s]; ++i, ++K) {
 				bam_pileup1_t *p = plp[s] + i;
-				int *sct = &score[K*n_types], indelQ;
+				int *sct = &score[K*n_types], indelQ, seqQ;
 				for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
 				for (t = 1; t < n_types; ++t) // insertion sort
 					for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
@@ -241,16 +241,15 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
 				 */
 				if ((sc[0]&0x3f) == ref_type) {
 					indelQ = (sc[1]>>6) - (sc[0]>>6);
-					tmp = est_seqQ(bca, types[sc[1]&0x3f], l_run);
+					seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
 				} else {
 					for (t = 0; t < n_types; ++t) // look for the reference type
 						if ((sc[t]&0x3f) == ref_type) break;
 					indelQ = (sc[t]>>6) - (sc[0]>>6);
-					tmp = est_seqQ(bca, types[sc[0]&0x3f], l_run);
+					seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
 				}
-				if (indelQ > tmp) indelQ = tmp;
-				p->aux = (sc[0]&0x3f)<<8 | indelQ;
-				sumq[sc[0]&0x3f] += indelQ;
+				p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ;
+				sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
 //				fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d q=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ);
 			}
 		}
@@ -278,11 +277,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla
 		for (s = K = 0; s < n; ++s) {
 			for (i = 0; i < n_plp[s]; ++i, ++K) {
 				bam_pileup1_t *p = plp[s] + i;
-				int x = types[p->aux>>8&0x3f];
+				int x = types[p->aux>>16&0x3f];
 				for (j = 0; j < 4; ++j)
 					if (x == bca->indel_types[j]) break;
-				p->aux = j<<8 | (j == 4? 0 : (p->aux&0xff));
-//				fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d q=%d\n", pos, s, i, bam1_qname(p->b), p->aux>>8&63, p->aux&0xff);
+				p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
+//				fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d q=%d\n", pos, s, i, bam1_qname(p->b), p->aux>>16&63, p->aux&0xff);
 			}
 		}		
 	}
diff --git a/bamtk.c b/bamtk.c
index 2c441cd..5395d96 100644
--- a/bamtk.c
+++ b/bamtk.c
@@ -9,7 +9,7 @@
 #endif
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.9-5 (r802)"
+#define PACKAGE_VERSION "0.1.9-6 (r803)"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);
diff --git a/bcftools/bcf.c b/bcftools/bcf.c
index 494dccc..05eae5b 100644
--- a/bcftools/bcf.c
+++ b/bcftools/bcf.c
@@ -306,3 +306,13 @@ int bcf_cpy(bcf1_t *r, const bcf1_t *b)
 		memcpy(r->gi[i].data, b->gi[i].data, r->n_smpl * r->gi[i].len);
 	return 0;
 }
+
+int bcf_is_indel(const bcf1_t *b)
+{
+	char *p;
+	if (strlen(b->ref) > 1) return 1;
+	for (p = b->alt; *p; ++p)
+		if (*p != ',' && p[1] != ',' && p[1] != '\0')
+			return 1;
+	return 0;
+}
diff --git a/bcftools/bcf.h b/bcftools/bcf.h
index 3657895..89feeb8 100644
--- a/bcftools/bcf.h
+++ b/bcftools/bcf.h
@@ -128,6 +128,8 @@ extern "C" {
 	int bcf_shrink_alt(bcf1_t *b, int n);
 	// convert GL to PL
 	int bcf_gl2pl(bcf1_t *b);
+	// if the site is an indel
+	int bcf_is_indel(const bcf1_t *b);
 
 	// string hash table
 	void *bcf_build_refhash(bcf_hdr_t *h);
diff --git a/bcftools/prob1.c b/bcftools/prob1.c
index e3b0f73..04a0e15 100644
--- a/bcftools/prob1.c
+++ b/bcftools/prob1.c
@@ -8,9 +8,9 @@
 #include "kseq.h"
 KSTREAM_INIT(gzFile, gzread, 16384)
 
-#define MC_AVG_ERR 0.007
 #define MC_MAX_EM_ITER 16
 #define MC_EM_EPS 1e-4
+#define MC_DEF_INDEL 0.15
 
 unsigned char seq_nt4_table[256] = {
 	4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
@@ -32,9 +32,9 @@ unsigned char seq_nt4_table[256] = {
 };
 
 struct __bcf_p1aux_t {
-	int n, M, n1;
+	int n, M, n1, is_indel;
 	double *q2p, *pdg; // pdg -> P(D|g)
-	double *phi;
+	double *phi, *phi_indel;
 	double *z, *zswap; // aux for afs
 	double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set
 	double t, t1, t2;
@@ -43,6 +43,14 @@ struct __bcf_p1aux_t {
 	int PL_len;
 };
 
+void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x)
+{
+	int i;
+	for (i = 0; i < ma->M; ++i)
+		ma->phi_indel[i] = ma->phi[i] * x;
+	ma->phi_indel[ma->M] = 1. - ma->phi[ma->M] * x;
+}
+
 static void init_prior(int type, double theta, int M, double *phi)
 {
 	int i;
@@ -63,6 +71,7 @@ static void init_prior(int type, double theta, int M, double *phi)
 void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta)
 {
 	init_prior(type, theta, ma->M, ma->phi);
+	bcf_p1_indel_prior(ma, MC_DEF_INDEL);
 }
 
 void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta)
@@ -110,6 +119,7 @@ int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn)
 	fprintf(stderr, "[%s] heterozygosity=%lf, ", __func__, (double)sum);
 	for (sum = 0., k = 1; k <= ma->M; ++k) sum += k * ma->phi[ma->M - k] / ma->M;
 	fprintf(stderr, "theta=%lf\n", (double)sum);
+	bcf_p1_indel_prior(ma, MC_DEF_INDEL);
 	return 0;
 }
 
@@ -123,6 +133,7 @@ bcf_p1aux_t *bcf_p1_init(int n)
 	ma->q2p = calloc(256, sizeof(double));
 	ma->pdg = calloc(3 * ma->n, sizeof(double));
 	ma->phi = calloc(ma->M + 1, sizeof(double));
+	ma->phi_indel = calloc(ma->M + 1, sizeof(double));
 	ma->phi1 = calloc(ma->M + 1, sizeof(double));
 	ma->phi2 = calloc(ma->M + 1, sizeof(double));
 	ma->z = calloc(2 * ma->n + 1, sizeof(double));
@@ -148,7 +159,7 @@ void bcf_p1_destroy(bcf_p1aux_t *ma)
 {
 	if (ma) {
 		free(ma->q2p); free(ma->pdg);
-		free(ma->phi); free(ma->phi1); free(ma->phi2);
+		free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2);
 		free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2);
 		free(ma->afs); free(ma->afs1);
 		free(ma);
@@ -303,12 +314,13 @@ static double mc_cal_afs(bcf_p1aux_t *ma)
 {
 	int k;
 	long double sum = 0.;
+	double *phi = ma->is_indel? ma->phi_indel : ma->phi;
 	memset(ma->afs1, 0, sizeof(double) * (ma->M + 1));
 	mc_cal_y(ma);
 	for (k = 0, sum = 0.; k <= ma->M; ++k)
-		sum += (long double)ma->phi[k] * ma->z[k];
+		sum += (long double)phi[k] * ma->z[k];
 	for (k = 0; k <= ma->M; ++k) {
-		ma->afs1[k] = ma->phi[k] * ma->z[k] / sum;
+		ma->afs1[k] = phi[k] * ma->z[k] / sum;
 		if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.;
 	}
 	for (k = 0, sum = 0.; k <= ma->M; ++k) {
@@ -348,6 +360,7 @@ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
 {
 	int i, k;
 	long double sum = 0.;
+	ma->is_indel = bcf_is_indel(b);
 	// set PL and PL_len
 	for (i = 0; i < b->n_gi; ++i) {
 		if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
-- 
2.39.5