From: Heng Li <lh3@live.co.uk>
Date: Fri, 5 Nov 2010 04:19:14 +0000 (+0000)
Subject:  * this revision is UNSTABLE
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=df7cadac556f302bdd0bc48c235c2b61b699dfe1;p=samtools.git

 * this revision is UNSTABLE

 * indel caller seems working, but it is very insensitive and has
   several things I do not quite understand.
---

diff --git a/bam2bcf.c b/bam2bcf.c
index d5a8dd4..ab5bfcd 100644
--- a/bam2bcf.c
+++ b/bam2bcf.c
@@ -98,6 +98,7 @@ int bcf_call_glfgen_gap(int pos, int _n, const bam_pileup1_t *pl, bcf_callaux_t
 	// fill the bases array
 	memset(r, 0, sizeof(bcf_callret1_t));
 	r->indelreg = 10000;
+	n_ins = n_del = 0;
 	for (i = n = 0; i < _n; ++i) {
 		const bam_pileup1_t *p = pl + i;
 		int q, b, mapQ, indelQ, is_diff, min_dist;
@@ -132,6 +133,7 @@ int bcf_call_glfgen_gap(int pos, int _n, const bam_pileup1_t *pl, bcf_callaux_t
 					if (qual[j] > BAM2BCF_INDELREG_THRES && indelreg == 0)
 						indelreg = j - y + 1;
 				}
+				if (r->indelreg > indelreg) r->indelreg = indelreg;
 			} else {
 				for (j = y; j <= p->qpos; ++j)
 					if (max_left < qual[j]) max_left = qual[j];
@@ -143,6 +145,11 @@ int bcf_call_glfgen_gap(int pos, int _n, const bam_pileup1_t *pl, bcf_callaux_t
 			// estimate the sequencing error rate
 			seqQ = bca->openQ;
 			if (p->indel != 0) seqQ += bca->extQ * (abs(p->indel) - 1); // FIXME: better to model homopolymer
+			if (p->indel > 0) {
+				++n_ins; r->ins_len += p->indel;
+			} else if (p->indel < 0) {
+				++n_del; r->del_len += -p->indel;
+			}
 			if (p->indel != 0) { // a different model for tandem repeats
 				uint8_t *seq = bam1_seq(p->b);
 				int tandemQ, qb = bam1_seqi(seq, p->qpos), l;
@@ -181,6 +188,9 @@ int bcf_call_glfgen_gap(int pos, int _n, const bam_pileup1_t *pl, bcf_callaux_t
 		r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
 	}
 	r->depth = n;
+	r->ins_len = n_ins? r->ins_len / n_ins : 0;
+	r->del_len = n_del? r->del_len / n_del : 0;
+	if (r->indelreg >= 10000) r->indelreg = 0;
 	// glfgen
 	errmod_cal(bca->e, n, 2, bca->bases, r->p);
 	return r->depth;	
@@ -191,6 +201,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/,
 	int ref4, i, j, qsum[4];
 	int64_t tmp;
 	call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
+	call->ins_len = call->del_len = 0; call->indelreg = 0;
 	if (ref4 > 4) ref4 = 4;
 	// calculate qsum
 	memset(qsum, 0, 4 * sizeof(int));
@@ -253,6 +264,66 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/,
 	return 0;
 }
 
+int bcf_call_combine_gap(int n, const bcf_callret1_t *calls, bcf_call_t *call)
+{
+	int i, j, n_ins, n_del;
+	// combine annotations
+	call->ori_ref = 4;
+	memset(call->anno, 0, 16 * sizeof(int));
+	call->ins_len = call->del_len = 0; call->indelreg = 10000;
+	for (i = call->depth = 0, n_ins = n_del = 0; i < n; ++i) {
+		const bcf_callret1_t *r = calls + i;
+		if (r->depth > 0) {
+			call->depth += r->depth;
+			if (r->ins_len > 0) {
+				call->ins_len += r->ins_len;
+				++n_ins;
+			}
+			if (r->del_len > 0) {
+				call->del_len += r->del_len;
+				++n_del;
+			}
+			if (r->indelreg > 0 && call->indelreg > r->indelreg)
+				call->indelreg = r->indelreg;
+			for (j = 0; j < 16; ++j) call->anno[j] += r->anno[j];
+		}
+	}
+	if (call->depth == 0) return 0; // no indels
+	call->ins_len = n_ins? call->ins_len / n_ins : 0;
+	call->del_len = n_del? call->del_len / n_del : 0;
+	//
+	for (i = 0; i < 5; ++i) call->a[i] = -1;
+	call->a[0] = 0; call->a[1] = 1;
+	call->unseen = -1;
+	call->n_alleles = 2;
+	// set the PL array
+	if (call->n < n) {
+		call->n = n;
+		call->PL = realloc(call->PL, 15 * n);
+	}
+	{
+		int g[3];
+		double sum_min = 0.;
+		g[0] = 0; g[1] = 1; g[2] = 3;
+		for (i = 0; i < n; ++i) {
+			uint8_t *PL = call->PL + 3 * i;
+			const bcf_callret1_t *r = calls + i;
+			float min = 1e37;
+			for (j = 0; j < 3; ++j)
+				if (min > r->p[g[j]]) min = r->p[g[j]];
+			sum_min += min;
+			for (j = 0; j < 3; ++j) {
+				int y;
+				y = (int)(r->p[g[j]] - min + .499);
+				if (y > 255) y = 255;
+				PL[j] = y;
+			}
+		}
+		call->shift = (int)(sum_min + .499);
+	}
+	return 0;
+}
+
 int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP)
 {
 	extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
@@ -262,11 +333,19 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc
 	b->tid = tid; b->pos = pos; b->qual = 0;
 	s.s = b->str; s.m = b->m_str; s.l = 0;
 	kputc('\0', &s);
-	kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
-	for (i = 1; i < 5; ++i) {
-		if (bc->a[i] < 0) break;
-		if (i > 1) kputc(',', &s);
-		kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s);
+	if (bc->ins_len > 0 || bc->del_len > 0) { // an indel
+		for (i = 0; i < bc->indelreg; ++i) kputc('N', &s);
+		kputc('\0', &s);
+		if (bc->ins_len > 0 && bc->del_len > 0) kputs("<INDEL>", &s);
+		else if (bc->ins_len > 0) kputs("<INS>", &s);
+		else if (bc->del_len > 0) kputs("<DEL>", &s);
+	} else { // SNP
+		kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
+		for (i = 1; i < 5; ++i) {
+			if (bc->a[i] < 0) break;
+			if (i > 1) kputc(',', &s);
+			kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s);
+		}
 	}
 	kputc('\0', &s);
 	kputc('\0', &s);
diff --git a/bam2bcf.h b/bam2bcf.h
index 92f1045..b6514c0 100644
--- a/bam2bcf.h
+++ b/bam2bcf.h
@@ -16,8 +16,8 @@ typedef struct __bcf_callaux_t {
 } bcf_callaux_t;
 
 typedef struct {
-	int depth;
-	float indelreg, ins_len, del_len;
+	int depth, indelreg;
+	float ins_len, del_len;
 	int qsum[4];
 	int anno[16];
 	float p[25];
@@ -26,7 +26,9 @@ typedef struct {
 typedef struct {
 	int a[5]; // alleles: ref, alt, alt2, alt3
 	int n, n_alleles, shift, ori_ref, unseen;
-	int anno[16], depth;
+	int indelreg;
+	float ins_len, del_len;
+	int depth, anno[16];
 	uint8_t *PL;
 } bcf_call_t;
 
@@ -38,6 +40,7 @@ extern "C" {
 	void bcf_call_destroy(bcf_callaux_t *bca);
 	int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r);
 	int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call);
+	int bcf_call_combine_gap(int n, const bcf_callret1_t *calls, bcf_call_t *call);
 	int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP);
 	int bcf_call_glfgen_gap(int pos, int _n, const bam_pileup1_t *pl, bcf_callaux_t *bca, bcf_callret1_t *r);
 
diff --git a/bam_plcmd.c b/bam_plcmd.c
index ed230e8..32d050d 100644
--- a/bam_plcmd.c
+++ b/bam_plcmd.c
@@ -742,6 +742,14 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
 				if (i < gplp.n) { // at least one of the read contains a gap
 					for (i = 0; i < gplp.n; ++i)
 						bcf_call_glfgen_gap(pos, gplp.n_plp[i], gplp.plp[i], bca, bcr + i);
+					bcf_call_combine_gap(gplp.n, bcr, &bc);
+					if (bc.depth > 0) {
+						b = calloc(1, sizeof(bcf1_t));
+						bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
+									 (conf->flag&MPLP_FMT_SP));
+						bcf_write(bp, bh, b);
+						bcf_destroy(b);
+					}
 				}
 			}
 		} else {