From d73a1b3b94255044787c0e4bfeba37df66e99770 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 22 Jul 2010 20:37:56 +0000 Subject: [PATCH] * preliminary multisample SNP caller. * something looks not so right, but it largely works --- bam_mcns.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++-- bam_mcns.h | 2 ++ bam_plcmd.c | 37 +++++++++++++++++++--------- bam_tview.c | 2 +- 4 files changed, 96 insertions(+), 15 deletions(-) diff --git a/bam_mcns.c b/bam_mcns.c index b69ec0f..cd64b0a 100644 --- a/bam_mcns.c +++ b/bam_mcns.c @@ -9,10 +9,22 @@ struct __mc_aux_t { int n, N; int ref, alt; + double theta; double *q2p, *pdg; // pdg -> P(D|g) + double *alpha, *beta; int *qsum, *bcnt, rcnt[4]; }; +void mc_init_prior(mc_aux_t *ma, double theta) +{ + double sum; + int i; + ma->theta = theta; + for (i = 0, sum = 0.; i < 2 * ma->n; ++i) + sum += (ma->alpha[i] = ma->theta / (2 * ma->n - i)); + ma->alpha[2 * ma->n] = 1. - sum; +} + mc_aux_t *mc_init(int n) // FIXME: assuming diploid { mc_aux_t *ma; @@ -23,8 +35,18 @@ mc_aux_t *mc_init(int n) // FIXME: assuming diploid ma->qsum = calloc(4 * ma->n, sizeof(int)); ma->bcnt = calloc(4 * ma->n, sizeof(int)); ma->pdg = calloc(3 * ma->n, sizeof(double)); + ma->alpha = calloc(2 * ma->n + 1, sizeof(double)); + ma->beta = calloc((2 * ma->n + 1) * 3, sizeof(double)); for (i = 0; i <= MC_MAX_SUMQ; ++i) ma->q2p[i] = pow(10., -i / 10.); + for (i = 0; i <= 2 * ma->n; ++i) { + double *bi = ma->beta + 3 * i; + double f = (double)i / (2 * ma->n); + bi[0] = (1. - f) * (1. - f); + bi[1] = 2 * f * (1. - f); + bi[2] = f * f; + } + mc_init_prior(ma, 1e-3); // the simplest prior return ma; } @@ -33,6 +55,7 @@ void mc_destroy(mc_aux_t *ma) if (ma) { free(ma->qsum); free(ma->bcnt); free(ma->q2p); free(ma->pdg); + free(ma->alpha); free(ma->beta); free(ma); } } @@ -101,7 +124,7 @@ static void cal_pdg(mc_aux_t *ma) pdg[i] = pi[i] > MC_MAX_SUMQ? MC_MAX_SUMQP : ma->q2p[pi[i]]; } } - +// return the reference allele frequency double mc_freq0(int ref, int *n, const bam_pileup1_t **plp, mc_aux_t *ma, int *_ref, int *alt) { int i, acnt[4], j; @@ -120,7 +143,7 @@ double mc_freq0(int ref, int *n, const bam_pileup1_t **plp, mc_aux_t *ma, int *_ *_ref = ma->ref; *alt = ma->alt; return ma->n == 1 || fr < 0.? f0 : fr; } - +// f0 is the reference allele frequency double mc_freq_iter(double f0, mc_aux_t *ma) { double f, f3[3]; @@ -138,3 +161,46 @@ double mc_freq_iter(double f0, mc_aux_t *ma) f /= ma->n * 2.; return f; } + +double mc_ref_prob(mc_aux_t *ma) +{ + int k, i, g; + long double PD = 0., Pref = 0.; + for (k = 0; k <= ma->n * 2; ++k) { + long double x = 1.; + double *bk = ma->beta + k * 3; + for (i = 0; i < ma->n; ++i) { + double y = 0., *pdg = ma->pdg + i * 3; + for (g = 0; g < 3; ++g) + y += pdg[g] * bk[g]; + x *= y; + } + PD += x * ma->alpha[k]; + } + for (k = 0; k <= ma->n * 2; ++k) { + long double x = 1.0; + for (i = 0; i < ma->n; ++i) + x *= ma->pdg[i * 3 + 2] * ma->beta[k * 3 + 2]; + Pref += x * ma->alpha[k]; + } + return Pref / PD; +} + +int mc_call_gt(const mc_aux_t *ma, double f0, int k) +{ + double sum, g[3]; + double max, f3[3], *pdg = ma->pdg + k * 3; + int q, i, max_i; + f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + for (i = 0, sum = 0.; i < 3; ++i) + sum += (g[i] = pdg[i] * f3[i]); + for (i = 0, max = -1., max_i = 0; i < 3; ++i) { + g[i] /= sum; + if (g[i] > max) max = g[i], max_i = i; + } + max = 1. - max; + if (max < 1e-308) max = 1e-308; + q = (int)(-3.434 * log(max) + .499); + if (q > 99) q = 99; + return q<<2|max_i; +} diff --git a/bam_mcns.h b/bam_mcns.h index ac1e41c..6d98de1 100644 --- a/bam_mcns.h +++ b/bam_mcns.h @@ -14,6 +14,8 @@ extern "C" { void mc_destroy(mc_aux_t *ma); double mc_freq0(int ref, int *n, const bam_pileup1_t **plp, mc_aux_t *ma, int *_ref, int *alt); double mc_freq_iter(double f0, mc_aux_t *ma); + double mc_ref_prob(mc_aux_t *ma); + int mc_call_gt(const mc_aux_t *ma, double f0, int k); #ifdef __cplusplus } diff --git a/bam_plcmd.c b/bam_plcmd.c index 708e63e..2854770 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -450,7 +450,7 @@ int bam_pileup(int argc, char *argv[]) ***********/ typedef struct { - int vcf, max_mq, min_mq; + int vcf, max_mq, min_mq, var_only; char *reg, *fn_pos; faidx_t *fai; kh_64_t *hash; @@ -520,6 +520,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if (conf->vcf) { kstring_t s; s.l = s.m = 0; s.s = 0; + puts("##fileformat=VCFv4.0"); kputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", &s); for (i = 0; i < n; ++i) { const char *p; @@ -548,24 +549,33 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) ref_tid = tid; } if (conf->vcf) { - double f0, f; // the reference allele frequency - int j, _ref, _alt, _ref0, depth, rms_q, _ref0b; + double f0, f, pref = -1.0; // the reference allele frequency + int j, _ref, _alt, _ref0, depth, rms_q, _ref0b, is_var = 0, qref = 0; uint64_t sqr_sum; _ref0 = _ref0b = (ref && pos < ref_len)? ref[pos] : 'N'; _ref0 = bam_nt16_nt4_table[bam_nt16_table[_ref0]]; f = f0 = mc_freq0(_ref0, n_plp, plp, ma, &_ref, &_alt); if (f >= 0.0) { - double flast = f; - for (j = 0; j < 10; ++j) { + double q, flast = f; + for (j = 0; j < 10; ++j){ f = mc_freq_iter(flast, ma); if (fabs(f - flast) < 1e-3) break; flast = f; } + pref = mc_ref_prob(ma); + is_var = (pref < .5); + q = is_var? pref : 1. - pref; + if (q < 1e-308) q = 1e-308; + qref = (int)(-3.434 * log(q) + .499); + if (qref > 99) qref = 99; } + if (conf->var_only && !is_var) continue; printf("%s\t%d\t.\t%c\t", h->target_name[tid], pos + 1, _ref0b); - if (_ref0 == _ref) putchar("ACGTN"[_alt]); - else printf("%c,%c", "ACGTN"[_ref], "ACGTN"[_alt]); - printf("\t0\t"); // FIXME: currently these not available + if (is_var) { + if (_ref0 == _ref) putchar("ACGTN"[_alt]); + else printf("%c,%c", "ACGTN"[_ref], "ACGTN"[_alt]); + } else putchar('.'); + printf("\t%d\t", qref); if (f0 < 0.) printf("Q13\t"); else printf(".\t"); for (i = depth = 0, sqr_sum = 0; i < n; ++i) { @@ -578,10 +588,12 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } rms_q = (int)(sqrt((double)sqr_sum / depth) + .499); printf("DP=%d;MQ=%d;AF=%.3lg", depth, rms_q, f0<0.?0.:1.-f); - printf("\tDP"); + printf("\tGT:GQ:DP"); // output genotype information; FIXME: to be implmented... - for (i = 0; i < n; ++i) - printf("\t%d", n_plp[i]); + for (i = 0; i < n; ++i) { + int x = mc_call_gt(ma, f, i); + printf("\t%c/%c:%d:%d", "10"[((x&3)==2)], "10"[((x&3)>0)], x>>2, n_plp[i]); + } putchar('\n'); } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); @@ -628,7 +640,7 @@ int bam_mpileup(int argc, char *argv[]) mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); mplp.max_mq = 60; - while ((c = getopt(argc, argv, "f:r:l:VM:q:")) >= 0) { + while ((c = getopt(argc, argv, "f:r:l:VvM:q:")) >= 0) { switch (c) { case 'f': mplp.fai = fai_load(optarg); @@ -636,6 +648,7 @@ int bam_mpileup(int argc, char *argv[]) break; case 'r': mplp.reg = strdup(optarg); break; case 'l': mplp.fn_pos = strdup(optarg); break; + case 'v': mplp.var_only = 1; break; case 'V': mplp.vcf = 1; break; case 'M': mplp.max_mq = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; diff --git a/bam_tview.c b/bam_tview.c index 7b326fc..24cd12a 100644 --- a/bam_tview.c +++ b/bam_tview.c @@ -292,7 +292,7 @@ static void tv_win_goto(tview_t *tv, int *tid, int *pos) } else if (c == KEY_ENTER || c == '\012' || c == '\015') { int _tid = -1, _beg, _end; if (str[0] == '=') { - _beg = strtol(str+1, &p, 10); + _beg = strtol(str+1, &p, 10) - 1; if (_beg > 0) { *pos = _beg; return; -- 2.39.2