From 7795f54fa8aed1e0b1edff61890e9fe25fdb7fe9 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 14 Jul 2010 20:26:27 +0000 Subject: [PATCH] * samtools-0.1.8-2 (r617) * allele frequency calculation apparently works... --- bam_mcns.c | 37 +++++++++++++++++++++++++------------ bam_plcmd.c | 24 +++++++++++++++++------- bamtk.c | 2 +- 3 files changed, 43 insertions(+), 20 deletions(-) diff --git a/bam_mcns.c b/bam_mcns.c index 845dfcd..b69ec0f 100644 --- a/bam_mcns.c +++ b/bam_mcns.c @@ -2,7 +2,7 @@ #include #include "bam_mcns.h" -#define MC_MIN_QUAL 20 +#define MC_MIN_QUAL 13 #define MC_MAX_SUMQ 3000 #define MC_MAX_SUMQP 1e-300 @@ -10,7 +10,7 @@ struct __mc_aux_t { int n, N; int ref, alt; double *q2p, *pdg; // pdg -> P(D|g) - int *qsum, *bcnt; + int *qsum, *bcnt, rcnt[4]; }; mc_aux_t *mc_init(int n) // FIXME: assuming diploid @@ -42,10 +42,12 @@ static void sum_err(int *n, const bam_pileup1_t **plp, mc_aux_t *ma) int i, j; memset(ma->qsum, 0, sizeof(int) * 4 * ma->n); memset(ma->bcnt, 0, sizeof(int) * 4 * ma->n); + memset(ma->rcnt, 0, sizeof(int) * 4); for (j = 0; j < ma->n; ++j) { - int *qsum = ma->qsum + j * 4; + int tmp, *qsum = ma->qsum + j * 4; int *bcnt = ma->bcnt + j * 4; - for (i = 0; i < n[j]; ++i) { + double r = drand48(), rr; + for (i = tmp = 0; i < n[j]; ++i) { const bam_pileup1_t *p = plp[j] + i; int q, b; if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; @@ -56,6 +58,14 @@ static void sum_err(int *n, const bam_pileup1_t **plp, mc_aux_t *ma) if (b > 3) continue; // N qsum[b] += q; ++bcnt[b]; + ++tmp; + } + if (tmp) { + for (i = 0, rr = 0.; i < 4; ++i) { + rr += (double)bcnt[i] / tmp; + if (rr >= r) break; + } + if (i < 4) ++ma->rcnt[i]; } } } @@ -85,17 +95,17 @@ static void cal_pdg(mc_aux_t *ma) qsum = ma->qsum + j * 4; bcnt = ma->bcnt + j * 4; pi[1] = 3 * (bcnt[ma->ref] + bcnt[ma->alt]); - pi[0] = qsum[ma->alt]; - pi[2] = qsum[ma->ref]; + pi[0] = qsum[ma->ref]; + pi[2] = qsum[ma->alt]; for (i = 0; i < 3; ++i) - pdg[i] = pi[i] < MC_MAX_SUMQ? MC_MAX_SUMQP : ma->q2p[pi[i]]; + pdg[i] = pi[i] > MC_MAX_SUMQ? MC_MAX_SUMQP : ma->q2p[pi[i]]; } } double mc_freq0(int ref, int *n, const bam_pileup1_t **plp, mc_aux_t *ma, int *_ref, int *alt) { int i, acnt[4], j; - double f0; + double fr = -1., f0 = -1.; sum_err(n, plp, ma); set_allele(ref, ma); cal_pdg(ma); @@ -103,17 +113,19 @@ double mc_freq0(int ref, int *n, const bam_pileup1_t **plp, mc_aux_t *ma, int *_ for (i = 0; i < ma->n; ++i) for (j = 0; j < 4; ++j) acnt[j] += ma->bcnt[i * 4 + j]; - f0 = acnt[ma->ref] + acnt[ma->alt] == 0? -1. - : (double)acnt[ref] / (acnt[ma->ref] + acnt[ma->alt]); + if (ma->rcnt[ma->ref] + ma->rcnt[ma->alt]) // never used... + fr = (double)ma->rcnt[ref] / (ma->rcnt[ma->ref] + ma->rcnt[ma->alt]); + if (acnt[ma->ref] + acnt[ma->alt]) + f0 = (double)acnt[ref] / (acnt[ma->ref] + acnt[ma->alt]); *_ref = ma->ref; *alt = ma->alt; - return f0; + return ma->n == 1 || fr < 0.? f0 : fr; } double mc_freq_iter(double f0, mc_aux_t *ma) { double f, f3[3]; int i, j; - f3[0] = f0*f0; f3[1] = 2.*f0*(1.-f0); f3[2] = (1.-f0)*(1.-f0); + f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; for (i = 0, f = 0.; i < ma->n; ++i) { double up, dn, *pdg; pdg = ma->pdg + i * 3; @@ -123,5 +135,6 @@ double mc_freq_iter(double f0, mc_aux_t *ma) dn += pdg[j] * f3[j]; f += up / dn; } + f /= ma->n * 2.; return f; } diff --git a/bam_plcmd.c b/bam_plcmd.c index e4de833..708e63e 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -548,16 +548,26 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) ref_tid = tid; } if (conf->vcf) { - double f; - int j, _ref, _alt, _ref0, depth, rms_q; + double f0, f; // the reference allele frequency + int j, _ref, _alt, _ref0, depth, rms_q, _ref0b; uint64_t sqr_sum; - _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; + _ref0 = _ref0b = (ref && pos < ref_len)? ref[pos] : 'N'; _ref0 = bam_nt16_nt4_table[bam_nt16_table[_ref0]]; - f = mc_freq0(_ref0, n_plp, plp, ma, &_ref, &_alt); - printf("%s\t%d\t.\t%c\t", h->target_name[tid], pos + 1, "ACGTN"[_ref0]); + f = f0 = mc_freq0(_ref0, n_plp, plp, ma, &_ref, &_alt); + if (f >= 0.0) { + double flast = f; + for (j = 0; j < 10; ++j) { + f = mc_freq_iter(flast, ma); + if (fabs(f - flast) < 1e-3) break; + flast = f; + } + } + printf("%s\t%d\t.\t%c\t", h->target_name[tid], pos + 1, _ref0b); if (_ref0 == _ref) putchar("ACGTN"[_alt]); else printf("%c,%c", "ACGTN"[_ref], "ACGTN"[_alt]); - printf("\t0\tPASS\t"); // FIXME: currently these not available + printf("\t0\t"); // FIXME: currently these not available + if (f0 < 0.) printf("Q13\t"); + else printf(".\t"); for (i = depth = 0, sqr_sum = 0; i < n; ++i) { depth += n_plp[i]; for (j = 0; j < n_plp[i]; ++j) { @@ -567,7 +577,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } } rms_q = (int)(sqrt((double)sqr_sum / depth) + .499); - printf("DP=%d;MQ=%d;AF=%.3lg", depth, rms_q, 1.-f); // FIXME: not working for triallelic alleles + printf("DP=%d;MQ=%d;AF=%.3lg", depth, rms_q, f0<0.?0.:1.-f); printf("\tDP"); // output genotype information; FIXME: to be implmented... for (i = 0; i < n; ++i) diff --git a/bamtk.c b/bamtk.c index f93cf52..b5f6b67 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.8-1 (r615)" +#define PACKAGE_VERSION "0.1.8-2 (r617)" #endif int bam_taf2baf(int argc, char *argv[]); -- 2.39.2