From f528e7da717a1a2a6ab2672c7c55f37d315452f3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 8 Mar 2011 03:07:35 +0000 Subject: [PATCH] added the haploid mode --- bcftools/bcfutils.c | 2 +- bcftools/call1.c | 51 +++++++------- bcftools/prob1.c | 161 +++++++++++++++++++++++++++----------------- bcftools/prob1.h | 3 +- 4 files changed, 123 insertions(+), 94 deletions(-) diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c index ae7ec0f..fc1c542 100644 --- a/bcftools/bcfutils.c +++ b/bcftools/bcfutils.c @@ -223,7 +223,7 @@ int bcf_subsam(int n_smpl, int *list, bcf1_t *b) // list MUST BE sorted for (j = 0; j < b->n_gi; ++j) { bcf_ginfo_t *gi = b->gi + j; for (i = 0; i < n_smpl; ++i) - memcpy((uint8_t*)gi->data + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); + if (i != list[i]) memcpy((uint8_t*)gi->data + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len); } b->n_smpl = n_smpl; return 0; diff --git a/bcftools/call1.c b/bcftools/call1.c index 286c014..9c0b498 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -19,7 +19,6 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_VARONLY 16 #define VC_VCFIN 32 #define VC_UNCOMP 64 -#define VC_HWE 128 #define VC_KEEPALT 256 #define VC_ACGT_ONLY 512 #define VC_QCALL 1024 @@ -31,6 +30,7 @@ KSTREAM_INIT(gzFile, gzread, 16384) typedef struct { int flag, prior_type, n1, n_sub, *sublist; char *fn_list, *prior_file, **subsam, *fn_dict; + uint8_t *ploidy; double theta, pref, indel_frac; } viewconf_t; @@ -67,23 +67,6 @@ khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h) return hash; } -static double test_hwe(const double g[3]) -{ - extern double kf_gammaq(double p, double x); - double fexp, chi2, f[3], n; - int i; - n = g[0] + g[1] + g[2]; - fexp = (2. * g[2] + g[1]) / (2. * n); - if (fexp > 1. - 1e-10) fexp = 1. - 1e-10; - if (fexp < 1e-10) fexp = 1e-10; - f[0] = n * (1. - fexp) * (1. - fexp); - f[1] = n * 2. * fexp * (1. - fexp); - f[2] = n * fexp * fexp; - for (i = 0, chi2 = 0.; i < 3; ++i) - chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i]; - return kf_gammaq(.5, chi2 / 2.); -} - typedef struct { double p[4]; int mq, depth, is_tested, d[4]; @@ -151,10 +134,9 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p { kstring_t s; int is_var = (pr->p_ref < pref); - double p_hwe, r = is_var? pr->p_ref : pr->p_var, fq; + double r = is_var? pr->p_ref : pr->p_var, fq; anno16_t a; - p_hwe = pr->g[0] >= 0.? test_hwe(pr->g) : 1.0; // only do HWE g[] is calculated test16(b, &a); rm_info(b, "I16="); @@ -174,8 +156,6 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p if (pr->pc[0] >= 0.) ksprintf(&s, ";PC4=%g,%g,%g,%g", pr->pc[0], pr->pc[1], pr->pc[2], pr->pc[3]); ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); } - if (pr->g[0] >= 0. && p_hwe <= .2) - ksprintf(&s, ";GC=%.2f,%.2f,%.2f;HWE=%.3f", pr->g[2], pr->g[1], pr->g[0], p_hwe); kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); free(b->str); @@ -214,11 +194,24 @@ static char **read_samples(const char *fn, int *_n) if (fp == 0) return 0; // fail to open file ks = ks_init(fp); while (ks_getuntil(ks, 0, &s, &dret) >= 0) { + int l; if (max == n) { max = max? max<<1 : 4; sam = realloc(sam, sizeof(void*)*max); } - sam[n++] = strdup(s.s); + l = s.l; + sam[n] = malloc(s.l + 2); + strcpy(sam[n], s.s); + sam[n][l+1] = 2; // by default, diploid + if (dret != '\n') { + if (ks_getuntil(ks, 0, &s, &dret) >= 0) { // read ploidy, 1 or 2 + int x = (int)s.s[0] - '0'; + if (x == 1 || x == 2) sam[n][l+1] = x; + else fprintf(stderr, "(%s) ploidy can only be 1 or 2; assume diploid\n", __func__); + } + if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); + } + ++n; } ks_destroy(ks); gzclose(fp); @@ -285,7 +278,6 @@ int bcfview(int argc, char *argv[]) tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; - vc.n_sub = 0; vc.subsam = 0; vc.sublist = 0; while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:IMs:D:")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; @@ -299,7 +291,6 @@ int bcfview(int argc, char *argv[]) case 'c': vc.flag |= VC_CALL; break; case 'v': vc.flag |= VC_VARONLY | VC_CALL; break; case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break; - case 'H': vc.flag |= VC_HWE; break; case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break; case 'I': vc.flag |= VC_NO_INDEL; break; case 'M': vc.flag |= VC_ANNO_MAX; break; @@ -308,7 +299,11 @@ int bcfview(int argc, char *argv[]) case 'i': vc.indel_frac = atof(optarg); break; case 'Q': vc.flag |= VC_QCALL; break; case 'L': vc.flag |= VC_ADJLD; break; - case 's': vc.subsam = read_samples(optarg, &vc.n_sub); break; + case 's': vc.subsam = read_samples(optarg, &vc.n_sub); + vc.ploidy = calloc(vc.n_sub + 1, 1); + for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1]; + tid = -1; + break; case 'P': if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL; else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2; @@ -370,7 +365,7 @@ int bcfview(int argc, char *argv[]) vcf_hdr_write(bout, hout); } if (vc.flag & VC_CALL) { - p1 = bcf_p1_init(hout->n_smpl); + p1 = bcf_p1_init(hout->n_smpl, vc.ploidy); if (vc.prior_file) { if (bcf_p1_read_prior(p1, vc.prior_file) < 0) { fprintf(stderr, "[%s] fail to read the prior AFS.\n", __func__); @@ -434,7 +429,6 @@ int bcfview(int argc, char *argv[]) if (vc.flag & VC_CALL) { // call variants bcf_p1rst_t pr; bcf_p1_cal(b, p1, &pr); // pr.g[3] is not calculated here - if (vc.flag&VC_HWE) bcf_p1_cal_g3(p1, pr.g); if (n_processed % 100000 == 0) { fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed); bcf_p1_dump_afs(p1); @@ -471,6 +465,7 @@ int bcfview(int argc, char *argv[]) if (hash) kh_destroy(set64, hash); if (vc.fn_list) free(vc.fn_list); if (vc.fn_dict) free(vc.fn_dict); + if (vc.ploidy) free(vc.ploidy); if (vc.n_sub) { int i; for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]); diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 193c4a0..4804e6e 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "prob1.h" #include "kseq.h" @@ -33,6 +34,7 @@ unsigned char seq_nt4_table[256] = { struct __bcf_p1aux_t { int n, M, n1, is_indel; + uint8_t *ploidy; // haploid or diploid ONLY double *q2p, *pdg; // pdg -> P(D|g) double *phi, *phi_indel; double *z, *zswap; // aux for afs @@ -123,25 +125,34 @@ int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn) return 0; } -bcf_p1aux_t *bcf_p1_init(int n) +bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy) { bcf_p1aux_t *ma; int i; ma = calloc(1, sizeof(bcf_p1aux_t)); ma->n1 = -1; ma->n = n; ma->M = 2 * n; + if (ploidy) { + ma->ploidy = malloc(n); + memcpy(ma->ploidy, ploidy, n); + for (i = 0, ma->M = 0; i < n; ++i) ma->M += ploidy[i]; + if (ma->M == 2 * n) { + free(ma->ploidy); + ma->ploidy = 0; + } + } ma->q2p = calloc(256, sizeof(double)); ma->pdg = calloc(3 * ma->n, sizeof(double)); ma->phi = calloc(ma->M + 1, sizeof(double)); ma->phi_indel = calloc(ma->M + 1, sizeof(double)); ma->phi1 = calloc(ma->M + 1, sizeof(double)); ma->phi2 = calloc(ma->M + 1, sizeof(double)); - ma->z = calloc(2 * ma->n + 1, sizeof(double)); - ma->zswap = calloc(2 * ma->n + 1, sizeof(double)); + ma->z = calloc(ma->M + 1, sizeof(double)); + ma->zswap = calloc(ma->M + 1, sizeof(double)); ma->z1 = calloc(ma->M + 1, sizeof(double)); // actually we do not need this large ma->z2 = calloc(ma->M + 1, sizeof(double)); - ma->afs = calloc(2 * ma->n + 1, sizeof(double)); - ma->afs1 = calloc(2 * ma->n + 1, sizeof(double)); + ma->afs = calloc(ma->M + 1, sizeof(double)); + ma->afs1 = calloc(ma->M + 1, sizeof(double)); for (i = 0; i < 256; ++i) ma->q2p[i] = pow(10., -i / 10.); bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior @@ -151,6 +162,10 @@ bcf_p1aux_t *bcf_p1_init(int n) int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) { if (n1 == 0 || n1 >= b->n) return -1; + if (b->M != b->n * 2) { + fprintf(stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__); + return -1; + } b->n1 = n1; return 0; } @@ -158,7 +173,7 @@ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) void bcf_p1_destroy(bcf_p1aux_t *ma) { if (ma) { - free(ma->q2p); free(ma->pdg); + free(ma->ploidy); free(ma->q2p); free(ma->pdg); free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2); free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2); free(ma->afs); free(ma->afs1); @@ -207,11 +222,16 @@ int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) { double sum, g[3]; double max, f3[3], *pdg = ma->pdg + k * 3; - int q, i, max_i; - f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + int q, i, max_i, ploidy; + ploidy = ma->ploidy? ma->ploidy[k] : 2; + if (ploidy == 2) { + f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + } else { + f3[0] = 1. - f0; f3[1] = 0; f3[2] = f0; + } for (i = 0, sum = 0.; i < 3; ++i) sum += (g[i] = pdg[i] * f3[i]); - for (i = 0, max = -1., max_i = 0; i < 3; ++i) { + for (i = 0, max = -1., max_i = 0; i <= ploidy; ++i) { g[i] /= sum; if (g[i] > max) max = g[i], max_i = i; } @@ -228,6 +248,7 @@ static void mc_cal_y_core(bcf_p1aux_t *ma, int beg) { double *z[2], *tmp, *pdg; int _j, last_min, last_max; + assert(beg == 0 || ma->M == ma->n*2); z[0] = ma->z; z[1] = ma->zswap; pdg = ma->pdg; @@ -236,41 +257,81 @@ static void mc_cal_y_core(bcf_p1aux_t *ma, int beg) z[0][0] = 1.; last_min = last_max = 0; ma->t = 0.; - for (_j = beg; _j < ma->n; ++_j) { - int k, j = _j - beg, _min = last_min, _max = last_max; - double p[3], sum; - pdg = ma->pdg + _j * 3; - p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2]; - for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; - for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; - _max += 2; - if (_min == 0) - k = 0, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k]; - if (_min <= 1) - k = 1, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] + k*(2*j+2-k) * p[1] * z[0][k-1]; - for (k = _min < 2? 2 : _min; k <= _max; ++k) - z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] - + k*(2*j+2-k) * p[1] * z[0][k-1] - + k*(k-1)* p[2] * z[0][k-2]; - for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; - ma->t += log(sum / ((2. * j + 2) * (2. * j + 1))); - for (k = _min; k <= _max; ++k) z[1][k] /= sum; - if (_min >= 1) z[1][_min-1] = 0.; - if (_min >= 2) z[1][_min-2] = 0.; - if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; - if (_j == ma->n1 - 1) { // set pop1 - ma->t1 = ma->t; - memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1)); + if (ma->M == ma->n * 2) { + for (_j = beg; _j < ma->n; ++_j) { + int k, j = _j - beg, _min = last_min, _max = last_max; + double p[3], sum; + pdg = ma->pdg + _j * 3; + p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2]; + for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; + for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; + _max += 2; + if (_min == 0) + k = 0, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k]; + if (_min <= 1) + k = 1, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] + k*(2*j+2-k) * p[1] * z[0][k-1]; + for (k = _min < 2? 2 : _min; k <= _max; ++k) + z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] + + k*(2*j+2-k) * p[1] * z[0][k-1] + + k*(k-1)* p[2] * z[0][k-2]; + for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; + ma->t += log(sum / ((2. * j + 2) * (2. * j + 1))); + for (k = _min; k <= _max; ++k) z[1][k] /= sum; + if (_min >= 1) z[1][_min-1] = 0.; + if (_min >= 2) z[1][_min-2] = 0.; + if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; + if (_j == ma->n1 - 1) { // set pop1; ma->n1==-1 when unset + ma->t1 = ma->t; + memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1)); + } + tmp = z[0]; z[0] = z[1]; z[1] = tmp; + last_min = _min; last_max = _max; + } + } else { // this block is very similar to the block above; these two might be merged in future + int j, M = 0; + for (j = 0; j < ma->n; ++j) { + int k, M0, _min = last_min, _max = last_max; + double p[3], sum; + pdg = ma->pdg + j * 3; + for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.; + for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.; + M0 = M; + M += ma->ploidy[j]; + if (ma->ploidy[j] == 1) { + p[0] = pdg[0]; p[1] = pdg[2]; + _max++; + if (_min == 0) k = 0, z[1][k] = (M0+1-k) * p[0] * z[0][k]; + for (k = _min < 1? 1 : _min; k <= _max; ++k) + z[1][k] = (M0+1-k) * p[0] * z[0][k] + k * p[1] * z[0][k-1]; + for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; + ma->t += log(sum / M); + for (k = _min; k <= _max; ++k) z[1][k] /= sum; + if (_min >= 1) z[1][_min-1] = 0.; + if (j < ma->n - 1) z[1][_max+1] = 0.; + } else if (ma->ploidy[j] == 2) { + p[0] = pdg[0]; p[1] = 2 * pdg[1]; p[2] = pdg[2]; + _max += 2; + if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k]; + if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1]; + for (k = _min < 2? 2 : _min; k <= _max; ++k) + z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2]; + for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k]; + ma->t += log(sum / (M * (M - 1.))); + for (k = _min; k <= _max; ++k) z[1][k] /= sum; + if (_min >= 1) z[1][_min-1] = 0.; + if (_min >= 2) z[1][_min-2] = 0.; + if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.; + } + tmp = z[0]; z[0] = z[1]; z[1] = tmp; + last_min = _min; last_max = _max; } - tmp = z[0]; z[0] = z[1]; z[1] = tmp; - last_min = _min; last_max = _max; } if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1)); } static void mc_cal_y(bcf_p1aux_t *ma) { - if (ma->n1 > 0 && ma->n1 < ma->n) { + if (ma->n1 > 0 && ma->n1 < ma->n && ma->M == ma->n * 2) { // NB: ma->n1 is ineffective when there are haploid samples int k; long double x; memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1)); @@ -337,32 +398,6 @@ static double mc_cal_afs(bcf_p1aux_t *ma, double *p_ref_folded, double *p_var_fo return sum / ma->M; } -long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3]) -{ - long double pd = 0., g2[3]; - int i, k; - memset(g2, 0, sizeof(long double) * 3); - for (k = 0; k < p1a->M; ++k) { - double f = (double)k / p1a->M, f3[3], g1[3]; - long double z = 1.; - g1[0] = g1[1] = g1[2] = 0.; - f3[0] = (1. - f) * (1. - f); f3[1] = 2. * f * (1. - f); f3[2] = f * f; - for (i = 0; i < p1a->n; ++i) { - double *pdg = p1a->pdg + i * 3; - double x = pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]; - z *= x; - g1[0] += pdg[0] * f3[0] / x; - g1[1] += pdg[1] * f3[1] / x; - g1[2] += pdg[2] * f3[2] / x; - } - pd += p1a->phi[k] * z; - for (i = 0; i < 3; ++i) - g2[i] += p1a->phi[k] * z * g1[i]; - } - for (i = 0; i < 3; ++i) g[i] = g2[i] / pd; - return pd; -} - int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) { int i, k; diff --git a/bcftools/prob1.h b/bcftools/prob1.h index b4f6a30..eb4dc92 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -22,7 +22,7 @@ typedef struct { extern "C" { #endif - bcf_p1aux_t *bcf_p1_init(int n); + bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy); void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_destroy(bcf_p1aux_t *ma); @@ -30,7 +30,6 @@ extern "C" { int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); void bcf_p1_dump_afs(bcf_p1aux_t *ma); int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); - long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3]); int bcf_p1_set_n1(bcf_p1aux_t *b, int n1); void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called -- 2.39.2