#define VC_VARONLY 16
#define VC_VCFIN 32
#define VC_UNCOMP 64
-#define VC_HWE 128
#define VC_KEEPALT 256
#define VC_ACGT_ONLY 512
#define VC_QCALL 1024
typedef struct {
int flag, prior_type, n1, n_sub, *sublist;
char *fn_list, *prior_file, **subsam, *fn_dict;
+ uint8_t *ploidy;
double theta, pref, indel_frac;
} viewconf_t;
return hash;
}
-static double test_hwe(const double g[3])
-{
- extern double kf_gammaq(double p, double x);
- double fexp, chi2, f[3], n;
- int i;
- n = g[0] + g[1] + g[2];
- fexp = (2. * g[2] + g[1]) / (2. * n);
- if (fexp > 1. - 1e-10) fexp = 1. - 1e-10;
- if (fexp < 1e-10) fexp = 1e-10;
- f[0] = n * (1. - fexp) * (1. - fexp);
- f[1] = n * 2. * fexp * (1. - fexp);
- f[2] = n * fexp * fexp;
- for (i = 0, chi2 = 0.; i < 3; ++i)
- chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i];
- return kf_gammaq(.5, chi2 / 2.);
-}
-
typedef struct {
double p[4];
int mq, depth, is_tested, d[4];
{
kstring_t s;
int is_var = (pr->p_ref < pref);
- double p_hwe, r = is_var? pr->p_ref : pr->p_var, fq;
+ double r = is_var? pr->p_ref : pr->p_var, fq;
anno16_t a;
- p_hwe = pr->g[0] >= 0.? test_hwe(pr->g) : 1.0; // only do HWE g[] is calculated
test16(b, &a);
rm_info(b, "I16=");
if (pr->pc[0] >= 0.) ksprintf(&s, ";PC4=%g,%g,%g,%g", pr->pc[0], pr->pc[1], pr->pc[2], pr->pc[3]);
ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
}
- if (pr->g[0] >= 0. && p_hwe <= .2)
- ksprintf(&s, ";GC=%.2f,%.2f,%.2f;HWE=%.3f", pr->g[2], pr->g[1], pr->g[0], p_hwe);
kputc('\0', &s);
kputs(b->fmt, &s); kputc('\0', &s);
free(b->str);
if (fp == 0) return 0; // fail to open file
ks = ks_init(fp);
while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
+ int l;
if (max == n) {
max = max? max<<1 : 4;
sam = realloc(sam, sizeof(void*)*max);
}
- sam[n++] = strdup(s.s);
+ l = s.l;
+ sam[n] = malloc(s.l + 2);
+ strcpy(sam[n], s.s);
+ sam[n][l+1] = 2; // by default, diploid
+ if (dret != '\n') {
+ if (ks_getuntil(ks, 0, &s, &dret) >= 0) { // read ploidy, 1 or 2
+ int x = (int)s.s[0] - '0';
+ if (x == 1 || x == 2) sam[n][l+1] = x;
+ else fprintf(stderr, "(%s) ploidy can only be 1 or 2; assume diploid\n", __func__);
+ }
+ if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+ }
+ ++n;
}
ks_destroy(ks);
gzclose(fp);
tid = begin = end = -1;
memset(&vc, 0, sizeof(viewconf_t));
vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.;
- vc.n_sub = 0; vc.subsam = 0; vc.sublist = 0;
while ((c = getopt(argc, argv, "N1:l:cHAGvbSuP:t:p:QgLi:IMs:D:")) >= 0) {
switch (c) {
case '1': vc.n1 = atoi(optarg); break;
case 'c': vc.flag |= VC_CALL; break;
case 'v': vc.flag |= VC_VARONLY | VC_CALL; break;
case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break;
- case 'H': vc.flag |= VC_HWE; break;
case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break;
case 'I': vc.flag |= VC_NO_INDEL; break;
case 'M': vc.flag |= VC_ANNO_MAX; break;
case 'i': vc.indel_frac = atof(optarg); break;
case 'Q': vc.flag |= VC_QCALL; break;
case 'L': vc.flag |= VC_ADJLD; break;
- case 's': vc.subsam = read_samples(optarg, &vc.n_sub); break;
+ case 's': vc.subsam = read_samples(optarg, &vc.n_sub);
+ vc.ploidy = calloc(vc.n_sub + 1, 1);
+ for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1];
+ tid = -1;
+ break;
case 'P':
if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL;
else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2;
vcf_hdr_write(bout, hout);
}
if (vc.flag & VC_CALL) {
- p1 = bcf_p1_init(hout->n_smpl);
+ p1 = bcf_p1_init(hout->n_smpl, vc.ploidy);
if (vc.prior_file) {
if (bcf_p1_read_prior(p1, vc.prior_file) < 0) {
fprintf(stderr, "[%s] fail to read the prior AFS.\n", __func__);
if (vc.flag & VC_CALL) { // call variants
bcf_p1rst_t pr;
bcf_p1_cal(b, p1, &pr); // pr.g[3] is not calculated here
- if (vc.flag&VC_HWE) bcf_p1_cal_g3(p1, pr.g);
if (n_processed % 100000 == 0) {
fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed);
bcf_p1_dump_afs(p1);
if (hash) kh_destroy(set64, hash);
if (vc.fn_list) free(vc.fn_list);
if (vc.fn_dict) free(vc.fn_dict);
+ if (vc.ploidy) free(vc.ploidy);
if (vc.n_sub) {
int i;
for (i = 0; i < vc.n_sub; ++i) free(vc.subsam[i]);
#include <string.h>
#include <stdio.h>
#include <errno.h>
+#include <assert.h>
#include "prob1.h"
#include "kseq.h"
struct __bcf_p1aux_t {
int n, M, n1, is_indel;
+ uint8_t *ploidy; // haploid or diploid ONLY
double *q2p, *pdg; // pdg -> P(D|g)
double *phi, *phi_indel;
double *z, *zswap; // aux for afs
return 0;
}
-bcf_p1aux_t *bcf_p1_init(int n)
+bcf_p1aux_t *bcf_p1_init(int n, uint8_t *ploidy)
{
bcf_p1aux_t *ma;
int i;
ma = calloc(1, sizeof(bcf_p1aux_t));
ma->n1 = -1;
ma->n = n; ma->M = 2 * n;
+ if (ploidy) {
+ ma->ploidy = malloc(n);
+ memcpy(ma->ploidy, ploidy, n);
+ for (i = 0, ma->M = 0; i < n; ++i) ma->M += ploidy[i];
+ if (ma->M == 2 * n) {
+ free(ma->ploidy);
+ ma->ploidy = 0;
+ }
+ }
ma->q2p = calloc(256, sizeof(double));
ma->pdg = calloc(3 * ma->n, sizeof(double));
ma->phi = calloc(ma->M + 1, sizeof(double));
ma->phi_indel = calloc(ma->M + 1, sizeof(double));
ma->phi1 = calloc(ma->M + 1, sizeof(double));
ma->phi2 = calloc(ma->M + 1, sizeof(double));
- ma->z = calloc(2 * ma->n + 1, sizeof(double));
- ma->zswap = calloc(2 * ma->n + 1, sizeof(double));
+ ma->z = calloc(ma->M + 1, sizeof(double));
+ ma->zswap = calloc(ma->M + 1, sizeof(double));
ma->z1 = calloc(ma->M + 1, sizeof(double)); // actually we do not need this large
ma->z2 = calloc(ma->M + 1, sizeof(double));
- ma->afs = calloc(2 * ma->n + 1, sizeof(double));
- ma->afs1 = calloc(2 * ma->n + 1, sizeof(double));
+ ma->afs = calloc(ma->M + 1, sizeof(double));
+ ma->afs1 = calloc(ma->M + 1, sizeof(double));
for (i = 0; i < 256; ++i)
ma->q2p[i] = pow(10., -i / 10.);
bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior
int bcf_p1_set_n1(bcf_p1aux_t *b, int n1)
{
if (n1 == 0 || n1 >= b->n) return -1;
+ if (b->M != b->n * 2) {
+ fprintf(stderr, "[%s] unable to set `n1' when there are haploid samples.\n", __func__);
+ return -1;
+ }
b->n1 = n1;
return 0;
}
void bcf_p1_destroy(bcf_p1aux_t *ma)
{
if (ma) {
- free(ma->q2p); free(ma->pdg);
+ free(ma->ploidy); free(ma->q2p); free(ma->pdg);
free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2);
free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2);
free(ma->afs); free(ma->afs1);
{
double sum, g[3];
double max, f3[3], *pdg = ma->pdg + k * 3;
- int q, i, max_i;
- f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ int q, i, max_i, ploidy;
+ ploidy = ma->ploidy? ma->ploidy[k] : 2;
+ if (ploidy == 2) {
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ } else {
+ f3[0] = 1. - f0; f3[1] = 0; f3[2] = f0;
+ }
for (i = 0, sum = 0.; i < 3; ++i)
sum += (g[i] = pdg[i] * f3[i]);
- for (i = 0, max = -1., max_i = 0; i < 3; ++i) {
+ for (i = 0, max = -1., max_i = 0; i <= ploidy; ++i) {
g[i] /= sum;
if (g[i] > max) max = g[i], max_i = i;
}
{
double *z[2], *tmp, *pdg;
int _j, last_min, last_max;
+ assert(beg == 0 || ma->M == ma->n*2);
z[0] = ma->z;
z[1] = ma->zswap;
pdg = ma->pdg;
z[0][0] = 1.;
last_min = last_max = 0;
ma->t = 0.;
- for (_j = beg; _j < ma->n; ++_j) {
- int k, j = _j - beg, _min = last_min, _max = last_max;
- double p[3], sum;
- pdg = ma->pdg + _j * 3;
- p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2];
- for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
- for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
- _max += 2;
- if (_min == 0)
- k = 0, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k];
- if (_min <= 1)
- k = 1, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] + k*(2*j+2-k) * p[1] * z[0][k-1];
- for (k = _min < 2? 2 : _min; k <= _max; ++k)
- z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k]
- + k*(2*j+2-k) * p[1] * z[0][k-1]
- + k*(k-1)* p[2] * z[0][k-2];
- for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
- ma->t += log(sum / ((2. * j + 2) * (2. * j + 1)));
- for (k = _min; k <= _max; ++k) z[1][k] /= sum;
- if (_min >= 1) z[1][_min-1] = 0.;
- if (_min >= 2) z[1][_min-2] = 0.;
- if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
- if (_j == ma->n1 - 1) { // set pop1
- ma->t1 = ma->t;
- memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1));
+ if (ma->M == ma->n * 2) {
+ for (_j = beg; _j < ma->n; ++_j) {
+ int k, j = _j - beg, _min = last_min, _max = last_max;
+ double p[3], sum;
+ pdg = ma->pdg + _j * 3;
+ p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2];
+ for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
+ for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
+ _max += 2;
+ if (_min == 0)
+ k = 0, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k];
+ if (_min <= 1)
+ k = 1, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] + k*(2*j+2-k) * p[1] * z[0][k-1];
+ for (k = _min < 2? 2 : _min; k <= _max; ++k)
+ z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k]
+ + k*(2*j+2-k) * p[1] * z[0][k-1]
+ + k*(k-1)* p[2] * z[0][k-2];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / ((2. * j + 2) * (2. * j + 1)));
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (_min >= 2) z[1][_min-2] = 0.;
+ if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
+ if (_j == ma->n1 - 1) { // set pop1; ma->n1==-1 when unset
+ ma->t1 = ma->t;
+ memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1));
+ }
+ tmp = z[0]; z[0] = z[1]; z[1] = tmp;
+ last_min = _min; last_max = _max;
+ }
+ } else { // this block is very similar to the block above; these two might be merged in future
+ int j, M = 0;
+ for (j = 0; j < ma->n; ++j) {
+ int k, M0, _min = last_min, _max = last_max;
+ double p[3], sum;
+ pdg = ma->pdg + j * 3;
+ for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
+ for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
+ M0 = M;
+ M += ma->ploidy[j];
+ if (ma->ploidy[j] == 1) {
+ p[0] = pdg[0]; p[1] = pdg[2];
+ _max++;
+ if (_min == 0) k = 0, z[1][k] = (M0+1-k) * p[0] * z[0][k];
+ for (k = _min < 1? 1 : _min; k <= _max; ++k)
+ z[1][k] = (M0+1-k) * p[0] * z[0][k] + k * p[1] * z[0][k-1];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / M);
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (j < ma->n - 1) z[1][_max+1] = 0.;
+ } else if (ma->ploidy[j] == 2) {
+ p[0] = pdg[0]; p[1] = 2 * pdg[1]; p[2] = pdg[2];
+ _max += 2;
+ if (_min == 0) k = 0, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k];
+ if (_min <= 1) k = 1, z[1][k] = (M0-k+1) * (M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1];
+ for (k = _min < 2? 2 : _min; k <= _max; ++k)
+ z[1][k] = (M0-k+1)*(M0-k+2) * p[0] * z[0][k] + k*(M0-k+2) * p[1] * z[0][k-1] + k*(k-1)* p[2] * z[0][k-2];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / (M * (M - 1.)));
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (_min >= 2) z[1][_min-2] = 0.;
+ if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
+ }
+ tmp = z[0]; z[0] = z[1]; z[1] = tmp;
+ last_min = _min; last_max = _max;
}
- tmp = z[0]; z[0] = z[1]; z[1] = tmp;
- last_min = _min; last_max = _max;
}
if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1));
}
static void mc_cal_y(bcf_p1aux_t *ma)
{
- if (ma->n1 > 0 && ma->n1 < ma->n) {
+ if (ma->n1 > 0 && ma->n1 < ma->n && ma->M == ma->n * 2) { // NB: ma->n1 is ineffective when there are haploid samples
int k;
long double x;
memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1));
return sum / ma->M;
}
-long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3])
-{
- long double pd = 0., g2[3];
- int i, k;
- memset(g2, 0, sizeof(long double) * 3);
- for (k = 0; k < p1a->M; ++k) {
- double f = (double)k / p1a->M, f3[3], g1[3];
- long double z = 1.;
- g1[0] = g1[1] = g1[2] = 0.;
- f3[0] = (1. - f) * (1. - f); f3[1] = 2. * f * (1. - f); f3[2] = f * f;
- for (i = 0; i < p1a->n; ++i) {
- double *pdg = p1a->pdg + i * 3;
- double x = pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2];
- z *= x;
- g1[0] += pdg[0] * f3[0] / x;
- g1[1] += pdg[1] * f3[1] / x;
- g1[2] += pdg[2] * f3[2] / x;
- }
- pd += p1a->phi[k] * z;
- for (i = 0; i < 3; ++i)
- g2[i] += p1a->phi[k] * z * g1[i];
- }
- for (i = 0; i < 3; ++i) g[i] = g2[i] / pd;
- return pd;
-}
-
int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
{
int i, k;