struct __mc_aux_t {
int n, N;
int ref, alt;
+ double theta;
double *q2p, *pdg; // pdg -> P(D|g)
+ double *alpha, *beta;
int *qsum, *bcnt, rcnt[4];
};
+void mc_init_prior(mc_aux_t *ma, double theta)
+{
+ double sum;
+ int i;
+ ma->theta = theta;
+ for (i = 0, sum = 0.; i < 2 * ma->n; ++i)
+ sum += (ma->alpha[i] = ma->theta / (2 * ma->n - i));
+ ma->alpha[2 * ma->n] = 1. - sum;
+}
+
mc_aux_t *mc_init(int n) // FIXME: assuming diploid
{
mc_aux_t *ma;
ma->qsum = calloc(4 * ma->n, sizeof(int));
ma->bcnt = calloc(4 * ma->n, sizeof(int));
ma->pdg = calloc(3 * ma->n, sizeof(double));
+ ma->alpha = calloc(2 * ma->n + 1, sizeof(double));
+ ma->beta = calloc((2 * ma->n + 1) * 3, sizeof(double));
for (i = 0; i <= MC_MAX_SUMQ; ++i)
ma->q2p[i] = pow(10., -i / 10.);
+ for (i = 0; i <= 2 * ma->n; ++i) {
+ double *bi = ma->beta + 3 * i;
+ double f = (double)i / (2 * ma->n);
+ bi[0] = (1. - f) * (1. - f);
+ bi[1] = 2 * f * (1. - f);
+ bi[2] = f * f;
+ }
+ mc_init_prior(ma, 1e-3); // the simplest prior
return ma;
}
if (ma) {
free(ma->qsum); free(ma->bcnt);
free(ma->q2p); free(ma->pdg);
+ free(ma->alpha); free(ma->beta);
free(ma);
}
}
pdg[i] = pi[i] > MC_MAX_SUMQ? MC_MAX_SUMQP : ma->q2p[pi[i]];
}
}
-
+// return the reference allele frequency
double mc_freq0(int ref, int *n, const bam_pileup1_t **plp, mc_aux_t *ma, int *_ref, int *alt)
{
int i, acnt[4], j;
*_ref = ma->ref; *alt = ma->alt;
return ma->n == 1 || fr < 0.? f0 : fr;
}
-
+// f0 is the reference allele frequency
double mc_freq_iter(double f0, mc_aux_t *ma)
{
double f, f3[3];
f /= ma->n * 2.;
return f;
}
+
+double mc_ref_prob(mc_aux_t *ma)
+{
+ int k, i, g;
+ long double PD = 0., Pref = 0.;
+ for (k = 0; k <= ma->n * 2; ++k) {
+ long double x = 1.;
+ double *bk = ma->beta + k * 3;
+ for (i = 0; i < ma->n; ++i) {
+ double y = 0., *pdg = ma->pdg + i * 3;
+ for (g = 0; g < 3; ++g)
+ y += pdg[g] * bk[g];
+ x *= y;
+ }
+ PD += x * ma->alpha[k];
+ }
+ for (k = 0; k <= ma->n * 2; ++k) {
+ long double x = 1.0;
+ for (i = 0; i < ma->n; ++i)
+ x *= ma->pdg[i * 3 + 2] * ma->beta[k * 3 + 2];
+ Pref += x * ma->alpha[k];
+ }
+ return Pref / PD;
+}
+
+int mc_call_gt(const mc_aux_t *ma, double f0, int k)
+{
+ double sum, g[3];
+ double max, f3[3], *pdg = ma->pdg + k * 3;
+ int q, i, max_i;
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ for (i = 0, sum = 0.; i < 3; ++i)
+ sum += (g[i] = pdg[i] * f3[i]);
+ for (i = 0, max = -1., max_i = 0; i < 3; ++i) {
+ g[i] /= sum;
+ if (g[i] > max) max = g[i], max_i = i;
+ }
+ max = 1. - max;
+ if (max < 1e-308) max = 1e-308;
+ q = (int)(-3.434 * log(max) + .499);
+ if (q > 99) q = 99;
+ return q<<2|max_i;
+}
void mc_destroy(mc_aux_t *ma);
double mc_freq0(int ref, int *n, const bam_pileup1_t **plp, mc_aux_t *ma, int *_ref, int *alt);
double mc_freq_iter(double f0, mc_aux_t *ma);
+ double mc_ref_prob(mc_aux_t *ma);
+ int mc_call_gt(const mc_aux_t *ma, double f0, int k);
#ifdef __cplusplus
}
***********/
typedef struct {
- int vcf, max_mq, min_mq;
+ int vcf, max_mq, min_mq, var_only;
char *reg, *fn_pos;
faidx_t *fai;
kh_64_t *hash;
if (conf->vcf) {
kstring_t s;
s.l = s.m = 0; s.s = 0;
+ puts("##fileformat=VCFv4.0");
kputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", &s);
for (i = 0; i < n; ++i) {
const char *p;
ref_tid = tid;
}
if (conf->vcf) {
- double f0, f; // the reference allele frequency
- int j, _ref, _alt, _ref0, depth, rms_q, _ref0b;
+ double f0, f, pref = -1.0; // the reference allele frequency
+ int j, _ref, _alt, _ref0, depth, rms_q, _ref0b, is_var = 0, qref = 0;
uint64_t sqr_sum;
_ref0 = _ref0b = (ref && pos < ref_len)? ref[pos] : 'N';
_ref0 = bam_nt16_nt4_table[bam_nt16_table[_ref0]];
f = f0 = mc_freq0(_ref0, n_plp, plp, ma, &_ref, &_alt);
if (f >= 0.0) {
- double flast = f;
- for (j = 0; j < 10; ++j) {
+ double q, flast = f;
+ for (j = 0; j < 10; ++j){
f = mc_freq_iter(flast, ma);
if (fabs(f - flast) < 1e-3) break;
flast = f;
}
+ pref = mc_ref_prob(ma);
+ is_var = (pref < .5);
+ q = is_var? pref : 1. - pref;
+ if (q < 1e-308) q = 1e-308;
+ qref = (int)(-3.434 * log(q) + .499);
+ if (qref > 99) qref = 99;
}
+ if (conf->var_only && !is_var) continue;
printf("%s\t%d\t.\t%c\t", h->target_name[tid], pos + 1, _ref0b);
- if (_ref0 == _ref) putchar("ACGTN"[_alt]);
- else printf("%c,%c", "ACGTN"[_ref], "ACGTN"[_alt]);
- printf("\t0\t"); // FIXME: currently these not available
+ if (is_var) {
+ if (_ref0 == _ref) putchar("ACGTN"[_alt]);
+ else printf("%c,%c", "ACGTN"[_ref], "ACGTN"[_alt]);
+ } else putchar('.');
+ printf("\t%d\t", qref);
if (f0 < 0.) printf("Q13\t");
else printf(".\t");
for (i = depth = 0, sqr_sum = 0; i < n; ++i) {
}
rms_q = (int)(sqrt((double)sqr_sum / depth) + .499);
printf("DP=%d;MQ=%d;AF=%.3lg", depth, rms_q, f0<0.?0.:1.-f);
- printf("\tDP");
+ printf("\tGT:GQ:DP");
// output genotype information; FIXME: to be implmented...
- for (i = 0; i < n; ++i)
- printf("\t%d", n_plp[i]);
+ for (i = 0; i < n; ++i) {
+ int x = mc_call_gt(ma, f, i);
+ printf("\t%c/%c:%d:%d", "10"[((x&3)==2)], "10"[((x&3)>0)], x>>2, n_plp[i]);
+ }
putchar('\n');
} else {
printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
mplp_conf_t mplp;
memset(&mplp, 0, sizeof(mplp_conf_t));
mplp.max_mq = 60;
- while ((c = getopt(argc, argv, "f:r:l:VM:q:")) >= 0) {
+ while ((c = getopt(argc, argv, "f:r:l:VvM:q:")) >= 0) {
switch (c) {
case 'f':
mplp.fai = fai_load(optarg);
break;
case 'r': mplp.reg = strdup(optarg); break;
case 'l': mplp.fn_pos = strdup(optarg); break;
+ case 'v': mplp.var_only = 1; break;
case 'V': mplp.vcf = 1; break;
case 'M': mplp.max_mq = atoi(optarg); break;
case 'q': mplp.min_mq = atoi(optarg); break;
} else if (c == KEY_ENTER || c == '\012' || c == '\015') {
int _tid = -1, _beg, _end;
if (str[0] == '=') {
- _beg = strtol(str+1, &p, 10);
+ _beg = strtol(str+1, &p, 10) - 1;
if (_beg > 0) {
*pos = _beg;
return;