From e561d6e1a3c58df4de0e2856c2cfb9b2b68e7f11 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 9 Nov 2010 15:45:33 +0000 Subject: [PATCH] * samtools-0.1.9-6 (r803) * mpileup: apply homopolymer correction when calculating GL, instead of before * bcftools: apply a different prior to indels --- bam2bcf.c | 6 ++++-- bam2bcf_indel.c | 17 ++++++++--------- bamtk.c | 2 +- bcftools/bcf.c | 10 ++++++++++ bcftools/bcf.h | 2 ++ bcftools/prob1.c | 25 +++++++++++++++++++------ 6 files changed, 44 insertions(+), 18 deletions(-) diff --git a/bam2bcf.c b/bam2bcf.c index f7c48aa..1ba288c 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -53,11 +53,13 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t memset(r, 0, sizeof(bcf_callret1_t)); for (i = n = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; - int q, b, mapQ, baseQ, is_diff, min_dist; + int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; // set base if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue; baseQ = q = is_indel? p->aux&0xff : (int)bam1_qual(p->b)[p->qpos]; // base/indel quality + seqQ = is_indel? (p->aux>>8&0xff) : 99; if (q < bca->min_baseQ) continue; + if (q > seqQ) q = seqQ; mapQ = p->b->core.qual < bca->capQ? p->b->core.qual : bca->capQ; if (q > mapQ) q = mapQ; if (q > 63) q = 63; @@ -67,7 +69,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t b = bam_nt16_nt4_table[b? b : ref_base]; // b is the 2-bit base is_diff = (ref4 < 4 && b == ref4)? 0 : 1; } else { - b = p->aux>>8&0x3f; + b = p->aux>>16&0x3f; is_diff = (b != 0); } bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b; diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 0632eef..2254cf0 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -228,7 +228,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla for (s = K = 0; s < n; ++s) { for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; - int *sct = &score[K*n_types], indelQ; + int *sct = &score[K*n_types], indelQ, seqQ; for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; for (t = 1; t < n_types; ++t) // insertion sort for (j = t; j > 0 && sc[j] < sc[j-1]; --j) @@ -241,16 +241,15 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla */ if ((sc[0]&0x3f) == ref_type) { indelQ = (sc[1]>>6) - (sc[0]>>6); - tmp = est_seqQ(bca, types[sc[1]&0x3f], l_run); + seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); } else { for (t = 0; t < n_types; ++t) // look for the reference type if ((sc[t]&0x3f) == ref_type) break; indelQ = (sc[t]>>6) - (sc[0]>>6); - tmp = est_seqQ(bca, types[sc[0]&0x3f], l_run); + seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); } - if (indelQ > tmp) indelQ = tmp; - p->aux = (sc[0]&0x3f)<<8 | indelQ; - sumq[sc[0]&0x3f] += indelQ; + p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; + sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; // fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d q=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ); } } @@ -278,11 +277,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla for (s = K = 0; s < n; ++s) { for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; - int x = types[p->aux>>8&0x3f]; + int x = types[p->aux>>16&0x3f]; for (j = 0; j < 4; ++j) if (x == bca->indel_types[j]) break; - p->aux = j<<8 | (j == 4? 0 : (p->aux&0xff)); -// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d q=%d\n", pos, s, i, bam1_qname(p->b), p->aux>>8&63, p->aux&0xff); + p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff)); +// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d q=%d\n", pos, s, i, bam1_qname(p->b), p->aux>>16&63, p->aux&0xff); } } } diff --git a/bamtk.c b/bamtk.c index 2c441cd..5395d96 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.9-5 (r802)" +#define PACKAGE_VERSION "0.1.9-6 (r803)" #endif int bam_taf2baf(int argc, char *argv[]); diff --git a/bcftools/bcf.c b/bcftools/bcf.c index 494dccc..05eae5b 100644 --- a/bcftools/bcf.c +++ b/bcftools/bcf.c @@ -306,3 +306,13 @@ int bcf_cpy(bcf1_t *r, const bcf1_t *b) memcpy(r->gi[i].data, b->gi[i].data, r->n_smpl * r->gi[i].len); return 0; } + +int bcf_is_indel(const bcf1_t *b) +{ + char *p; + if (strlen(b->ref) > 1) return 1; + for (p = b->alt; *p; ++p) + if (*p != ',' && p[1] != ',' && p[1] != '\0') + return 1; + return 0; +} diff --git a/bcftools/bcf.h b/bcftools/bcf.h index 3657895..89feeb8 100644 --- a/bcftools/bcf.h +++ b/bcftools/bcf.h @@ -128,6 +128,8 @@ extern "C" { int bcf_shrink_alt(bcf1_t *b, int n); // convert GL to PL int bcf_gl2pl(bcf1_t *b); + // if the site is an indel + int bcf_is_indel(const bcf1_t *b); // string hash table void *bcf_build_refhash(bcf_hdr_t *h); diff --git a/bcftools/prob1.c b/bcftools/prob1.c index e3b0f73..04a0e15 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -8,9 +8,9 @@ #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) -#define MC_AVG_ERR 0.007 #define MC_MAX_EM_ITER 16 #define MC_EM_EPS 1e-4 +#define MC_DEF_INDEL 0.15 unsigned char seq_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@ -32,9 +32,9 @@ unsigned char seq_nt4_table[256] = { }; struct __bcf_p1aux_t { - int n, M, n1; + int n, M, n1, is_indel; double *q2p, *pdg; // pdg -> P(D|g) - double *phi; + double *phi, *phi_indel; double *z, *zswap; // aux for afs double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set double t, t1, t2; @@ -43,6 +43,14 @@ struct __bcf_p1aux_t { int PL_len; }; +void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x) +{ + int i; + for (i = 0; i < ma->M; ++i) + ma->phi_indel[i] = ma->phi[i] * x; + ma->phi_indel[ma->M] = 1. - ma->phi[ma->M] * x; +} + static void init_prior(int type, double theta, int M, double *phi) { int i; @@ -63,6 +71,7 @@ static void init_prior(int type, double theta, int M, double *phi) void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta) { init_prior(type, theta, ma->M, ma->phi); + bcf_p1_indel_prior(ma, MC_DEF_INDEL); } void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta) @@ -110,6 +119,7 @@ int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn) fprintf(stderr, "[%s] heterozygosity=%lf, ", __func__, (double)sum); for (sum = 0., k = 1; k <= ma->M; ++k) sum += k * ma->phi[ma->M - k] / ma->M; fprintf(stderr, "theta=%lf\n", (double)sum); + bcf_p1_indel_prior(ma, MC_DEF_INDEL); return 0; } @@ -123,6 +133,7 @@ bcf_p1aux_t *bcf_p1_init(int n) ma->q2p = calloc(256, sizeof(double)); ma->pdg = calloc(3 * ma->n, sizeof(double)); ma->phi = calloc(ma->M + 1, sizeof(double)); + ma->phi_indel = calloc(ma->M + 1, sizeof(double)); ma->phi1 = calloc(ma->M + 1, sizeof(double)); ma->phi2 = calloc(ma->M + 1, sizeof(double)); ma->z = calloc(2 * ma->n + 1, sizeof(double)); @@ -148,7 +159,7 @@ void bcf_p1_destroy(bcf_p1aux_t *ma) { if (ma) { free(ma->q2p); free(ma->pdg); - free(ma->phi); free(ma->phi1); free(ma->phi2); + free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2); free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2); free(ma->afs); free(ma->afs1); free(ma); @@ -303,12 +314,13 @@ static double mc_cal_afs(bcf_p1aux_t *ma) { int k; long double sum = 0.; + double *phi = ma->is_indel? ma->phi_indel : ma->phi; memset(ma->afs1, 0, sizeof(double) * (ma->M + 1)); mc_cal_y(ma); for (k = 0, sum = 0.; k <= ma->M; ++k) - sum += (long double)ma->phi[k] * ma->z[k]; + sum += (long double)phi[k] * ma->z[k]; for (k = 0; k <= ma->M; ++k) { - ma->afs1[k] = ma->phi[k] * ma->z[k] / sum; + ma->afs1[k] = phi[k] * ma->z[k] / sum; if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.; } for (k = 0, sum = 0.; k <= ma->M; ++k) { @@ -348,6 +360,7 @@ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) { int i, k; long double sum = 0.; + ma->is_indel = bcf_is_indel(b); // set PL and PL_len for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("PL", 2)) { -- 2.39.2