From: Heng Li Date: Tue, 24 Aug 2010 18:05:50 +0000 (+0000) Subject: * samtools-0.1.8-13 (r698) X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=f69357a6a9e701f12d1f01c52cc0d2893bb2ac7a * samtools-0.1.8-13 (r698) * perform one-tailed t-test for baseQ, mapQ and endDist --- diff --git a/bam2bcf.c b/bam2bcf.c index 7fe178a..9553225 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -6,14 +6,14 @@ #include "errmod.h" #include "bcftools/bcf.h" -#define END_DIST_THRES 11 - extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); #define CALL_ETA 0.03f #define CALL_MAX 256 #define CALL_DEFTHETA 0.83f +#define CAP_DIST 25 + struct __bcf_callaux_t { int max_bases, capQ, min_baseQ; uint16_t *bases; @@ -40,7 +40,7 @@ void bcf_call_destroy(bcf_callaux_t *bca) int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r) { - int i, k, n, ref4; + int i, n, ref4; memset(r, 0, sizeof(bcf_callret1_t)); ref4 = bam_nt16_nt4_table[ref_base]; if (_n == 0) return -1; @@ -52,33 +52,34 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); } // fill the bases array - memset(r->qsum, 0, 4 * sizeof(float)); - for (i = n = 0, r->sum_Q2 = 0; i < _n; ++i) { + memset(r, 0, sizeof(bcf_callret1_t)); + for (i = n = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; - int q, b, mapQ; - int min_dist; + int q, b, mapQ, baseQ, is_diff, min_dist; // set base if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; // skip unmapped reads and deleted bases - q = (int)bam1_qual(p->b)[p->qpos]; // base quality + baseQ = q = (int)bam1_qual(p->b)[p->qpos]; // base quality if (q < bca->min_baseQ) continue; mapQ = p->b->core.qual < bca->capQ? p->b->core.qual : bca->capQ; - r->sum_Q2 += mapQ * mapQ; if (q > mapQ) q = mapQ; if (q > 63) q = 63; if (q < 4) q = 4; b = bam1_seqi(bam1_seq(p->b), p->qpos); // base b = bam_nt16_nt4_table[b? b : ref_base]; // b is the 2-bit base bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b; - // collect other information + // collect annotations r->qsum[b] += q; - k = (ref4 < 4 && b == ref4)? 0 : 1; - k = k<<1 | bam1_strand(p->b); - ++r->d[k]; - // calculate min_dist + is_diff = (ref4 < 4 && b == ref4)? 0 : 1; + ++r->anno[0<<2|is_diff<<1|bam1_strand(p->b)]; min_dist = p->b->core.l_qseq - 1 - p->qpos; if (min_dist > p->qpos) min_dist = p->qpos; - k = (k&2) | (min_dist <= END_DIST_THRES); - ++r->ed[k]; + if (min_dist > CAP_DIST) min_dist = CAP_DIST; + r->anno[1<<2|is_diff<<1|0] += baseQ; + r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; + r->anno[2<<2|is_diff<<1|0] += mapQ; + r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; + r->anno[3<<2|is_diff<<1|0] += min_dist; + r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist; } r->depth = n; // glfgen @@ -92,7 +93,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, int64_t tmp; call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base]; if (ref4 > 4) ref4 = 4; - // calculate esum + // calculate qsum memset(qsum, 0, 4 * sizeof(int)); for (i = 0; i < n; ++i) for (j = 0; j < 4; ++j) @@ -145,14 +146,11 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, call->shift = (int)(sum_min + .499); } // combine annotations - memset(call->d, 0, 4 * sizeof(int)); - memset(call->ed, 0, 4 * sizeof(int)); + memset(call->anno, 0, 16 * sizeof(int)); for (i = call->depth = 0, tmp = 0; i < n; ++i) { call->depth += calls[i].depth; - for (j = 0; j < 4; ++j) call->d[j] += calls[i].d[j], call->ed[j] += calls[i].ed[j]; - tmp += calls[i].sum_Q2; + for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; } - call->rmsQ = (int)(sqrt((double)tmp / call->depth) + .499); return 0; } @@ -172,16 +170,10 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b) kputc('\0', &s); kputc('\0', &s); // INFO - kputs("MQ=", &s); kputw(bc->rmsQ, &s); - kputs(";DP4=", &s); - for (i = 0; i < 4; ++i) { - if (i) kputc(',', &s); - kputw(bc->d[i], &s); - } - kputs(";ED4=", &s); - for (i = 0; i < 4; ++i) { + kputs("I16=", &s); + for (i = 0; i < 16; ++i) { if (i) kputc(',', &s); - kputw(bc->ed[i], &s); + kputw(bc->anno[i], &s); } kputc('\0', &s); // FMT diff --git a/bam2bcf.h b/bam2bcf.h index ddc6b61..0dddb92 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -8,15 +8,15 @@ struct __bcf_callaux_t; typedef struct __bcf_callaux_t bcf_callaux_t; typedef struct { - int depth, d[4], ed[4], qsum[4]; - uint64_t sum_Q2; + int depth, qsum[4]; + int anno[16]; float p[25]; } bcf_callret1_t; typedef struct { int a[5]; // alleles: ref, alt, alt2, alt3 int n, n_alleles, shift, ori_ref, unseen; - int d[4], ed[4], depth, rmsQ; + int anno[16], depth; uint8_t *PL; } bcf_call_t; diff --git a/bamtk.c b/bamtk.c index d4b40ea..ce27209 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.8-12 (r693)" +#define PACKAGE_VERSION "0.1.8-13 (r698)" #endif int bam_taf2baf(int argc, char *argv[]); diff --git a/bcftools/kfunc.c b/bcftools/kfunc.c index b534a31..a637b6c 100644 --- a/bcftools/kfunc.c +++ b/bcftools/kfunc.c @@ -112,16 +112,18 @@ double kf_gammaq(double s, double z) } /* Regularized incomplete beta function. The method is taken from - * Numerical Recipe in C, 2nd edition, section 6.4. The following page - * calculates the incomplete beta function, which equals + * Numerical Recipe in C, 2nd edition, section 6.4. The following web + * page calculates the incomplete beta function, which equals * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b): * * http://www.danielsoper.com/statcalc/calc36.aspx */ static double kf_betai_aux(double a, double b, double x) { - double C, D, f, z; + double C, D, f; int j; + if (x == 0.) return 0.; + if (x == 1.) return 1.; f = 1.; C = f; D = 0.; // Modified Lentz's algorithm for computing continued fraction for (j = 1; j < 200; ++j) { @@ -138,8 +140,7 @@ static double kf_betai_aux(double a, double b, double x) f *= d; if (fabs(d - 1.) < KF_GAMMA_EPS) break; } - z = (x == 0. || x == 1.)? 0. : exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)); - return z / a / f; + return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f; } double kf_betai(double a, double b, double x) { diff --git a/bcftools/vcfout.c b/bcftools/vcfout.c index e02e364..827dcd2 100644 --- a/bcftools/vcfout.c +++ b/bcftools/vcfout.c @@ -77,22 +77,54 @@ static double test_hwe(const double g[3]) return kf_gammaq(.5, chi2 / 2.); } -extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); +typedef struct { + double p[4]; + int mq, depth, is_tested, d[4]; +} anno16_t; -static double test_fisher(bcf1_t *b, const char *key, int d[4], int is_single) +static double ttest(int n1, int n2, int a[4]) { - double left, right, two; - char *p; + extern double kf_betai(double a, double b, double x); + double t, v; + if (n1 == 0 || n2 == 0 || n1 + n2 < 3) return 1.0; + t = (a[0] / n1 - a[2] / n2) / sqrt((a[1] + a[3]) / (n1 + n2 - 2) * (1./n1 + 1./n2)); + v = n1 + n2 - 2; +// printf("%d,%d,%d,%d,%lf\n", a[0], a[1], a[2], a[3], t); + return t < 0.? 1. : .5 * kf_betai(.5*v, .5, v/(v+t*t)); +} + +static int test16_core(int anno[16], anno16_t *a) +{ + extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two); + double left, right; int i; - if ((p = strstr(b->info, key)) == 0) return -1.; + a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.; + memcpy(a->d, anno, 4 * sizeof(int)); + a->depth = anno[0] + anno[1] + anno[2] + anno[3]; + a->is_tested = (anno[0] + anno[1] > 0 && anno[2] + anno[3] > 0); + if (a->depth == 0) return -1; + a->mq = (int)(sqrt((anno[9] + anno[11]) / a->depth) + .499); + kt_fisher_exact(anno[0], anno[1], anno[2], anno[3], &left, &right, &a->p[0]); + for (i = 1; i < 4; ++i) + a->p[i] = ttest(anno[0] + anno[1], anno[2] + anno[3], anno+4*i); + return 0; +} + +static int test16(bcf1_t *b, anno16_t *a) +{ + char *p; + int i, anno[16]; + a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.; + a->d[0] = a->d[1] = a->d[2] = a->d[3] = 0.; + a->mq = a->depth = a->is_tested = 0; + if ((p = strstr(b->info, "I16=")) == 0) return -1; p += 4; - for (i = 0; i < 4; ++i) { - d[i] = strtol(p, &p, 10); - if (d[i] == 0 && (errno == EINVAL || errno == ERANGE)) return -2.; + for (i = 0; i < 16; ++i) { + anno[i] = strtol(p, &p, 10); + if (anno[i] == 0 && (errno == EINVAL || errno == ERANGE)) return -2; ++p; } - kt_fisher_exact(d[0], d[1], d[2], d[3], &left, &right, &two); - return is_single? right : two; + return test16_core(anno, a); } static void rm_info(int n_smpl, bcf1_t *b, const char *key) @@ -109,13 +141,13 @@ static void rm_info(int n_smpl, bcf1_t *b, const char *key) static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag) { kstring_t s; - int d[4], is_var = (pr->p_ref < pref); - double p_hwe, p_dp, p_ed, r = is_var? pr->p_ref : 1. - pr->p_ref; + int is_var = (pr->p_ref < pref); + double p_hwe, r = is_var? pr->p_ref : 1. - pr->p_ref; + anno16_t a; p_hwe = test_hwe(pr->g); - p_ed = test_fisher(b, "ED4=", d, 1); - p_dp = test_fisher(b, "DP4=", d, 0); - rm_info(n_smpl, b, "ED4="); + test16(b, &a); + rm_info(n_smpl, b, "I16="); memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); @@ -126,9 +158,9 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p kputs(b->info, &s); if (b->info[0]) kputc(';', &s); ksprintf(&s, "AF1=%.3lf;AFE=%.3lf", 1.-pr->f_em, 1.-pr->f_exp); + ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + if (a.is_tested) ksprintf(&s, ";PV4=%.2lg,%.2lg,%.2lg,%.2lg", a.p[0], a.p[1], a.p[2], a.p[3]); if (p_hwe <= .2) ksprintf(&s, ";GC=%.2lf,%.2lf,%.2lf;HWE=%.3lf", pr->g[2], pr->g[1], pr->g[0], p_hwe); - if (p_dp >= 0. && p_dp <= .2) ksprintf(&s, ";TDP=%.3lf", p_dp); - if (p_ed >= 0. && p_ed <= .2) ksprintf(&s, ";TED=%.3lf", p_ed); kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s); free(b->str);