From 125ac895854cf760a04192a257a4279f2c541164 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 7 Mar 2013 19:21:43 +0000 Subject: [PATCH] backup commit --- bam2bcf.c | 248 +++++++++++++++++++++++++++++++------------- bam2bcf.h | 6 +- bam_plcmd.c | 4 +- bcftools/bcfutils.c | 14 ++- bcftools/call1.c | 34 +++++- bcftools/prob1.c | 193 ++++++++++++++++++++++++++++++---- bcftools/prob1.h | 2 +- samtools.1 | 44 +++++++- 8 files changed, 435 insertions(+), 110 deletions(-) diff --git a/bam2bcf.c b/bam2bcf.c index a51a406..b4f7fa6 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -1,5 +1,6 @@ #include #include +#include #include "bam.h" #include "kstring.h" #include "bam2bcf.h" @@ -30,17 +31,50 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) return bca; } + +int dist_from_end(const bam_pileup1_t *p) +{ + int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1; + for (icig=0; icigb->core.n_cigar; icig++) + { + // Conversion from uint32_t to MIDNSHP + // 0123456 + // MIDNSHP + int cig = bam1_cigar(p->b)[icig] & BAM_CIGAR_MASK; + int ncig = bam1_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT; + if ( cig==0 ) + { + n_tot_bases += ncig; + iread += ncig; + } + else if ( cig==1 ) + { + n_tot_bases += ncig; + iread += ncig; + } + else if ( cig==4 ) + { + iread += ncig; + if ( iread<=p->qpos ) edist -= ncig; + } + } + // distance from either end + if ( edist > n_tot_bases/2. ) edist = n_tot_bases - edist + 1; + if ( edist<0 ) { fprintf(stderr,"uh: %d %d -> %d (%d)\n", p->qpos,n_tot_bases,edist,p->b->core.pos+1); exit(1); } + return edist; +} + void bcf_call_destroy(bcf_callaux_t *bca) { if (bca == 0) return; errmod_destroy(bca->e); + if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; } free(bca->bases); free(bca->inscns); free(bca); } /* ref_base is the 4-bit representation of the reference base. It is * negative if we are looking at an indel. */ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) { - static int *var_pos = NULL, nvar_pos = 0; int i, n, ref4, is_indel, ori_depth = 0; memset(r, 0, sizeof(bcf_callret1_t)); if (ref_base >= 0) { @@ -55,6 +89,7 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); } // fill the bases array + bca->read_len = 0; for (i = n = r->n_supp = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; @@ -92,97 +127,159 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; // FIXME: signed int is not enough for thousands of samples r->anno[3<<2|is_diff<<1|0] += min_dist; r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist; + + // collect read positions for ReadPosBias + int epos = dist_from_end(p); + int npos = p->b->core.l_qseq; + if ( bca->npos < npos ) + { + bca->ref_pos = realloc(bca->ref_pos, sizeof(int)*npos); + bca->alt_pos = realloc(bca->alt_pos, sizeof(int)*npos); + int j; + for (j=bca->npos; jref_pos[j] = 0; + for (j=bca->npos; jalt_pos[j] = 0; + bca->npos = npos; + } + if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) + bca->ref_pos[epos]++; + else + bca->alt_pos[epos]++; + bca->read_len += npos; + //fprintf(stderr,"qpos=%d epos=%d fwd=%d var=%d len=%d %s\n", p->qpos, epos, p->b->core.flag&BAM_FREVERSE ? 0 : 1, bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ? 1 : 0, p->b->core.l_qseq, bam1_qname(p->b)); } + if ( n ) bca->read_len /= n; r->depth = n; r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); + return r->depth; +} - // Calculate the Variant Distance Bias (make it optional?) - if ( nvar_pos < _n ) { - nvar_pos = _n; - var_pos = realloc(var_pos,sizeof(int)*nvar_pos); - } - int alt_dp=0, read_len=0; - for (i=0; i<_n; i++) { - const bam_pileup1_t *p = pl + i; - if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) - continue; +double mann_whitney_1947(int n, int m, int U) +{ + if (U<0) return 0; + if (n==0||m==0) return U==0 ? 1 : 0; + return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U); +} - var_pos[alt_dp] = p->qpos; - if ( (bam1_cigar(p->b)[0]&BAM_CIGAR_MASK)==4 ) - var_pos[alt_dp] -= bam1_cigar(p->b)[0]>>BAM_CIGAR_SHIFT; +void calc_ReadPosBias(bcf_callaux_t *bca, bcf_call_t *call) +{ + int i, nref = 0, nalt = 0; + unsigned long int U = 0; + for (i=0; inpos; i++) + { + nref += bca->ref_pos[i]; + nalt += bca->alt_pos[i]; + U += nref*bca->alt_pos[i]; + bca->ref_pos[i] = 0; + bca->alt_pos[i] = 0; + } + if ( !nref || !nalt ) + { + call->read_pos_bias = -1; + return; + } - alt_dp++; - read_len += p->b->core.l_qseq; + if ( nref>=8 || nalt>=8 ) + { + // normal approximation + double mean = ((double)nref*nalt+1.0)/2.0; + double var2 = (double)nref*nalt*(nref+nalt+1.0)/12.0; + double z = (U-mean)/sqrt(var2); + call->read_pos_bias = z; + //fprintf(stderr,"nref=%d nalt=%d U=%ld mean=%e var=%e zval=%e\n", nref,nalt,U,mean,sqrt(var2),call->read_pos_bias); } - float mvd=0; - int j; - n=0; - for (i=0; i= 1./sqrt(var2*2*M_PI) ) z = 0; // equal to mean + else + { + if ( U >= nref*nalt/2. ) z = sqrt(-2*log(sqrt(var2*2*M_PI)*p)); + else z = -sqrt(-2*log(sqrt(var2*2*M_PI)*p)); } + call->read_pos_bias = z; + //fprintf(stderr,"nref=%d nalt=%d U=%ld p=%e var2=%e zval=%e\n", nref,nalt,U, p,var2,call->read_pos_bias); } - r->mvd[0] = n ? mvd/n : 0; - r->mvd[1] = alt_dp; - r->mvd[2] = alt_dp ? read_len/alt_dp : 0; - - return r->depth; } - -void calc_vdb(int n, const bcf_callret1_t *calls, bcf_call_t *call) +float mean_diff_to_prob(float mdiff, int dp, int readlen) { - // Variant distance bias. Samples merged by means of DP-weighted average. - - float weight=0, tot_prob=0; - - int i; - for (i=0; i2*mu ? 0 : sin(mvd*3.14/2/mu) / (4*mu/3.14); - } - else - { - // Scaled gaussian curve, crude approximation, but behaves well. Using fixed depth for bigger depths. - if ( dp>5 ) - dp = 5; - float sigma2 = (read_len/1.9/(dp+1)) * (read_len/1.9/(dp+1)); - float norm = 1.125*sqrt(2*3.14*sigma2); - float mu = read_len/2.9; - if ( mvd < mu ) - prob = exp(-(mvd-mu)*(mvd-mu)/2/sigma2)/norm; - else - prob = exp(-(mvd-mu)*(mvd-mu)/3.125/sigma2)/norm; - } + float m, v; + if ( dp>=24 ) + { + m = (int)readlen/8.; + if (dp>100) dp = 100; + v = 1.476/(0.182*pow(dp,0.514)); + v = v*(readlen/100.); + } + else + { + m = mv[dp][0]; + v = mv[dp][1]; + m = (int)m*readlen/100.; + v = v*readlen/100.; + v *= 1.2; // allow more variability + } + return 1.0/(v*sqrt(2*M_PI)) * exp(-0.5*((mdiff-m)/v)*((mdiff-m)/v)); +} - //fprintf(stderr,"dp=%d mvd=%d read_len=%d -> prob=%f\n", dp,mvd,read_len,prob); - tot_prob += prob*dp; - weight += dp; +void calc_vdb(bcf_callaux_t *bca, bcf_call_t *call) +{ + int i, dp = 0; + float mean_pos = 0, mean_diff = 0; + for (i=0; inpos; i++) + { + if ( !bca->alt_pos[i] ) continue; + dp += bca->alt_pos[i]; + mean_pos += bca->alt_pos[i]*i; + } + if ( !dp ) + { + call->vdb = -1; + return; + } + mean_pos /= dp; + for (i=0; inpos; i++) + { + if ( !bca->alt_pos[i] ) continue; + mean_diff += bca->alt_pos[i] * fabs(i - mean_pos); } - tot_prob = weight ? tot_prob/weight : 1; - //fprintf(stderr,"prob=%f\n", tot_prob); - call->vdb = tot_prob; + mean_diff /= dp; + call->vdb = mean_diff_to_prob(mean_diff, dp, bca->read_len); } -int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call) +/** + * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles + * @n: number of samples + * @calls: each sample's calls + * @bca: auxiliary data structure for holding temporary values + * @ref_base: the reference base + * @call: filled with the annotations + */ +int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call) { int ref4, i, j, qsum[4]; int64_t tmp; @@ -264,7 +361,8 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; } - calc_vdb(n, calls, call); + calc_vdb(bca, call); + calc_ReadPosBias(bca, call); return 0; } @@ -320,8 +418,10 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc kputw(bc->anno[i], &s); } ksprintf(&s,";QS=%f,%f,%f,%f", bc->qsum[0],bc->qsum[1],bc->qsum[2],bc->qsum[3]); - if (bc->vdb != 1) + if (bc->vdb != -1) ksprintf(&s, ";VDB=%.4f", bc->vdb); + if (bc->read_pos_bias != -1 ) + ksprintf(&s, ";RPB=%.4f", bc->read_pos_bias); kputc('\0', &s); // FMT kputs("PL", &s); diff --git a/bam2bcf.h b/bam2bcf.h index 8ac6b79..46f91f9 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -17,10 +17,12 @@ typedef struct __bcf_callaux_t { int min_support, max_support; // for collecting indel candidates double min_frac, max_frac; // for collecting indel candidates int per_sample_flt; // indel filtering strategy + int *ref_pos, *alt_pos, npos; // for ReadPosBias // for internal uses int max_bases; int indel_types[4]; int maxins, indelreg; + int read_len; char *inscns; uint16_t *bases; errmod_t *e; @@ -31,7 +33,6 @@ typedef struct { int depth, n_supp, ori_depth, qsum[4]; int anno[16]; float p[25]; - int mvd[3]; // mean variant distance, number of variant reads, average read length } bcf_callret1_t; typedef struct { @@ -42,6 +43,7 @@ typedef struct { int anno[16], depth, ori_depth; uint8_t *PL; float vdb; // variant distance bias + float read_pos_bias; } bcf_call_t; #ifdef __cplusplus @@ -51,7 +53,7 @@ extern "C" { bcf_callaux_t *bcf_call_init(double theta, int min_baseQ); void bcf_call_destroy(bcf_callaux_t *bca); int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r); - int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call); + int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call); int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref); int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, diff --git a/bam_plcmd.c b/bam_plcmd.c index 51dece5..3f5d424 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -312,7 +312,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); - bcf_call_combine(gplp.n, bcr, ref16, &bc); + bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); @@ -320,7 +320,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); - if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { + if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, bca, ref); bcf_write(bp, bh, b); diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c index 1321c13..7638085 100644 --- a/bcftools/bcfutils.c +++ b/bcftools/bcfutils.c @@ -74,7 +74,6 @@ void bcf_fit_alt(bcf1_t *b, int mask) int i,j,nals=0; for (i=0; in_alleles <= nals ) return; // update ALT, in principle any of the alleles can be removed @@ -241,7 +240,7 @@ int bcf_fix_gt(bcf1_t *b) bcf_ginfo_t gt; // check the presence of the GT FMT if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first - if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact + assert(s[3] == '\0' || s[3] == ':'); // :GTX in fact tmp = bcf_str2int("GT", 2); for (i = 0; i < b->n_gi; ++i) if (b->gi[i].fmt == tmp) break; @@ -250,7 +249,10 @@ int bcf_fix_gt(bcf1_t *b) // move GT to the first for (; i > 0; --i) b->gi[i] = b->gi[i-1]; b->gi[0] = gt; - memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt); + if ( s[3]==0 ) + memmove(b->fmt + 3, b->fmt, s - b->fmt); // :GT + else + memmove(b->fmt + 3, b->fmt, s - b->fmt + 1); // :GT: b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':'; return 0; } @@ -395,7 +397,11 @@ bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int kputs(samples[i], &s); kputc('\0', &s); } } - if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); + if (j < n) + { + fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); + exit(1); + } kh_destroy(str2id, hash); h = calloc(1, sizeof(bcf_hdr_t)); *h = *h0; diff --git a/bcftools/call1.c b/bcftools/call1.c index 22ff2ac..240f0ee 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -190,7 +190,27 @@ static char **read_samples(const char *fn, int *_n) *_n = 0; s.l = s.m = 0; s.s = 0; fp = gzopen(fn, "r"); - if (fp == 0) return 0; // fail to open file + if (fp == 0) + { + // interpret as sample names, not as a file name + const char *t = fn, *p = t; + while (*t) + { + t++; + if ( *t==',' || !*t ) + { + sam = realloc(sam, sizeof(void*)*(n+1)); + sam[n] = (char*) malloc(sizeof(char)*(t-p+2)); + memcpy(sam[n], p, t-p); + sam[n][t-p] = 0; + sam[n][t-p+1] = 2; // assume diploid + p = t+1; + n++; + } + } + *_n = n; + return sam; // fail to open file + } ks = ks_init(fp); while (ks_getuntil(ks, 0, &s, &dret) >= 0) { int l; @@ -266,8 +286,16 @@ static void write_header(bcf_hdr_t *h) kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + kputs("##INFO=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT==0 ) { bcf_p1_set_ploidy(b, p1); // could be improved: do this per site to allow pseudo-autosomal regions - int gts = call_multiallelic_gt(b,p1,vc.min_ma_lrt); + int gts = call_multiallelic_gt(b, p1, vc.min_ma_lrt, vc.flag&VC_VARONLY); if ( gts<=1 && vc.flag & VC_VARONLY ) continue; } else if (vc.flag & VC_CALL) { // call variants diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 5051cc3..d655722 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -203,7 +203,124 @@ void bcf_p1_destroy(bcf_p1aux_t *ma) extern double kf_gammap(double s, double z); int test16(bcf1_t *b, anno16_t *a); -int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) +// Wigginton 2005, PMID: 15789306 +// written by Jan Wigginton +double calc_hwe(int obs_hom1, int obs_hom2, int obs_hets) +{ + if (obs_hom1 + obs_hom2 + obs_hets == 0 ) return 1; + + assert(obs_hom1 >= 0 && obs_hom2 >= 0 && obs_hets >= 0); + + int obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1; + int obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2; + + int rare_copies = 2 * obs_homr + obs_hets; + int genotypes = obs_hets + obs_homc + obs_homr; + + double *het_probs = (double*) calloc(rare_copies+1, sizeof(double)); + + /* start at midpoint */ + int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes); + + /* check to ensure that midpoint and rare alleles have same parity */ + if ((rare_copies & 1) ^ (mid & 1)) mid++; + + int curr_hets = mid; + int curr_homr = (rare_copies - mid) / 2; + int curr_homc = genotypes - curr_hets - curr_homr; + + het_probs[mid] = 1.0; + double sum = het_probs[mid]; + for (curr_hets = mid; curr_hets > 1; curr_hets -= 2) + { + het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)); + sum += het_probs[curr_hets - 2]; + + /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */ + curr_homr++; + curr_homc++; + } + + curr_hets = mid; + curr_homr = (rare_copies - mid) / 2; + curr_homc = genotypes - curr_hets - curr_homr; + for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2) + { + het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc /((curr_hets + 2.0) * (curr_hets + 1.0)); + sum += het_probs[curr_hets + 2]; + + /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */ + curr_homr--; + curr_homc--; + } + int i; + for (i = 0; i <= rare_copies; i++) het_probs[i] /= sum; + + /* p-value calculation for p_hwe */ + double p_hwe = 0.0; + for (i = 0; i <= rare_copies; i++) + { + if (het_probs[i] > het_probs[obs_hets]) + continue; + p_hwe += het_probs[i]; + } + + p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; + free(het_probs); + return p_hwe; + +} + + +static void _bcf1_set_ref(bcf1_t *b, int idp) +{ + kstring_t s; + int old_n_gi = b->n_gi; + s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; + kputs(":GT", &s); kputc('\0', &s); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + + // Call GTs + int isample, an = 0; + for (isample = 0; isample < b->n_smpl; isample++) + { + if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 ) + ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7; + else + { + ((uint8_t*)b->gi[old_n_gi].data)[isample] = 0; + an += b->ploidy ? b->ploidy[isample] : 2; + } + } + bcf_fit_alt(b,1); + b->qual = 999; + + // Prepare BCF for output: ref, alt, filter, info, format + memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); + kputs(b->ref, &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); + { + ksprintf(&s, "AN=%d;", an); + kputs(b->info, &s); + anno16_t a; + int has_I16 = test16(b, &a) >= 0? 1 : 0; + if (has_I16 ) + { + if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); + ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + } + kputc('\0', &s); + rm_info(&s, "I16="); + rm_info(&s, "QS="); + } + kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); +} + +int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only) { int nals = 1; char *p; @@ -214,8 +331,6 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) } if ( b->alt[0] && !*p ) nals++; - if ( nals==1 ) return 1; - if ( nals>4 ) { if ( *b->ref=='N' ) return 0; @@ -223,10 +338,9 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) exit(1); } - // find PL and DP FORMAT indexes + // find PL, DV and DP FORMAT indexes uint8_t *pl = NULL; - int npl = 0, idp=-1; - int i; + int i, npl = 0, idp = -1, idv = -1; for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("PL", 2)) @@ -234,7 +348,13 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) pl = (uint8_t*)b->gi[i].data; npl = b->gi[i].len; } - if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i; + else if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i; + else if (b->gi[i].fmt == bcf_str2int("DV", 2)) idv=i; + } + if ( nals==1 ) + { + if ( !var_only ) _bcf1_set_ref(b, idp); + return 1; } if ( !pl ) return -1; @@ -265,9 +385,9 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) if ( sscanf(p+3,"%lf,%lf,%lf,%lf",&qsum[0],&qsum[1],&qsum[2],&qsum[3])!=4 ) { fprintf(stderr,"Could not parse %s\n",p); exit(1); } - // Calculate the most likely combination of alleles + // Calculate the most likely combination of alleles, remembering the most and second most likely set int ia,ib,ic, max_als=0, max_als2=0; - double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN; + double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN, lk_sums[3]; for (ia=0; ialk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); } + lk_sums[0] = lk_sum; if ( nals>1 ) { for (ia=0; ialk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); } } + lk_sums[1] = lk_sum; } if ( nals>2 ) { @@ -349,6 +471,7 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) } } } + lk_sums[2] = lk_sum; } // Should we add another allele, does it increase the likelihood significantly? @@ -357,9 +480,12 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) for (i=0; in_smpl; isample++) { int ploidy = b->ploidy ? b->ploidy[isample] : 2; double *p = pdg + isample*npdg; int ia, als = 0; - double lk = 0, lk_sum=0; + double lk = 0, lk_s = 0; for (ia=0; ia lk ) { lk = _lk; als = ia<<3 | ia; } - lk_sum += _lk; + lk_s += _lk; } if ( ploidy==2 ) { @@ -397,26 +524,48 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) int iab = iaa - ia + ib; double _lk = 2*qsum[ia]*qsum[ib]*p[iab]; if ( _lk > lk ) { lk = _lk; als = ib<<3 | ia; } - lk_sum += _lk; + lk_s += _lk; } } } - lk = -log(1-lk/lk_sum)/0.2302585; - if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 ) + lk = -log(1-lk/lk_s)/0.2302585; + int dp = 0; + if ( idp>=0 && (dp=((uint16_t*)b->gi[idp].data)[isample])==0 ) { + // no coverage ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7; ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = 0; continue; } + if ( lk>99 ) lk = 99; ((uint8_t*)b->gi[old_n_gi].data)[isample] = als; - ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = lk<100 ? (int)lk : 99; + ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = (int)lk; + + // For MDV annotation + int dv; + if ( als && idv>=0 && (dv=((uint16_t*)b->gi[idv].data)[isample]) ) + { + if ( max_dv < dv ) max_dv = dv; + dp_nref += dp; + } + + // For HWE annotation; multiple ALT alleles treated as one + if ( !als ) nRR++; + else if ( !(als>>3&7) || !(als&7) ) nRA++; + else nAA++; gts |= 1<<(als>>3&7) | 1<<(als&7); ac[ als>>3&7 ]++; ac[ als&7 ]++; } + free(pdg); bcf_fit_alt(b,max_als); + // The VCF spec is ambiguous about QUAL: is it the probability of anything else + // (that is QUAL(non-ref) = P(ref)+P(any non-ref other than ALT)) or is it + // QUAL(non-ref)=P(ref) and QUAL(ref)=1-P(ref)? Assuming the latter. + b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*log(1-exp(ref_lk - lk_sum)); + if ( b->qual>999 ) b->qual = 999; // Prepare BCF for output: ref, alt, filter, info, format memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); @@ -449,6 +598,14 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) { if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + ksprintf(&s, ";QBD=%e", b->qual/(a.d[0] + a.d[1] + a.d[2] + a.d[3])); + if ( dp_nref ) ksprintf(&s, ";QBDNR=%e", b->qual/dp_nref); + if ( max_dv ) ksprintf(&s, ";MDV=%d", max_dv); + } + if ( nAA+nRA ) + { + double hwe = calc_hwe(nAA, nRR, nRA); + ksprintf(&s, ";HWE=%e", hwe); } kputc('\0', &s); rm_info(&s, "I16="); @@ -457,12 +614,8 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) kputs(b->fmt, &s); kputc('\0', &s); free(b->str); b->m_str = s.m; b->l_str = s.l; b->str = s.s; - b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*(max_lk - lk_sum); - if ( b->qual>999 ) b->qual = 999; bcf_sync(b); - - free(pdg); return gts; } diff --git a/bcftools/prob1.h b/bcftools/prob1.h index eb0b145..6f93155 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -33,7 +33,7 @@ extern "C" { void bcf_p1_destroy(bcf_p1aux_t *ma); void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma); int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst); - int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold); + int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only); int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); void bcf_p1_dump_afs(bcf_p1aux_t *ma); int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); diff --git a/samtools.1 b/samtools.1 index 718ac39..4b3f75a 100644 --- a/samtools.1 +++ b/samtools.1 @@ -30,7 +30,7 @@ bcftools index in.bcf .PP bcftools view in.bcf chr2:100-200 > out.vcf .PP -bcftools view -vc in.bcf > out.vcf 2> out.afs +bcftools view -Nvm0.99 in.bcf > out.vcf 2> out.afs .SH DESCRIPTION .PP @@ -140,17 +140,38 @@ to another samtools command. .TP .B tview -samtools tview [ref.fasta] +samtools tview +.RB [ \-p +.IR chr:pos ] +.RB [ \-s +.IR STR ] +.RB [ \-d +.IR display ] +.RI +.RI [ref.fasta] Text alignment viewer (based on the ncurses library). In the viewer, press `?' for help and press `g' to check the alignment start from a region in the format like `chr10:10,000,000' or `=10,000,000' when viewing the same reference sequence. +.B Options: +.RS +.TP 14 +.BI -d \ display +Output as (H)tml or (C)urses or (T)ext +.TP +.BI -p \ chr:pos +Go directly to this position +.TP +.BI -s \ STR +Display only reads from this sample or read group +.RE + .TP .B mpileup -.B samtools mpileup -.RB [ \-EBug ] +samtools mpileup +.RB [ \-EBugp ] .RB [ \-C .IR capQcoef ] .RB [ \-r @@ -297,6 +318,10 @@ Phred-scaled gap open sequencing error probability. Reducing .I INT leads to more indel calls. [40] .TP +.BI -p +Apply -m and -F thresholds per sample to increase sensitivity of calling. +By default both options are applied to reads pooled from all samples. +.TP .BI -P \ STR Comma dilimited list of platforms (determined by .BR @RG-PL ) @@ -570,6 +595,8 @@ Minimum base quality to be used in het calling. [13] .IR mutRate ] .RB [ \-p .IR varThres ] +.RB [ \-m +.IR varThres ] .RB [ \-P .IR prior ] .RB [ \-1 @@ -652,6 +679,12 @@ Call per-sample genotypes at variant sites (force -c) .BI -i \ FLOAT Ratio of INDEL-to-SNP mutation rate [0.15] .TP +.BI -m \ FLOAT +New model for improved multiallelic and rare-variant calling. Another +ALT allele is accepted if P(chi^2) of LRT exceeds the FLOAT threshold. The +parameter seems robust and the actual value usually does not affect the results +much; a good value to use is 0.99. This is the recommended calling method. [0] +.TP .BI -p \ FLOAT A site is considered to be a variant if P(ref|D)