From ec294adf095b60c90e57e31f3af1335138c5a22a Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 2 Jul 2012 11:36:41 +0100 Subject: [PATCH] Alternative model for multiallelic and rare-variant calling. Proof-of-principle implementation. --- bam2bcf.c | 11 +- bam2bcf.h | 1 + bcftools/bcf.c | 59 ++++------ bcftools/bcf.h | 3 + bcftools/bcfutils.c | 116 +++++++++++++++++++ bcftools/call1.c | 16 ++- bcftools/prob1.c | 273 +++++++++++++++++++++++++++++++++++++------- bcftools/prob1.h | 2 + 8 files changed, 401 insertions(+), 80 deletions(-) diff --git a/bam2bcf.c b/bam2bcf.c index 502d832..ed78ea6 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -195,6 +195,8 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, for (i = 0; i < n; ++i) for (j = 0; j < 4; ++j) qsum[j] += calls[i].qsum[j]; + int qsum_tot=0; + for (j=0; j<4; j++) qsum_tot += qsum[j]; for (j = 0; j < 4; ++j) qsum[j] = qsum[j] << 2 | j; // find the top 2 alleles for (i = 1; i < 4; ++i) // insertion sort @@ -206,9 +208,15 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, call->a[0] = ref4; for (i = 3, j = 1; i >= 0; --i) { if ((qsum[i]&3) != ref4) { - if (qsum[i]>>2 != 0) call->a[j++] = qsum[i]&3; + if (qsum[i]>>2 != 0) + { + call->qsum[j] = (float)(qsum[i]>>2)/qsum_tot; + call->a[j++] = qsum[i]&3; + } else break; } + else + call->qsum[0] = (float)(qsum[i]>>2)/qsum_tot; } if (ref_base >= 0) { // for SNPs, find the "unseen" base if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0) @@ -311,6 +319,7 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc if (i) kputc(',', &s); kputw(bc->anno[i], &s); } + ksprintf(&s,";QS=%f,%f,%f,%f", bc->qsum[0],bc->qsum[1],bc->qsum[2],bc->qsum[3]); if (bc->vdb != 1) ksprintf(&s, ";VDB=%.4f", bc->vdb); kputc('\0', &s); diff --git a/bam2bcf.h b/bam2bcf.h index a31ac9d..e3ecc07 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -36,6 +36,7 @@ typedef struct { typedef struct { int a[5]; // alleles: ref, alt, alt2, alt3 + float qsum[4]; int n, n_alleles, shift, ori_ref, unseen; int n_supp; // number of supporting non-reference reads int anno[16], depth, ori_depth; diff --git a/bcftools/bcf.c b/bcftools/bcf.c index 0524408..e3daaeb 100644 --- a/bcftools/bcf.c +++ b/bcftools/bcf.c @@ -240,31 +240,24 @@ void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s) } } for (j = 0; j < h->n_smpl; ++j) { - - // Determine GT with maximum PL (multiple ALT sites only) - int imax=-1; - if ( iPL!=-1 ) { - uint8_t *d = (uint8_t*)b->gi[iPL].data + j * x; - int k,identical=1; - imax=0; - for (k=1; kploidy ? b->ploidy[j] : 2; kputc('\t', s); for (i = 0; i < b->n_gi; ++i) { if (i) kputc(':', s); if (b->gi[i].fmt == bcf_str2int("PL", 2)) { uint8_t *d = (uint8_t*)b->gi[i].data + j * x; int k; - for (k = 0; k < x; ++k) { - if (k > 0) kputc(',', s); - kputw(d[k], s); - } + if ( ploidy==1 ) + for (k=0; kn_alleles; k++) + { + if (k>0) kputc(',', s); + kputw(d[(k+1)*(k+2)/2-1], s); + } + else + for (k = 0; k < x; ++k) { + if (k > 0) kputc(',', s); + kputw(d[k], s); + } } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { kputw(((uint16_t*)b->gi[i].data)[j], s); } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { @@ -273,28 +266,22 @@ void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s) kputw(((int32_t*)b->gi[i].data)[j], s); } else if (b->gi[i].fmt == bcf_str2int("GT", 2)) { int y = ((uint8_t*)b->gi[i].data)[j]; - if ( y>>7&1 ) - kputsn("./.", 3, s); - else if ( imax==-1 ) + if ( ploidy==1 ) { - kputc('0' + (y>>3&7), s); - kputc("/|"[y>>6&1], s); - kputc('0' + (y&7), s); + if ( y>>7&1 ) + kputsn(".", 3, s); + else + kputc('0' + (y>>3&7), s); } else { - // Arguably, the while loop will be faster than two sqrts - int n = 0; - int row = 1; - while ( n>7&1 ) + kputsn("./.", 3, s); + else { + kputc('0' + (y>>3&7), s); + kputc("/|"[y>>6&1], s); + kputc('0' + (y&7), s); } - row--; - kputw(imax-n+row, s); - kputc("/|"[y>>6&1], s); - kputw(row, s); } } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { float *d = (float*)b->gi[i].data + j * x; diff --git a/bcftools/bcf.h b/bcftools/bcf.h index 822ae5c..4fe3ed7 100644 --- a/bcftools/bcf.h +++ b/bcftools/bcf.h @@ -73,6 +73,7 @@ typedef struct { bcf_ginfo_t *gi; // array of geno fields int n_alleles, n_smpl; // number of alleles and samples // derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl) + uint8_t *ploidy; // ploidy of all samples; if NULL, ploidy of 2 is assumed. } bcf1_t; typedef struct { @@ -142,6 +143,8 @@ extern "C" { // keep the first n alleles and discard the rest int bcf_shrink_alt(bcf1_t *b, int n); + // keep the masked alleles and discard the rest + void bcf_fit_alt(bcf1_t *b, int mask); // convert GL to PL int bcf_gl2pl(bcf1_t *b); // if the site is an indel diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c index 0eab4c1..7988e58 100644 --- a/bcftools/bcfutils.c +++ b/bcftools/bcfutils.c @@ -1,5 +1,6 @@ #include #include +#include #include "bcf.h" #include "kstring.h" #include "khash.h" @@ -66,6 +67,121 @@ int bcf_str2id_add(void *_hash, const char *str) return kh_val(hash, k); } +void bcf_fit_alt(bcf1_t *b, int mask) +{ + mask |= 1; // REF must be always present + + int i,j,nals=0; + for (i=0; in_alleles <= nals ) return; + +#if DBG + printf("fit_alt: %s, %s, mask=%d, nals=%d: ", b->ref, b->alt, mask, nals); + for (i=0; i1 ) + { + char *dst, *src; + int n=0, nalts=nals-1; + for (src=dst=p=b->alt, i=1; *p; p++) + { + if ( *p!=',' ) continue; + + if ( mask&1<=nalts ) { *dst=0; break; } + src = p+1; + } + if ( nalt, *p = '\0'; +#if DBG + printf("fit_alt: %s, mask=%d\n", b->alt, mask); +#endif + p++; + memmove(p, b->flt, b->str + b->l_str - b->flt); + b->l_str -= b->flt - p; + + // update PL and GT + int ipl=-1, igt=-1; + for (i = 0; i < b->n_gi; ++i) + { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) ipl = i; + if (g->fmt == bcf_str2int("GT", 2)) igt = i; + } + + // .. create mapping between old and new indexes + int npl = nals * (nals+1) / 2; + int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles)); + int kori=0,knew=0; + for (i=0; in_alleles; i++) + { + for (j=0; j<=i; j++) + { + int skip=0; + if ( i && !(mask&1<n_smpl; + for (i = 0; i < b->n_gi; ++i) + { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) + { + g->len = npl; + uint8_t *d = (uint8_t*)g->data; + int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2; + for (knew=ismpl=0; ismpln_alleles; i++) + map[i] = mask&1<gi[igt].data)[i]; + int a1 = (gt>>3)&7; + int a2 = gt&7; + assert( map[a1]>=0 && map[a2]>=0 ); + ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)>) | map[a1]<<3 | map[a2]; + } + free(map); + b->n_alleles = nals; +} + int bcf_shrink_alt(bcf1_t *b, int n) { char *p; diff --git a/bcftools/call1.c b/bcftools/call1.c index 6c53008..13f5533 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -40,7 +40,7 @@ typedef struct { uint32_t *trio_aux; char *prior_file, **subsam, *fn_dict; uint8_t *ploidy; - double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt; + double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt, min_ma_lrt; void *bed; } viewconf_t; @@ -319,9 +319,9 @@ int bcfview(int argc, char *argv[]) tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); - vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; vc.min_lrt = 1; + vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; vc.min_lrt = 1; vc.min_ma_lrt = -1; memset(qcnt, 0, 8 * 256); - while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Yw")) >= 0) { + while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Ywm:")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; case 'l': vc.bed = bed_read(optarg); break; @@ -341,6 +341,7 @@ int bcfview(int argc, char *argv[]) case 'w': vc.flag |= VC_INDEL_ONLY; break; case 'M': vc.flag |= VC_ANNO_MAX; break; case 'Y': vc.flag |= VC_QCNT; break; + case 'm': vc.min_ma_lrt = atof(optarg); break; case 't': vc.theta = atof(optarg); break; case 'p': vc.pref = atof(optarg); break; case 'i': vc.indel_frac = atof(optarg); break; @@ -396,6 +397,7 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); fprintf(stderr, " -I skip indels\n"); + fprintf(stderr, " -m FLOAT alternative model for multiallelic and rare-variant calling, include if P(chi^2)>=FLOAT\n"); fprintf(stderr, " -p FLOAT variant if P(ref|D)=0 ) + { + int gts = call_multiallelic_gt(b,p1,vc.min_ma_lrt); + if ( gts==0 && vc.flag & VC_VARONLY ) continue; + } + else if (vc.flag & VC_CALL) { // call variants bcf_p1rst_t pr; int calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr); if (n_processed % 100000 == 0) { diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 83bd8e2..2a9c036 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -4,7 +4,9 @@ #include #include #include +#include #include "prob1.h" +#include "kstring.h" #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) @@ -174,6 +176,13 @@ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) return 0; } +void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma) +{ + // bcf_p1aux_t fields are not visible outside of prob1.c, hence this wrapper. + // Ideally, this should set ploidy per site to allow pseudo-autosomal regions + b->ploidy = ma->ploidy; +} + void bcf_p1_destroy(bcf_p1aux_t *ma) { if (ma) { @@ -191,54 +200,240 @@ void bcf_p1_destroy(bcf_p1aux_t *ma) } } -static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) +extern double kf_gammap(double s, double z); + +int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) { - int i, j; - int n = (b->n_alleles+1)*b->n_alleles/2; - double *lk = alloca(n * sizeof(long)); - memset(lk, 0, sizeof(double) * n); - for (j = 0; j < ma->n; ++j) { - const uint8_t *pi = ma->PL + j * ma->PL_len; - double *pdg = ma->pdg + j * 3; - pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; - for (i=0; ialt; *p; p++) + { + if ( *p=='X' || p[0]=='.' ) break; + if ( p[0]==',' ) nals++; } + if ( b->alt[0] && !*p ) nals++; + + if ( nals==1 ) return 1; - double norm=lk[0]; - for (i=1; in_alleles; i++) + if ( nals>4 ) { fprintf(stderr,"too many alts: %d\n", nals); exit(1); } + + // set PL and PL_len + uint8_t *pl = NULL; + int npl = 0, idp=-1; + int i; + for (i = 0; i < b->n_gi; ++i) + { + if (b->gi[i].fmt == bcf_str2int("PL", 2)) + { + pl = (uint8_t*)b->gi[i].data; + npl = b->gi[i].len; + } + if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i; + } + if ( !pl ) return -1; + + int npdg = nals*(nals+1)/2; + float *pdg,*_pdg; + _pdg = pdg = malloc(sizeof(float)*ma->n*npdg); + for (i=0; in; i++) + { + int j; + float sum = 0; + for (j=0; jinfo, "QS=")) == 0) { fprintf(stderr,"INFO/QS is required with -m, exiting\n"); exit(1); } + float qsum[4]; + if ( sscanf(p+3,"%f,%f,%f,%f",&qsum[0],&qsum[1],&qsum[2],&qsum[3])!=4 ) { fprintf(stderr,"Could not parse %s\n",p); exit(1); } + + + int ia,ib,ic, max_als=0, max_als2=0; + float max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN; + for (ia=0; ian; isample++) + { + float *p = pdg + isample*npdg; + assert( log(p[iaa]) <= 0 ); + lk_tot += log(p[iaa]); + } + if ( max_lklk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); + } + for (ia=0; ian; isample++) + { + if ( b->ploidy && b->ploidy[isample]==1 ) continue; + float *p = pdg + isample*npdg; + assert( log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]) <= 0 ); + lk_tot += log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]); + } + if ( max_lklk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); + } } - #endif - for (i=0; iis_indel ? b->n_alleles : b->n_alleles-1; - for (i=0; in_alleles; j++) { k=j*(j+1)/2+i; pr+=lk[k]; } - #if DBG - printf("%d\t%e\n", i,pr); - #endif - if (pmaxn; isample++) + { + if ( b->ploidy && b->ploidy[isample]==1 ) continue; + float *p = pdg + isample*npdg; + assert( log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]) <= 0 ); + lk_tot += log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]); + } + if ( max_lklk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); + } + } } - return i-1; + + int n1=0, n2=0; + for (i=0; iref, &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); + kputs(b->info, &s); if (b->info[0]) kputc(';', &s); kputc('\0', &s); + kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + b->qual = -4.343*(log(1-exp(max_lk-lk_sum))); + if ( b->qual>999 ) b->qual = 999; + //bcf_sync(b); + + int x, old_n_gi = b->n_gi; + s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; + kputs(":GT:GQ", &s); kputc('\0', &s); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + + // Call GT + int isample, gts=0; + for (isample = 0; isample < b->n_smpl; isample++) + { + int ploidy = b->ploidy ? b->ploidy[isample] : 2; + float *p = pdg + isample*npdg; + int ia, als = 0; + float lk = INT_MIN, lk_sum=0; + for (ia=0; ia lk ) { lk = _lk; als = ia<<3 | ia; } + lk_sum += _lk; + } + if ( ploidy==2 ) + { + for (ia=0; ia lk ) { lk = _lk; als = ib<<3 | ia; } + lk_sum += _lk; + } + } + } + lk = -log(1-lk/lk_sum)/0.2302585; + if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 ) + { + als |= 1<<7; + lk = 0; + } + ((uint8_t*)b->gi[old_n_gi].data)[isample] = als; + ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = lk<100 ? (int)lk : 99; + + gts |= (als>>3&7) | (als&7); + } + bcf_fit_alt(b,max_als); + bcf_sync(b); + + free(pdg); + return gts; +} + +static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) +{ + int i, j; + long *p, tmp; + p = alloca(b->n_alleles * sizeof(long)); + memset(p, 0, sizeof(long) * b->n_alleles); + for (j = 0; j < ma->n; ++j) { + const uint8_t *pi = ma->PL + j * ma->PL_len; + double *pdg = ma->pdg + j * 3; + pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; + for (i = 0; i < b->n_alleles; ++i) + p[i] += (int)pi[(i+1)*(i+2)/2-1]; + } + for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i; + for (i = 1; i < b->n_alleles; ++i) // insertion sort + for (j = i; j > 0 && p[j] < p[j-1]; --j) + tmp = p[j], p[j] = p[j-1], p[j-1] = tmp; + for (i = b->n_alleles - 1; i >= 0; --i) + if ((p[i]&0xf) == 0) break; + return i; } + int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) { double sum, g[3]; diff --git a/bcftools/prob1.h b/bcftools/prob1.h index 0a51a0a..88ed5bf 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -26,7 +26,9 @@ extern "C" { void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_destroy(bcf_p1aux_t *ma); + void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma); int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst); + int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold); int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); void bcf_p1_dump_afs(bcf_p1aux_t *ma); int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); -- 2.39.2