#include <stdio.h>
#include <errno.h>
#include <assert.h>
+#include <limits.h>
#include "prob1.h"
+#include "kstring.h"
#include "kseq.h"
KSTREAM_INIT(gzFile, gzread, 16384)
return 0;
}
+void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma)
+{
+ // bcf_p1aux_t fields are not visible outside of prob1.c, hence this wrapper.
+ // Ideally, this should set ploidy per site to allow pseudo-autosomal regions
+ b->ploidy = ma->ploidy;
+}
+
void bcf_p1_destroy(bcf_p1aux_t *ma)
{
if (ma) {
}
}
+extern double kf_gammap(double s, double z);
+
+int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold)
+{
+ int nals = 1;
+ char *p;
+ for (p=b->alt; *p; p++)
+ {
+ if ( *p=='X' || p[0]=='.' ) break;
+ if ( p[0]==',' ) nals++;
+ }
+ if ( b->alt[0] && !*p ) nals++;
+
+ if ( nals==1 ) return 1;
+
+ if ( nals>4 ) { fprintf(stderr,"too many alts: %d\n", nals); exit(1); }
+
+ // set PL and PL_len
+ uint8_t *pl = NULL;
+ int npl = 0, idp=-1;
+ int i;
+ for (i = 0; i < b->n_gi; ++i)
+ {
+ if (b->gi[i].fmt == bcf_str2int("PL", 2))
+ {
+ pl = (uint8_t*)b->gi[i].data;
+ npl = b->gi[i].len;
+ }
+ if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i;
+ }
+ if ( !pl ) return -1;
+
+ int npdg = nals*(nals+1)/2;
+ float *pdg,*_pdg;
+ _pdg = pdg = malloc(sizeof(float)*ma->n*npdg);
+ for (i=0; i<ma->n; i++)
+ {
+ int j;
+ float sum = 0;
+ for (j=0; j<npdg; j++)
+ {
+ _pdg[j] = pow(10,-0.1*pl[j]);
+ sum += _pdg[j];
+ }
+ if ( sum )
+ for (j=0; j<npdg; j++) _pdg[j] /= sum;
+ _pdg += npdg;
+ pl += npl;
+ }
+
+ if ((p = strstr(b->info, "QS=")) == 0) { fprintf(stderr,"INFO/QS is required with -m, exiting\n"); exit(1); }
+ float qsum[4];
+ if ( sscanf(p+3,"%f,%f,%f,%f",&qsum[0],&qsum[1],&qsum[2],&qsum[3])!=4 ) { fprintf(stderr,"Could not parse %s\n",p); exit(1); }
+
+
+ int ia,ib,ic, max_als=0, max_als2=0;
+ float max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN;
+ for (ia=0; ia<nals; ia++)
+ {
+ //if ( ia && qsum[ia]==0 ) continue;
+ float lk_tot = 0;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ int isample;
+ for (isample=0; isample<ma->n; isample++)
+ {
+ float *p = pdg + isample*npdg;
+ assert( log(p[iaa]) <= 0 );
+ lk_tot += log(p[iaa]);
+ }
+ if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia; }
+ else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia; }
+ lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
+ }
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( qsum[ia]==0 ) continue;
+ //if ( ia && qsum[ia]==0 ) continue;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( qsum[ib]==0 ) continue;
+ //if ( ib && qsum[ib]==0 ) continue;
+ //if ( qsum[ia]+qsum[ib]==0 ) continue;
+ float lk_tot = 0;
+ float fa = qsum[ia]/(qsum[ia]+qsum[ib]);
+ float fb = qsum[ib]/(qsum[ia]+qsum[ib]);
+ float fab = 2*fa*fb; fa *= fa; fb *= fb;
+ int isample, iaa = (ia+1)*(ia+2)/2-1, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
+ for (isample=0; isample<ma->n; isample++)
+ {
+ if ( b->ploidy && b->ploidy[isample]==1 ) continue;
+ float *p = pdg + isample*npdg;
+ assert( log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]) <= 0 );
+ lk_tot += log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]);
+ }
+ if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia|1<<ib; }
+ else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia|1<<ib; }
+ lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
+ }
+ }
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( qsum[ia]==0 ) continue;
+ //if ( ia && qsum[ia]==0 ) continue;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( qsum[ib]==0 ) continue;
+ //if ( ib && qsum[ib]==0 ) continue;
+ for (ic=0; ic<ib; ic++)
+ {
+ if ( qsum[ic]==0 ) continue;
+ //if ( ic && qsum[ic]==0 ) continue;
+ //if ( qsum[ia]+qsum[ib]+qsum[ic]==0 ) continue;
+ float lk_tot = 0;
+ float fa = qsum[ia]/(qsum[ia]+qsum[ib]+qsum[ic]);
+ float fb = qsum[ib]/(qsum[ia]+qsum[ib]+qsum[ic]);
+ float fc = qsum[ic]/(qsum[ia]+qsum[ib]+qsum[ic]);
+ float fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; fa *= fa; fb *= fb; fc *= fc;
+ int isample, iaa = (ia+1)*(ia+2)/2-1, ibb = (ib+1)*(ib+2)/2-1, icc = (ic+1)*(ic+2)/2-1;
+ int iab = iaa - ia + ib, iac = iaa - ia + ic, ibc = ibb - ib + ic;
+ for (isample=0; isample<ma->n; isample++)
+ {
+ if ( b->ploidy && b->ploidy[isample]==1 ) continue;
+ float *p = pdg + isample*npdg;
+ assert( log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]) <= 0 );
+ lk_tot += log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]);
+ }
+ if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia|1<<ib|1<<ic; }
+ else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia|1<<ib|1<<ic; }
+ lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
+ }
+ }
+ }
+
+ int n1=0, n2=0;
+ for (i=0; i<nals; i++) if ( max_als&1<<i) n1++;
+ for (i=0; i<nals; i++) if ( max_als2&1<<i) n2++;
+ if ( n2<n1 && kf_gammap(1,2.0*(max_lk-max_lk2))<threshold )
+ {
+ max_lk = max_lk2;
+ max_als = max_als2;
+ }
+
+ // Get the BCF record ready for output
+ kstring_t s;
+ memset(&s, 0, sizeof(kstring_t));
+ kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s);
+ kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
+ kputs(b->info, &s); if (b->info[0]) kputc(';', &s); kputc('\0', &s);
+ kputs(b->fmt, &s); kputc('\0', &s);
+ free(b->str);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ b->qual = -4.343*(log(1-exp(max_lk-lk_sum)));
+ if ( b->qual>999 ) b->qual = 999;
+ //bcf_sync(b);
+
+ int x, old_n_gi = b->n_gi;
+ s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str;
+ kputs(":GT:GQ", &s); kputc('\0', &s);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ bcf_sync(b);
+
+ // Call GT
+ int isample, gts=0;
+ for (isample = 0; isample < b->n_smpl; isample++)
+ {
+ int ploidy = b->ploidy ? b->ploidy[isample] : 2;
+ float *p = pdg + isample*npdg;
+ int ia, als = 0;
+ float lk = INT_MIN, lk_sum=0;
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(max_als&1<<ia) ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ float _lk = p[iaa]*qsum[ia]*qsum[ia];
+ if ( _lk > lk ) { lk = _lk; als = ia<<3 | ia; }
+ lk_sum += _lk;
+ }
+ if ( ploidy==2 )
+ {
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(max_als&1<<ia) ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( !(max_als&1<<ib) ) continue;
+ int iab = iaa - ia + ib;
+ float _lk = 2*qsum[ia]*qsum[ib]*p[iab];
+ if ( _lk > lk ) { lk = _lk; als = ib<<3 | ia; }
+ lk_sum += _lk;
+ }
+ }
+ }
+ lk = -log(1-lk/lk_sum)/0.2302585;
+ if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 )
+ {
+ als |= 1<<7;
+ lk = 0;
+ }
+ ((uint8_t*)b->gi[old_n_gi].data)[isample] = als;
+ ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = lk<100 ? (int)lk : 99;
+
+ gts |= (als>>3&7) | (als&7);
+ }
+ bcf_fit_alt(b,max_als);
+ bcf_sync(b);
+
+ free(pdg);
+ return gts;
+}
+
static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
{
- int i, j;
- long *p, tmp;
- p = alloca(b->n_alleles * sizeof(long));
- memset(p, 0, sizeof(long) * b->n_alleles);
- for (j = 0; j < ma->n; ++j) {
- const uint8_t *pi = ma->PL + j * ma->PL_len;
- double *pdg = ma->pdg + j * 3;
- pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]];
- for (i = 0; i < b->n_alleles; ++i)
- p[i] += (int)pi[(i+1)*(i+2)/2-1];
- }
- for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i;
- for (i = 1; i < b->n_alleles; ++i) // insertion sort
- for (j = i; j > 0 && p[j] < p[j-1]; --j)
- tmp = p[j], p[j] = p[j-1], p[j-1] = tmp;
- for (i = b->n_alleles - 1; i >= 0; --i)
- if ((p[i]&0xf) == 0) break;
- return i;
+ int i, j;
+ long *p, tmp;
+ p = alloca(b->n_alleles * sizeof(long));
+ memset(p, 0, sizeof(long) * b->n_alleles);
+ for (j = 0; j < ma->n; ++j) {
+ const uint8_t *pi = ma->PL + j * ma->PL_len;
+ double *pdg = ma->pdg + j * 3;
+ pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]];
+ for (i = 0; i < b->n_alleles; ++i)
+ p[i] += (int)pi[(i+1)*(i+2)/2-1];
+ }
+ for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i;
+ for (i = 1; i < b->n_alleles; ++i) // insertion sort
+ for (j = i; j > 0 && p[j] < p[j-1]; --j)
+ tmp = p[j], p[j] = p[j-1], p[j-1] = tmp;
+ for (i = b->n_alleles - 1; i >= 0; --i)
+ if ((p[i]&0xf) == 0) break;
+ return i;
}
+
int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
{
double sum, g[3];