* negative if we are looking at an indel. */
int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
{
+ static int *var_pos = NULL, nvar_pos = 0;
int i, n, ref4, is_indel, ori_depth = 0;
memset(r, 0, sizeof(bcf_callret1_t));
if (ref_base >= 0) {
r->depth = n; r->ori_depth = ori_depth;
// glfgen
errmod_cal(bca->e, n, 5, bca->bases, r->p);
+
+ // Calculate the Variant Distance Bias (make it optional?)
+ if ( nvar_pos < _n ) {
+ nvar_pos = _n;
+ var_pos = realloc(var_pos,sizeof(int)*nvar_pos);
+ }
+ int alt_dp=0, read_len=0;
+ for (i=0; i<_n; i++) {
+ const bam_pileup1_t *p = pl + i;
+ if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base )
+ continue;
+
+ var_pos[alt_dp] = p->qpos;
+ if ( (bam1_cigar(p->b)[0]&BAM_CIGAR_MASK)==4 )
+ var_pos[alt_dp] -= bam1_cigar(p->b)[0]>>BAM_CIGAR_SHIFT;
+
+ alt_dp++;
+ read_len += p->b->core.l_qseq;
+ }
+ float mvd=0;
+ int j;
+ n=0;
+ for (i=0; i<alt_dp; i++) {
+ for (j=0; j<i; j++) {
+ mvd += abs(var_pos[i] - var_pos[j]);
+ n++;
+ }
+ }
+ r->mvd[0] = n ? mvd/n : 0;
+ r->mvd[1] = alt_dp;
+ r->mvd[2] = alt_dp ? read_len/alt_dp : 0;
+
return r->depth;
}
+
+void calc_vdb(int n, const bcf_callret1_t *calls, bcf_call_t *call)
+{
+ // Variant distance bias. Samples merged by means of DP-weighted average.
+
+ float weight=0, tot_prob=0;
+
+ int i;
+ for (i=0; i<n; i++)
+ {
+ int mvd = calls[i].mvd[0];
+ int dp = calls[i].mvd[1];
+ int read_len = calls[i].mvd[2];
+
+ if ( dp<2 ) continue;
+
+ float prob = 0;
+ if ( dp==2 )
+ {
+ // Exact formula
+ prob = (mvd==0) ? 1.0/read_len : (read_len-mvd)*2.0/read_len/read_len;
+ }
+ else if ( dp==3 )
+ {
+ // Sin, quite accurate approximation
+ float mu = read_len/2.9;
+ prob = mvd>2*mu ? 0 : sin(mvd*3.14/2/mu) / (4*mu/3.14);
+ }
+ else
+ {
+ // Scaled gaussian curve, crude approximation, but behaves well. Using fixed depth for bigger depths.
+ if ( dp>5 )
+ dp = 5;
+ float sigma2 = (read_len/1.9/(dp+1)) * (read_len/1.9/(dp+1));
+ float norm = 1.125*sqrt(2*3.14*sigma2);
+ float mu = read_len/2.9;
+ if ( mvd < mu )
+ prob = exp(-(mvd-mu)*(mvd-mu)/2/sigma2)/norm;
+ else
+ prob = exp(-(mvd-mu)*(mvd-mu)/3.125/sigma2)/norm;
+ }
+
+ //fprintf(stderr,"dp=%d mvd=%d read_len=%d -> prob=%f\n", dp,mvd,read_len,prob);
+ tot_prob += prob*dp;
+ weight += dp;
+ }
+ tot_prob = weight ? tot_prob/weight : 1;
+ //fprintf(stderr,"prob=%f\n", tot_prob);
+ call->vdb = tot_prob;
+}
+
int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
{
int ref4, i, j, qsum[4];
call->ori_depth += calls[i].ori_depth;
for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
}
+
+ calc_vdb(n, calls, call);
+
return 0;
}
if (i) kputc(',', &s);
kputw(bc->anno[i], &s);
}
+ if ( bc->vdb!=1 )
+ {
+ ksprintf(&s, ";VDB=%.4f", bc->vdb);
+ }
kputc('\0', &s);
// FMT
kputs("PL", &s);
int depth, ori_depth, qsum[4];
int anno[16];
float p[25];
+ int mvd[3]; // mean variant distance, number of variant reads, average read length
} bcf_callret1_t;
typedef struct {
int n, n_alleles, shift, ori_ref, unseen;
int anno[16], depth, ori_depth;
uint8_t *PL;
+ float vdb; // variant distance bias
} bcf_call_t;
#ifdef __cplusplus
bam1_qname(b), last_coor, c->pos, c->tid+1);
return NULL;
}
- if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off);
+ if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off);
if (c->bin != last_bin) { // then possibly write the binning index
if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record
insert_offset(idx->index[save_tid], save_bin, save_off, last_off);
if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]);
if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]);
if (em[8] >= 0) ksprintf(&s, ";LRT2=%.3g", em[8]);
- //if (em[9] >= 0) ksprintf(&s, ";LRT1=%.3g", em[9]);
}
if (cons_llr > 0) {
ksprintf(&s, ";CLR=%d", cons_llr);
kputs("##INFO=<ID=QCHI2,Number=1,Type=Integer,Description=\"Phred scaled PCHI2.\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=RP,"))
kputs("##INFO=<ID=PR,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=VDB,"))
+ kputs("##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias\">\n", &str);
if (!strstr(str.s, "##FORMAT=<ID=GT,"))
kputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", &str);
if (!strstr(str.s, "##FORMAT=<ID=GQ,"))
if (!strstr(str.s, "##FORMAT=<ID=SP,"))
kputs("##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">\n", &str);
if (!strstr(str.s, "##FORMAT=<ID=PL,"))
- kputs("##FORMAT=<ID=PL,Number=-1,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods, number of values is (#ALT+1)*(#ALT+2)/2\">\n", &str);
+ kputs("##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">\n", &str);
h->l_txt = str.l + 1; h->txt = str.s;
}
// x[5..6]: group1 freq, group2 freq
// x[7]: 1-degree P-value
// x[8]: 2-degree P-value
-// x[9]: 1-degree P-value with freq estimated from genotypes
int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
{
double *pdg;
x[6] = freqml(x[0], n1, n, pdg);
}
if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value
- double f[3], f3[3][3];
+ double f[3], f3[3][3], tmp;
f[0] = x[0]; f[1] = x[5]; f[2] = x[6];
for (i = 0; i < 3; ++i)
f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i];
- x[7] = kf_gammaq(.5, log(lk_ratio_test(n, n1, pdg, f3)));
+ tmp = log(lk_ratio_test(n, n1, pdg, f3));
+ if (tmp < 0) tmp = 0;
+ x[7] = kf_gammaq(.5, tmp);
}
if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value
double g[3][3], tmp;
for (i = 0; i < ITER_MAX; ++i)
if (g3_iter(g[2], pdg, n1, n) < EPS) break;
tmp = log(lk_ratio_test(n, n1, pdg, g));
+ if (tmp < 0) tmp = 0;
x[8] = kf_gammaq(1., tmp);
- x[9] = kf_gammaq(.5, tmp);
}
// free
free(pdg);
double p = 1., Pr1 = 0.;
for (i = 0; i <= l_query + 1; ++i) {
p *= s[i];
- if (p < 1e-100) Pr += -4.343 * log(p), p = 1.;
+ if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.;
}
Pr1 += -4.343 * log(p * l_ref * l_query);
Pr = (int)(Pr1 + .499);
int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
{
const char *p = txt, *q, *r;
- kstring_t buf;
+ kstring_t buf, first_sm;
int n = 0;
khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id;
if (txt == 0) {
return 0;
}
memset(&buf, 0, sizeof(kstring_t));
+ memset(&first_sm, 0, sizeof(kstring_t));
while ((q = strstr(p, "@RG")) != 0) {
p = q + 3;
r = q = 0;
oq = *u; or = *v; *u = *v = '\0';
buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf);
add_pair(sm, sm2id, buf.s, r);
+ if ( !first_sm.s )
+ kputs(r,&first_sm);
*u = oq; *v = or;
} else break;
p = q > r? q : r;
++n;
}
if (n == 0) add_pair(sm, sm2id, fn, fn);
+ // If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but
+ // use the tag instead.
+ else if ( n==1 && first_sm.s )
+ add_pair(sm,sm2id,fn,first_sm.s);
+ if ( first_sm.s )
+ free(first_sm.s);
+
// add_pair(sm, sm2id, fn, fn);
free(buf.s);
return 0;