From cb5f44947629a68d16a48d32eef6e42a66e482fe Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 17 Aug 2011 18:26:22 +0000 Subject: [PATCH] * bugfix in index: large memory when a read pos is 1 * bugfix in BAQ: incorrect probability (rare) * compute variant distance bias (commited by Petr) * bugfix for computing LRT2: LRT2=nan --- bam2bcf.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++ bam2bcf.h | 2 ++ bam_index.c | 2 +- bcftools/call1.c | 5 +-- bcftools/em.c | 9 ++--- kprobaln.c | 2 +- sample.c | 12 ++++++- 7 files changed, 114 insertions(+), 9 deletions(-) diff --git a/bam2bcf.c b/bam2bcf.c index f263fad..dec3305 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -39,6 +39,7 @@ void bcf_call_destroy(bcf_callaux_t *bca) * negative if we are looking at an indel. */ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) { + static int *var_pos = NULL, nvar_pos = 0; int i, n, ref4, is_indel, ori_depth = 0; memset(r, 0, sizeof(bcf_callret1_t)); if (ref_base >= 0) { @@ -94,9 +95,92 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t r->depth = n; r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); + + // Calculate the Variant Distance Bias (make it optional?) + if ( nvar_pos < _n ) { + nvar_pos = _n; + var_pos = realloc(var_pos,sizeof(int)*nvar_pos); + } + int alt_dp=0, read_len=0; + for (i=0; i<_n; i++) { + const bam_pileup1_t *p = pl + i; + if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) + continue; + + var_pos[alt_dp] = p->qpos; + if ( (bam1_cigar(p->b)[0]&BAM_CIGAR_MASK)==4 ) + var_pos[alt_dp] -= bam1_cigar(p->b)[0]>>BAM_CIGAR_SHIFT; + + alt_dp++; + read_len += p->b->core.l_qseq; + } + float mvd=0; + int j; + n=0; + for (i=0; imvd[0] = n ? mvd/n : 0; + r->mvd[1] = alt_dp; + r->mvd[2] = alt_dp ? read_len/alt_dp : 0; + return r->depth; } + +void calc_vdb(int n, const bcf_callret1_t *calls, bcf_call_t *call) +{ + // Variant distance bias. Samples merged by means of DP-weighted average. + + float weight=0, tot_prob=0; + + int i; + for (i=0; i2*mu ? 0 : sin(mvd*3.14/2/mu) / (4*mu/3.14); + } + else + { + // Scaled gaussian curve, crude approximation, but behaves well. Using fixed depth for bigger depths. + if ( dp>5 ) + dp = 5; + float sigma2 = (read_len/1.9/(dp+1)) * (read_len/1.9/(dp+1)); + float norm = 1.125*sqrt(2*3.14*sigma2); + float mu = read_len/2.9; + if ( mvd < mu ) + prob = exp(-(mvd-mu)*(mvd-mu)/2/sigma2)/norm; + else + prob = exp(-(mvd-mu)*(mvd-mu)/3.125/sigma2)/norm; + } + + //fprintf(stderr,"dp=%d mvd=%d read_len=%d -> prob=%f\n", dp,mvd,read_len,prob); + tot_prob += prob*dp; + weight += dp; + } + tot_prob = weight ? tot_prob/weight : 1; + //fprintf(stderr,"prob=%f\n", tot_prob); + call->vdb = tot_prob; +} + int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call) { int ref4, i, j, qsum[4]; @@ -170,6 +254,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, call->ori_depth += calls[i].ori_depth; for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; } + + calc_vdb(n, calls, call); + return 0; } @@ -223,6 +310,10 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc if (i) kputc(',', &s); kputw(bc->anno[i], &s); } + if ( bc->vdb!=1 ) + { + ksprintf(&s, ";VDB=%.4f", bc->vdb); + } kputc('\0', &s); // FMT kputs("PL", &s); diff --git a/bam2bcf.h b/bam2bcf.h index 9585672..4af080c 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -26,6 +26,7 @@ typedef struct { int depth, ori_depth, qsum[4]; int anno[16]; float p[25]; + int mvd[3]; // mean variant distance, number of variant reads, average read length } bcf_callret1_t; typedef struct { @@ -33,6 +34,7 @@ typedef struct { int n, n_alleles, shift, ori_ref, unseen; int anno[16], depth, ori_depth; uint8_t *PL; + float vdb; // variant distance bias } bcf_call_t; #ifdef __cplusplus diff --git a/bam_index.c b/bam_index.c index 66d8eb8..9610a26 100644 --- a/bam_index.c +++ b/bam_index.c @@ -188,7 +188,7 @@ bam_index_t *bam_index_core(bamFile fp) bam1_qname(b), last_coor, c->pos, c->tid+1); return NULL; } - if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off); + if (c->tid >= 0 && !(c->flag & BAM_FUNMAP)) insert_offset2(&idx->index2[b->core.tid], b, last_off); if (c->bin != last_bin) { // then possibly write the binning index if (save_bin != 0xffffffffu) // save_bin==0xffffffffu only happens to the first record insert_offset(idx->index[save_tid], save_bin, save_off, last_off); diff --git a/bcftools/call1.c b/bcftools/call1.c index b0baee2..3cc4649 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -131,7 +131,6 @@ static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]); if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]); if (em[8] >= 0) ksprintf(&s, ";LRT2=%.3g", em[8]); - //if (em[9] >= 0) ksprintf(&s, ";LRT1=%.3g", em[9]); } if (cons_llr > 0) { ksprintf(&s, ";CLR=%d", cons_llr); @@ -274,6 +273,8 @@ static void write_header(bcf_hdr_t *h) kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); + kputs("##FORMAT=\n", &str); h->l_txt = str.l + 1; h->txt = str.s; } diff --git a/bcftools/em.c b/bcftools/em.c index 32b400f..b7dfe1a 100644 --- a/bcftools/em.c +++ b/bcftools/em.c @@ -168,7 +168,6 @@ static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3]) // x[5..6]: group1 freq, group2 freq // x[7]: 1-degree P-value // x[8]: 2-degree P-value -// x[9]: 1-degree P-value with freq estimated from genotypes int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]) { double *pdg; @@ -208,11 +207,13 @@ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]) x[6] = freqml(x[0], n1, n, pdg); } if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value - double f[3], f3[3][3]; + double f[3], f3[3][3], tmp; f[0] = x[0]; f[1] = x[5]; f[2] = x[6]; for (i = 0; i < 3; ++i) f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i]; - x[7] = kf_gammaq(.5, log(lk_ratio_test(n, n1, pdg, f3))); + tmp = log(lk_ratio_test(n, n1, pdg, f3)); + if (tmp < 0) tmp = 0; + x[7] = kf_gammaq(.5, tmp); } if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value double g[3][3], tmp; @@ -222,8 +223,8 @@ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]) for (i = 0; i < ITER_MAX; ++i) if (g3_iter(g[2], pdg, n1, n) < EPS) break; tmp = log(lk_ratio_test(n, n1, pdg, g)); + if (tmp < 0) tmp = 0; x[8] = kf_gammaq(1., tmp); - x[9] = kf_gammaq(.5, tmp); } // free free(pdg); diff --git a/kprobaln.c b/kprobaln.c index 5201c1a..894a2ae 100644 --- a/kprobaln.c +++ b/kprobaln.c @@ -161,7 +161,7 @@ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_quer double p = 1., Pr1 = 0.; for (i = 0; i <= l_query + 1; ++i) { p *= s[i]; - if (p < 1e-100) Pr += -4.343 * log(p), p = 1.; + if (p < 1e-100) Pr1 += -4.343 * log(p), p = 1.; } Pr1 += -4.343 * log(p * l_ref * l_query); Pr = (int)(Pr1 + .499); diff --git a/sample.c b/sample.c index 4e4b8a4..830b9d1 100644 --- a/sample.c +++ b/sample.c @@ -52,7 +52,7 @@ static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, cons int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) { const char *p = txt, *q, *r; - kstring_t buf; + kstring_t buf, first_sm; int n = 0; khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id; if (txt == 0) { @@ -60,6 +60,7 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) return 0; } memset(&buf, 0, sizeof(kstring_t)); + memset(&first_sm, 0, sizeof(kstring_t)); while ((q = strstr(p, "@RG")) != 0) { p = q + 3; r = q = 0; @@ -73,12 +74,21 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt) oq = *u; or = *v; *u = *v = '\0'; buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf); add_pair(sm, sm2id, buf.s, r); + if ( !first_sm.s ) + kputs(r,&first_sm); *u = oq; *v = or; } else break; p = q > r? q : r; ++n; } if (n == 0) add_pair(sm, sm2id, fn, fn); + // If there is only one RG tag present in the header and reads are not annotated, don't refuse to work but + // use the tag instead. + else if ( n==1 && first_sm.s ) + add_pair(sm,sm2id,fn,first_sm.s); + if ( first_sm.s ) + free(first_sm.s); + // add_pair(sm, sm2id, fn, fn); free(buf.s); return 0; -- 2.39.2