From: Petr Danecek Date: Tue, 3 Jul 2012 12:03:26 +0000 (+0100) Subject: Bug fixes X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=74e6aebb5c07d14cdccc5f8abb40fd4a00b2be86 Bug fixes --- diff --git a/bam2bcf.c b/bam2bcf.c index ed78ea6..f69684d 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -196,7 +196,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, for (j = 0; j < 4; ++j) qsum[j] += calls[i].qsum[j]; int qsum_tot=0; - for (j=0; j<4; j++) qsum_tot += qsum[j]; + for (j=0; j<4; j++) { qsum_tot += qsum[j]; call->qsum[j] = 0; } for (j = 0; j < 4; ++j) qsum[j] = qsum[j] << 2 | j; // find the top 2 alleles for (i = 1; i < 4; ++i) // insertion sort @@ -210,7 +210,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, if ((qsum[i]&3) != ref4) { if (qsum[i]>>2 != 0) { - call->qsum[j] = (float)(qsum[i]>>2)/qsum_tot; + if ( j<4 ) call->qsum[j] = (float)(qsum[i]>>2)/qsum_tot; // ref N can make j>=4 call->a[j++] = qsum[i]&3; } else break; @@ -313,7 +313,7 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc } kputc('\0', &s); // INFO - if (bc->ori_ref < 0) kputs("INDEL;", &s); + if (bc->ori_ref < 0) ksprintf(&s,"INDEL;IS=%d,%f;", bca->max_support, bca->max_frac); kputs("DP=", &s); kputw(bc->ori_depth, &s); kputs(";I16=", &s); for (i = 0; i < 16; ++i) { if (i) kputc(',', &s); diff --git a/bam2bcf.h b/bam2bcf.h index e3ecc07..8ac6b79 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -14,8 +14,8 @@ typedef struct __bcf_callaux_t { int capQ, min_baseQ; int openQ, extQ, tandemQ; // for indels - int min_support; // for collecting indel candidates - double min_frac; // for collecting indel candidates + int min_support, max_support; // for collecting indel candidates + double min_frac, max_frac; // for collecting indel candidates int per_sample_flt; // indel filtering strategy // for internal uses int max_bases; diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 87736c3..9aea5f9 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -142,6 +142,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (s == n) return -1; // there is no indel at this position. for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads { // find out how many types of indels are present + bca->max_support = bca->max_frac = 0; int m, n_alt = 0, n_tot = 0, indel_support_ok = 0; uint32_t *aux; aux = calloc(N + 1, 4); @@ -161,8 +162,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); if (j > max_rd_len) max_rd_len = j; } - if ( !indel_support_ok && na >= bca->min_support && (double)na/nt >= bca->min_frac ) + float frac = (float)na/nt; + if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac ) indel_support_ok = 1; + if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac; n_alt += na; n_tot += nt; } @@ -175,12 +178,12 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla // squeeze out identical types for (i = 1, n_types = 1; i < m; ++i) if (aux[i] != aux[i-1]) ++n_types; - // Taking totals makes hard to call rare indels - if ( !bca->per_sample_flt ) - indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1; - if ( n_types == 1 || !indel_support_ok ) { // then skip - free(aux); return -1; - } + // Taking totals makes it hard to call rare indels + if ( !bca->per_sample_flt ) + indel_support_ok = ( (float)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1; + if ( n_types == 1 || !indel_support_ok ) { // then skip + free(aux); return -1; + } if (n_types >= 64) { free(aux); if (bam_verbose >= 2) diff --git a/bcftools/bcf.c b/bcftools/bcf.c index e3daaeb..87ff6c7 100644 --- a/bcftools/bcf.c +++ b/bcftools/bcf.c @@ -323,6 +323,31 @@ int bcf_append_info(bcf1_t *b, const char *info, int l) return 0; } +int remove_tag(char *str, char *tag, char delim) +{ + char *tmp = str, *p; + int len_diff = 0, ori_len = strlen(str); + while ( *tmp && (p = strstr(tmp,tag)) ) + { + if ( p>str ) + { + if ( *(p-1)!=delim ) { tmp=p+1; continue; } // shared substring + p--; + } + char *q=p+1; + while ( *q && *q!=delim ) q++; + if ( p==str && *q ) q++; + len_diff += q-p; + if ( ! *q ) { *p = 0; break; } + else + memmove(p,q,ori_len-(int)(q-p)+1); + } + if ( len_diff==ori_len ) + str[0]='.', str[1]=0, len_diff--; + + return len_diff; +} + int bcf_cpy(bcf1_t *r, const bcf1_t *b) { char *t1 = r->str; diff --git a/bcftools/bcf.h b/bcftools/bcf.h index 4fe3ed7..7e451a6 100644 --- a/bcftools/bcf.h +++ b/bcftools/bcf.h @@ -123,6 +123,8 @@ extern "C" { char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b); // append more info int bcf_append_info(bcf1_t *b, const char *info, int l); + // remove tag + int remove_tag(char *string, char *tag, char delim); // copy int bcf_cpy(bcf1_t *r, const bcf1_t *b); diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c index 7988e58..1321c13 100644 --- a/bcftools/bcfutils.c +++ b/bcftools/bcfutils.c @@ -77,12 +77,6 @@ void bcf_fit_alt(bcf1_t *b, int mask) if ( b->n_alleles <= nals ) return; -#if DBG - printf("fit_alt: %s, %s, mask=%d, nals=%d: ", b->ref, b->alt, mask, nals); - for (i=0; i1 ) @@ -118,9 +112,6 @@ void bcf_fit_alt(bcf1_t *b, int mask) p = dst; } else p = b->alt, *p = '\0'; -#if DBG - printf("fit_alt: %s, mask=%d\n", b->alt, mask); -#endif p++; memmove(p, b->flt, b->str + b->l_str - b->flt); b->l_str -= b->flt - p; @@ -180,6 +171,7 @@ void bcf_fit_alt(bcf1_t *b, int mask) } free(map); b->n_alleles = nals; + bcf_sync(b); } int bcf_shrink_alt(bcf1_t *b, int n) diff --git a/bcftools/call1.c b/bcftools/call1.c index 13f5533..415426a 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -250,6 +250,12 @@ static void write_header(bcf_hdr_t *h) kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO==0 ) { int gts = call_multiallelic_gt(b,p1,vc.min_ma_lrt); - if ( gts==0 && vc.flag & VC_VARONLY ) continue; + if ( gts<=1 && vc.flag & VC_VARONLY ) continue; } else if (vc.flag & VC_CALL) { // call variants bcf_p1rst_t pr; diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 2a9c036..bc2aff8 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -215,7 +215,12 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) if ( nals==1 ) return 1; - if ( nals>4 ) { fprintf(stderr,"too many alts: %d\n", nals); exit(1); } + if ( nals>4 ) + { + if ( *b->ref=='N' ) return 0; + fprintf(stderr,"Not ready for this, more than 4 alleles at %d: %s, %s\n", b->pos+1, b->ref,b->alt); + exit(1); + } // set PL and PL_len uint8_t *pl = NULL; @@ -329,6 +334,7 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) if ( max_lklk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); + //printf(" %f\n", lk_sum); } } } @@ -342,27 +348,16 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) max_als = max_als2; } - // Get the BCF record ready for output + // Get the BCF record ready for GT and GQ kstring_t s; - memset(&s, 0, sizeof(kstring_t)); - kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); - kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); - kputs(b->info, &s); if (b->info[0]) kputc(';', &s); kputc('\0', &s); - kputs(b->fmt, &s); kputc('\0', &s); - free(b->str); - b->m_str = s.m; b->l_str = s.l; b->str = s.s; - b->qual = -4.343*(log(1-exp(max_lk-lk_sum))); - if ( b->qual>999 ) b->qual = 999; - //bcf_sync(b); - - int x, old_n_gi = b->n_gi; + int old_n_gi = b->n_gi; s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; kputs(":GT:GQ", &s); kputc('\0', &s); b->m_str = s.m; b->l_str = s.l; b->str = s.s; bcf_sync(b); - // Call GT - int isample, gts=0; + // Call GTs + int isample, gts=0, ac[4] = {0,0,0,0}; for (isample = 0; isample < b->n_smpl; isample++) { int ploidy = b->ploidy ? b->ploidy[isample] : 2; @@ -396,17 +391,59 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) lk = -log(1-lk/lk_sum)/0.2302585; if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 ) { - als |= 1<<7; - lk = 0; + ((uint8_t*)b->gi[old_n_gi].data)[isample] = als | 1<<7; + ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = 0; + continue; } - ((uint8_t*)b->gi[old_n_gi].data)[isample] = als; + ((uint8_t*)b->gi[old_n_gi].data)[isample] = als; ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = lk<100 ? (int)lk : 99; - gts |= (als>>3&7) | (als&7); + gts |= 1<<(als>>3&7) | 1<<(als&7); + ac[ als>>3&7 ]++; + ac[ als&7 ]++; } bcf_fit_alt(b,max_als); + + // prepare ref, alt, filter, info, format + memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); + kputs(b->ref, &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); + { + kstring_t info; memset(&info, 0, sizeof(kstring_t)); + int an=0, nalts=0; + for (i=0; i0 && ac[i] ) nalts++; + } + ksprintf(&info, "AN=%d;", an); + if ( nalts ) + { + kputs("AC=", &info); + for (i=1; i0 ) kputc(',', &info); + } + kputc(';', &info); + } + kputs(b->info, &info); + info.l -= remove_tag(info.s, "I16=", ';'); + info.l -= remove_tag(info.s, "QS=", ';'); + kputc('\0', &info); + kputs(info.s, &s); kputc('\0', &s); + free(info.s); + } + kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + b->qual = -4.343*(log(1-exp(max_lk-lk_sum))); + if ( b->qual>999 ) b->qual = 999; bcf_sync(b); + free(pdg); return gts; }