From 6418571502ba9c6bd4eb930c7ae0309f7f251b4e Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 1 Sep 2010 19:21:41 +0000 Subject: [PATCH] Write the correct ALT and PL in the SNP calling mode. --- bcftools/bcf.c | 14 ++++++++++---- bcftools/bcf.h | 1 + bcftools/bcfutils.c | 36 ++++++++++++++++++++++++++++++++++++ bcftools/call1.c | 17 ++++++++++------- 4 files changed, 57 insertions(+), 11 deletions(-) diff --git a/bcftools/bcf.c b/bcftools/bcf.c index 8801cea..19969eb 100644 --- a/bcftools/bcf.c +++ b/bcftools/bcf.c @@ -109,12 +109,18 @@ int bcf_sync(int n_smpl, bcf1_t *b) b->ref = b->alt = b->flt = b->info = b->fmt = 0; for (p = b->str, n = 0; p < b->str + b->l_str; ++p) if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1; - if (n != 5) return -1; + if (n != 5) { + fprintf(stderr, "[bcf_sync] incorrect number of fields (%d != 5)\n", n); + return -1; + } b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4]; // set n_alleles - for (p = b->alt, n = 1; *p; ++p) - if (*p == ',') ++n; - b->n_alleles = n + 1; + if (*b->alt == 0) b->n_alleles = 1; + else { + for (p = b->alt, n = 1; *p; ++p) + if (*p == ',') ++n; + b->n_alleles = n + 1; + } // set n_gi and gi[i].fmt for (p = b->fmt, n = 1; *p; ++p) if (*p == ':') ++n; diff --git a/bcftools/bcf.h b/bcftools/bcf.h index 8e4ecf9..e6ba225 100644 --- a/bcftools/bcf.h +++ b/bcftools/bcf.h @@ -65,6 +65,7 @@ extern "C" { int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); + int bcf_shrink_alt(int n_smpl, bcf1_t *b, int n); void *bcf_build_refhash(bcf_hdr_t *h); void bcf_str2id_destroy(void *_hash); int bcf_str2id_add(void *_hash, const char *str); diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c index 3010ef3..6908bc4 100644 --- a/bcftools/bcfutils.c +++ b/bcftools/bcfutils.c @@ -1,4 +1,5 @@ #include "bcf.h" +#include "kstring.h" #include "khash.h" KHASH_MAP_INIT_STR(str2id, int) @@ -46,3 +47,38 @@ int bcf_str2id_add(void *_hash, const char *str) kh_val(hash, k) = kh_size(hash) - 1; return kh_val(hash, k); } + +int bcf_shrink_alt(int n_smpl, bcf1_t *b, int n) +{ + char *p; + int i, j, k; + int *z; + if (b->n_alleles <= n) return -1; + if (n > 1) { + for (p = b->alt, k = 1; *p; ++p) + if (*p == ',' && ++k == n) break; + *p = '\0'; + } else p = b->alt, *p = '\0'; + ++p; + memmove(p, b->flt, b->str + b->l_str - b->flt); + b->l_str -= b->flt - p; + z = alloca(sizeof(int) / 2 * n * (n+1)); + for (i = k = 0; i < n; ++i) + for (j = 0; j < n - i; ++j) + z[k++] = i * b->n_alleles + j; + for (i = 0; i < b->n_gi; ++i) { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) { + int l, x = b->n_alleles * (b->n_alleles + 1) / 2; + uint8_t *d = (uint8_t*)g->data; + g->len = n * (n + 1) / 2; + for (l = k = 0; l < n_smpl; ++l) { + uint8_t *dl = d + l * x; + for (j = 0; j < g->len; ++j) d[k++] = dl[z[j]]; + } + } // FIXME: to add GL + } + b->n_alleles = n; + bcf_sync(n_smpl, b); + return 0; +} diff --git a/bcftools/call1.c b/bcftools/call1.c index e8ecb51..1060513 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -21,6 +21,7 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define VC_VCFIN 32 #define VC_UNCOMP 64 #define VC_HWE 128 +#define VC_KEEPALT 256 typedef struct { int flag, prior_type, n1; @@ -141,7 +142,7 @@ static void rm_info(int n_smpl, bcf1_t *b, const char *key) bcf_sync(n_smpl, b); } -static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref) +static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag) { kstring_t s; int is_var = (pr->p_ref < pref); @@ -154,10 +155,7 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); - if (is_var) { - kputs(b->alt, &s); - } - kputc('\0', &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); kputs(b->info, &s); if (b->info[0]) kputc(';', &s); ksprintf(&s, "AF1=%.3lf;AFE=%.3lf", 1.-pr->f_em, 1.-pr->f_exp); @@ -175,6 +173,9 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p b->qual = r < 1e-100? 99 : -3.434 * log(r); if (b->qual > 99) b->qual = 99; bcf_sync(n_smpl, b); + if (!is_var) bcf_shrink_alt(n_smpl, b, 1); + else if (!(flag&VC_KEEPALT)) + bcf_shrink_alt(n_smpl, b, pr->rank0 < 2? 2 : pr->rank0+1); return is_var; } @@ -194,12 +195,13 @@ int bcfview(int argc, char *argv[]) tid = begin = end = -1; memset(&vc, 0, sizeof(viewconf_t)); vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.9; - while ((c = getopt(argc, argv, "1:l:cHGvLbSuP:t:p:")) >= 0) { + while ((c = getopt(argc, argv, "1:l:cHAGvLbSuP:t:p:")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; case 'l': vc.fn_list = strdup(optarg); break; case 'G': vc.flag |= VC_NO_GENO; break; case 'L': vc.flag |= VC_NO_PL; break; + case 'A': vc.flag |= VC_KEEPALT; break; case 'b': vc.flag |= VC_BCFOUT; break; case 'S': vc.flag |= VC_VCFIN; break; case 'c': vc.flag |= VC_CALL; break; @@ -223,6 +225,7 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, " -b output BCF instead of VCF\n"); fprintf(stderr, " -u uncompressed BCF output\n"); fprintf(stderr, " -S input is VCF\n"); + fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); fprintf(stderr, " -G suppress all individual genotype information\n"); fprintf(stderr, " -L discard the PL genotype field\n"); fprintf(stderr, " -H perform Hardy-Weinberg test (slower)\n"); @@ -288,7 +291,7 @@ int bcfview(int argc, char *argv[]) if (vc.flag&VC_HWE) bcf_p1_cal_g3(p1, pr.g); if ((n_processed + 1) % 50000 == 0) bcf_p1_dump_afs(p1); if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue; - update_bcf1(h->n_smpl, b, p1, &pr, vc.pref); + update_bcf1(h->n_smpl, b, p1, &pr, vc.pref, vc.flag); } if (vc.flag & VC_NO_GENO) { b->n_gi = 0; -- 2.39.2