b->ref = b->alt = b->flt = b->info = b->fmt = 0;
for (p = b->str, n = 0; p < b->str + b->l_str; ++p)
if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1;
- if (n != 5) return -1;
+ if (n != 5) {
+ fprintf(stderr, "[bcf_sync] incorrect number of fields (%d != 5)\n", n);
+ return -1;
+ }
b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4];
// set n_alleles
- for (p = b->alt, n = 1; *p; ++p)
- if (*p == ',') ++n;
- b->n_alleles = n + 1;
+ if (*b->alt == 0) b->n_alleles = 1;
+ else {
+ for (p = b->alt, n = 1; *p; ++p)
+ if (*p == ',') ++n;
+ b->n_alleles = n + 1;
+ }
// set n_gi and gi[i].fmt
for (p = b->fmt, n = 1; *p; ++p)
if (*p == ':') ++n;
#include "bcf.h"
+#include "kstring.h"
#include "khash.h"
KHASH_MAP_INIT_STR(str2id, int)
kh_val(hash, k) = kh_size(hash) - 1;
return kh_val(hash, k);
}
+
+int bcf_shrink_alt(int n_smpl, bcf1_t *b, int n)
+{
+ char *p;
+ int i, j, k;
+ int *z;
+ if (b->n_alleles <= n) return -1;
+ if (n > 1) {
+ for (p = b->alt, k = 1; *p; ++p)
+ if (*p == ',' && ++k == n) break;
+ *p = '\0';
+ } else p = b->alt, *p = '\0';
+ ++p;
+ memmove(p, b->flt, b->str + b->l_str - b->flt);
+ b->l_str -= b->flt - p;
+ z = alloca(sizeof(int) / 2 * n * (n+1));
+ for (i = k = 0; i < n; ++i)
+ for (j = 0; j < n - i; ++j)
+ z[k++] = i * b->n_alleles + j;
+ for (i = 0; i < b->n_gi; ++i) {
+ bcf_ginfo_t *g = b->gi + i;
+ if (g->fmt == bcf_str2int("PL", 2)) {
+ int l, x = b->n_alleles * (b->n_alleles + 1) / 2;
+ uint8_t *d = (uint8_t*)g->data;
+ g->len = n * (n + 1) / 2;
+ for (l = k = 0; l < n_smpl; ++l) {
+ uint8_t *dl = d + l * x;
+ for (j = 0; j < g->len; ++j) d[k++] = dl[z[j]];
+ }
+ } // FIXME: to add GL
+ }
+ b->n_alleles = n;
+ bcf_sync(n_smpl, b);
+ return 0;
+}
#define VC_VCFIN 32
#define VC_UNCOMP 64
#define VC_HWE 128
+#define VC_KEEPALT 256
typedef struct {
int flag, prior_type, n1;
bcf_sync(n_smpl, b);
}
-static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref)
+static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag)
{
kstring_t s;
int is_var = (pr->p_ref < pref);
memset(&s, 0, sizeof(kstring_t));
kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s);
- if (is_var) {
- kputs(b->alt, &s);
- }
- kputc('\0', &s); kputc('\0', &s);
+ kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
kputs(b->info, &s);
if (b->info[0]) kputc(';', &s);
ksprintf(&s, "AF1=%.3lf;AFE=%.3lf", 1.-pr->f_em, 1.-pr->f_exp);
b->qual = r < 1e-100? 99 : -3.434 * log(r);
if (b->qual > 99) b->qual = 99;
bcf_sync(n_smpl, b);
+ if (!is_var) bcf_shrink_alt(n_smpl, b, 1);
+ else if (!(flag&VC_KEEPALT))
+ bcf_shrink_alt(n_smpl, b, pr->rank0 < 2? 2 : pr->rank0+1);
return is_var;
}
tid = begin = end = -1;
memset(&vc, 0, sizeof(viewconf_t));
vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.9;
- while ((c = getopt(argc, argv, "1:l:cHGvLbSuP:t:p:")) >= 0) {
+ while ((c = getopt(argc, argv, "1:l:cHAGvLbSuP:t:p:")) >= 0) {
switch (c) {
case '1': vc.n1 = atoi(optarg); break;
case 'l': vc.fn_list = strdup(optarg); break;
case 'G': vc.flag |= VC_NO_GENO; break;
case 'L': vc.flag |= VC_NO_PL; break;
+ case 'A': vc.flag |= VC_KEEPALT; break;
case 'b': vc.flag |= VC_BCFOUT; break;
case 'S': vc.flag |= VC_VCFIN; break;
case 'c': vc.flag |= VC_CALL; break;
fprintf(stderr, " -b output BCF instead of VCF\n");
fprintf(stderr, " -u uncompressed BCF output\n");
fprintf(stderr, " -S input is VCF\n");
+ fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n");
fprintf(stderr, " -G suppress all individual genotype information\n");
fprintf(stderr, " -L discard the PL genotype field\n");
fprintf(stderr, " -H perform Hardy-Weinberg test (slower)\n");
if (vc.flag&VC_HWE) bcf_p1_cal_g3(p1, pr.g);
if ((n_processed + 1) % 50000 == 0) bcf_p1_dump_afs(p1);
if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue;
- update_bcf1(h->n_smpl, b, p1, &pr, vc.pref);
+ update_bcf1(h->n_smpl, b, p1, &pr, vc.pref, vc.flag);
}
if (vc.flag & VC_NO_GENO) {
b->n_gi = 0;