From 39eb14826a5ef0de7f5982390e7e2b0d671848ee Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 1 Sep 2010 22:31:17 +0000 Subject: [PATCH] * fixed bugs in parsing VCF * parser now works with GT/GQ/DP/PL/GL --- bcftools/bcf.c | 20 ++++++++++++++++---- bcftools/vcf.c | 42 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/bcftools/bcf.c b/bcftools/bcf.c index 19969eb..3393aad 100644 --- a/bcftools/bcf.c +++ b/bcftools/bcf.c @@ -143,7 +143,7 @@ int bcf_sync(int n_smpl, bcf1_t *b) } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2)) { b->gi[i].len = 1; } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { - b->gi[i].len = 4; + b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2 * 4; } b->gi[i].data = realloc(b->gi[i].data, n_smpl * b->gi[i].len); } @@ -245,9 +245,21 @@ void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s) kputw(((uint8_t*)b->gi[i].data)[j], s); } else if (b->gi[i].fmt == bcf_str2int("GT", 2)) { int y = ((uint8_t*)b->gi[i].data)[j]; - kputc('0' + (y>>3&7), s); - kputc("/|"[y>>6&1], s); - kputc('0' + (y&7), s); + if (y>>7&1) { + kputsn("./.", 3, s); + } else { + kputc('0' + (y>>3&7), s); + kputc("/|"[y>>6&1], s); + kputc('0' + (y&7), s); + } + } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { + float *d = (float*)b->gi[i].data + j * x; + int k; + //printf("- %lx\n", d); + for (k = 0; k < x; ++k) { + if (k > 0) kputc(',', s); + ksprintf(s, "%.2f", d[k]); + } } } } diff --git a/bcftools/vcf.c b/bcftools/vcf.c index 81c7061..23aad3c 100644 --- a/bcftools/vcf.c +++ b/bcftools/vcf.c @@ -150,19 +150,38 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) } b->tid = tid; } else if (k == 1) { // pos - b->pos = atoi(p); + b->pos = atoi(p) - 1; } else if (k == 5) { // qual b->qual = (p[0] >= '0' && p[0] <= '9')? atoi(p) : 0; } else if (k <= 8) { // variable length strings kputs(p, &str); kputc('\0', &str); b->l_str = str.l; b->m_str = str.m; b->str = str.s; if (k == 8) bcf_sync(h->n_smpl, b); - } else { + } else { // k > 9 + if (strncmp(p, "./.", 3) == 0) { + for (i = 0; i < b->n_gi; ++i) { + if (b->gi[i].fmt == bcf_str2int("GT", 2)) { + ((uint8_t*)b->gi[i].data)[k-9] = 1<<7; + } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { + ((uint8_t*)b->gi[i].data)[k-9] = 0; + } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { + ((uint16_t*)b->gi[i].data)[k-9] = 0; + } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { + int y = b->n_alleles * (b->n_alleles + 1) / 2; + memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y); + } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { + int y = b->n_alleles * (b->n_alleles + 1) / 2; + memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4); + } + } + goto endblock; + } for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) { if (b->gi[i].fmt == bcf_str2int("GT", 2)) { ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6; } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { - int x = strtol(q, &q, 10); + double _x = strtod(q, &q); + int x = (int)(_x + .499); if (x > 255) x = 255; ((uint8_t*)b->gi[i].data)[k-9] = x; } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { @@ -170,16 +189,27 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) if (x > 0xffff) x = 0xffff; ((uint16_t*)b->gi[i].data)[k-9] = x; } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { - int x, j; + int x, y, j; uint8_t *data = (uint8_t*)b->gi[i].data; - for (j = 0; j < b->gi[i].len; ++j) { + y = b->n_alleles * (b->n_alleles + 1) / 2; + for (j = 0; j < y; ++j) { x = strtol(q, &q, 10); if (x > 255) x = 255; - data[i * b->gi[i].len + j] = x; + data[(k-9) * y + j] = x; + ++q; + } + } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { + int j, y; + float x, *data = (float*)b->gi[i].data; + y = b->n_alleles * (b->n_alleles + 1) / 2; + for (j = 0; j < y; ++j) { + x = strtod(q, &q); + data[(k-9) * y + j] = x; ++q; } } } + endblock: i = i; } } h->l_nm = rn.l; h->name = rn.s; -- 2.39.5