]> git.donarmstrong.com Git - samtools.git/commitdiff
* fixed bugs in parsing VCF
authorHeng Li <lh3@live.co.uk>
Wed, 1 Sep 2010 22:31:17 +0000 (22:31 +0000)
committerHeng Li <lh3@live.co.uk>
Wed, 1 Sep 2010 22:31:17 +0000 (22:31 +0000)
 * parser now works with GT/GQ/DP/PL/GL

bcftools/bcf.c
bcftools/vcf.c

index 19969eb5d4a4f15ad0e4e9f2486123794e8b7410..3393aad1edef67c24c548dbc290b15ac02c14caf 100644 (file)
@@ -143,7 +143,7 @@ int bcf_sync(int n_smpl, bcf1_t *b)
                } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2)) {
                        b->gi[i].len = 1;
                } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
-                       b->gi[i].len = 4;
+                       b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2 * 4;
                }
                b->gi[i].data = realloc(b->gi[i].data, n_smpl * b->gi[i].len);
        }
@@ -245,9 +245,21 @@ void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s)
                                kputw(((uint8_t*)b->gi[i].data)[j], s);
                        } else if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
                                int y = ((uint8_t*)b->gi[i].data)[j];
-                               kputc('0' + (y>>3&7), s);
-                               kputc("/|"[y>>6&1], s);
-                               kputc('0' + (y&7), s);
+                               if (y>>7&1) {
+                                       kputsn("./.", 3, s);
+                               } else {
+                                       kputc('0' + (y>>3&7), s);
+                                       kputc("/|"[y>>6&1], s);
+                                       kputc('0' + (y&7), s);
+                               }
+                       } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
+                               float *d = (float*)b->gi[i].data + j * x;
+                               int k;
+                               //printf("- %lx\n", d);
+                               for (k = 0; k < x; ++k) {
+                                       if (k > 0) kputc(',', s);
+                                       ksprintf(s, "%.2f", d[k]);
+                               }
                        }
                }
        }
index 81c7061c9cbc4462ade1d36825100b5b760c5975..23aad3c0c97b02c12d9643d18c87e68ef4d87dc2 100644 (file)
@@ -150,19 +150,38 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
                        }
                        b->tid = tid;
                } else if (k == 1) { // pos
-                       b->pos = atoi(p);
+                       b->pos = atoi(p) - 1;
                } else if (k == 5) { // qual
                        b->qual = (p[0] >= '0' && p[0] <= '9')? atoi(p) : 0;
                } else if (k <= 8) { // variable length strings
                        kputs(p, &str); kputc('\0', &str);
                        b->l_str = str.l; b->m_str = str.m; b->str = str.s;
                        if (k == 8) bcf_sync(h->n_smpl, b);
-               } else {
+               } else { // k > 9
+                       if (strncmp(p, "./.", 3) == 0) {
+                               for (i = 0; i < b->n_gi; ++i) {
+                                       if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
+                                               ((uint8_t*)b->gi[i].data)[k-9] = 1<<7;
+                                       } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
+                                               ((uint8_t*)b->gi[i].data)[k-9] = 0;
+                                       } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
+                                               ((uint16_t*)b->gi[i].data)[k-9] = 0;
+                                       } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+                                               int y = b->n_alleles * (b->n_alleles + 1) / 2;
+                                               memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y);
+                                       } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
+                                               int y = b->n_alleles * (b->n_alleles + 1) / 2;
+                                               memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4);
+                                       }
+                               }
+                               goto endblock;
+                       }
                        for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) {
                                if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
                                        ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6;
                                } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
-                                       int x = strtol(q, &q, 10);
+                                       double _x = strtod(q, &q);
+                                       int x = (int)(_x + .499);
                                        if (x > 255) x = 255;
                                        ((uint8_t*)b->gi[i].data)[k-9] = x;
                                } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
@@ -170,16 +189,27 @@ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
                                        if (x > 0xffff) x = 0xffff;
                                        ((uint16_t*)b->gi[i].data)[k-9] = x;
                                } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
-                                       int x, j;
+                                       int x, y, j;
                                        uint8_t *data = (uint8_t*)b->gi[i].data;
-                                       for (j = 0; j < b->gi[i].len; ++j) {
+                                       y = b->n_alleles * (b->n_alleles + 1) / 2;
+                                       for (j = 0; j < y; ++j) {
                                                x = strtol(q, &q, 10);
                                                if (x > 255) x = 255;
-                                               data[i * b->gi[i].len + j] = x;
+                                               data[(k-9) * y + j] = x;
+                                               ++q;
+                                       }
+                               } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
+                                       int j, y;
+                                       float x, *data = (float*)b->gi[i].data;
+                                       y = b->n_alleles * (b->n_alleles + 1) / 2;
+                                       for (j = 0; j < y; ++j) {
+                                               x = strtod(q, &q);
+                                               data[(k-9) * y + j] = x;
                                                ++q;
                                        }
                                }
                        }
+               endblock: i = i;
                }
        }
        h->l_nm = rn.l; h->name = rn.s;