]> git.donarmstrong.com Git - samtools.git/blobdiff - bcftools/vcf.c
Fix memory leaks:
[samtools.git] / bcftools / vcf.c
index 647ee1f7fe50f0bc6f16b3c008e279ffa519f6af..e8526a38706357c4aa2d067fcf9c64dadd8978f6 100644 (file)
@@ -13,6 +13,7 @@ typedef struct {
        kstream_t *ks;
        void *refhash;
        kstring_t line;
+       int max_ref;
 } vcf_t;
 
 bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
@@ -21,7 +22,7 @@ bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
        int dret;
        vcf_t *v;
        bcf_hdr_t *h;
-       if (!bp->is_vcf) return 0;
+       if (!bp->is_vcf) return bcf_hdr_read(bp);
        h = calloc(1, sizeof(bcf_hdr_t));
        v = (vcf_t*)bp->v;
        v->line.l = 0;
@@ -29,20 +30,22 @@ bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
        memset(&smpl, 0, sizeof(kstring_t));
        while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) {
                if (v->line.l < 2) continue;
-               if (v->line.s[0] != '#') return 0; // no sample line
+               if (v->line.s[0] != '#') {
+            free(meta.s);
+            free(smpl.s);
+            free(h);
+            return 0; // no sample line
+        }
                if (v->line.s[0] == '#' && v->line.s[1] == '#') {
                        kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta);
                } else if (v->line.s[0] == '#') {
                        int k;
-                       char *p, *q, *r;
-                       for (q = v->line.s, p = q + 1, k = 0; *p; ++p) {
-                               if (*p == '\t' || *(p+1) == 0) {
-                                       r = *(p+1) == 0? p+1 : p;
-                                       if (k >= 9) {
-                                               kputsn(q, r - q, &smpl);
-                                               kputc('\0', &smpl);
-                                       }
-                                       q = p + 1; ++k;
+                       ks_tokaux_t aux;
+                       char *p;
+                       for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
+                               if (k >= 9) {
+                                       kputsn(p, aux.p - p, &smpl);
+                                       kputc('\0', &smpl);
                                }
                        }
                        break;
@@ -60,25 +63,52 @@ bcf_t *vcf_open(const char *fn, const char *mode)
 {
        bcf_t *bp;
        vcf_t *v;
+       if (strchr(mode, 'b')) return bcf_open(fn, mode);
        bp = calloc(1, sizeof(bcf_t));
        v = calloc(1, sizeof(vcf_t));
        bp->is_vcf = 1;
        bp->v = v;
+       v->refhash = bcf_str2id_init();
        if (strchr(mode, 'r')) {
                v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
                v->ks = ks_init(v->fp);
        } else if (strchr(mode, 'w'))
-               v->fpout = strcmp(fn, "-")? fopen(fn, "w") : fdopen(fileno(stdout), "w");
+               v->fpout = strcmp(fn, "-")? fopen(fn, "w") : stdout;
        return bp;
 }
 
-void bcf_hdr_clear(bcf_hdr_t *b);
+int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn)
+{
+       vcf_t *v;
+       gzFile fp;
+       kstream_t *ks;
+       kstring_t s, rn;
+       int dret;
+       if (bp == 0) return -1;
+       if (!bp->is_vcf) return 0;
+       s.l = s.m = 0; s.s = 0;
+       rn.m = rn.l = h->l_nm; rn.s = h->name;
+       v = (vcf_t*)bp->v;
+       fp = gzopen(fn, "r");
+       ks = ks_init(fp);
+       while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
+               bcf_str2id_add(v->refhash, strdup(s.s));
+               kputs(s.s, &rn); kputc('\0', &rn);
+               if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       h->l_nm = rn.l; h->name = rn.s;
+       bcf_hdr_sync(h);
+       free(s.s);
+       return 0;
+}
 
 int vcf_close(bcf_t *bp)
 {
        vcf_t *v;
        if (bp == 0) return -1;
-       if (bp->v == 0) return -1;
+       if (!bp->is_vcf) return bcf_close(bp);
        v = (vcf_t*)bp->v;
        if (v->fp) {
                ks_destroy(v->ks);
@@ -86,6 +116,7 @@ int vcf_close(bcf_t *bp)
        }
        if (v->fpout) fclose(v->fpout);
        free(v->line.s);
+       bcf_str2id_thorough_destroy(v->refhash);
        free(v);
        free(bp);
        return 0;
@@ -94,33 +125,125 @@ int vcf_close(bcf_t *bp)
 int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h)
 {
        vcf_t *v = (vcf_t*)bp->v;
-       int i;
-       if (v == 0 || v->fpout == 0) return -1;
-       fwrite(h->txt, 1, h->l_txt, v->fpout);
-       fprintf(v->fpout, "#CHROM\tPOS\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
+       int i, has_ver = 0;
+       if (!bp->is_vcf) return bcf_hdr_write(bp, h);
+       if (h->l_txt > 0) {
+               if (strstr(h->txt, "##fileformat=")) has_ver = 1;
+               if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n");
+               fwrite(h->txt, 1, h->l_txt - 1, v->fpout);
+       }
+       if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n");
+       fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
        for (i = 0; i < h->n_smpl; ++i)
                fprintf(v->fpout, "\t%s", h->sns[i]);
        fputc('\n', v->fpout);
        return 0;
 }
 
-int vcf_read(bcf_t *bp, bcf1_t *b)
+int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
 {
-       int dret;
        vcf_t *v = (vcf_t*)bp->v;
-       v->line.l = 0;
-       if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1;
+       extern void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s);
+       if (!bp->is_vcf) return bcf_write(bp, h, b);
+       bcf_fmt_core(h, b, &v->line);
+       fwrite(v->line.s, 1, v->line.l, v->fpout);
+       fputc('\n', v->fpout);
        return v->line.l + 1;
 }
 
-int vcf_test(int argc, char *argv[])
+int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
 {
-       bcf_t *bp, *bpout;
-       bcf_hdr_t *h;
-       bp = vcf_open(argv[1], "r");
-       bpout = vcf_open("-", "w");
-       h = vcf_hdr_read(bpout);
-       vcf_hdr_write(bpout, h);
-       vcf_close(bp);
-       return 0;
+       int dret, k, i, sync = 0;
+       vcf_t *v = (vcf_t*)bp->v;
+       char *p, *q;
+       kstring_t str, rn;
+       ks_tokaux_t aux, a2;
+       if (!bp->is_vcf) return bcf_read(bp, h, b);
+       v->line.l = 0;
+       str.l = 0; str.m = b->m_str; str.s = b->str;
+       rn.l = rn.m = h->l_nm; rn.s = h->name;
+       if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1;
+       b->n_smpl = h->n_smpl;
+       for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
+               *(char*)aux.p = 0;
+               if (k == 0) { // ref
+                       int tid = bcf_str2id(v->refhash, p);
+                       if (tid < 0) {
+                               tid = bcf_str2id_add(v->refhash, strdup(p));
+                               kputs(p, &rn); kputc('\0', &rn);
+                               sync = 1;
+                       }
+                       b->tid = tid;
+               } else if (k == 1) { // pos
+                       b->pos = atoi(p) - 1;
+               } else if (k == 5) { // qual
+                       b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0;
+               } else if (k <= 8) { // variable length strings
+                       kputs(p, &str); kputc('\0', &str);
+                       b->l_str = str.l; b->m_str = str.m; b->str = str.s;
+                       if (k == 8) bcf_sync(b);
+               } else { // k > 9
+                       if (strncmp(p, "./.", 3) == 0) {
+                               for (i = 0; i < b->n_gi; ++i) {
+                                       if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
+                                               ((uint8_t*)b->gi[i].data)[k-9] = 1<<7;
+                                       } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
+                                               ((uint8_t*)b->gi[i].data)[k-9] = 0;
+                                       } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
+                                               ((int32_t*)b->gi[i].data)[k-9] = 0;
+                                       } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) {
+                                               ((uint16_t*)b->gi[i].data)[k-9] = 0;
+                                       } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+                                               int y = b->n_alleles * (b->n_alleles + 1) / 2;
+                                               memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y);
+                                       } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
+                                               int y = b->n_alleles * (b->n_alleles + 1) / 2;
+                                               memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4);
+                                       }
+                               }
+                               goto endblock;
+                       }
+                       for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) {
+                               if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
+                                       ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6;
+                               } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
+                                       double _x = strtod(q, &q);
+                                       int x = (int)(_x + .499);
+                                       if (x > 255) x = 255;
+                                       ((uint8_t*)b->gi[i].data)[k-9] = x;
+                               } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
+                                       int x = strtol(q, &q, 10);
+                                       if (x > 0xffff) x = 0xffff;
+                                       ((uint32_t*)b->gi[i].data)[k-9] = x;
+                               } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) {
+                                       int x = strtol(q, &q, 10);
+                                       if (x > 0xffff) x = 0xffff;
+                                       ((uint16_t*)b->gi[i].data)[k-9] = x;
+                               } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+                                       int x, y, j;
+                                       uint8_t *data = (uint8_t*)b->gi[i].data;
+                                       y = b->n_alleles * (b->n_alleles + 1) / 2;
+                                       for (j = 0; j < y; ++j) {
+                                               x = strtol(q, &q, 10);
+                                               if (x > 255) x = 255;
+                                               data[(k-9) * y + j] = x;
+                                               ++q;
+                                       }
+                               } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
+                                       int j, y;
+                                       float x, *data = (float*)b->gi[i].data;
+                                       y = b->n_alleles * (b->n_alleles + 1) / 2;
+                                       for (j = 0; j < y; ++j) {
+                                               x = strtod(q, &q);
+                                               data[(k-9) * y + j] = x > 0? -x/10. : x;
+                                               ++q;
+                                       }
+                               }
+                       }
+               endblock: i = i;
+               }
+       }
+       h->l_nm = rn.l; h->name = rn.s;
+       if (sync) bcf_hdr_sync(h);
+       return v->line.l + 1;
 }