From a90c2f68b793d5c35df8943bc0264024fb270f4a Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 5 Aug 2010 19:46:40 +0000 Subject: [PATCH] * added prelimiary VCF parser (not finished) * change struct a bit --- bcftools/Makefile | 4 +- bcftools/bcf.c | 73 +++++++++++-------------- bcftools/bcf.h | 17 +++--- bcftools/bcfutils.c | 10 ++++ bcftools/index.c | 12 +++-- bcftools/main.c | 1 + bcftools/vcf.c | 126 ++++++++++++++++++++++++++++++++++++++++++++ bcftools/vcfout.c | 20 +++---- 8 files changed, 198 insertions(+), 65 deletions(-) create mode 100644 bcftools/vcf.c diff --git a/bcftools/Makefile b/bcftools/Makefile index 7fdf41f..d2db831 100644 --- a/bcftools/Makefile +++ b/bcftools/Makefile @@ -1,7 +1,7 @@ CC= gcc -CFLAGS= -g -Wall -O2 #-fPIC #-m64 #-arch ppc +CFLAGS= -g -Wall #-O2 #-fPIC #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -LOBJS= bcf.o bcfutils.o prob1.o index.o +LOBJS= bcf.o vcf.o bcfutils.o prob1.o index.o OMISC= .. AOBJS= vcfout.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o PROG= bcftools diff --git a/bcftools/bcf.c b/bcftools/bcf.c index e406319..aa1bb03 100644 --- a/bcftools/bcf.c +++ b/bcftools/bcf.c @@ -4,8 +4,6 @@ #include "kstring.h" #include "bcf.h" -int bcf_hdr_read(bcf_t *b); - void bcf_hdr_clear(bcf_hdr_t *b) { free(b->name); free(b->sname); free(b->txt); free(b->ns); free(b->sns); @@ -22,25 +20,18 @@ bcf_t *bcf_open(const char *fn, const char *mode) b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), mode); } b->fp->owned_file = 1; - if (strchr(mode, 'r')) bcf_hdr_read(b); return b; } int bcf_close(bcf_t *b) { - int ret; if (b == 0) return 0; - ret = bgzf_close(b->fp); - free(b->h.name); free(b->h.sname); free(b->h.txt); free(b->h.ns); free(b->h.sns); - free(b); - return ret; + return bgzf_close(b->fp); } -int bcf_hdr_write(bcf_t *b) +int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h) { - bcf_hdr_t *h; - if (b == 0) return -1; - h = &b->h; + if (b == 0 || h == 0) return -1; bgzf_write(b->fp, "BCF\4", 4); bgzf_write(b->fp, &h->l_nm, 4); bgzf_write(b->fp, h->name, h->l_nm); @@ -52,26 +43,12 @@ int bcf_hdr_write(bcf_t *b) return 16 + h->l_nm + h->l_smpl + h->l_txt; } -int bcf_hdr_cpy(bcf_hdr_t *h, const bcf_hdr_t *h0) -{ - *h = *h0; - h->name = malloc(h->l_nm); - h->sname = malloc(h->l_smpl); - h->txt = malloc(h->l_txt); - memcpy(h->name, h0->name, h->l_nm); - memcpy(h->sname, h0->sname, h->l_smpl); - memcpy(h->txt, h0->txt, h->l_txt); - bcf_hdr_sync(h); - return 0; -} - -int bcf_hdr_read(bcf_t *b) +bcf_hdr_t *bcf_hdr_read(bcf_t *b) { uint8_t magic[4]; bcf_hdr_t *h; - if (b == 0) return -1; - bcf_hdr_clear(&b->h); - h = &b->h; + if (b == 0) return 0; + h = calloc(1, sizeof(bcf_hdr_t)); bgzf_read(b->fp, magic, 4); bgzf_read(b->fp, &h->l_nm, 4); h->name = malloc(h->l_nm); @@ -82,8 +59,15 @@ int bcf_hdr_read(bcf_t *b) bgzf_read(b->fp, &h->l_txt, 4); h->txt = malloc(h->l_txt); bgzf_read(b->fp, h->txt, h->l_txt); - bcf_hdr_sync(&b->h); - return 16 + h->l_nm + h->l_smpl + h->l_txt; + bcf_hdr_sync(h); + return h; +} + +void bcf_hdr_destroy(bcf_hdr_t *h) +{ + if (h == 0) return; + free(h->name); free(h->sname); free(h->txt); free(h->ns); free(h->sns); + free(h); } static inline char **cnt_null(int l, char *str, int *_n) @@ -105,7 +89,10 @@ static inline char **cnt_null(int l, char *str, int *_n) int bcf_hdr_sync(bcf_hdr_t *b) { if (b == 0) return -1; - b->ns = cnt_null(b->l_nm, b->name, &b->n_ref); + if (b->ns) free(b->ns); + if (b->sns) free(b->sns); + if (b->l_nm) b->ns = cnt_null(b->l_nm, b->name, &b->n_ref); + else b->ns = 0, b->n_ref = 0; b->sns = cnt_null(b->l_smpl, b->sname, &b->n_smpl); return 0; } @@ -161,7 +148,7 @@ int bcf_sync(int n_smpl, bcf1_t *b) return 0; } -int bcf_write(bcf_t *bp, const bcf1_t *b) +int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b) { uint32_t x; int i, l = 0; @@ -173,18 +160,18 @@ int bcf_write(bcf_t *bp, const bcf1_t *b) bgzf_write(bp->fp, b->str, b->l_str); l = 12 + b->l_str; for (i = 0; i < b->n_gi; ++i) { - bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl); - l += b->gi[i].len * bp->h.n_smpl; + bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl); + l += b->gi[i].len * h->n_smpl; } return l; } -int bcf_read(bcf_t *bp, bcf1_t *b) +int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b) { int i, l = 0; uint32_t x; if (b == 0) return -1; - if (bgzf_read(bp->fp, &b->tid, 4) == 0) return 0; + if (bgzf_read(bp->fp, &b->tid, 4) == 0) return -1; bgzf_read(bp->fp, &b->pos, 4); bgzf_read(bp->fp, &x, 4); b->qual = x >> 24; b->l_str = x << 8 >> 8; @@ -195,10 +182,10 @@ int bcf_read(bcf_t *bp, bcf1_t *b) } bgzf_read(bp->fp, b->str, b->l_str); l = 12 + b->l_str; - bcf_sync(bp->h.n_smpl, b); + bcf_sync(h->n_smpl, b); for (i = 0; i < b->n_gi; ++i) { - bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl); - l += b->gi[i].len * bp->h.n_smpl; + bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl); + l += b->gi[i].len * h->n_smpl; } return l; } @@ -221,12 +208,12 @@ static inline void fmt_str(const char *p, kstring_t *s) else kputs(p, s); } -char *bcf_fmt(bcf_t *bp, bcf1_t *b) +char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b) { kstring_t s; int i, j, x; memset(&s, 0, sizeof(kstring_t)); - kputs(bp->h.ns[b->tid], &s); kputc('\t', &s); + kputs(h->ns[b->tid], &s); kputc('\t', &s); kputw(b->pos + 1, &s); kputc('\t', &s); fmt_str(b->str, &s); kputc('\t', &s); fmt_str(b->ref, &s); kputc('\t', &s); @@ -239,7 +226,7 @@ char *bcf_fmt(bcf_t *bp, bcf1_t *b) fmt_str(b->fmt, &s); } x = b->n_alleles * (b->n_alleles + 1) / 2; - for (j = 0; j < bp->h.n_smpl; ++j) { + for (j = 0; j < h->n_smpl; ++j) { kputc('\t', &s); for (i = 0; i < b->n_gi; ++i) { if (i) kputc(':', &s); diff --git a/bcftools/bcf.h b/bcftools/bcf.h index 1c0436a..0923574 100644 --- a/bcftools/bcf.h +++ b/bcftools/bcf.h @@ -2,6 +2,7 @@ #define BCF_H #include +#include #include "bgzf.h" typedef struct { @@ -32,8 +33,9 @@ typedef struct { } bcf_hdr_t; typedef struct { + int is_vcf; + void *v; BGZF *fp; - bcf_hdr_t h; } bcf_t; struct __bcf_idx_t; @@ -45,14 +47,17 @@ extern "C" { bcf_t *bcf_open(const char *fn, const char *mode); int bcf_close(bcf_t *b); - int bcf_read(bcf_t *bp, bcf1_t *b); + int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b); int bcf_sync(int n_smpl, bcf1_t *b); - int bcf_write(bcf_t *bp, const bcf1_t *b); - int bcf_hdr_write(bcf_t *b); + int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b); + bcf_hdr_t *bcf_hdr_read(bcf_t *b); + int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h); int bcf_hdr_sync(bcf_hdr_t *b); - int bcf_hdr_cpy(bcf_hdr_t *h, const bcf_hdr_t *h0); + void bcf_hdr_destroy(bcf_hdr_t *h); int bcf_destroy(bcf1_t *b); - char *bcf_fmt(bcf_t *bp, bcf1_t *b); + char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b); + + int vcf_close(bcf_t *bp); void *bcf_build_refhash(bcf_hdr_t *h); void bcf_str2id_destroy(void *_hash); diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c index e0e4a06..861e33f 100644 --- a/bcftools/bcfutils.c +++ b/bcftools/bcfutils.c @@ -15,6 +15,16 @@ void *bcf_build_refhash(bcf_hdr_t *h) return hash; } +void *bcf_str2id_init() +{ + return kh_init(str2id); +} + +int bcf_str2id_put(void *_hash, const char *str, int id) +{ + return 0; +} + void bcf_str2id_destroy(void *_hash) { khash_t(str2id) *hash = (khash_t(str2id)*)_hash; diff --git a/bcftools/index.c b/bcftools/index.c index b2663d4..768519c 100644 --- a/bcftools/index.c +++ b/bcftools/index.c @@ -56,7 +56,7 @@ static void fill_missing(bcf_idx_t *idx) } } -bcf_idx_t *bcf_idx_core(bcf_t *bp) +bcf_idx_t *bcf_idx_core(bcf_t *bp, bcf_hdr_t *h) { bcf_idx_t *idx; int32_t last_coor, last_tid; @@ -69,12 +69,12 @@ bcf_idx_t *bcf_idx_core(bcf_t *bp) b = calloc(1, sizeof(bcf1_t)); str = calloc(1, sizeof(kstring_t)); idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t)); - idx->n = bp->h.n_ref; - idx->index2 = calloc(bp->h.n_ref, sizeof(bcf_lidx_t)); + idx->n = h->n_ref; + idx->index2 = calloc(h->n_ref, sizeof(bcf_lidx_t)); last_tid = 0xffffffffu; last_off = bgzf_tell(fp); last_coor = 0xffffffffu; - while ((ret = bcf_read(bp, b)) > 0) { + while ((ret = bcf_read(bp, h, b)) > 0) { int end, tmp; if (last_tid != b->tid) { // change of chromosomes last_tid = b->tid; @@ -255,11 +255,13 @@ int bcf_idx_build2(const char *fn, const char *_fnidx) BGZF *fpidx; bcf_t *bp; bcf_idx_t *idx; + bcf_hdr_t *h; if ((bp = bcf_open(fn, "r")) == 0) { fprintf(stderr, "[bcf_idx_build2] fail to open the BAM file.\n"); return -1; } - idx = bcf_idx_core(bp); + h = bcf_hdr_read(bp); + idx = bcf_idx_core(bp, h); bcf_close(bp); if (_fnidx == 0) { fnidx = (char*)calloc(strlen(fn) + 5, 1); diff --git a/bcftools/main.c b/bcftools/main.c index e271efd..017ac43 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -17,6 +17,7 @@ int main(int argc, char *argv[]) } if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1); if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1); + if (strcmp(argv[1], "test") == 0) return vcf_test(argc-1, argv+1); else { fprintf(stderr, "[main] Unrecognized command.\n"); return 1; diff --git a/bcftools/vcf.c b/bcftools/vcf.c new file mode 100644 index 0000000..647ee1f --- /dev/null +++ b/bcftools/vcf.c @@ -0,0 +1,126 @@ +#include +#include +#include +#include +#include "bcf.h" +#include "kstring.h" +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 4096) + +typedef struct { + gzFile fp; + FILE *fpout; + kstream_t *ks; + void *refhash; + kstring_t line; +} vcf_t; + +bcf_hdr_t *vcf_hdr_read(bcf_t *bp) +{ + kstring_t meta, smpl; + int dret; + vcf_t *v; + bcf_hdr_t *h; + if (!bp->is_vcf) return 0; + h = calloc(1, sizeof(bcf_hdr_t)); + v = (vcf_t*)bp->v; + v->line.l = 0; + memset(&meta, 0, sizeof(kstring_t)); + memset(&smpl, 0, sizeof(kstring_t)); + while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) { + if (v->line.l < 2) continue; + if (v->line.s[0] != '#') return 0; // no sample line + if (v->line.s[0] == '#' && v->line.s[1] == '#') { + kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta); + } else if (v->line.s[0] == '#') { + int k; + char *p, *q, *r; + for (q = v->line.s, p = q + 1, k = 0; *p; ++p) { + if (*p == '\t' || *(p+1) == 0) { + r = *(p+1) == 0? p+1 : p; + if (k >= 9) { + kputsn(q, r - q, &smpl); + kputc('\0', &smpl); + } + q = p + 1; ++k; + } + } + break; + } + } + kputc('\0', &meta); + h->name = 0; + h->sname = smpl.s; h->l_smpl = smpl.l; + h->txt = meta.s; h->l_txt = meta.l; + bcf_hdr_sync(h); + return h; +} + +bcf_t *vcf_open(const char *fn, const char *mode) +{ + bcf_t *bp; + vcf_t *v; + bp = calloc(1, sizeof(bcf_t)); + v = calloc(1, sizeof(vcf_t)); + bp->is_vcf = 1; + bp->v = v; + if (strchr(mode, 'r')) { + v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + v->ks = ks_init(v->fp); + } else if (strchr(mode, 'w')) + v->fpout = strcmp(fn, "-")? fopen(fn, "w") : fdopen(fileno(stdout), "w"); + return bp; +} + +void bcf_hdr_clear(bcf_hdr_t *b); + +int vcf_close(bcf_t *bp) +{ + vcf_t *v; + if (bp == 0) return -1; + if (bp->v == 0) return -1; + v = (vcf_t*)bp->v; + if (v->fp) { + ks_destroy(v->ks); + gzclose(v->fp); + } + if (v->fpout) fclose(v->fpout); + free(v->line.s); + free(v); + free(bp); + return 0; +} + +int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h) +{ + vcf_t *v = (vcf_t*)bp->v; + int i; + if (v == 0 || v->fpout == 0) return -1; + fwrite(h->txt, 1, h->l_txt, v->fpout); + fprintf(v->fpout, "#CHROM\tPOS\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); + for (i = 0; i < h->n_smpl; ++i) + fprintf(v->fpout, "\t%s", h->sns[i]); + fputc('\n', v->fpout); + return 0; +} + +int vcf_read(bcf_t *bp, bcf1_t *b) +{ + int dret; + vcf_t *v = (vcf_t*)bp->v; + v->line.l = 0; + if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1; + return v->line.l + 1; +} + +int vcf_test(int argc, char *argv[]) +{ + bcf_t *bp, *bpout; + bcf_hdr_t *h; + bp = vcf_open(argv[1], "r"); + bpout = vcf_open("-", "w"); + h = vcf_hdr_read(bpout); + vcf_hdr_write(bpout, h); + vcf_close(bp); + return 0; +} diff --git a/bcftools/vcfout.c b/bcftools/vcfout.c index d71dfd9..075fb61 100644 --- a/bcftools/vcfout.c +++ b/bcftools/vcfout.c @@ -88,6 +88,7 @@ int bcfview(int argc, char *argv[]) uint64_t n_processed = 0; viewconf_t vc; bcf_p1aux_t *p1 = 0; + bcf_hdr_t *h; int tid, begin, end; khash_t(set64) *hash = 0; @@ -117,18 +118,18 @@ int bcfview(int argc, char *argv[]) b = calloc(1, sizeof(bcf1_t)); bp = bcf_open(argv[optind], "r"); + h = bcf_hdr_read(bp); if (vc.flag & VC_BCF) { bout = bcf_open("-", "w"); - bcf_hdr_cpy(&bout->h, &bp->h); - bcf_hdr_write(bout); + bcf_hdr_write(bout, h); } if (vc.flag & VC_CALL) { - p1 = bcf_p1_init(bp->h.n_smpl); + p1 = bcf_p1_init(h->n_smpl); bcf_p1_init_prior(p1, vc.prior_type, vc.theta); } - if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, &bp->h); + if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h); if (optind + 1 < argc) { - void *str2id = bcf_build_refhash(&bp->h); + void *str2id = bcf_build_refhash(h); if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) { bcf_idx_t *idx; idx = bcf_idx_load(argv[optind]); @@ -140,7 +141,7 @@ int bcfview(int argc, char *argv[]) } } } - while (bcf_read(bp, b) > 0) { + while (bcf_read(bp, h, b) > 0) { if (hash) { uint64_t x = (uint64_t)b->tid<<32 | b->pos; khint_t k = kh_get(set64, hash, x); @@ -157,24 +158,25 @@ int bcfview(int argc, char *argv[]) int is_var; bcf_p1_cal(b, p1, &pr); is_var = update_bcf1(b, p1, &pr, vc.flag); - bcf_sync(bp->h.n_smpl, b); + bcf_sync(h->n_smpl, b); if ((n_processed + 1) % 50000 == 0) bcf_p1_dump_afs(p1); if ((vc.flag & VC_VARONLY) && !is_var) continue; } - if (vc.flag & VC_BCF) bcf_write(bout, b); + if (vc.flag & VC_BCF) bcf_write(bout, h, b); else { char *vcf; if (vc.flag & VC_NO_GENO) { b->n_gi = 0; b->fmt[0] = '\0'; } - vcf = bcf_fmt(bp, b); + vcf = bcf_fmt(h, b); puts(vcf); free(vcf); } ++n_processed; } if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1); + bcf_hdr_destroy(h); bcf_close(bp); bcf_close(bout); bcf_destroy(b); if (hash) kh_destroy(set64, hash); -- 2.39.2