CC= gcc
-CFLAGS= -g -Wall -O2 #-fPIC #-m64 #-arch ppc
+CFLAGS= -g -Wall #-O2 #-fPIC #-m64 #-arch ppc
DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE
-LOBJS= bcf.o bcfutils.o prob1.o index.o
+LOBJS= bcf.o vcf.o bcfutils.o prob1.o index.o
OMISC= ..
AOBJS= vcfout.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o
PROG= bcftools
#include "kstring.h"
#include "bcf.h"
-int bcf_hdr_read(bcf_t *b);
-
void bcf_hdr_clear(bcf_hdr_t *b)
{
free(b->name); free(b->sname); free(b->txt); free(b->ns); free(b->sns);
b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), mode);
}
b->fp->owned_file = 1;
- if (strchr(mode, 'r')) bcf_hdr_read(b);
return b;
}
int bcf_close(bcf_t *b)
{
- int ret;
if (b == 0) return 0;
- ret = bgzf_close(b->fp);
- free(b->h.name); free(b->h.sname); free(b->h.txt); free(b->h.ns); free(b->h.sns);
- free(b);
- return ret;
+ return bgzf_close(b->fp);
}
-int bcf_hdr_write(bcf_t *b)
+int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h)
{
- bcf_hdr_t *h;
- if (b == 0) return -1;
- h = &b->h;
+ if (b == 0 || h == 0) return -1;
bgzf_write(b->fp, "BCF\4", 4);
bgzf_write(b->fp, &h->l_nm, 4);
bgzf_write(b->fp, h->name, h->l_nm);
return 16 + h->l_nm + h->l_smpl + h->l_txt;
}
-int bcf_hdr_cpy(bcf_hdr_t *h, const bcf_hdr_t *h0)
-{
- *h = *h0;
- h->name = malloc(h->l_nm);
- h->sname = malloc(h->l_smpl);
- h->txt = malloc(h->l_txt);
- memcpy(h->name, h0->name, h->l_nm);
- memcpy(h->sname, h0->sname, h->l_smpl);
- memcpy(h->txt, h0->txt, h->l_txt);
- bcf_hdr_sync(h);
- return 0;
-}
-
-int bcf_hdr_read(bcf_t *b)
+bcf_hdr_t *bcf_hdr_read(bcf_t *b)
{
uint8_t magic[4];
bcf_hdr_t *h;
- if (b == 0) return -1;
- bcf_hdr_clear(&b->h);
- h = &b->h;
+ if (b == 0) return 0;
+ h = calloc(1, sizeof(bcf_hdr_t));
bgzf_read(b->fp, magic, 4);
bgzf_read(b->fp, &h->l_nm, 4);
h->name = malloc(h->l_nm);
bgzf_read(b->fp, &h->l_txt, 4);
h->txt = malloc(h->l_txt);
bgzf_read(b->fp, h->txt, h->l_txt);
- bcf_hdr_sync(&b->h);
- return 16 + h->l_nm + h->l_smpl + h->l_txt;
+ bcf_hdr_sync(h);
+ return h;
+}
+
+void bcf_hdr_destroy(bcf_hdr_t *h)
+{
+ if (h == 0) return;
+ free(h->name); free(h->sname); free(h->txt); free(h->ns); free(h->sns);
+ free(h);
}
static inline char **cnt_null(int l, char *str, int *_n)
int bcf_hdr_sync(bcf_hdr_t *b)
{
if (b == 0) return -1;
- b->ns = cnt_null(b->l_nm, b->name, &b->n_ref);
+ if (b->ns) free(b->ns);
+ if (b->sns) free(b->sns);
+ if (b->l_nm) b->ns = cnt_null(b->l_nm, b->name, &b->n_ref);
+ else b->ns = 0, b->n_ref = 0;
b->sns = cnt_null(b->l_smpl, b->sname, &b->n_smpl);
return 0;
}
return 0;
}
-int bcf_write(bcf_t *bp, const bcf1_t *b)
+int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b)
{
uint32_t x;
int i, l = 0;
bgzf_write(bp->fp, b->str, b->l_str);
l = 12 + b->l_str;
for (i = 0; i < b->n_gi; ++i) {
- bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl);
- l += b->gi[i].len * bp->h.n_smpl;
+ bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl);
+ l += b->gi[i].len * h->n_smpl;
}
return l;
}
-int bcf_read(bcf_t *bp, bcf1_t *b)
+int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b)
{
int i, l = 0;
uint32_t x;
if (b == 0) return -1;
- if (bgzf_read(bp->fp, &b->tid, 4) == 0) return 0;
+ if (bgzf_read(bp->fp, &b->tid, 4) == 0) return -1;
bgzf_read(bp->fp, &b->pos, 4);
bgzf_read(bp->fp, &x, 4);
b->qual = x >> 24; b->l_str = x << 8 >> 8;
}
bgzf_read(bp->fp, b->str, b->l_str);
l = 12 + b->l_str;
- bcf_sync(bp->h.n_smpl, b);
+ bcf_sync(h->n_smpl, b);
for (i = 0; i < b->n_gi; ++i) {
- bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl);
- l += b->gi[i].len * bp->h.n_smpl;
+ bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl);
+ l += b->gi[i].len * h->n_smpl;
}
return l;
}
else kputs(p, s);
}
-char *bcf_fmt(bcf_t *bp, bcf1_t *b)
+char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b)
{
kstring_t s;
int i, j, x;
memset(&s, 0, sizeof(kstring_t));
- kputs(bp->h.ns[b->tid], &s); kputc('\t', &s);
+ kputs(h->ns[b->tid], &s); kputc('\t', &s);
kputw(b->pos + 1, &s); kputc('\t', &s);
fmt_str(b->str, &s); kputc('\t', &s);
fmt_str(b->ref, &s); kputc('\t', &s);
fmt_str(b->fmt, &s);
}
x = b->n_alleles * (b->n_alleles + 1) / 2;
- for (j = 0; j < bp->h.n_smpl; ++j) {
+ for (j = 0; j < h->n_smpl; ++j) {
kputc('\t', &s);
for (i = 0; i < b->n_gi; ++i) {
if (i) kputc(':', &s);
#define BCF_H
#include <stdint.h>
+#include <zlib.h>
#include "bgzf.h"
typedef struct {
} bcf_hdr_t;
typedef struct {
+ int is_vcf;
+ void *v;
BGZF *fp;
- bcf_hdr_t h;
} bcf_t;
struct __bcf_idx_t;
bcf_t *bcf_open(const char *fn, const char *mode);
int bcf_close(bcf_t *b);
- int bcf_read(bcf_t *bp, bcf1_t *b);
+ int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b);
int bcf_sync(int n_smpl, bcf1_t *b);
- int bcf_write(bcf_t *bp, const bcf1_t *b);
- int bcf_hdr_write(bcf_t *b);
+ int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b);
+ bcf_hdr_t *bcf_hdr_read(bcf_t *b);
+ int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h);
int bcf_hdr_sync(bcf_hdr_t *b);
- int bcf_hdr_cpy(bcf_hdr_t *h, const bcf_hdr_t *h0);
+ void bcf_hdr_destroy(bcf_hdr_t *h);
int bcf_destroy(bcf1_t *b);
- char *bcf_fmt(bcf_t *bp, bcf1_t *b);
+ char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b);
+
+ int vcf_close(bcf_t *bp);
void *bcf_build_refhash(bcf_hdr_t *h);
void bcf_str2id_destroy(void *_hash);
return hash;
}
+void *bcf_str2id_init()
+{
+ return kh_init(str2id);
+}
+
+int bcf_str2id_put(void *_hash, const char *str, int id)
+{
+ return 0;
+}
+
void bcf_str2id_destroy(void *_hash)
{
khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
}
}
-bcf_idx_t *bcf_idx_core(bcf_t *bp)
+bcf_idx_t *bcf_idx_core(bcf_t *bp, bcf_hdr_t *h)
{
bcf_idx_t *idx;
int32_t last_coor, last_tid;
b = calloc(1, sizeof(bcf1_t));
str = calloc(1, sizeof(kstring_t));
idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t));
- idx->n = bp->h.n_ref;
- idx->index2 = calloc(bp->h.n_ref, sizeof(bcf_lidx_t));
+ idx->n = h->n_ref;
+ idx->index2 = calloc(h->n_ref, sizeof(bcf_lidx_t));
last_tid = 0xffffffffu;
last_off = bgzf_tell(fp); last_coor = 0xffffffffu;
- while ((ret = bcf_read(bp, b)) > 0) {
+ while ((ret = bcf_read(bp, h, b)) > 0) {
int end, tmp;
if (last_tid != b->tid) { // change of chromosomes
last_tid = b->tid;
BGZF *fpidx;
bcf_t *bp;
bcf_idx_t *idx;
+ bcf_hdr_t *h;
if ((bp = bcf_open(fn, "r")) == 0) {
fprintf(stderr, "[bcf_idx_build2] fail to open the BAM file.\n");
return -1;
}
- idx = bcf_idx_core(bp);
+ h = bcf_hdr_read(bp);
+ idx = bcf_idx_core(bp, h);
bcf_close(bp);
if (_fnidx == 0) {
fnidx = (char*)calloc(strlen(fn) + 5, 1);
}
if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1);
if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1);
+ if (strcmp(argv[1], "test") == 0) return vcf_test(argc-1, argv+1);
else {
fprintf(stderr, "[main] Unrecognized command.\n");
return 1;
--- /dev/null
+#include <zlib.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "bcf.h"
+#include "kstring.h"
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 4096)
+
+typedef struct {
+ gzFile fp;
+ FILE *fpout;
+ kstream_t *ks;
+ void *refhash;
+ kstring_t line;
+} vcf_t;
+
+bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
+{
+ kstring_t meta, smpl;
+ int dret;
+ vcf_t *v;
+ bcf_hdr_t *h;
+ if (!bp->is_vcf) return 0;
+ h = calloc(1, sizeof(bcf_hdr_t));
+ v = (vcf_t*)bp->v;
+ v->line.l = 0;
+ memset(&meta, 0, sizeof(kstring_t));
+ memset(&smpl, 0, sizeof(kstring_t));
+ while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) {
+ if (v->line.l < 2) continue;
+ if (v->line.s[0] != '#') return 0; // no sample line
+ if (v->line.s[0] == '#' && v->line.s[1] == '#') {
+ kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta);
+ } else if (v->line.s[0] == '#') {
+ int k;
+ char *p, *q, *r;
+ for (q = v->line.s, p = q + 1, k = 0; *p; ++p) {
+ if (*p == '\t' || *(p+1) == 0) {
+ r = *(p+1) == 0? p+1 : p;
+ if (k >= 9) {
+ kputsn(q, r - q, &smpl);
+ kputc('\0', &smpl);
+ }
+ q = p + 1; ++k;
+ }
+ }
+ break;
+ }
+ }
+ kputc('\0', &meta);
+ h->name = 0;
+ h->sname = smpl.s; h->l_smpl = smpl.l;
+ h->txt = meta.s; h->l_txt = meta.l;
+ bcf_hdr_sync(h);
+ return h;
+}
+
+bcf_t *vcf_open(const char *fn, const char *mode)
+{
+ bcf_t *bp;
+ vcf_t *v;
+ bp = calloc(1, sizeof(bcf_t));
+ v = calloc(1, sizeof(vcf_t));
+ bp->is_vcf = 1;
+ bp->v = v;
+ if (strchr(mode, 'r')) {
+ v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ v->ks = ks_init(v->fp);
+ } else if (strchr(mode, 'w'))
+ v->fpout = strcmp(fn, "-")? fopen(fn, "w") : fdopen(fileno(stdout), "w");
+ return bp;
+}
+
+void bcf_hdr_clear(bcf_hdr_t *b);
+
+int vcf_close(bcf_t *bp)
+{
+ vcf_t *v;
+ if (bp == 0) return -1;
+ if (bp->v == 0) return -1;
+ v = (vcf_t*)bp->v;
+ if (v->fp) {
+ ks_destroy(v->ks);
+ gzclose(v->fp);
+ }
+ if (v->fpout) fclose(v->fpout);
+ free(v->line.s);
+ free(v);
+ free(bp);
+ return 0;
+}
+
+int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h)
+{
+ vcf_t *v = (vcf_t*)bp->v;
+ int i;
+ if (v == 0 || v->fpout == 0) return -1;
+ fwrite(h->txt, 1, h->l_txt, v->fpout);
+ fprintf(v->fpout, "#CHROM\tPOS\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
+ for (i = 0; i < h->n_smpl; ++i)
+ fprintf(v->fpout, "\t%s", h->sns[i]);
+ fputc('\n', v->fpout);
+ return 0;
+}
+
+int vcf_read(bcf_t *bp, bcf1_t *b)
+{
+ int dret;
+ vcf_t *v = (vcf_t*)bp->v;
+ v->line.l = 0;
+ if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1;
+ return v->line.l + 1;
+}
+
+int vcf_test(int argc, char *argv[])
+{
+ bcf_t *bp, *bpout;
+ bcf_hdr_t *h;
+ bp = vcf_open(argv[1], "r");
+ bpout = vcf_open("-", "w");
+ h = vcf_hdr_read(bpout);
+ vcf_hdr_write(bpout, h);
+ vcf_close(bp);
+ return 0;
+}
uint64_t n_processed = 0;
viewconf_t vc;
bcf_p1aux_t *p1 = 0;
+ bcf_hdr_t *h;
int tid, begin, end;
khash_t(set64) *hash = 0;
b = calloc(1, sizeof(bcf1_t));
bp = bcf_open(argv[optind], "r");
+ h = bcf_hdr_read(bp);
if (vc.flag & VC_BCF) {
bout = bcf_open("-", "w");
- bcf_hdr_cpy(&bout->h, &bp->h);
- bcf_hdr_write(bout);
+ bcf_hdr_write(bout, h);
}
if (vc.flag & VC_CALL) {
- p1 = bcf_p1_init(bp->h.n_smpl);
+ p1 = bcf_p1_init(h->n_smpl);
bcf_p1_init_prior(p1, vc.prior_type, vc.theta);
}
- if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, &bp->h);
+ if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h);
if (optind + 1 < argc) {
- void *str2id = bcf_build_refhash(&bp->h);
+ void *str2id = bcf_build_refhash(h);
if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) {
bcf_idx_t *idx;
idx = bcf_idx_load(argv[optind]);
}
}
}
- while (bcf_read(bp, b) > 0) {
+ while (bcf_read(bp, h, b) > 0) {
if (hash) {
uint64_t x = (uint64_t)b->tid<<32 | b->pos;
khint_t k = kh_get(set64, hash, x);
int is_var;
bcf_p1_cal(b, p1, &pr);
is_var = update_bcf1(b, p1, &pr, vc.flag);
- bcf_sync(bp->h.n_smpl, b);
+ bcf_sync(h->n_smpl, b);
if ((n_processed + 1) % 50000 == 0) bcf_p1_dump_afs(p1);
if ((vc.flag & VC_VARONLY) && !is_var) continue;
}
- if (vc.flag & VC_BCF) bcf_write(bout, b);
+ if (vc.flag & VC_BCF) bcf_write(bout, h, b);
else {
char *vcf;
if (vc.flag & VC_NO_GENO) {
b->n_gi = 0;
b->fmt[0] = '\0';
}
- vcf = bcf_fmt(bp, b);
+ vcf = bcf_fmt(h, b);
puts(vcf);
free(vcf);
}
++n_processed;
}
if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1);
+ bcf_hdr_destroy(h);
bcf_close(bp); bcf_close(bout);
bcf_destroy(b);
if (hash) kh_destroy(set64, hash);