From f418206d4563377a981d66804873599e4e050842 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 17 Sep 2010 14:11:37 +0000 Subject: [PATCH] * added comments * VCF->BCF is not possible without knowing the sequence dictionary before hand... --- bcftools/Makefile | 6 ++- bcftools/bcf.c | 8 +--- bcftools/bcf.h | 105 ++++++++++++++++++++++++++++++++++++---------- bcftools/vcf.c | 7 ---- 4 files changed, 88 insertions(+), 38 deletions(-) diff --git a/bcftools/Makefile b/bcftools/Makefile index 9592025..2dbcf59 100644 --- a/bcftools/Makefile +++ b/bcftools/Makefile @@ -34,8 +34,12 @@ bcftools:lib $(AOBJS) $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) -lz -L. -lbcf bcf.o:bcf.h +vcf.o:bcf.h +index.o:bcf.h +bcfutils.o:bcf.h prob1.o:prob1.h bcf.h -vcfout.o:bcf.h prob1.h +call1.o:prob1.h bcf.h +bcf2qcall.o:bcf.h main.o:bcf.h bcf.pdf:bcf.tex diff --git a/bcftools/bcf.c b/bcftools/bcf.c index 2fd1696..feeda57 100644 --- a/bcftools/bcf.c +++ b/bcftools/bcf.c @@ -4,12 +4,6 @@ #include "kstring.h" #include "bcf.h" -void bcf_hdr_clear(bcf_hdr_t *b) -{ - free(b->name); free(b->sname); free(b->txt); free(b->ns); free(b->sns); - memset(b, 0, sizeof(bcf_hdr_t)); -} - bcf_t *bcf_open(const char *fn, const char *mode) { bcf_t *b; @@ -110,7 +104,7 @@ int bcf_sync(bcf1_t *b) for (p = b->str, n = 0; p < b->str + b->l_str; ++p) if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1; if (n != 5) { - fprintf(stderr, "[bcf_sync] incorrect number of fields (%d != 5)\n", n); + fprintf(stderr, "[%s] incorrect number of fields (%d != 5). Corrupted file?\n", __func__, n); return -1; } b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4]; diff --git a/bcftools/bcf.h b/bcftools/bcf.h index 92b2c03..7286111 100644 --- a/bcftools/bcf.h +++ b/bcftools/bcf.h @@ -1,3 +1,30 @@ +/* The MIT License + + Copyright (c) 2010 Broad Institute + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Contact: Heng Li */ + #ifndef BCF_H #define BCF_H @@ -5,38 +32,49 @@ #include #include "bgzf.h" +/* + A member in the structs below is said to "primary" if its content + cannot be inferred from other members in any of structs below; a + member is said to be "derived" if its content can be derived from + other members. For example, bcf1_t::str is primary as this comes from + the input data, while bcf1_t::info is derived as it can always be + correctly set if we know bcf1_t::str. Derived members are for quick + access to the content and must be synchronized with the primary data. + */ + typedef struct { - uint32_t fmt; - int len; // len is the unit length - void *data; - // derived info: fmt, len + uint32_t fmt; // format of the block, set by bcf_str2int(). + int len; // length of data for each individual + void *data; // concatenated data + // derived info: fmt, len (<-bcf1_t::fmt) } bcf_ginfo_t; typedef struct { - int32_t tid, pos; - int32_t l_str, m_str; - float qual; - char *str, *ref, *alt, *flt, *info, *fmt; // fmt, ref, alt and info point to str - int n_gi, m_gi; - bcf_ginfo_t *gi; - int n_alleles, n_smpl; - // derived info: ref, alt, flt, info, fmt, n_gi, n_alleles + int32_t tid, pos; // refID and 0-based position + int32_t l_str, m_str; // length and the allocated size of ->str + float qual; // SNP quality + char *str; // concatenated string of variable length strings in VCF (from col.2 to col.7) + char *ref, *alt, *flt, *info, *fmt; // they all point to ->str; no memory allocation + int n_gi, m_gi; // number and the allocated size of geno fields + bcf_ginfo_t *gi; // array of geno fields + int n_alleles, n_smpl; // number of alleles and samples + // derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl) } bcf1_t; typedef struct { - int32_t n_ref, n_smpl; - int32_t l_nm; - int32_t l_smpl; - int32_t l_txt; - char *name, *sname, *txt; - char **ns, **sns; - // derived info: n_ref, n_smpl, ns, sns + int32_t n_ref, n_smpl; // number of reference sequences and samples + int32_t l_nm; // length of concatenated sequence names; 0 padded + int32_t l_smpl; // length of concatenated sample names; 0 padded + int32_t l_txt; // length of header text (lines started with ##) + char *name, *sname, *txt; // concatenated sequence names, sample names and header text + char **ns, **sns; // array of sequence and sample names; point to name and sname, respectively + // derived info: n_ref (<-name), n_smpl (<-sname), ns (<-name), sns (<-sname) } bcf_hdr_t; typedef struct { - int is_vcf; - void *v; - BGZF *fp; + int is_vcf; // if the file in operation is a VCF + void *v; // auxillary data structure for VCF + BGZF *fp; // file handler for BCF } bcf_t; struct __bcf_idx_t; @@ -46,34 +84,55 @@ typedef struct __bcf_idx_t bcf_idx_t; extern "C" { #endif + // open a BCF file; for BCF file only bcf_t *bcf_open(const char *fn, const char *mode); + // close file int bcf_close(bcf_t *b); + // read one record from BCF; return -1 on end-of-file, and <-1 for errors int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b); + // call this function if b->str is changed int bcf_sync(bcf1_t *b); + // write a BCF record int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b); + // read the BCF header; BCF only bcf_hdr_t *bcf_hdr_read(bcf_t *b); + // write the BCF header int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h); + // set bcf_hdr_t::ns and bcf_hdr_t::sns int bcf_hdr_sync(bcf_hdr_t *b); + // destroy the header void bcf_hdr_destroy(bcf_hdr_t *h); + // destroy a record int bcf_destroy(bcf1_t *b); + // BCF->VCF conversion char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b); + // open a VCF or BCF file if "b" is set in "mode" bcf_t *vcf_open(const char *fn, const char *mode); + // close a VCF/BCF file int vcf_close(bcf_t *bp); + // read the VCF/BCF header bcf_hdr_t *vcf_hdr_read(bcf_t *bp); + // read a VCF/BCF record; return -1 on end-of-file and <-1 for errors + int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); + // write the VCF header int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h); + // write a VCF record int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); - int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b); + // keep the first n alleles and discard the rest int bcf_shrink_alt(bcf1_t *b, int n); + // convert GL to PL int bcf_gl2pl(bcf1_t *b); + // string hash table void *bcf_build_refhash(bcf_hdr_t *h); void bcf_str2id_destroy(void *_hash); int bcf_str2id_add(void *_hash, const char *str); int bcf_str2id(void *_hash, const char *str); void *bcf_str2id_init(); + // indexing related functions int bcf_idx_build(const char *fn); uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg); int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end); diff --git a/bcftools/vcf.c b/bcftools/vcf.c index 51ccb74..ebca869 100644 --- a/bcftools/vcf.c +++ b/bcftools/vcf.c @@ -102,13 +102,6 @@ int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h) if (strstr(h->txt, "##SQ=")) has_ref = 1; } if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n"); - if (!has_ref) { - fprintf(v->fpout, "##SQ="); - for (i = 0; i < h->n_ref; ++i) { - fprintf(v->fpout, "%s", h->ns[i]); - fputc(i == h->n_ref - 1? '\n' : ',', v->fpout); - } - } fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); for (i = 0; i < h->n_smpl; ++i) fprintf(v->fpout, "\t%s", h->sns[i]); -- 2.39.5