3 Copyright (c) 2010 Broad Institute
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* Contact: Heng Li <lh3@live.co.uk> */
36 A member in the structs below is said to "primary" if its content
37 cannot be inferred from other members in any of structs below; a
38 member is said to be "derived" if its content can be derived from
39 other members. For example, bcf1_t::str is primary as this comes from
40 the input data, while bcf1_t::info is derived as it can always be
41 correctly set if we know bcf1_t::str. Derived members are for quick
42 access to the content and must be synchronized with the primary data.
46 uint32_t fmt; // format of the block, set by bcf_str2int().
47 int len; // length of data for each individual
48 void *data; // concatenated data
49 // derived info: fmt, len (<-bcf1_t::fmt)
53 int32_t tid, pos; // refID and 0-based position
54 int32_t l_str, m_str; // length and the allocated size of ->str
55 float qual; // SNP quality
56 char *str; // concatenated string of variable length strings in VCF (from col.2 to col.7)
57 char *ref, *alt, *flt, *info, *fmt; // they all point to ->str; no memory allocation
58 int n_gi, m_gi; // number and the allocated size of geno fields
59 bcf_ginfo_t *gi; // array of geno fields
60 int n_alleles, n_smpl; // number of alleles and samples
61 // derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl)
65 int32_t n_ref, n_smpl; // number of reference sequences and samples
66 int32_t l_nm; // length of concatenated sequence names; 0 padded
67 int32_t l_smpl; // length of concatenated sample names; 0 padded
68 int32_t l_txt; // length of header text (lines started with ##)
69 char *name, *sname, *txt; // concatenated sequence names, sample names and header text
70 char **ns, **sns; // array of sequence and sample names; point to name and sname, respectively
71 // derived info: n_ref (<-name), n_smpl (<-sname), ns (<-name), sns (<-sname)
75 int is_vcf; // if the file in operation is a VCF
76 void *v; // auxillary data structure for VCF
77 BGZF *fp; // file handler for BCF
81 typedef struct __bcf_idx_t bcf_idx_t;
87 // open a BCF file; for BCF file only
88 bcf_t *bcf_open(const char *fn, const char *mode);
90 int bcf_close(bcf_t *b);
91 // read one record from BCF; return -1 on end-of-file, and <-1 for errors
92 int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b);
93 // call this function if b->str is changed
94 int bcf_sync(bcf1_t *b);
96 int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b);
97 // read the BCF header; BCF only
98 bcf_hdr_t *bcf_hdr_read(bcf_t *b);
99 // write the BCF header
100 int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h);
101 // set bcf_hdr_t::ns and bcf_hdr_t::sns
102 int bcf_hdr_sync(bcf_hdr_t *b);
103 // destroy the header
104 void bcf_hdr_destroy(bcf_hdr_t *h);
106 int bcf_destroy(bcf1_t *b);
107 // BCF->VCF conversion
108 char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b);
110 int bcf_append_info(bcf1_t *b, const char *info, int l);
112 int bcf_cpy(bcf1_t *r, const bcf1_t *b);
114 // open a VCF or BCF file if "b" is set in "mode"
115 bcf_t *vcf_open(const char *fn, const char *mode);
116 // close a VCF/BCF file
117 int vcf_close(bcf_t *bp);
118 // read the VCF/BCF header
119 bcf_hdr_t *vcf_hdr_read(bcf_t *bp);
120 // read a VCF/BCF record; return -1 on end-of-file and <-1 for errors
121 int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
122 // write the VCF header
123 int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h);
124 // write a VCF record
125 int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
127 // keep the first n alleles and discard the rest
128 int bcf_shrink_alt(bcf1_t *b, int n);
130 int bcf_gl2pl(bcf1_t *b);
133 void *bcf_build_refhash(bcf_hdr_t *h);
134 void bcf_str2id_destroy(void *_hash);
135 int bcf_str2id_add(void *_hash, const char *str);
136 int bcf_str2id(void *_hash, const char *str);
137 void *bcf_str2id_init();
139 // indexing related functions
140 int bcf_idx_build(const char *fn);
141 uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg);
142 int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end);
143 bcf_idx_t *bcf_idx_load(const char *fn);
144 void bcf_idx_destroy(bcf_idx_t *idx);
150 static inline uint32_t bcf_str2int(const char *str, int l)
154 for (i = 0; i < l && i < 4; ++i) {
155 if (str[i] == 0) return x;