CC= gcc
-CFLAGS= -g -Wall -O2 #-m64 #-arch ppc
+CFLAGS= -g -Wall #-O2 #-m64 #-arch ppc
DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -D_CURSES_LIB=1
KNETFILE_O= knetfile.o
LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
$(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o
AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
- bamtk.o kaln.o bam_mcns.o
+ bamtk.o kaln.o bam_mcns.o bam2bcf.o bcf.o
PROG= samtools
INCLUDES=
SUBDIRS= . misc
sam.o:sam.h bam.h
bam_import.o:bam.h kseq.h khash.h razf.h
bam_pileup.o:bam.h razf.h ksort.h
-bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h
+bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h bam_mcns.h bcf.h bam2bcf.h
bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h
bam_lpileup.o:bam.h ksort.h
bam_tview.o:bam.h faidx.h bam_maqcns.h
bam_md.o:bam.h faidx.h
glf.o:glf.h
sam_header.o:sam_header.h khash.h
+bcf.o:bcf.h
+bam2bcf.o:bam2bcf.h bcf.h
faidx.o:faidx.h razf.h khash.h
faidx_main.o:faidx.h razf.h
--- /dev/null
+#include <math.h>
+#include <stdint.h>
+#include "bam.h"
+#include "kstring.h"
+#include "bam2bcf.h"
+#include "bcf.h"
+
+extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
+
+#define CALL_ETA 0.03f
+#define CALL_MAX 256
+#define CALL_DEFTHETA 0.85f
+
+struct __bcf_callaux_t {
+ int max_info, capQ;
+ double *fk;
+ uint32_t *info;
+};
+
+bcf_callaux_t *bcf_call_init(double theta)
+{
+ bcf_callaux_t *bca;
+ int n;
+ if (theta <= 0.) theta = CALL_DEFTHETA;
+ bca = calloc(1, sizeof(bcf_callaux_t));
+ bca->capQ = 60;
+ bca->fk = calloc(CALL_MAX, sizeof(double));
+ bca->fk[0] = 1.;
+ for (n = 1; n < CALL_MAX; ++n)
+ bca->fk[n] = theta >= 1.? 1. : pow(theta, n) * (1.0 - CALL_ETA) + CALL_ETA;
+ bca->fk[CALL_MAX-1] = 0.;
+ return bca;
+}
+
+void bcf_call_destroy(bcf_callaux_t *bca)
+{
+ if (bca) {
+ free(bca->info); free(bca->fk); free(bca);
+ }
+}
+
+typedef struct {
+ float esum[4], fsum[4];
+ uint32_t c[4];
+ int w[8];
+} auxaux_t;
+
+/*
+ The following code is nearly identical to bam_maqcns_glfgen() under
+ the simplified SOAPsnp model. It does the following:
+
+ 1) Collect strand, base, quality and mapping quality information for
+ each base and combine them in an integer:
+
+ x = min{baseQ,mapQ}<<24 | 1<<21 | strand<<18 | base<<16 | baseQ<<8 | mapQ
+
+ 2) Sort the list of integers for the next step.
+
+ 3) For each base, calculate e_b, the sum of weighted qualities. For
+ each type of base on each strand, the best quality has the highest
+ weight. Only the top 255 bases on each strand are used (different
+ from maqcns).
+
+ 4) Rescale the total read depth to 255.
+
+ 5) Calculate Q(D|g) = -10\log_{10}P(D|g) (d_a is the allele count):
+
+ Q(D|<aa>)=\sum_{b\not=a}e_b
+ Q(D|<aA>)=3*(d_a+d_A)+\sum_{b\not=a,b\not=A}e_b
+ */
+int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r)
+{
+ int i, j, k, c, n;
+ float *p = r->p;
+ auxaux_t aux;
+
+ memset(r, 0, sizeof(bcf_callret1_t));
+ if (_n == 0) return -1;
+
+ // enlarge the aux array if necessary
+ if (bca->max_info < _n) {
+ bca->max_info = _n;
+ kroundup32(bca->max_info);
+ bca->info = (uint32_t*)realloc(bca->info, 4 * bca->max_info);
+ }
+ // fill the aux array
+ for (i = n = 0; i < _n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ uint32_t q, x = 0, qq;
+ if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; // skip unmapped reads and deleted bases
+ q = (uint32_t)bam1_qual(p->b)[p->qpos]; // base quality
+ x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual;
+ if (p->b->core.qual < q) q = p->b->core.qual; // cap the overall quality at mapping quality
+ x |= q << 24;
+ qq = bam1_seqi(bam1_seq(p->b), p->qpos); // base
+ q = bam_nt16_nt4_table[qq? qq : ref_base]; // q is the 2-bit base
+ if (q < 4) x |= 1 << 21 | q << 16;
+
+ bca->info[n++] = x;
+ }
+ ks_introsort_uint32_t(n, bca->info);
+ r->depth = n;
+ // generate esum and fsum
+ memset(&aux, 0, sizeof(auxaux_t));
+ for (j = n - 1, r->sum_Q2 = 0; j >= 0; --j) { // calculate esum and fsum
+ uint32_t info = bca->info[j];
+ int tmp;
+ if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff);
+ k = info>>16&7;
+ if (info>>24 > 0) {
+ aux.esum[k&3] += bca->fk[aux.w[k]] * (info>>24);
+ aux.fsum[k&3] += bca->fk[aux.w[k]];
+ if (aux.w[k] + 1 < CALL_MAX) ++aux.w[k];
+ ++aux.c[k&3];
+ }
+ tmp = (int)(info&0xff) < bca->capQ? (int)(info&0xff) : bca->capQ;
+ r->sum_Q2 += tmp * tmp;
+ }
+ memcpy(r->esum, aux.esum, 4 * sizeof(float));
+ // rescale ->c[]
+ for (j = c = 0; j != 4; ++j) c += aux.c[j];
+ if (c > 255) {
+ for (j = 0; j != 4; ++j) aux.c[j] = (int)(254.0 * aux.c[j] / c + 0.499);
+ for (j = c = 0; j != 4; ++j) c += aux.c[j];
+ }
+ // generate likelihood
+ for (j = 0; j != 4; ++j) {
+ float tmp;
+ // homozygous
+ for (k = 0, tmp = 0.0; k != 4; ++k)
+ if (j != k) tmp += aux.esum[k];
+ p[j<<2|j] = tmp; // anything that is not j
+ // heterozygous
+ for (k = j + 1; k < 4; ++k) {
+ for (i = 0, tmp = 0.0; i != 4; ++i)
+ if (i != j && i != k) tmp += aux.esum[i];
+ p[j<<2|k] = p[k<<2|j] = 3.01 * (aux.c[j] + aux.c[k]) + tmp;
+ }
+ }
+ return 0;
+}
+
+/*
+ 1) Find the top 2 bases (from esum[4]).
+
+ 2) If the reference base is among the top 2, consider the locus is
+ potentially biallelic and set call->a[2] as -1; otherwise, the
+ locus is potentially triallelic. If the reference is ambiguous,
+ take the weakest call as the pseudo-reference.
+ */
+int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
+{
+ int ref4, i, j;
+ int64_t sum[4], tmp;
+ call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
+ if (ref4 > 4) ref4 = 4;
+ { // calculate esum
+ double esum[4];
+ memset(esum, 0, sizeof(double) * 4);
+ for (i = 0; i < n; ++i) {
+ for (j = 0; j < 4; ++j)
+ esum[j] += calls[i].esum[j];
+ }
+ for (j = 0; j < 4; ++j)
+ sum[j] = (int)(esum[j] * 100. + .499) << 2 | j;
+ }
+ // find the top 2 alleles
+ for (i = 1; i < 4; ++i) // insertion sort
+ for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
+ tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
+ call->a[0] = sum[3]&3; call->a[1] = sum[2]&3; call->a[2] = -1;
+ // set the reference allele and alternative allele(s)
+ if (ref4 != call->a[0]) { // ref is not the best
+ if (ref4 < 4) { // not ambiguous
+ if (ref4 == call->a[1]) tmp = call->a[0], call->a[0] = call->a[1], call->a[1] = tmp; // switch 0 and 1
+ else call->a[2] = call->a[1], call->a[1] = call->a[0], call->a[0] = ref4; // triallele
+ }
+ }
+ // set the PL array
+ if (call->n < n) {
+ call->n = n;
+ call->PL = realloc(call->PL, 6 * n);
+ }
+ {
+ int x, g[6], z;
+ double sum_min = 0.;
+ call->n_alleles = call->a[2] < 0? 2 : 3;
+ x = call->n_alleles * (call->n_alleles + 1) / 2;
+ // get the possible genotypes
+ for (i = z = 0; i < call->n_alleles; ++i)
+ for (j = i; j < call->n_alleles; ++j)
+ g[z++] = call->a[i]<<2 | call->a[j];
+ for (i = 0; i < n; ++i) {
+ uint8_t *PL = call->PL + x * i;
+ const bcf_callret1_t *r = calls + i;
+ float min = 1e37;
+ for (j = 0; j < x; ++j)
+ if (min > r->p[g[j]]) min = r->p[g[j]];
+ sum_min += min;
+ for (j = 0; j < x; ++j) {
+ int y;
+ y = (int)(r->p[g[j]] - min + .499);
+ if (y > 255) y = 255;
+ PL[j] = y;
+ }
+ }
+ call->shift = (int)(sum_min + .499);
+ }
+ for (i = call->depth = 0, tmp = 0; i < n; ++i) {
+ call->depth += calls[i].depth;
+ tmp += calls[i].sum_Q2;
+ }
+ call->rmsQ = (int)(sqrt((double)tmp / call->depth) + .499);
+ return 0;
+}
+
+int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b)
+{
+ kstring_t s;
+ int i, beg;
+ b->tid = tid; b->pos = pos; b->qual = 0;
+ s.s = b->str; s.m = b->m_str; s.l = 0;
+ kputc('\0', &s);
+ kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
+ beg = bc->ori_ref > 3? 0 : 1;
+ for (i = beg; i < 4; ++i) {
+ if (bc->a[i] < 0) break;
+ if (i > beg) kputc(',', &s);
+ kputc("ACGT"[bc->a[i]], &s);
+ }
+ kputc('\0', &s);
+ kputc('\0', &s);
+ kputs("MQ=", &s); kputw(bc->rmsQ, &s); kputc('\0', &s);
+ kputs("PL", &s); kputc('\0', &s);
+ b->m_str = s.m; b->str = s.s; b->l_str = s.l;
+ bcf_sync(bc->n, b);
+ memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n);
+ return 0;
+}
--- /dev/null
+#ifndef BAM2BCF_H
+#define BAM2BCF_H
+
+#include <stdint.h>
+#include "bcf.h"
+
+struct __bcf_callaux_t;
+typedef struct __bcf_callaux_t bcf_callaux_t;
+
+typedef struct {
+ int depth;
+ uint64_t sum_Q2;
+ float p[16], esum[4];
+} bcf_callret1_t;
+
+typedef struct {
+ int a[4]; // alleles: ref, alt, alt2, alt3
+ int n, n_alleles, shift, ori_ref;
+ int depth, rmsQ;
+ uint8_t *PL;
+} bcf_call_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bcf_callaux_t *bcf_call_init(double theta);
+ void bcf_call_destroy(bcf_callaux_t *bca);
+ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r);
+ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call);
+ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
#include "faidx.h"
#include "bam_maqcns.h"
#include "bam_mcns.h"
+#include "bam2bcf.h"
#include "khash.h"
#include "glf.h"
#include "kstring.h"
#define MPLP_VCF 0x1
#define MPLP_VAR 0x2
#define MPLP_AFALL 0x8
+#define MPLP_GLF 0x10
#define MPLP_AFS_BLOCK 0x10000
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
mplp_aux_t **data;
- mc_aux_t *ma = 0;
int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid;
const bam_pileup1_t **plp;
bam_mplp_t iter;
uint64_t N = 0;
char *ref;
khash_t(64) *hash = 0;
- // allocate
+
+ mc_aux_t *ma = 0;
+
+ bcf_callaux_t *bca = 0;
+ bcf_callret1_t *bcr = 0;
+ bcf_call_t bc;
+ bcf_t *bp = 0;
+
+ memset(&bc, 0, sizeof(bcf_call_t));
data = calloc(n, sizeof(void*));
plp = calloc(n, sizeof(void*));
n_plp = calloc(n, sizeof(int*));
+
// read the header and initialize data
for (i = 0; i < n; ++i) {
bam_header_t *h_tmp;
}
if (conf->fn_pos) hash = load_pos(conf->fn_pos, h);
// write the VCF header
- if (conf->flag & MPLP_VCF) {
+ if (conf->flag & MPLP_GLF) {
+ kstring_t s;
+ s.l = s.m = 0; s.s = 0;
+ bp = bcf_open("-", "w");
+ for (i = 0; i < h->n_targets; ++i) {
+ kputs(h->target_name[i], &s);
+ kputc('\0', &s);
+ }
+ bp->h.l_nm = s.l;
+ bp->h.name = malloc(s.l);
+ memcpy(bp->h.name, s.s, s.l);
+ s.l = 0;
+ for (i = 0; i < n; ++i) {
+ const char *p;
+ if ((p = strstr(fn[i], ".bam")) != 0)
+ kputsn(fn[i], p - fn[i], &s);
+ else kputs(fn[i], &s);
+ kputc('\0', &s);
+ }
+ bp->h.l_smpl = s.l;
+ bp->h.sname = malloc(s.l);
+ memcpy(bp->h.sname, s.s, s.l);
+ bp->h.l_txt = 0;
+ free(s.s);
+ bcf_hdr_sync(&bp->h);
+ bcf_hdr_write(bp);
+ } else if (conf->flag & MPLP_VCF) {
kstring_t s;
s.l = s.m = 0; s.s = 0;
puts("##fileformat=VCFv4.0");
free(s.s);
}
// mpileup
- if (conf->flag & MPLP_VCF) {
+ if (conf->flag & MPLP_GLF) {
+ bca = bcf_call_init(-1.);
+ bcr = calloc(n, sizeof(bcf_callret1_t));
+ } else if (conf->flag & MPLP_VCF) {
ma = mc_init(n);
mc_init_prior(ma, conf->prior_type, conf->theta);
}
if (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len);
ref_tid = tid;
}
- if (conf->flag & MPLP_VCF) {
+ if (conf->flag & MPLP_GLF) {
+ int _ref0, ref16;
+ bcf1_t *b = calloc(1, sizeof(bcf1_t));
+ _ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
+ ref16 = bam_nt16_table[_ref0];
+ for (i = 0; i < n; ++i)
+ bcf_call_glfgen(n_plp[i], plp[i], ref16, bca, bcr + i);
+ bcf_call_combine(n, bcr, ref16, &bc);
+ bcf_call2bcf(tid, pos, &bc, b);
+ bcf_write(bp, b);
+ //fprintf(stderr, "%d,%d,%d\n", b->tid, b->pos, b->l_str);
+ bcf_destroy(b);
+ } else if (conf->flag & MPLP_VCF) {
mc_rst_t r;
int j, _ref0, depth, rms_q, _ref0b, is_var = 0, qref = 0, level = 2, tot;
uint64_t sqr_sum;
putchar('\n');
}
}
+ bcf_close(bp);
if (conf->flag&MPLP_VCF) mc_dump_afs(ma);
if (hash) { // free the hash table
khint_t k;
if (kh_exist(hash, k)) free(kh_val(hash, k));
kh_destroy(64, hash);
}
+ bcf_call_destroy(bca); free(bc.PL); free(bcr);
mc_destroy(ma);
bam_mplp_destroy(iter);
bam_header_destroy(h);
mplp.max_mq = 60;
mplp.prior_type = MC_PTYPE_FULL;
mplp.theta = 1e-3;
- while ((c = getopt(argc, argv, "vVcFSP:f:r:l:VM:q:t:")) >= 0) {
+ while ((c = getopt(argc, argv, "gvVcFSP:f:r:l:VM:q:t:")) >= 0) {
switch (c) {
case 't': mplp.theta = atof(optarg); break;
case 'P':
break;
case 'r': mplp.reg = strdup(optarg); break;
case 'l': mplp.fn_pos = strdup(optarg); break;
+ case 'g': mplp.flag |= MPLP_GLF; break;
case 'V':
case 'c': mplp.flag |= MPLP_VCF; break;
case 'F': mplp.flag |= MPLP_AFALL; break;
case 'q': mplp.min_mq = atoi(optarg); break;
}
}
+ if (mplp.flag&MPLP_GLF) mplp.flag &= ~MPLP_VCF;
if (argc == 1) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n");
fprintf(stderr, " -t FLOAT scaled mutation rate [%lg]\n", mplp.theta);
fprintf(stderr, " -P STR prior: full, flat, cond2 [full]\n");
fprintf(stderr, " -c generate VCF output (consensus calling)\n");
+ fprintf(stderr, " -g generate GLF output\n");
fprintf(stderr, " -v show variant sites only\n");
fprintf(stderr, "\n");
fprintf(stderr, "Notes: Assuming error independency and diploid individuals.\n\n");
--- /dev/null
+#include <string.h>
+#include <ctype.h>
+#include <stdio.h>
+#include "kstring.h"
+#include "bcf.h"
+
+int bcf_hdr_read(bcf_t *b);
+
+void bcf_hdr_clear(bcf_hdr_t *b)
+{
+ free(b->name); free(b->sname); free(b->txt); free(b->ns); free(b->sns);
+ memset(b, 0, sizeof(bcf_hdr_t));
+}
+
+bcf_t *bcf_open(const char *fn, const char *mode)
+{
+ bcf_t *b;
+ b = calloc(1, sizeof(bcf_t));
+ if (strchr(mode, 'w')) {
+ b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdout), "w");
+ } else {
+ b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), "r");
+ }
+ b->fp->owned_file = 1;
+ if (strchr(mode, 'r')) bcf_hdr_read(b);
+ return b;
+}
+
+int bcf_close(bcf_t *b)
+{
+ int ret;
+ if (b == 0) return 0;
+ ret = bgzf_close(b->fp);
+ free(b->h.name); free(b->h.sname); free(b->h.txt); free(b->h.ns); free(b->h.sns);
+ free(b);
+ return ret;
+}
+
+int bcf_hdr_write(bcf_t *b)
+{
+ bcf_hdr_t *h;
+ if (b == 0) return -1;
+ h = &b->h;
+ bgzf_write(b->fp, "BCF\4", 4);
+ bgzf_write(b->fp, &h->l_nm, 4);
+ bgzf_write(b->fp, h->name, h->l_nm);
+ bgzf_write(b->fp, &h->l_smpl, 4);
+ bgzf_write(b->fp, h->sname, h->l_smpl);
+ bgzf_write(b->fp, &h->l_txt, 4);
+ bgzf_write(b->fp, h->txt, h->l_txt);
+ bgzf_flush(b->fp);
+ return 16 + h->l_nm + h->l_smpl + h->l_txt;
+}
+
+int bcf_hdr_read(bcf_t *b)
+{
+ uint8_t magic[4];
+ bcf_hdr_t *h;
+ if (b == 0) return -1;
+ bcf_hdr_clear(&b->h);
+ h = &b->h;
+ bgzf_read(b->fp, magic, 4);
+ bgzf_read(b->fp, &h->l_nm, 4);
+ h->name = malloc(h->l_nm);
+ bgzf_read(b->fp, h->name, h->l_nm);
+ bgzf_read(b->fp, &h->l_smpl, 4);
+ h->sname = malloc(h->l_smpl);
+ bgzf_read(b->fp, h->sname, h->l_smpl);
+ bgzf_read(b->fp, &h->l_txt, 4);
+ h->txt = malloc(h->l_txt);
+ bgzf_read(b->fp, h->txt, h->l_txt);
+ bcf_hdr_sync(&b->h);
+ return 16 + h->l_nm + h->l_smpl + h->l_txt;
+}
+
+static inline char **cnt_null(int l, char *str, int *_n)
+{
+ int n = 0;
+ char *p, **list;
+ *_n = 0;
+ if (l == 0 || str == 0) return 0;
+ for (p = str; p != str + l; ++p)
+ if (*p == 0) ++n;
+ *_n = n;
+ list = calloc(n, sizeof(void*));
+ list[0] = str;
+ for (p = str, n = 1; p < str + l - 1; ++p)
+ if (*p == 0) list[n++] = p + 1;
+ return list;
+}
+
+int bcf_hdr_sync(bcf_hdr_t *b)
+{
+ if (b == 0) return -1;
+ b->ns = cnt_null(b->l_nm, b->name, &b->n_ref);
+ b->sns = cnt_null(b->l_smpl, b->sname, &b->n_smpl);
+ return 0;
+}
+
+#define char2int(s) (((int)s[0])<<8|s[1])
+
+int bcf_sync(int n_smpl, bcf1_t *b)
+{
+ char *p, *tmp[4], *s;
+ int i, n, c;
+ // set ref, alt, flt, info, fmt
+ b->ref = b->alt = b->flt = b->info = b->fmt = 0;
+ for (p = b->str, n = 0; p < b->str + b->l_str; ++p)
+ if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1;
+ if (n != 5) return -1;
+ b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4];
+ // set n_alleles
+ for (p = b->alt, n = 1; *p; ++p)
+ if (*p == ',') ++n;
+ b->n_alleles = n;
+ c = toupper(*b->ref);
+ if (b->alt - b->ref == 2 && (c == 'A' || c == 'C' || c == 'G' || c == 'T'))
+ ++b->n_alleles;
+ // set n_gi and gi[i].fmt
+ for (p = b->fmt, n = 1; *p; ++p)
+ if (*p == ':') ++n;
+ if (n > b->m_gi) {
+ int old_m = b->m_gi;
+ b->m_gi = n;
+ kroundup32(b->m_gi);
+ b->gi = realloc(b->gi, b->m_gi * sizeof(bcf_ginfo_t));
+ memset(b->gi + old_m, 0, (b->m_gi - old_m) * sizeof(bcf_ginfo_t));
+ }
+ b->n_gi = n;
+ for (p = s = b->fmt, n = 0; *p; ++p) {
+ if (*p == ':' || *(p+1) == 0) {
+ char *q = *p == ':'? p : p + 1;
+ if ((q - s) != 2) return -2;
+ b->gi[n].fmt = char2int(s);
+ s = q;
+ }
+ }
+ // set gi[i].len
+ for (i = 0; i < b->n_gi; ++i) {
+ if (b->gi[i].fmt == char2int("PL")) {
+ b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2;
+ } else if (b->gi[i].fmt == char2int("DP") || b->gi[i].fmt == char2int("HQ")) {
+ b->gi[i].len = 2;
+ } else if (b->gi[i].fmt == char2int("GQ") || b->gi[i].fmt == char2int("GT")) {
+ b->gi[i].len = 1;
+ } else if (b->gi[i].fmt == char2int("GL")) {
+ b->gi[i].len = 4;
+ }
+ b->gi[i].data = malloc(n_smpl * b->gi[i].len);
+ }
+ return 0;
+}
+
+int bcf_write(bcf_t *bp, const bcf1_t *b)
+{
+ uint32_t x;
+ int i, l = 0;
+ if (b == 0) return -1;
+ bgzf_write(bp->fp, &b->tid, 4);
+ bgzf_write(bp->fp, &b->pos, 4);
+ x = b->qual<<24 | b->l_str;
+ bgzf_write(bp->fp, &x, 4);
+ bgzf_write(bp->fp, b->str, b->l_str);
+ l = 12 + b->l_str;
+ for (i = 0; i < b->n_gi; ++i) {
+ bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl);
+ l += b->gi[i].len * bp->h.n_smpl;
+ }
+ return l;
+}
+
+int bcf_read(bcf_t *bp, bcf1_t *b)
+{
+ int i, l = 0;
+ uint32_t x;
+ if (b == 0) return -1;
+ if (bgzf_read(bp->fp, &b->tid, 4) == 0) return 0;
+ bgzf_read(bp->fp, &b->pos, 4);
+ bgzf_read(bp->fp, &x, 4);
+ b->qual = x >> 24; b->l_str = x << 8 >> 8;
+ if (b->l_str > b->m_str) {
+ b->m_str = b->l_str;
+ kroundup32(b->m_str);
+ b->str = realloc(b->str, b->m_str);
+ }
+ bgzf_read(bp->fp, b->str, b->l_str);
+ l = 12 + b->l_str;
+ bcf_sync(bp->h.n_smpl, b);
+ for (i = 0; i < b->n_gi; ++i) {
+ bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl);
+ l += b->gi[i].len * bp->h.n_smpl;
+ }
+ return l;
+}
+
+int bcf_destroy(bcf1_t *b)
+{
+ int i;
+ if (b == 0) return -1;
+ free(b->str);
+ for (i = 0; i < b->n_gi; ++i)
+ free(b->gi[i].data);
+ free(b->gi);
+ free(b);
+ return 0;
+}
+
+static inline void fmt_str(const char *p, kstring_t *s)
+{
+ if (*p == 0) kputc('.', s);
+ else kputs(p, s);
+}
+
+char *bcf_fmt(bcf_t *bp, bcf1_t *b)
+{
+ kstring_t s;
+ int i, j, x;
+ memset(&s, 0, sizeof(kstring_t));
+ kputs(bp->h.ns[b->tid], &s); kputc('\t', &s);
+ kputw(b->pos + 1, &s); kputc('\t', &s);
+ fmt_str(b->str, &s); kputc('\t', &s);
+ fmt_str(b->ref, &s); kputc('\t', &s);
+ fmt_str(b->alt, &s); kputc('\t', &s);
+ kputw(b->qual, &s); kputc('\t', &s);
+ fmt_str(b->info, &s); kputc('\t', &s);
+ fmt_str(b->fmt, &s);
+ x = b->n_alleles * (b->n_alleles + 1) / 2;
+ for (j = 0; j < bp->h.n_smpl; ++j) {
+ kputc('\t', &s);
+ for (i = 0; i < b->n_gi; ++i) {
+ if (i) kputc(':', &s);
+ if (b->gi[i].fmt == char2int("PL")) {
+ uint8_t *d = (uint8_t*)b->gi[i].data + j * x;
+ int k;
+ for (k = 0; k < x; ++k) {
+ if (k > 0) kputc(',', &s);
+ kputw(d[k], &s);
+ }
+ } else if (b->gi[i].fmt == char2int("DP")) {
+ kputw(((uint16_t*)b->gi[i].data)[j], &s);
+ } else if (b->gi[i].fmt == char2int("GQ")) {
+ kputw(((uint8_t*)b->gi[i].data)[j], &s);
+ }
+ }
+ }
+ return s.s;
+}
--- /dev/null
+#ifndef BCF_H
+#define BCF_H
+
+#include <stdint.h>
+#include "bgzf.h"
+
+typedef struct {
+ int fmt, len; // len is the unit length
+ void *data;
+ // derived info: fmt, len
+} bcf_ginfo_t;
+
+typedef struct {
+ int32_t tid, pos;
+ uint32_t qual:8, l_str:24;
+ int m_str;
+ char *str, *ref, *alt, *flt, *info, *fmt; // fmt, ref, alt and info point to str
+ int n_gi, m_gi;
+ bcf_ginfo_t *gi;
+ int n_alleles;
+ // derived info: ref, alt, flt, info, fmt, n_gi, n_alleles
+} bcf1_t;
+
+typedef struct {
+ int32_t n_ref, n_smpl;
+ int32_t l_nm;
+ int32_t l_smpl;
+ int32_t l_txt;
+ char *name, *sname, *txt;
+ char **ns, **sns;
+ // derived info: n_ref, n_smpl, ns, sns
+} bcf_hdr_t;
+
+typedef struct {
+ BGZF *fp;
+ bcf_hdr_t h;
+} bcf_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bcf_t *bcf_open(const char *fn, const char *mode);
+ int bcf_close(bcf_t *b);
+ int bcf_read(bcf_t *bp, bcf1_t *b);
+ int bcf_sync(int n_smpl, bcf1_t *b);
+ int bcf_write(bcf_t *bp, const bcf1_t *b);
+ int bcf_hdr_write(bcf_t *b);
+ int bcf_hdr_sync(bcf_hdr_t *b);
+ int bcf_destroy(bcf1_t *b);
+ char *bcf_fmt(bcf_t *bp, bcf1_t *b);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif