$(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o
AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
- bamtk.o kaln.o bam_mcns.o bam2bcf.o bcf.o
+ bamtk.o kaln.o bam_mcns.o bam2bcf.o
PROG= samtools
-INCLUDES=
-SUBDIRS= . misc
+INCLUDES= -Ibcftools
+SUBDIRS= . bcftools misc
LIBPATH=
LIBCURSES= -lcurses # -lXCurses
list='$(SUBDIRS)'; for subdir in $$list; do \
cd $$subdir; \
$(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
- INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \
+ LIBPATH="$(LIBPATH)" $$target || exit 1; \
cd $$wdir; \
done;
libbam.a:$(LOBJS)
$(AR) -cru $@ $(LOBJS)
-samtools:$(AOBJS) libbam.a
- $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz
+samtools:lib-recur $(AOBJS)
+ $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz -Lbcftools -lbcf
razip:razip.o razf.o $(KNETFILE_O)
$(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz
+++ /dev/null
-#include <string.h>
-#include <ctype.h>
-#include <stdio.h>
-#include "kstring.h"
-#include "bcf.h"
-
-int bcf_hdr_read(bcf_t *b);
-
-void bcf_hdr_clear(bcf_hdr_t *b)
-{
- free(b->name); free(b->sname); free(b->txt); free(b->ns); free(b->sns);
- memset(b, 0, sizeof(bcf_hdr_t));
-}
-
-bcf_t *bcf_open(const char *fn, const char *mode)
-{
- bcf_t *b;
- b = calloc(1, sizeof(bcf_t));
- if (strchr(mode, 'w')) {
- b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdout), "w");
- } else {
- b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), "r");
- }
- b->fp->owned_file = 1;
- if (strchr(mode, 'r')) bcf_hdr_read(b);
- return b;
-}
-
-int bcf_close(bcf_t *b)
-{
- int ret;
- if (b == 0) return 0;
- ret = bgzf_close(b->fp);
- free(b->h.name); free(b->h.sname); free(b->h.txt); free(b->h.ns); free(b->h.sns);
- free(b);
- return ret;
-}
-
-int bcf_hdr_write(bcf_t *b)
-{
- bcf_hdr_t *h;
- if (b == 0) return -1;
- h = &b->h;
- bgzf_write(b->fp, "BCF\4", 4);
- bgzf_write(b->fp, &h->l_nm, 4);
- bgzf_write(b->fp, h->name, h->l_nm);
- bgzf_write(b->fp, &h->l_smpl, 4);
- bgzf_write(b->fp, h->sname, h->l_smpl);
- bgzf_write(b->fp, &h->l_txt, 4);
- bgzf_write(b->fp, h->txt, h->l_txt);
- bgzf_flush(b->fp);
- return 16 + h->l_nm + h->l_smpl + h->l_txt;
-}
-
-int bcf_hdr_cpy(bcf_hdr_t *h, const bcf_hdr_t *h0)
-{
- *h = *h0;
- h->name = malloc(h->l_nm);
- h->sname = malloc(h->l_smpl);
- h->txt = malloc(h->l_txt);
- memcpy(h->name, h0->name, h->l_nm);
- memcpy(h->sname, h0->sname, h->l_smpl);
- memcpy(h->txt, h0->txt, h->l_txt);
- bcf_hdr_sync(h);
- return 0;
-}
-
-int bcf_hdr_read(bcf_t *b)
-{
- uint8_t magic[4];
- bcf_hdr_t *h;
- if (b == 0) return -1;
- bcf_hdr_clear(&b->h);
- h = &b->h;
- bgzf_read(b->fp, magic, 4);
- bgzf_read(b->fp, &h->l_nm, 4);
- h->name = malloc(h->l_nm);
- bgzf_read(b->fp, h->name, h->l_nm);
- bgzf_read(b->fp, &h->l_smpl, 4);
- h->sname = malloc(h->l_smpl);
- bgzf_read(b->fp, h->sname, h->l_smpl);
- bgzf_read(b->fp, &h->l_txt, 4);
- h->txt = malloc(h->l_txt);
- bgzf_read(b->fp, h->txt, h->l_txt);
- bcf_hdr_sync(&b->h);
- return 16 + h->l_nm + h->l_smpl + h->l_txt;
-}
-
-static inline char **cnt_null(int l, char *str, int *_n)
-{
- int n = 0;
- char *p, **list;
- *_n = 0;
- if (l == 0 || str == 0) return 0;
- for (p = str; p != str + l; ++p)
- if (*p == 0) ++n;
- *_n = n;
- list = calloc(n, sizeof(void*));
- list[0] = str;
- for (p = str, n = 1; p < str + l - 1; ++p)
- if (*p == 0) list[n++] = p + 1;
- return list;
-}
-
-int bcf_hdr_sync(bcf_hdr_t *b)
-{
- if (b == 0) return -1;
- b->ns = cnt_null(b->l_nm, b->name, &b->n_ref);
- b->sns = cnt_null(b->l_smpl, b->sname, &b->n_smpl);
- return 0;
-}
-
-#define char2int(s) (((int)s[0])<<8|s[1])
-
-int bcf_sync(int n_smpl, bcf1_t *b)
-{
- char *p, *tmp[5], *s;
- int i, n;
- // set ref, alt, flt, info, fmt
- b->ref = b->alt = b->flt = b->info = b->fmt = 0;
- for (p = b->str, n = 0; p < b->str + b->l_str; ++p)
- if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1;
- if (n != 5) return -1;
- b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4];
- // set n_alleles
- for (p = b->alt, n = 1; *p; ++p)
- if (*p == ',') ++n;
- b->n_alleles = n + 1;
- // set n_gi and gi[i].fmt
- for (p = b->fmt, n = 1; *p; ++p)
- if (*p == ':') ++n;
- if (n > b->m_gi) {
- int old_m = b->m_gi;
- b->m_gi = n;
- kroundup32(b->m_gi);
- b->gi = realloc(b->gi, b->m_gi * sizeof(bcf_ginfo_t));
- memset(b->gi + old_m, 0, (b->m_gi - old_m) * sizeof(bcf_ginfo_t));
- }
- b->n_gi = n;
- for (p = s = b->fmt, n = 0; *p; ++p) {
- if (*p == ':' || *(p+1) == 0) {
- char *q = *p == ':'? p : p + 1;
- if ((q - s) != 2) return -2;
- b->gi[n].fmt = char2int(s);
- s = q;
- }
- }
- // set gi[i].len
- for (i = 0; i < b->n_gi; ++i) {
- if (b->gi[i].fmt == char2int("PL")) {
- b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2;
- } else if (b->gi[i].fmt == char2int("DP") || b->gi[i].fmt == char2int("HQ")) {
- b->gi[i].len = 2;
- } else if (b->gi[i].fmt == char2int("GQ") || b->gi[i].fmt == char2int("GT")) {
- b->gi[i].len = 1;
- } else if (b->gi[i].fmt == char2int("GL")) {
- b->gi[i].len = 4;
- }
- b->gi[i].data = realloc(b->gi[i].data, n_smpl * b->gi[i].len);
- }
- return 0;
-}
-
-int bcf_write(bcf_t *bp, const bcf1_t *b)
-{
- uint32_t x;
- int i, l = 0;
- if (b == 0) return -1;
- bgzf_write(bp->fp, &b->tid, 4);
- bgzf_write(bp->fp, &b->pos, 4);
- x = b->qual<<24 | b->l_str;
- bgzf_write(bp->fp, &x, 4);
- bgzf_write(bp->fp, b->str, b->l_str);
- l = 12 + b->l_str;
- for (i = 0; i < b->n_gi; ++i) {
- bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl);
- l += b->gi[i].len * bp->h.n_smpl;
- }
- return l;
-}
-
-int bcf_read(bcf_t *bp, bcf1_t *b)
-{
- int i, l = 0;
- uint32_t x;
- if (b == 0) return -1;
- if (bgzf_read(bp->fp, &b->tid, 4) == 0) return 0;
- bgzf_read(bp->fp, &b->pos, 4);
- bgzf_read(bp->fp, &x, 4);
- b->qual = x >> 24; b->l_str = x << 8 >> 8;
- if (b->l_str > b->m_str) {
- b->m_str = b->l_str;
- kroundup32(b->m_str);
- b->str = realloc(b->str, b->m_str);
- }
- bgzf_read(bp->fp, b->str, b->l_str);
- l = 12 + b->l_str;
- bcf_sync(bp->h.n_smpl, b);
- for (i = 0; i < b->n_gi; ++i) {
- bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl);
- l += b->gi[i].len * bp->h.n_smpl;
- }
- return l;
-}
-
-int bcf_destroy(bcf1_t *b)
-{
- int i;
- if (b == 0) return -1;
- free(b->str);
- for (i = 0; i < b->n_gi; ++i)
- free(b->gi[i].data);
- free(b->gi);
- free(b);
- return 0;
-}
-
-static inline void fmt_str(const char *p, kstring_t *s)
-{
- if (*p == 0) kputc('.', s);
- else kputs(p, s);
-}
-
-char *bcf_fmt(bcf_t *bp, bcf1_t *b)
-{
- kstring_t s;
- int i, j, x;
- memset(&s, 0, sizeof(kstring_t));
- kputs(bp->h.ns[b->tid], &s); kputc('\t', &s);
- kputw(b->pos + 1, &s); kputc('\t', &s);
- fmt_str(b->str, &s); kputc('\t', &s);
- fmt_str(b->ref, &s); kputc('\t', &s);
- fmt_str(b->alt, &s); kputc('\t', &s);
- kputw(b->qual, &s); kputc('\t', &s);
- fmt_str(b->flt, &s); kputc('\t', &s);
- fmt_str(b->info, &s);
- if (b->fmt[0]) {
- kputc('\t', &s);
- fmt_str(b->fmt, &s);
- }
- x = b->n_alleles * (b->n_alleles + 1) / 2;
- for (j = 0; j < bp->h.n_smpl; ++j) {
- kputc('\t', &s);
- for (i = 0; i < b->n_gi; ++i) {
- if (i) kputc(':', &s);
- if (b->gi[i].fmt == char2int("PL")) {
- uint8_t *d = (uint8_t*)b->gi[i].data + j * x;
- int k;
- for (k = 0; k < x; ++k) {
- if (k > 0) kputc(',', &s);
- kputw(d[k], &s);
- }
- } else if (b->gi[i].fmt == char2int("DP")) {
- kputw(((uint16_t*)b->gi[i].data)[j], &s);
- } else if (b->gi[i].fmt == char2int("GQ")) {
- kputw(((uint8_t*)b->gi[i].data)[j], &s);
- }
- }
- }
- return s.s;
-}
+++ /dev/null
-#ifndef BCF_H
-#define BCF_H
-
-#include <stdint.h>
-#include "bgzf.h"
-
-typedef struct {
- int fmt, len; // len is the unit length
- void *data;
- // derived info: fmt, len
-} bcf_ginfo_t;
-
-typedef struct {
- int32_t tid, pos;
- uint32_t qual:8, l_str:24;
- int m_str;
- char *str, *ref, *alt, *flt, *info, *fmt; // fmt, ref, alt and info point to str
- int n_gi, m_gi;
- bcf_ginfo_t *gi;
- int n_alleles;
- // derived info: ref, alt, flt, info, fmt, n_gi, n_alleles
-} bcf1_t;
-
-typedef struct {
- int32_t n_ref, n_smpl;
- int32_t l_nm;
- int32_t l_smpl;
- int32_t l_txt;
- char *name, *sname, *txt;
- char **ns, **sns;
- // derived info: n_ref, n_smpl, ns, sns
-} bcf_hdr_t;
-
-typedef struct {
- BGZF *fp;
- bcf_hdr_t h;
-} bcf_t;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
- bcf_t *bcf_open(const char *fn, const char *mode);
- int bcf_close(bcf_t *b);
- int bcf_read(bcf_t *bp, bcf1_t *b);
- int bcf_sync(int n_smpl, bcf1_t *b);
- int bcf_write(bcf_t *bp, const bcf1_t *b);
- int bcf_hdr_write(bcf_t *b);
- int bcf_hdr_sync(bcf_hdr_t *b);
- int bcf_destroy(bcf1_t *b);
- char *bcf_fmt(bcf_t *bp, bcf1_t *b);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- /dev/null
+CC= gcc
+CFLAGS= -g -Wall -O2 #-fPIC #-m64 #-arch ppc
+DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE
+LOBJS= bcf.o bcfutils.o prob1.o index.o
+OMISC= ..
+AOBJS= vcfout.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o
+PROG= bcftools
+INCLUDES= -I..
+SUBDIRS= .
+
+.SUFFIXES:.c .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@
+
+all-recur lib-recur clean-recur cleanlocal-recur install-recur:
+ @target=`echo $@ | sed s/-recur//`; \
+ wdir=`pwd`; \
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ cd $$subdir; \
+ $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+ INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \
+ cd $$wdir; \
+ done;
+
+all:$(PROG)
+
+lib:libbcf.a
+
+libbcf.a:$(LOBJS)
+ $(AR) -cru $@ $(LOBJS)
+
+bcftools:lib $(AOBJS)
+ $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) -lz -L. -lbcf
+
+bcf.o:bcf.h
+prob1.o:prob1.h bcf.h
+vcfout.o:bcf.h prob1.h
+main.o:bcf.h
+
+bcf.pdf:bcf.tex
+ pdflatex bcf
+
+cleanlocal:
+ rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a bcf.aux bcf.log bcf.pdf *.class libbcf.*.dylib libbcf.so*
+
+clean:cleanlocal-recur
--- /dev/null
+#include <string.h>
+#include <ctype.h>
+#include <stdio.h>
+#include "kstring.h"
+#include "bcf.h"
+
+int bcf_hdr_read(bcf_t *b);
+
+void bcf_hdr_clear(bcf_hdr_t *b)
+{
+ free(b->name); free(b->sname); free(b->txt); free(b->ns); free(b->sns);
+ memset(b, 0, sizeof(bcf_hdr_t));
+}
+
+bcf_t *bcf_open(const char *fn, const char *mode)
+{
+ bcf_t *b;
+ b = calloc(1, sizeof(bcf_t));
+ if (strchr(mode, 'w')) {
+ b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdout), "w");
+ } else {
+ b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), "r");
+ }
+ b->fp->owned_file = 1;
+ if (strchr(mode, 'r')) bcf_hdr_read(b);
+ return b;
+}
+
+int bcf_close(bcf_t *b)
+{
+ int ret;
+ if (b == 0) return 0;
+ ret = bgzf_close(b->fp);
+ free(b->h.name); free(b->h.sname); free(b->h.txt); free(b->h.ns); free(b->h.sns);
+ free(b);
+ return ret;
+}
+
+int bcf_hdr_write(bcf_t *b)
+{
+ bcf_hdr_t *h;
+ if (b == 0) return -1;
+ h = &b->h;
+ bgzf_write(b->fp, "BCF\4", 4);
+ bgzf_write(b->fp, &h->l_nm, 4);
+ bgzf_write(b->fp, h->name, h->l_nm);
+ bgzf_write(b->fp, &h->l_smpl, 4);
+ bgzf_write(b->fp, h->sname, h->l_smpl);
+ bgzf_write(b->fp, &h->l_txt, 4);
+ bgzf_write(b->fp, h->txt, h->l_txt);
+ bgzf_flush(b->fp);
+ return 16 + h->l_nm + h->l_smpl + h->l_txt;
+}
+
+int bcf_hdr_cpy(bcf_hdr_t *h, const bcf_hdr_t *h0)
+{
+ *h = *h0;
+ h->name = malloc(h->l_nm);
+ h->sname = malloc(h->l_smpl);
+ h->txt = malloc(h->l_txt);
+ memcpy(h->name, h0->name, h->l_nm);
+ memcpy(h->sname, h0->sname, h->l_smpl);
+ memcpy(h->txt, h0->txt, h->l_txt);
+ bcf_hdr_sync(h);
+ return 0;
+}
+
+int bcf_hdr_read(bcf_t *b)
+{
+ uint8_t magic[4];
+ bcf_hdr_t *h;
+ if (b == 0) return -1;
+ bcf_hdr_clear(&b->h);
+ h = &b->h;
+ bgzf_read(b->fp, magic, 4);
+ bgzf_read(b->fp, &h->l_nm, 4);
+ h->name = malloc(h->l_nm);
+ bgzf_read(b->fp, h->name, h->l_nm);
+ bgzf_read(b->fp, &h->l_smpl, 4);
+ h->sname = malloc(h->l_smpl);
+ bgzf_read(b->fp, h->sname, h->l_smpl);
+ bgzf_read(b->fp, &h->l_txt, 4);
+ h->txt = malloc(h->l_txt);
+ bgzf_read(b->fp, h->txt, h->l_txt);
+ bcf_hdr_sync(&b->h);
+ return 16 + h->l_nm + h->l_smpl + h->l_txt;
+}
+
+static inline char **cnt_null(int l, char *str, int *_n)
+{
+ int n = 0;
+ char *p, **list;
+ *_n = 0;
+ if (l == 0 || str == 0) return 0;
+ for (p = str; p != str + l; ++p)
+ if (*p == 0) ++n;
+ *_n = n;
+ list = calloc(n, sizeof(void*));
+ list[0] = str;
+ for (p = str, n = 1; p < str + l - 1; ++p)
+ if (*p == 0) list[n++] = p + 1;
+ return list;
+}
+
+int bcf_hdr_sync(bcf_hdr_t *b)
+{
+ if (b == 0) return -1;
+ b->ns = cnt_null(b->l_nm, b->name, &b->n_ref);
+ b->sns = cnt_null(b->l_smpl, b->sname, &b->n_smpl);
+ return 0;
+}
+
+#define char2int(s) (((int)s[0])<<8|s[1])
+
+int bcf_sync(int n_smpl, bcf1_t *b)
+{
+ char *p, *tmp[5], *s;
+ int i, n;
+ // set ref, alt, flt, info, fmt
+ b->ref = b->alt = b->flt = b->info = b->fmt = 0;
+ for (p = b->str, n = 0; p < b->str + b->l_str; ++p)
+ if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1;
+ if (n != 5) return -1;
+ b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4];
+ // set n_alleles
+ for (p = b->alt, n = 1; *p; ++p)
+ if (*p == ',') ++n;
+ b->n_alleles = n + 1;
+ // set n_gi and gi[i].fmt
+ for (p = b->fmt, n = 1; *p; ++p)
+ if (*p == ':') ++n;
+ if (n > b->m_gi) {
+ int old_m = b->m_gi;
+ b->m_gi = n;
+ kroundup32(b->m_gi);
+ b->gi = realloc(b->gi, b->m_gi * sizeof(bcf_ginfo_t));
+ memset(b->gi + old_m, 0, (b->m_gi - old_m) * sizeof(bcf_ginfo_t));
+ }
+ b->n_gi = n;
+ for (p = s = b->fmt, n = 0; *p; ++p) {
+ if (*p == ':' || *(p+1) == 0) {
+ char *q = *p == ':'? p : p + 1;
+ if ((q - s) != 2) return -2;
+ b->gi[n].fmt = char2int(s);
+ s = q;
+ }
+ }
+ // set gi[i].len
+ for (i = 0; i < b->n_gi; ++i) {
+ if (b->gi[i].fmt == char2int("PL")) {
+ b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2;
+ } else if (b->gi[i].fmt == char2int("DP") || b->gi[i].fmt == char2int("HQ")) {
+ b->gi[i].len = 2;
+ } else if (b->gi[i].fmt == char2int("GQ") || b->gi[i].fmt == char2int("GT")) {
+ b->gi[i].len = 1;
+ } else if (b->gi[i].fmt == char2int("GL")) {
+ b->gi[i].len = 4;
+ }
+ b->gi[i].data = realloc(b->gi[i].data, n_smpl * b->gi[i].len);
+ }
+ return 0;
+}
+
+int bcf_write(bcf_t *bp, const bcf1_t *b)
+{
+ uint32_t x;
+ int i, l = 0;
+ if (b == 0) return -1;
+ bgzf_write(bp->fp, &b->tid, 4);
+ bgzf_write(bp->fp, &b->pos, 4);
+ x = b->qual<<24 | b->l_str;
+ bgzf_write(bp->fp, &x, 4);
+ bgzf_write(bp->fp, b->str, b->l_str);
+ l = 12 + b->l_str;
+ for (i = 0; i < b->n_gi; ++i) {
+ bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl);
+ l += b->gi[i].len * bp->h.n_smpl;
+ }
+ return l;
+}
+
+int bcf_read(bcf_t *bp, bcf1_t *b)
+{
+ int i, l = 0;
+ uint32_t x;
+ if (b == 0) return -1;
+ if (bgzf_read(bp->fp, &b->tid, 4) == 0) return 0;
+ bgzf_read(bp->fp, &b->pos, 4);
+ bgzf_read(bp->fp, &x, 4);
+ b->qual = x >> 24; b->l_str = x << 8 >> 8;
+ if (b->l_str > b->m_str) {
+ b->m_str = b->l_str;
+ kroundup32(b->m_str);
+ b->str = realloc(b->str, b->m_str);
+ }
+ bgzf_read(bp->fp, b->str, b->l_str);
+ l = 12 + b->l_str;
+ bcf_sync(bp->h.n_smpl, b);
+ for (i = 0; i < b->n_gi; ++i) {
+ bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl);
+ l += b->gi[i].len * bp->h.n_smpl;
+ }
+ return l;
+}
+
+int bcf_destroy(bcf1_t *b)
+{
+ int i;
+ if (b == 0) return -1;
+ free(b->str);
+ for (i = 0; i < b->n_gi; ++i)
+ free(b->gi[i].data);
+ free(b->gi);
+ free(b);
+ return 0;
+}
+
+static inline void fmt_str(const char *p, kstring_t *s)
+{
+ if (*p == 0) kputc('.', s);
+ else kputs(p, s);
+}
+
+char *bcf_fmt(bcf_t *bp, bcf1_t *b)
+{
+ kstring_t s;
+ int i, j, x;
+ memset(&s, 0, sizeof(kstring_t));
+ kputs(bp->h.ns[b->tid], &s); kputc('\t', &s);
+ kputw(b->pos + 1, &s); kputc('\t', &s);
+ fmt_str(b->str, &s); kputc('\t', &s);
+ fmt_str(b->ref, &s); kputc('\t', &s);
+ fmt_str(b->alt, &s); kputc('\t', &s);
+ kputw(b->qual, &s); kputc('\t', &s);
+ fmt_str(b->flt, &s); kputc('\t', &s);
+ fmt_str(b->info, &s);
+ if (b->fmt[0]) {
+ kputc('\t', &s);
+ fmt_str(b->fmt, &s);
+ }
+ x = b->n_alleles * (b->n_alleles + 1) / 2;
+ for (j = 0; j < bp->h.n_smpl; ++j) {
+ kputc('\t', &s);
+ for (i = 0; i < b->n_gi; ++i) {
+ if (i) kputc(':', &s);
+ if (b->gi[i].fmt == char2int("PL")) {
+ uint8_t *d = (uint8_t*)b->gi[i].data + j * x;
+ int k;
+ for (k = 0; k < x; ++k) {
+ if (k > 0) kputc(',', &s);
+ kputw(d[k], &s);
+ }
+ } else if (b->gi[i].fmt == char2int("DP")) {
+ kputw(((uint16_t*)b->gi[i].data)[j], &s);
+ } else if (b->gi[i].fmt == char2int("GQ")) {
+ kputw(((uint8_t*)b->gi[i].data)[j], &s);
+ }
+ }
+ }
+ return s.s;
+}
--- /dev/null
+#ifndef BCF_H
+#define BCF_H
+
+#include <stdint.h>
+#include "bgzf.h"
+
+typedef struct {
+ int fmt, len; // len is the unit length
+ void *data;
+ // derived info: fmt, len
+} bcf_ginfo_t;
+
+typedef struct {
+ int32_t tid, pos;
+ uint32_t qual:8, l_str:24;
+ int m_str;
+ char *str, *ref, *alt, *flt, *info, *fmt; // fmt, ref, alt and info point to str
+ int n_gi, m_gi;
+ bcf_ginfo_t *gi;
+ int n_alleles;
+ // derived info: ref, alt, flt, info, fmt, n_gi, n_alleles
+} bcf1_t;
+
+typedef struct {
+ int32_t n_ref, n_smpl;
+ int32_t l_nm;
+ int32_t l_smpl;
+ int32_t l_txt;
+ char *name, *sname, *txt;
+ char **ns, **sns;
+ // derived info: n_ref, n_smpl, ns, sns
+} bcf_hdr_t;
+
+typedef struct {
+ BGZF *fp;
+ bcf_hdr_t h;
+} bcf_t;
+
+struct __bcf_idx_t;
+typedef struct __bcf_idx_t bcf_idx_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bcf_t *bcf_open(const char *fn, const char *mode);
+ int bcf_close(bcf_t *b);
+ int bcf_read(bcf_t *bp, bcf1_t *b);
+ int bcf_sync(int n_smpl, bcf1_t *b);
+ int bcf_write(bcf_t *bp, const bcf1_t *b);
+ int bcf_hdr_write(bcf_t *b);
+ int bcf_hdr_sync(bcf_hdr_t *b);
+ int bcf_hdr_cpy(bcf_hdr_t *h, const bcf_hdr_t *h0);
+ int bcf_destroy(bcf1_t *b);
+ char *bcf_fmt(bcf_t *bp, bcf1_t *b);
+
+ void *bcf_build_refhash(bcf_hdr_t *h);
+ void bcf_str2id_destroy(void *_hash);
+ int bcf_str2id(void *_hash, const char *str);
+
+ int bcf_idx_build(const char *fn);
+ uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg, int end);
+ int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end);
+ bcf_idx_t *bcf_idx_load(const char *fn);
+ void bcf_idx_destroy(bcf_idx_t *idx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+#include "bcf.h"
+#include "khash.h"
+KHASH_MAP_INIT_STR(str2id, int)
+
+void *bcf_build_refhash(bcf_hdr_t *h)
+{
+ khash_t(str2id) *hash;
+ int i, ret;
+ hash = kh_init(str2id);
+ for (i = 0; i < h->n_ref; ++i) {
+ khint_t k;
+ k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret
+ kh_val(hash, k) = i;
+ }
+ return hash;
+}
+
+void bcf_str2id_destroy(void *_hash)
+{
+ khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
+ if (hash) kh_destroy(str2id, hash); // Note that strings are not freed.
+}
+
+int bcf_str2id(void *_hash, const char *str)
+{
+ khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
+ khint_t k;
+ if (!hash) return -1;
+ k = kh_get(str2id, hash, str);
+ return k == kh_end(hash)? -1 : kh_val(hash, k);
+}
+
--- /dev/null
+#include <assert.h>
+#include <ctype.h>
+#include <sys/stat.h>
+#include "bam_endian.h"
+#include "kstring.h"
+#include "bcf.h"
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+#define TAD_LIDX_SHIFT 13
+
+typedef struct {
+ int32_t n, m;
+ uint64_t *offset;
+} bcf_lidx_t;
+
+struct __bcf_idx_t {
+ int32_t n;
+ bcf_lidx_t *index2;
+};
+
+/************
+ * indexing *
+ ************/
+
+static inline void insert_offset2(bcf_lidx_t *index2, int _beg, int _end, uint64_t offset)
+{
+ int i, beg, end;
+ beg = _beg >> TAD_LIDX_SHIFT;
+ end = (_end - 1) >> TAD_LIDX_SHIFT;
+ if (index2->m < end + 1) {
+ int old_m = index2->m;
+ index2->m = end + 1;
+ kroundup32(index2->m);
+ index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
+ memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
+ }
+ if (beg == end) {
+ if (index2->offset[beg] == 0) index2->offset[beg] = offset;
+ } else {
+ for (i = beg; i <= end; ++i)
+ if (index2->offset[i] == 0) index2->offset[i] = offset;
+ }
+ if (index2->n < end + 1) index2->n = end + 1;
+}
+
+static void fill_missing(bcf_idx_t *idx)
+{
+ int i, j;
+ for (i = 0; i < idx->n; ++i) {
+ bcf_lidx_t *idx2 = &idx->index2[i];
+ for (j = 1; j < idx2->n; ++j)
+ if (idx2->offset[j] == 0)
+ idx2->offset[j] = idx2->offset[j-1];
+ }
+}
+
+bcf_idx_t *bcf_idx_core(bcf_t *bp)
+{
+ bcf_idx_t *idx;
+ int32_t last_coor, last_tid;
+ uint64_t last_off;
+ kstring_t *str;
+ BGZF *fp = bp->fp;
+ bcf1_t *b;
+ int ret;
+
+ b = calloc(1, sizeof(bcf1_t));
+ str = calloc(1, sizeof(kstring_t));
+ idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t));
+ idx->n = bp->h.n_ref;
+ idx->index2 = calloc(bp->h.n_ref, sizeof(bcf_lidx_t));
+
+ last_tid = 0xffffffffu;
+ last_off = bgzf_tell(fp); last_coor = 0xffffffffu;
+ while ((ret = bcf_read(bp, b)) > 0) {
+ int end, tmp;
+ if (last_tid != b->tid) { // change of chromosomes
+ last_tid = b->tid;
+ } else if (last_coor > b->pos) {
+ fprintf(stderr, "[bcf_idx_core] the input is out of order\n");
+ free(str->s); free(str); free(idx); bcf_destroy(b);
+ return 0;
+ }
+ tmp = strlen(b->ref);
+ end = b->pos + (tmp > 0? tmp : 1);
+ insert_offset2(&idx->index2[b->tid], b->pos, end, last_off);
+ last_off = bgzf_tell(fp);
+ last_coor = b->pos;
+ }
+ fill_missing(idx);
+ free(str->s); free(str); bcf_destroy(b);
+ return idx;
+}
+
+void bcf_idx_destroy(bcf_idx_t *idx)
+{
+ int i;
+ if (idx == 0) return;
+ for (i = 0; i < idx->n; ++i) free(idx->index2[i].offset);
+ free(idx->index2);
+ free(idx);
+}
+
+/******************
+ * index file I/O *
+ ******************/
+
+void bcf_idx_save(const bcf_idx_t *idx, BGZF *fp)
+{
+ int32_t i, ti_is_be;
+ ti_is_be = bam_is_big_endian();
+ bgzf_write(fp, "BCI\4", 4);
+ if (ti_is_be) {
+ uint32_t x = idx->n;
+ bgzf_write(fp, bam_swap_endian_4p(&x), 4);
+ } else bgzf_write(fp, &idx->n, 4);
+ for (i = 0; i < idx->n; ++i) {
+ bcf_lidx_t *index2 = idx->index2 + i;
+ // write linear index (index2)
+ if (ti_is_be) {
+ int x = index2->n;
+ bgzf_write(fp, bam_swap_endian_4p(&x), 4);
+ } else bgzf_write(fp, &index2->n, 4);
+ if (ti_is_be) { // big endian
+ int x;
+ for (x = 0; (int)x < index2->n; ++x)
+ bam_swap_endian_8p(&index2->offset[x]);
+ bgzf_write(fp, index2->offset, 8 * index2->n);
+ for (x = 0; (int)x < index2->n; ++x)
+ bam_swap_endian_8p(&index2->offset[x]);
+ } else bgzf_write(fp, index2->offset, 8 * index2->n);
+ }
+}
+
+static bcf_idx_t *bcf_idx_load_core(BGZF *fp)
+{
+ int i, ti_is_be;
+ char magic[4];
+ bcf_idx_t *idx;
+ ti_is_be = bam_is_big_endian();
+ if (fp == 0) {
+ fprintf(stderr, "[%s] fail to load index.\n", __func__);
+ return 0;
+ }
+ bgzf_read(fp, magic, 4);
+ if (strncmp(magic, "BCI\4", 4)) {
+ fprintf(stderr, "[%s] wrong magic number.\n", __func__);
+ return 0;
+ }
+ idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t));
+ bgzf_read(fp, &idx->n, 4);
+ if (ti_is_be) bam_swap_endian_4p(&idx->n);
+ idx->index2 = (bcf_lidx_t*)calloc(idx->n, sizeof(bcf_lidx_t));
+ for (i = 0; i < idx->n; ++i) {
+ bcf_lidx_t *index2 = idx->index2 + i;
+ int j;
+ bgzf_read(fp, &index2->n, 4);
+ if (ti_is_be) bam_swap_endian_4p(&index2->n);
+ index2->m = index2->n;
+ index2->offset = (uint64_t*)calloc(index2->m, 8);
+ bgzf_read(fp, index2->offset, index2->n * 8);
+ if (ti_is_be)
+ for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
+ }
+ return idx;
+}
+
+bcf_idx_t *bcf_idx_load_local(const char *fnidx)
+{
+ BGZF *fp;
+ fp = bgzf_open(fnidx, "r");
+ if (fp) {
+ bcf_idx_t *idx = bcf_idx_load_core(fp);
+ bgzf_close(fp);
+ return idx;
+ } else return 0;
+}
+
+#ifdef _USE_KNETFILE
+static void download_from_remote(const char *url)
+{
+ const int buf_size = 1 * 1024 * 1024;
+ char *fn;
+ FILE *fp;
+ uint8_t *buf;
+ knetFile *fp_remote;
+ int l;
+ if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
+ l = strlen(url);
+ for (fn = (char*)url + l - 1; fn >= url; --fn)
+ if (*fn == '/') break;
+ ++fn; // fn now points to the file name
+ fp_remote = knet_open(url, "r");
+ if (fp_remote == 0) {
+ fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
+ return;
+ }
+ if ((fp = fopen(fn, "w")) == 0) {
+ fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
+ knet_close(fp_remote);
+ return;
+ }
+ buf = (uint8_t*)calloc(buf_size, 1);
+ while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+ fwrite(buf, 1, l, fp);
+ free(buf);
+ fclose(fp);
+ knet_close(fp_remote);
+}
+#else
+static void download_from_remote(const char *url)
+{
+ return;
+}
+#endif
+
+static char *get_local_version(const char *fn)
+{
+ struct stat sbuf;
+ char *fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcat(strcpy(fnidx, fn), ".bci");
+ if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) {
+ char *p, *url;
+ int l = strlen(fnidx);
+ for (p = fnidx + l - 1; p >= fnidx; --p)
+ if (*p == '/') break;
+ url = fnidx; fnidx = strdup(p + 1);
+ if (stat(fnidx, &sbuf) == 0) {
+ free(url);
+ return fnidx;
+ }
+ fprintf(stderr, "[%s] downloading the index file...\n", __func__);
+ download_from_remote(url);
+ free(url);
+ }
+ if (stat(fnidx, &sbuf) == 0) return fnidx;
+ free(fnidx); return 0;
+}
+
+bcf_idx_t *bcf_idx_load(const char *fn)
+{
+ bcf_idx_t *idx;
+ char *fname = get_local_version(fn);
+ if (fname == 0) return 0;
+ idx = bcf_idx_load_local(fname);
+ free(fname);
+ return idx;
+}
+
+int bcf_idx_build2(const char *fn, const char *_fnidx)
+{
+ char *fnidx;
+ BGZF *fpidx;
+ bcf_t *bp;
+ bcf_idx_t *idx;
+ if ((bp = bcf_open(fn, "r")) == 0) {
+ fprintf(stderr, "[bcf_idx_build2] fail to open the BAM file.\n");
+ return -1;
+ }
+ idx = bcf_idx_core(bp);
+ bcf_close(bp);
+ if (_fnidx == 0) {
+ fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcpy(fnidx, fn); strcat(fnidx, ".bci");
+ } else fnidx = strdup(_fnidx);
+ fpidx = bgzf_open(fnidx, "w");
+ if (fpidx == 0) {
+ fprintf(stderr, "[bcf_idx_build2] fail to create the index file.\n");
+ free(fnidx);
+ return -1;
+ }
+ bcf_idx_save(idx, fpidx);
+ bcf_idx_destroy(idx);
+ bgzf_close(fpidx);
+ free(fnidx);
+ return 0;
+}
+
+int bcf_idx_build(const char *fn)
+{
+ return bcf_idx_build2(fn, 0);
+}
+
+/********************************************
+ * parse a region in the format chr:beg-end *
+ ********************************************/
+
+int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end)
+{
+ char *s, *p;
+ int i, l, k;
+ l = strlen(str);
+ p = s = (char*)malloc(l+1);
+ /* squeeze out "," */
+ for (i = k = 0; i != l; ++i)
+ if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
+ s[k] = 0;
+ for (i = 0; i != k; ++i) if (s[i] == ':') break;
+ s[i] = 0;
+ if ((*tid = bcf_str2id(str2id, s)) < 0) {
+ free(s);
+ return -1;
+ }
+ if (i == k) { /* dump the whole sequence */
+ *begin = 0; *end = 1<<29; free(s);
+ return 0;
+ }
+ for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
+ *begin = atoi(p);
+ if (i < k) {
+ p = s + i + 1;
+ *end = atoi(p);
+ } else *end = 1<<29;
+ if (*begin > 0) --*begin;
+ free(s);
+ if (*begin > *end) return -1;
+ return 0;
+}
+
+/*******************************
+ * retrieve a specified region *
+ *******************************/
+
+uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg, int end)
+{
+ uint64_t min_off;
+ if (beg < 0) beg = 0;
+ if (end < beg) return 0xffffffffffffffffull;
+ min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1]
+ : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT];
+ assert(min_off);
+ return min_off;
+}
+
+int bcf_main_index(int argc, char *argv[])
+{
+ if (argc == 1) {
+ fprintf(stderr, "Usage: bcftools index <in.bcf>\n");
+ return 1;
+ }
+ bcf_idx_build(argv[1]);
+ return 0;
+}
--- /dev/null
+#include <string.h>
+#include <stdlib.h>
+#include "bcf.h"
+
+int bcfview(int argc, char *argv[]);
+int bcf_main_index(int argc, char *argv[]);
+
+int main(int argc, char *argv[])
+{
+ if (argc == 1) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bcftools <command> <arguments>\n\n");
+ fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n");
+ fprintf(stderr, " index index BCF\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1);
+ if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1);
+ else {
+ fprintf(stderr, "[main] Unrecognized command.\n");
+ return 1;
+ }
+ return 0;
+}
--- /dev/null
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "prob1.h"
+
+#define MC_AVG_ERR 0.007
+#define MC_MAX_EM_ITER 16
+#define MC_EM_EPS 1e-4
+
+unsigned char seq_nt4_table[256] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+
+struct __bcf_p1aux_t {
+ int n, M;
+ double *q2p, *pdg; // pdg -> P(D|g)
+ double *phi, *CMk; // CMk=\binom{M}{k}
+ double *z, *zswap; // aux for afs
+ double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution
+ const uint8_t *PL; // point to PL
+ int PL_len;
+};
+
+void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta)
+{
+ int i;
+ if (type == MC_PTYPE_COND2) {
+ for (i = 0; i <= ma->M; ++i)
+ ma->phi[i] = 2. * (i + 1) / (ma->M + 1) / (ma->M + 2);
+ } else if (type == MC_PTYPE_FLAT) {
+ for (i = 0; i <= ma->M; ++i)
+ ma->phi[i] = 1. / (ma->M + 1);
+ } else {
+ double sum;
+ for (i = 0, sum = 0.; i < ma->M; ++i)
+ sum += (ma->phi[i] = theta / (ma->M - i));
+ ma->phi[ma->M] = 1. - sum;
+ }
+}
+
+bcf_p1aux_t *bcf_p1_init(int n) // FIXME: assuming diploid
+{
+ bcf_p1aux_t *ma;
+ int i;
+ ma = calloc(1, sizeof(bcf_p1aux_t));
+ ma->n = n; ma->M = 2 * n;
+ ma->q2p = calloc(256, sizeof(double));
+ ma->pdg = calloc(3 * ma->n, sizeof(double));
+ ma->phi = calloc(ma->M + 1, sizeof(double));
+ ma->CMk = calloc(ma->M + 1, sizeof(double));
+ ma->z = calloc(2 * ma->n + 1, sizeof(double));
+ ma->zswap = calloc(2 * ma->n + 1, sizeof(double));
+ ma->afs = calloc(2 * ma->n + 1, sizeof(double));
+ ma->afs1 = calloc(2 * ma->n + 1, sizeof(double));
+ for (i = 0; i < 256; ++i)
+ ma->q2p[i] = pow(10., -i / 10.);
+ for (i = 0; i <= ma->M; ++i)
+ ma->CMk[i] = exp(lgamma(ma->M + 1) - lgamma(i + 1) - lgamma(ma->M - i + 1));
+ bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior
+ return ma;
+}
+
+void bcf_p1_destroy(bcf_p1aux_t *ma)
+{
+ if (ma) {
+ free(ma->q2p); free(ma->pdg);
+ free(ma->phi); free(ma->CMk);
+ free(ma->z); free(ma->zswap);
+ free(ma->afs); free(ma->afs1);
+ free(ma);
+ }
+}
+
+#define char2int(s) (((int)s[0])<<8|s[1])
+
+static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
+{
+ int i, j, k;
+ long *p, tmp;
+ p = alloca(b->n_alleles * sizeof(long));
+ memset(p, 0, sizeof(long) * b->n_alleles);
+ for (j = 0; j < ma->n; ++j) {
+ const uint8_t *pi = ma->PL + j * ma->PL_len;
+ double *pdg = ma->pdg + j * 3;
+ pdg[0] = ma->q2p[pi[b->n_alleles]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]];
+ for (i = k = 0; i < b->n_alleles; ++i) {
+ p[i] += (int)pi[k];
+ k += b->n_alleles - i;
+ }
+ }
+ for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i;
+ for (i = 1; i < b->n_alleles; ++i) // insertion sort
+ for (j = i; j > 0 && p[j] < p[j-1]; --j)
+ tmp = p[j], p[j] = p[j-1], p[j-1] = tmp;
+ for (i = b->n_alleles - 1; i >= 0; --i)
+ if ((p[i]&0xf) == 0) break;
+ return i;
+}
+// f0 is the reference allele frequency
+static double mc_freq_iter(double f0, const bcf_p1aux_t *ma)
+{
+ double f, f3[3];
+ int i;
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ for (i = 0, f = 0.; i < ma->n; ++i) {
+ double *pdg;
+ pdg = ma->pdg + i * 3;
+ f += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2])
+ / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]);
+ }
+ f /= ma->n * 2.;
+ return f;
+}
+
+int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
+{
+ double sum, g[3];
+ double max, f3[3], *pdg = ma->pdg + k * 3;
+ int q, i, max_i;
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ for (i = 0, sum = 0.; i < 3; ++i)
+ sum += (g[i] = pdg[i] * f3[i]);
+ for (i = 0, max = -1., max_i = 0; i < 3; ++i) {
+ g[i] /= sum;
+ if (g[i] > max) max = g[i], max_i = i;
+ }
+ max = 1. - max;
+ if (max < 1e-308) max = 1e-308;
+ q = (int)(-3.434 * log(max) + .499);
+ if (q > 99) q = 99;
+ return q<<2|max_i;
+}
+
+static void mc_cal_z(bcf_p1aux_t *ma)
+{
+ double *z[2], *tmp, *pdg;
+ int i, j;
+ z[0] = ma->z;
+ z[1] = ma->zswap;
+ pdg = ma->pdg;
+ z[0][0] = 1.; z[0][1] = z[0][2] = 0.;
+ for (j = 0; j < ma->n; ++j) {
+ int max = (j + 1) * 2;
+ double p[3];
+ pdg = ma->pdg + j * 3;
+ p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2];
+ z[1][0] = p[0] * z[0][0];
+ z[1][1] = p[0] * z[0][1] + p[1] * z[0][0];
+ for (i = 2; i <= max; ++i)
+ z[1][i] = p[0] * z[0][i] + p[1] * z[0][i-1] + p[2] * z[0][i-2];
+ if (j < ma->n - 1) z[1][max+1] = z[1][max+2] = 0.;
+ tmp = z[0]; z[0] = z[1]; z[1] = tmp;
+ }
+ if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1));
+}
+
+static double mc_cal_afs(bcf_p1aux_t *ma)
+{
+ int k;
+ long double sum = 0.;
+ memset(ma->afs1, 0, sizeof(double) * (ma->M + 1));
+ mc_cal_z(ma);
+ for (k = 0, sum = 0.; k <= ma->M; ++k)
+ sum += (long double)ma->phi[k] * ma->z[k] / ma->CMk[k];
+ for (k = 0; k <= ma->M; ++k) {
+ ma->afs1[k] = ma->phi[k] * ma->z[k] / ma->CMk[k] / sum;
+ if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.;
+ }
+ for (k = 0, sum = 0.; k <= ma->M; ++k) {
+ ma->afs[k] += ma->afs1[k];
+ sum += k * ma->afs1[k];
+ }
+ return sum / ma->M;
+}
+
+int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
+{
+ int i, k;
+ long double sum = 0.;
+ // set PL and PL_len
+ for (i = 0; i < b->n_gi; ++i) {
+ if (b->gi[i].fmt == char2int("PL")) {
+ ma->PL = (uint8_t*)b->gi[i].data;
+ ma->PL_len = b->gi[i].len;
+ break;
+ }
+ }
+ if (b->n_alleles < 2) return -1; // FIXME: find a better solution
+ //
+ rst->rank0 = cal_pdg(b, ma);
+ rst->f_exp = mc_cal_afs(ma);
+ rst->p_ref = ma->afs1[ma->M];
+ // calculate f_flat and f_em
+ for (k = 0, sum = 0.; k <= ma->M; ++k)
+ sum += (long double)ma->z[k] / ma->CMk[k];
+ rst->f_flat = 0.;
+ for (k = 0; k <= ma->M; ++k) {
+ double p = ma->z[k] / ma->CMk[k] / sum;
+ rst->f_flat += k * p;
+ }
+ rst->f_flat /= ma->M;
+ { // calculate f_em
+ double flast = rst->f_flat;
+ for (i = 0; i < MC_MAX_EM_ITER; ++i) {
+ rst->f_em = mc_freq_iter(flast, ma);
+ if (fabs(rst->f_em - flast) < MC_EM_EPS) break;
+ flast = rst->f_em;
+ }
+ }
+ return 0;
+}
+
+void bcf_p1_dump_afs(bcf_p1aux_t *ma)
+{
+ int k;
+ fprintf(stderr, "[afs]");
+ for (k = 0; k <= ma->M; ++k)
+ fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]);
+ fprintf(stderr, "\n");
+ memset(ma->afs, 0, sizeof(double) * (ma->M + 1));
+}
--- /dev/null
+#ifndef BCF_PROB1_H
+#define BCF_PROB1_H
+
+#include "bcf.h"
+
+struct __bcf_p1aux_t;
+typedef struct __bcf_p1aux_t bcf_p1aux_t;
+
+typedef struct {
+ int rank0;
+ double f_em, f_exp, f_flat, p_ref;
+} bcf_p1rst_t;
+
+#define MC_PTYPE_FULL 1
+#define MC_PTYPE_COND2 2
+#define MC_PTYPE_FLAT 3
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bcf_p1aux_t *bcf_p1_init(int n);
+ void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta);
+ void bcf_p1_destroy(bcf_p1aux_t *ma);
+ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst);
+ int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k);
+ void bcf_p1_dump_afs(bcf_p1aux_t *ma);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+#include <unistd.h>
+#include <stdlib.h>
+#include <math.h>
+#include <zlib.h>
+#include "bcf.h"
+#include "prob1.h"
+#include "kstring.h"
+
+#include "khash.h"
+KHASH_SET_INIT_INT64(set64)
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define VC_NO_PL 1
+#define VC_NO_GENO 2
+#define VC_BCF 4
+#define VC_CALL 8
+#define VC_VARONLY 16
+
+typedef struct {
+ int flag, prior_type;
+ char *fn_list;
+ double theta;
+} viewconf_t;
+
+khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h)
+{
+ void *str2id;
+ gzFile fp;
+ kstream_t *ks;
+ int ret, dret, lineno = 1;
+ kstring_t *str;
+ khash_t(set64) *hash = 0;
+
+ hash = kh_init(set64);
+ str2id = bcf_build_refhash(_h);
+ str = calloc(1, sizeof(kstring_t));
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ int tid = bcf_str2id(str2id, str->s);
+ if (tid >= 0 && dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1);
+ kh_put(set64, hash, x, &ret);
+ } else break;
+ } else fprintf(stderr, "[%s] %s is not a reference name (line %d).\n", __func__, str->s, lineno);
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+ if (dret < 0) break;
+ ++lineno;
+ }
+ bcf_str2id_destroy(str2id);
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ return hash;
+}
+
+static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, int flag)
+{
+ kstring_t s;
+ int x, is_var = (pr->p_ref < .5);
+ double r = is_var? pr->p_ref : 1. - pr->p_ref;
+ memset(&s, 0, sizeof(kstring_t));
+ kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s);
+ if (is_var) {
+ kputs(b->alt, &s);
+ }
+ kputc('\0', &s); kputc('\0', &s);
+ kputs(b->info, &s);
+ if (b->info[0]) kputc(';', &s);
+ ksprintf(&s, "AF1=%.3lf;AFE=%.3lf", 1.-pr->f_em, 1.-pr->f_exp);
+ kputc('\0', &s);
+ kputs(b->fmt, &s); kputc('\0', &s);
+ free(b->str);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ x = (int)(r < 1e-100? 99 : -3.434 * log(r) + .499);
+ b->qual = x > 99? 99 : x;
+ return is_var;
+}
+
+int bcfview(int argc, char *argv[])
+{
+ bcf_t *bp, *bout = 0;
+ bcf1_t *b;
+ int c;
+ uint64_t n_processed = 0;
+ viewconf_t vc;
+ bcf_p1aux_t *p1 = 0;
+ int tid, begin, end;
+ khash_t(set64) *hash = 0;
+
+ tid = begin = end = -1;
+ memset(&vc, 0, sizeof(viewconf_t));
+ vc.prior_type = -1; vc.theta = 1e-3;
+ while ((c = getopt(argc, argv, "l:cGvLbP:t:")) >= 0) {
+ switch (c) {
+ case 'l': vc.fn_list = strdup(optarg); break;
+ case 'G': vc.flag |= VC_NO_GENO; break;
+ case 'L': vc.flag |= VC_NO_PL; break;
+ case 'b': vc.flag |= VC_BCF; break;
+ case 'c': vc.flag |= VC_CALL; break;
+ case 'v': vc.flag |= VC_VARONLY; break;
+ case 't': vc.theta = atof(optarg); break;
+ case 'P':
+ if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL;
+ else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2;
+ else if (strcmp(optarg, "flat") == 0) vc.prior_type = MC_PTYPE_FLAT;
+ break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "Usage: bcftools view [-cGPb] [-l list] <in.bcf> [reg]\n");
+ return 1;
+ }
+
+ b = calloc(1, sizeof(bcf1_t));
+ bp = bcf_open(argv[optind], "r");
+ if (vc.flag & VC_BCF) {
+ bout = bcf_open("-", "w");
+ bcf_hdr_cpy(&bout->h, &bp->h);
+ bcf_hdr_write(bout);
+ }
+ if (vc.flag & VC_CALL) {
+ p1 = bcf_p1_init(bp->h.n_smpl);
+ bcf_p1_init_prior(p1, vc.prior_type, vc.theta);
+ }
+ if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, &bp->h);
+ if (optind + 1 < argc) {
+ void *str2id = bcf_build_refhash(&bp->h);
+ if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) {
+ bcf_idx_t *idx;
+ idx = bcf_idx_load(argv[optind]);
+ if (idx) {
+ uint64_t off;
+ off = bcf_idx_query(idx, tid, begin, end);
+ bgzf_seek(bp->fp, off, SEEK_SET);
+ bcf_idx_destroy(idx);
+ }
+ }
+ }
+ while (bcf_read(bp, b) > 0) {
+ if (hash) {
+ uint64_t x = (uint64_t)b->tid<<32 | b->pos;
+ khint_t k = kh_get(set64, hash, x);
+ if (k == kh_end(hash)) continue;
+ }
+ if (tid >= 0) {
+ int l = strlen(b->ref);
+ l = b->pos + (l > 0? l : 1);
+ if (b->tid != tid || b->pos >= end) break;
+ if (!(l > begin && end > b->pos)) continue;
+ }
+ if (vc.flag & VC_CALL) {
+ bcf_p1rst_t pr;
+ int is_var;
+ bcf_p1_cal(b, p1, &pr);
+ is_var = update_bcf1(b, p1, &pr, vc.flag);
+ bcf_sync(bp->h.n_smpl, b);
+ if ((n_processed + 1) % 50000 == 0) bcf_p1_dump_afs(p1);
+ if ((vc.flag & VC_VARONLY) && !is_var) continue;
+ }
+ if (vc.flag & VC_BCF) bcf_write(bout, b);
+ else {
+ char *vcf;
+ if (vc.flag & VC_NO_GENO) {
+ b->n_gi = 0;
+ b->fmt[0] = '\0';
+ }
+ vcf = bcf_fmt(bp, b);
+ puts(vcf);
+ free(vcf);
+ }
+ ++n_processed;
+ }
+ if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1);
+ bcf_close(bp); bcf_close(bout);
+ bcf_destroy(b);
+ if (hash) kh_destroy(set64, hash);
+ if (vc.fn_list) free(vc.fn_list);
+ if (p1) bcf_p1_destroy(p1);
+ return 0;
+}
+