From: Heng Li Date: Wed, 4 Aug 2010 01:04:00 +0000 (+0000) Subject: move bcftools to samtools X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=fd187467e3c5f026a4d934c5b0cb68a42939f6e1 move bcftools to samtools --- diff --git a/Makefile b/Makefile index 7d98b24..8a7418e 100644 --- a/Makefile +++ b/Makefile @@ -7,10 +7,10 @@ LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ - bamtk.o kaln.o bam_mcns.o bam2bcf.o bcf.o + bamtk.o kaln.o bam_mcns.o bam2bcf.o PROG= samtools -INCLUDES= -SUBDIRS= . misc +INCLUDES= -Ibcftools +SUBDIRS= . bcftools misc LIBPATH= LIBCURSES= -lcurses # -lXCurses @@ -25,7 +25,7 @@ all-recur lib-recur clean-recur cleanlocal-recur install-recur: list='$(SUBDIRS)'; for subdir in $$list; do \ cd $$subdir; \ $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ - INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ + LIBPATH="$(LIBPATH)" $$target || exit 1; \ cd $$wdir; \ done; @@ -39,8 +39,8 @@ lib:libbam.a libbam.a:$(LOBJS) $(AR) -cru $@ $(LOBJS) -samtools:$(AOBJS) libbam.a - $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz +samtools:lib-recur $(AOBJS) + $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz -Lbcftools -lbcf razip:razip.o razf.o $(KNETFILE_O) $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz diff --git a/bcf.c b/bcf.c deleted file mode 100644 index f8b55c2..0000000 --- a/bcf.c +++ /dev/null @@ -1,261 +0,0 @@ -#include -#include -#include -#include "kstring.h" -#include "bcf.h" - -int bcf_hdr_read(bcf_t *b); - -void bcf_hdr_clear(bcf_hdr_t *b) -{ - free(b->name); free(b->sname); free(b->txt); free(b->ns); free(b->sns); - memset(b, 0, sizeof(bcf_hdr_t)); -} - -bcf_t *bcf_open(const char *fn, const char *mode) -{ - bcf_t *b; - b = calloc(1, sizeof(bcf_t)); - if (strchr(mode, 'w')) { - b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdout), "w"); - } else { - b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), "r"); - } - b->fp->owned_file = 1; - if (strchr(mode, 'r')) bcf_hdr_read(b); - return b; -} - -int bcf_close(bcf_t *b) -{ - int ret; - if (b == 0) return 0; - ret = bgzf_close(b->fp); - free(b->h.name); free(b->h.sname); free(b->h.txt); free(b->h.ns); free(b->h.sns); - free(b); - return ret; -} - -int bcf_hdr_write(bcf_t *b) -{ - bcf_hdr_t *h; - if (b == 0) return -1; - h = &b->h; - bgzf_write(b->fp, "BCF\4", 4); - bgzf_write(b->fp, &h->l_nm, 4); - bgzf_write(b->fp, h->name, h->l_nm); - bgzf_write(b->fp, &h->l_smpl, 4); - bgzf_write(b->fp, h->sname, h->l_smpl); - bgzf_write(b->fp, &h->l_txt, 4); - bgzf_write(b->fp, h->txt, h->l_txt); - bgzf_flush(b->fp); - return 16 + h->l_nm + h->l_smpl + h->l_txt; -} - -int bcf_hdr_cpy(bcf_hdr_t *h, const bcf_hdr_t *h0) -{ - *h = *h0; - h->name = malloc(h->l_nm); - h->sname = malloc(h->l_smpl); - h->txt = malloc(h->l_txt); - memcpy(h->name, h0->name, h->l_nm); - memcpy(h->sname, h0->sname, h->l_smpl); - memcpy(h->txt, h0->txt, h->l_txt); - bcf_hdr_sync(h); - return 0; -} - -int bcf_hdr_read(bcf_t *b) -{ - uint8_t magic[4]; - bcf_hdr_t *h; - if (b == 0) return -1; - bcf_hdr_clear(&b->h); - h = &b->h; - bgzf_read(b->fp, magic, 4); - bgzf_read(b->fp, &h->l_nm, 4); - h->name = malloc(h->l_nm); - bgzf_read(b->fp, h->name, h->l_nm); - bgzf_read(b->fp, &h->l_smpl, 4); - h->sname = malloc(h->l_smpl); - bgzf_read(b->fp, h->sname, h->l_smpl); - bgzf_read(b->fp, &h->l_txt, 4); - h->txt = malloc(h->l_txt); - bgzf_read(b->fp, h->txt, h->l_txt); - bcf_hdr_sync(&b->h); - return 16 + h->l_nm + h->l_smpl + h->l_txt; -} - -static inline char **cnt_null(int l, char *str, int *_n) -{ - int n = 0; - char *p, **list; - *_n = 0; - if (l == 0 || str == 0) return 0; - for (p = str; p != str + l; ++p) - if (*p == 0) ++n; - *_n = n; - list = calloc(n, sizeof(void*)); - list[0] = str; - for (p = str, n = 1; p < str + l - 1; ++p) - if (*p == 0) list[n++] = p + 1; - return list; -} - -int bcf_hdr_sync(bcf_hdr_t *b) -{ - if (b == 0) return -1; - b->ns = cnt_null(b->l_nm, b->name, &b->n_ref); - b->sns = cnt_null(b->l_smpl, b->sname, &b->n_smpl); - return 0; -} - -#define char2int(s) (((int)s[0])<<8|s[1]) - -int bcf_sync(int n_smpl, bcf1_t *b) -{ - char *p, *tmp[5], *s; - int i, n; - // set ref, alt, flt, info, fmt - b->ref = b->alt = b->flt = b->info = b->fmt = 0; - for (p = b->str, n = 0; p < b->str + b->l_str; ++p) - if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1; - if (n != 5) return -1; - b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4]; - // set n_alleles - for (p = b->alt, n = 1; *p; ++p) - if (*p == ',') ++n; - b->n_alleles = n + 1; - // set n_gi and gi[i].fmt - for (p = b->fmt, n = 1; *p; ++p) - if (*p == ':') ++n; - if (n > b->m_gi) { - int old_m = b->m_gi; - b->m_gi = n; - kroundup32(b->m_gi); - b->gi = realloc(b->gi, b->m_gi * sizeof(bcf_ginfo_t)); - memset(b->gi + old_m, 0, (b->m_gi - old_m) * sizeof(bcf_ginfo_t)); - } - b->n_gi = n; - for (p = s = b->fmt, n = 0; *p; ++p) { - if (*p == ':' || *(p+1) == 0) { - char *q = *p == ':'? p : p + 1; - if ((q - s) != 2) return -2; - b->gi[n].fmt = char2int(s); - s = q; - } - } - // set gi[i].len - for (i = 0; i < b->n_gi; ++i) { - if (b->gi[i].fmt == char2int("PL")) { - b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2; - } else if (b->gi[i].fmt == char2int("DP") || b->gi[i].fmt == char2int("HQ")) { - b->gi[i].len = 2; - } else if (b->gi[i].fmt == char2int("GQ") || b->gi[i].fmt == char2int("GT")) { - b->gi[i].len = 1; - } else if (b->gi[i].fmt == char2int("GL")) { - b->gi[i].len = 4; - } - b->gi[i].data = realloc(b->gi[i].data, n_smpl * b->gi[i].len); - } - return 0; -} - -int bcf_write(bcf_t *bp, const bcf1_t *b) -{ - uint32_t x; - int i, l = 0; - if (b == 0) return -1; - bgzf_write(bp->fp, &b->tid, 4); - bgzf_write(bp->fp, &b->pos, 4); - x = b->qual<<24 | b->l_str; - bgzf_write(bp->fp, &x, 4); - bgzf_write(bp->fp, b->str, b->l_str); - l = 12 + b->l_str; - for (i = 0; i < b->n_gi; ++i) { - bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl); - l += b->gi[i].len * bp->h.n_smpl; - } - return l; -} - -int bcf_read(bcf_t *bp, bcf1_t *b) -{ - int i, l = 0; - uint32_t x; - if (b == 0) return -1; - if (bgzf_read(bp->fp, &b->tid, 4) == 0) return 0; - bgzf_read(bp->fp, &b->pos, 4); - bgzf_read(bp->fp, &x, 4); - b->qual = x >> 24; b->l_str = x << 8 >> 8; - if (b->l_str > b->m_str) { - b->m_str = b->l_str; - kroundup32(b->m_str); - b->str = realloc(b->str, b->m_str); - } - bgzf_read(bp->fp, b->str, b->l_str); - l = 12 + b->l_str; - bcf_sync(bp->h.n_smpl, b); - for (i = 0; i < b->n_gi; ++i) { - bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl); - l += b->gi[i].len * bp->h.n_smpl; - } - return l; -} - -int bcf_destroy(bcf1_t *b) -{ - int i; - if (b == 0) return -1; - free(b->str); - for (i = 0; i < b->n_gi; ++i) - free(b->gi[i].data); - free(b->gi); - free(b); - return 0; -} - -static inline void fmt_str(const char *p, kstring_t *s) -{ - if (*p == 0) kputc('.', s); - else kputs(p, s); -} - -char *bcf_fmt(bcf_t *bp, bcf1_t *b) -{ - kstring_t s; - int i, j, x; - memset(&s, 0, sizeof(kstring_t)); - kputs(bp->h.ns[b->tid], &s); kputc('\t', &s); - kputw(b->pos + 1, &s); kputc('\t', &s); - fmt_str(b->str, &s); kputc('\t', &s); - fmt_str(b->ref, &s); kputc('\t', &s); - fmt_str(b->alt, &s); kputc('\t', &s); - kputw(b->qual, &s); kputc('\t', &s); - fmt_str(b->flt, &s); kputc('\t', &s); - fmt_str(b->info, &s); - if (b->fmt[0]) { - kputc('\t', &s); - fmt_str(b->fmt, &s); - } - x = b->n_alleles * (b->n_alleles + 1) / 2; - for (j = 0; j < bp->h.n_smpl; ++j) { - kputc('\t', &s); - for (i = 0; i < b->n_gi; ++i) { - if (i) kputc(':', &s); - if (b->gi[i].fmt == char2int("PL")) { - uint8_t *d = (uint8_t*)b->gi[i].data + j * x; - int k; - for (k = 0; k < x; ++k) { - if (k > 0) kputc(',', &s); - kputw(d[k], &s); - } - } else if (b->gi[i].fmt == char2int("DP")) { - kputw(((uint16_t*)b->gi[i].data)[j], &s); - } else if (b->gi[i].fmt == char2int("GQ")) { - kputw(((uint8_t*)b->gi[i].data)[j], &s); - } - } - } - return s.s; -} diff --git a/bcf.h b/bcf.h deleted file mode 100644 index 7512f3e..0000000 --- a/bcf.h +++ /dev/null @@ -1,57 +0,0 @@ -#ifndef BCF_H -#define BCF_H - -#include -#include "bgzf.h" - -typedef struct { - int fmt, len; // len is the unit length - void *data; - // derived info: fmt, len -} bcf_ginfo_t; - -typedef struct { - int32_t tid, pos; - uint32_t qual:8, l_str:24; - int m_str; - char *str, *ref, *alt, *flt, *info, *fmt; // fmt, ref, alt and info point to str - int n_gi, m_gi; - bcf_ginfo_t *gi; - int n_alleles; - // derived info: ref, alt, flt, info, fmt, n_gi, n_alleles -} bcf1_t; - -typedef struct { - int32_t n_ref, n_smpl; - int32_t l_nm; - int32_t l_smpl; - int32_t l_txt; - char *name, *sname, *txt; - char **ns, **sns; - // derived info: n_ref, n_smpl, ns, sns -} bcf_hdr_t; - -typedef struct { - BGZF *fp; - bcf_hdr_t h; -} bcf_t; - -#ifdef __cplusplus -extern "C" { -#endif - - bcf_t *bcf_open(const char *fn, const char *mode); - int bcf_close(bcf_t *b); - int bcf_read(bcf_t *bp, bcf1_t *b); - int bcf_sync(int n_smpl, bcf1_t *b); - int bcf_write(bcf_t *bp, const bcf1_t *b); - int bcf_hdr_write(bcf_t *b); - int bcf_hdr_sync(bcf_hdr_t *b); - int bcf_destroy(bcf1_t *b); - char *bcf_fmt(bcf_t *bp, bcf1_t *b); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/bcftools/Makefile b/bcftools/Makefile new file mode 100644 index 0000000..ec3fa74 --- /dev/null +++ b/bcftools/Makefile @@ -0,0 +1,47 @@ +CC= gcc +CFLAGS= -g -Wall -O2 #-fPIC #-m64 #-arch ppc +DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE +LOBJS= bcf.o bcfutils.o prob1.o index.o +OMISC= .. +AOBJS= vcfout.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o +PROG= bcftools +INCLUDES= -I.. +SUBDIRS= . + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all-recur lib-recur clean-recur cleanlocal-recur install-recur: + @target=`echo $@ | sed s/-recur//`; \ + wdir=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + cd $$subdir; \ + $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ + INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ + cd $$wdir; \ + done; + +all:$(PROG) + +lib:libbcf.a + +libbcf.a:$(LOBJS) + $(AR) -cru $@ $(LOBJS) + +bcftools:lib $(AOBJS) + $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) -lz -L. -lbcf + +bcf.o:bcf.h +prob1.o:prob1.h bcf.h +vcfout.o:bcf.h prob1.h +main.o:bcf.h + +bcf.pdf:bcf.tex + pdflatex bcf + +cleanlocal: + rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a bcf.aux bcf.log bcf.pdf *.class libbcf.*.dylib libbcf.so* + +clean:cleanlocal-recur diff --git a/bcftools/bcf.c b/bcftools/bcf.c new file mode 100644 index 0000000..f8b55c2 --- /dev/null +++ b/bcftools/bcf.c @@ -0,0 +1,261 @@ +#include +#include +#include +#include "kstring.h" +#include "bcf.h" + +int bcf_hdr_read(bcf_t *b); + +void bcf_hdr_clear(bcf_hdr_t *b) +{ + free(b->name); free(b->sname); free(b->txt); free(b->ns); free(b->sns); + memset(b, 0, sizeof(bcf_hdr_t)); +} + +bcf_t *bcf_open(const char *fn, const char *mode) +{ + bcf_t *b; + b = calloc(1, sizeof(bcf_t)); + if (strchr(mode, 'w')) { + b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdout), "w"); + } else { + b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), "r"); + } + b->fp->owned_file = 1; + if (strchr(mode, 'r')) bcf_hdr_read(b); + return b; +} + +int bcf_close(bcf_t *b) +{ + int ret; + if (b == 0) return 0; + ret = bgzf_close(b->fp); + free(b->h.name); free(b->h.sname); free(b->h.txt); free(b->h.ns); free(b->h.sns); + free(b); + return ret; +} + +int bcf_hdr_write(bcf_t *b) +{ + bcf_hdr_t *h; + if (b == 0) return -1; + h = &b->h; + bgzf_write(b->fp, "BCF\4", 4); + bgzf_write(b->fp, &h->l_nm, 4); + bgzf_write(b->fp, h->name, h->l_nm); + bgzf_write(b->fp, &h->l_smpl, 4); + bgzf_write(b->fp, h->sname, h->l_smpl); + bgzf_write(b->fp, &h->l_txt, 4); + bgzf_write(b->fp, h->txt, h->l_txt); + bgzf_flush(b->fp); + return 16 + h->l_nm + h->l_smpl + h->l_txt; +} + +int bcf_hdr_cpy(bcf_hdr_t *h, const bcf_hdr_t *h0) +{ + *h = *h0; + h->name = malloc(h->l_nm); + h->sname = malloc(h->l_smpl); + h->txt = malloc(h->l_txt); + memcpy(h->name, h0->name, h->l_nm); + memcpy(h->sname, h0->sname, h->l_smpl); + memcpy(h->txt, h0->txt, h->l_txt); + bcf_hdr_sync(h); + return 0; +} + +int bcf_hdr_read(bcf_t *b) +{ + uint8_t magic[4]; + bcf_hdr_t *h; + if (b == 0) return -1; + bcf_hdr_clear(&b->h); + h = &b->h; + bgzf_read(b->fp, magic, 4); + bgzf_read(b->fp, &h->l_nm, 4); + h->name = malloc(h->l_nm); + bgzf_read(b->fp, h->name, h->l_nm); + bgzf_read(b->fp, &h->l_smpl, 4); + h->sname = malloc(h->l_smpl); + bgzf_read(b->fp, h->sname, h->l_smpl); + bgzf_read(b->fp, &h->l_txt, 4); + h->txt = malloc(h->l_txt); + bgzf_read(b->fp, h->txt, h->l_txt); + bcf_hdr_sync(&b->h); + return 16 + h->l_nm + h->l_smpl + h->l_txt; +} + +static inline char **cnt_null(int l, char *str, int *_n) +{ + int n = 0; + char *p, **list; + *_n = 0; + if (l == 0 || str == 0) return 0; + for (p = str; p != str + l; ++p) + if (*p == 0) ++n; + *_n = n; + list = calloc(n, sizeof(void*)); + list[0] = str; + for (p = str, n = 1; p < str + l - 1; ++p) + if (*p == 0) list[n++] = p + 1; + return list; +} + +int bcf_hdr_sync(bcf_hdr_t *b) +{ + if (b == 0) return -1; + b->ns = cnt_null(b->l_nm, b->name, &b->n_ref); + b->sns = cnt_null(b->l_smpl, b->sname, &b->n_smpl); + return 0; +} + +#define char2int(s) (((int)s[0])<<8|s[1]) + +int bcf_sync(int n_smpl, bcf1_t *b) +{ + char *p, *tmp[5], *s; + int i, n; + // set ref, alt, flt, info, fmt + b->ref = b->alt = b->flt = b->info = b->fmt = 0; + for (p = b->str, n = 0; p < b->str + b->l_str; ++p) + if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1; + if (n != 5) return -1; + b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4]; + // set n_alleles + for (p = b->alt, n = 1; *p; ++p) + if (*p == ',') ++n; + b->n_alleles = n + 1; + // set n_gi and gi[i].fmt + for (p = b->fmt, n = 1; *p; ++p) + if (*p == ':') ++n; + if (n > b->m_gi) { + int old_m = b->m_gi; + b->m_gi = n; + kroundup32(b->m_gi); + b->gi = realloc(b->gi, b->m_gi * sizeof(bcf_ginfo_t)); + memset(b->gi + old_m, 0, (b->m_gi - old_m) * sizeof(bcf_ginfo_t)); + } + b->n_gi = n; + for (p = s = b->fmt, n = 0; *p; ++p) { + if (*p == ':' || *(p+1) == 0) { + char *q = *p == ':'? p : p + 1; + if ((q - s) != 2) return -2; + b->gi[n].fmt = char2int(s); + s = q; + } + } + // set gi[i].len + for (i = 0; i < b->n_gi; ++i) { + if (b->gi[i].fmt == char2int("PL")) { + b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2; + } else if (b->gi[i].fmt == char2int("DP") || b->gi[i].fmt == char2int("HQ")) { + b->gi[i].len = 2; + } else if (b->gi[i].fmt == char2int("GQ") || b->gi[i].fmt == char2int("GT")) { + b->gi[i].len = 1; + } else if (b->gi[i].fmt == char2int("GL")) { + b->gi[i].len = 4; + } + b->gi[i].data = realloc(b->gi[i].data, n_smpl * b->gi[i].len); + } + return 0; +} + +int bcf_write(bcf_t *bp, const bcf1_t *b) +{ + uint32_t x; + int i, l = 0; + if (b == 0) return -1; + bgzf_write(bp->fp, &b->tid, 4); + bgzf_write(bp->fp, &b->pos, 4); + x = b->qual<<24 | b->l_str; + bgzf_write(bp->fp, &x, 4); + bgzf_write(bp->fp, b->str, b->l_str); + l = 12 + b->l_str; + for (i = 0; i < b->n_gi; ++i) { + bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl); + l += b->gi[i].len * bp->h.n_smpl; + } + return l; +} + +int bcf_read(bcf_t *bp, bcf1_t *b) +{ + int i, l = 0; + uint32_t x; + if (b == 0) return -1; + if (bgzf_read(bp->fp, &b->tid, 4) == 0) return 0; + bgzf_read(bp->fp, &b->pos, 4); + bgzf_read(bp->fp, &x, 4); + b->qual = x >> 24; b->l_str = x << 8 >> 8; + if (b->l_str > b->m_str) { + b->m_str = b->l_str; + kroundup32(b->m_str); + b->str = realloc(b->str, b->m_str); + } + bgzf_read(bp->fp, b->str, b->l_str); + l = 12 + b->l_str; + bcf_sync(bp->h.n_smpl, b); + for (i = 0; i < b->n_gi; ++i) { + bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl); + l += b->gi[i].len * bp->h.n_smpl; + } + return l; +} + +int bcf_destroy(bcf1_t *b) +{ + int i; + if (b == 0) return -1; + free(b->str); + for (i = 0; i < b->n_gi; ++i) + free(b->gi[i].data); + free(b->gi); + free(b); + return 0; +} + +static inline void fmt_str(const char *p, kstring_t *s) +{ + if (*p == 0) kputc('.', s); + else kputs(p, s); +} + +char *bcf_fmt(bcf_t *bp, bcf1_t *b) +{ + kstring_t s; + int i, j, x; + memset(&s, 0, sizeof(kstring_t)); + kputs(bp->h.ns[b->tid], &s); kputc('\t', &s); + kputw(b->pos + 1, &s); kputc('\t', &s); + fmt_str(b->str, &s); kputc('\t', &s); + fmt_str(b->ref, &s); kputc('\t', &s); + fmt_str(b->alt, &s); kputc('\t', &s); + kputw(b->qual, &s); kputc('\t', &s); + fmt_str(b->flt, &s); kputc('\t', &s); + fmt_str(b->info, &s); + if (b->fmt[0]) { + kputc('\t', &s); + fmt_str(b->fmt, &s); + } + x = b->n_alleles * (b->n_alleles + 1) / 2; + for (j = 0; j < bp->h.n_smpl; ++j) { + kputc('\t', &s); + for (i = 0; i < b->n_gi; ++i) { + if (i) kputc(':', &s); + if (b->gi[i].fmt == char2int("PL")) { + uint8_t *d = (uint8_t*)b->gi[i].data + j * x; + int k; + for (k = 0; k < x; ++k) { + if (k > 0) kputc(',', &s); + kputw(d[k], &s); + } + } else if (b->gi[i].fmt == char2int("DP")) { + kputw(((uint16_t*)b->gi[i].data)[j], &s); + } else if (b->gi[i].fmt == char2int("GQ")) { + kputw(((uint8_t*)b->gi[i].data)[j], &s); + } + } + } + return s.s; +} diff --git a/bcftools/bcf.h b/bcftools/bcf.h new file mode 100644 index 0000000..1c0436a --- /dev/null +++ b/bcftools/bcf.h @@ -0,0 +1,71 @@ +#ifndef BCF_H +#define BCF_H + +#include +#include "bgzf.h" + +typedef struct { + int fmt, len; // len is the unit length + void *data; + // derived info: fmt, len +} bcf_ginfo_t; + +typedef struct { + int32_t tid, pos; + uint32_t qual:8, l_str:24; + int m_str; + char *str, *ref, *alt, *flt, *info, *fmt; // fmt, ref, alt and info point to str + int n_gi, m_gi; + bcf_ginfo_t *gi; + int n_alleles; + // derived info: ref, alt, flt, info, fmt, n_gi, n_alleles +} bcf1_t; + +typedef struct { + int32_t n_ref, n_smpl; + int32_t l_nm; + int32_t l_smpl; + int32_t l_txt; + char *name, *sname, *txt; + char **ns, **sns; + // derived info: n_ref, n_smpl, ns, sns +} bcf_hdr_t; + +typedef struct { + BGZF *fp; + bcf_hdr_t h; +} bcf_t; + +struct __bcf_idx_t; +typedef struct __bcf_idx_t bcf_idx_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bcf_t *bcf_open(const char *fn, const char *mode); + int bcf_close(bcf_t *b); + int bcf_read(bcf_t *bp, bcf1_t *b); + int bcf_sync(int n_smpl, bcf1_t *b); + int bcf_write(bcf_t *bp, const bcf1_t *b); + int bcf_hdr_write(bcf_t *b); + int bcf_hdr_sync(bcf_hdr_t *b); + int bcf_hdr_cpy(bcf_hdr_t *h, const bcf_hdr_t *h0); + int bcf_destroy(bcf1_t *b); + char *bcf_fmt(bcf_t *bp, bcf1_t *b); + + void *bcf_build_refhash(bcf_hdr_t *h); + void bcf_str2id_destroy(void *_hash); + int bcf_str2id(void *_hash, const char *str); + + int bcf_idx_build(const char *fn); + uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg, int end); + int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end); + bcf_idx_t *bcf_idx_load(const char *fn); + void bcf_idx_destroy(bcf_idx_t *idx); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c new file mode 100644 index 0000000..e0e4a06 --- /dev/null +++ b/bcftools/bcfutils.c @@ -0,0 +1,32 @@ +#include "bcf.h" +#include "khash.h" +KHASH_MAP_INIT_STR(str2id, int) + +void *bcf_build_refhash(bcf_hdr_t *h) +{ + khash_t(str2id) *hash; + int i, ret; + hash = kh_init(str2id); + for (i = 0; i < h->n_ref; ++i) { + khint_t k; + k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret + kh_val(hash, k) = i; + } + return hash; +} + +void bcf_str2id_destroy(void *_hash) +{ + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + if (hash) kh_destroy(str2id, hash); // Note that strings are not freed. +} + +int bcf_str2id(void *_hash, const char *str) +{ + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + khint_t k; + if (!hash) return -1; + k = kh_get(str2id, hash, str); + return k == kh_end(hash)? -1 : kh_val(hash, k); +} + diff --git a/bcftools/index.c b/bcftools/index.c new file mode 100644 index 0000000..b2663d4 --- /dev/null +++ b/bcftools/index.c @@ -0,0 +1,345 @@ +#include +#include +#include +#include "bam_endian.h" +#include "kstring.h" +#include "bcf.h" +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +#define TAD_LIDX_SHIFT 13 + +typedef struct { + int32_t n, m; + uint64_t *offset; +} bcf_lidx_t; + +struct __bcf_idx_t { + int32_t n; + bcf_lidx_t *index2; +}; + +/************ + * indexing * + ************/ + +static inline void insert_offset2(bcf_lidx_t *index2, int _beg, int _end, uint64_t offset) +{ + int i, beg, end; + beg = _beg >> TAD_LIDX_SHIFT; + end = (_end - 1) >> TAD_LIDX_SHIFT; + if (index2->m < end + 1) { + int old_m = index2->m; + index2->m = end + 1; + kroundup32(index2->m); + index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); + memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); + } + if (beg == end) { + if (index2->offset[beg] == 0) index2->offset[beg] = offset; + } else { + for (i = beg; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + } + if (index2->n < end + 1) index2->n = end + 1; +} + +static void fill_missing(bcf_idx_t *idx) +{ + int i, j; + for (i = 0; i < idx->n; ++i) { + bcf_lidx_t *idx2 = &idx->index2[i]; + for (j = 1; j < idx2->n; ++j) + if (idx2->offset[j] == 0) + idx2->offset[j] = idx2->offset[j-1]; + } +} + +bcf_idx_t *bcf_idx_core(bcf_t *bp) +{ + bcf_idx_t *idx; + int32_t last_coor, last_tid; + uint64_t last_off; + kstring_t *str; + BGZF *fp = bp->fp; + bcf1_t *b; + int ret; + + b = calloc(1, sizeof(bcf1_t)); + str = calloc(1, sizeof(kstring_t)); + idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t)); + idx->n = bp->h.n_ref; + idx->index2 = calloc(bp->h.n_ref, sizeof(bcf_lidx_t)); + + last_tid = 0xffffffffu; + last_off = bgzf_tell(fp); last_coor = 0xffffffffu; + while ((ret = bcf_read(bp, b)) > 0) { + int end, tmp; + if (last_tid != b->tid) { // change of chromosomes + last_tid = b->tid; + } else if (last_coor > b->pos) { + fprintf(stderr, "[bcf_idx_core] the input is out of order\n"); + free(str->s); free(str); free(idx); bcf_destroy(b); + return 0; + } + tmp = strlen(b->ref); + end = b->pos + (tmp > 0? tmp : 1); + insert_offset2(&idx->index2[b->tid], b->pos, end, last_off); + last_off = bgzf_tell(fp); + last_coor = b->pos; + } + fill_missing(idx); + free(str->s); free(str); bcf_destroy(b); + return idx; +} + +void bcf_idx_destroy(bcf_idx_t *idx) +{ + int i; + if (idx == 0) return; + for (i = 0; i < idx->n; ++i) free(idx->index2[i].offset); + free(idx->index2); + free(idx); +} + +/****************** + * index file I/O * + ******************/ + +void bcf_idx_save(const bcf_idx_t *idx, BGZF *fp) +{ + int32_t i, ti_is_be; + ti_is_be = bam_is_big_endian(); + bgzf_write(fp, "BCI\4", 4); + if (ti_is_be) { + uint32_t x = idx->n; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &idx->n, 4); + for (i = 0; i < idx->n; ++i) { + bcf_lidx_t *index2 = idx->index2 + i; + // write linear index (index2) + if (ti_is_be) { + int x = index2->n; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &index2->n, 4); + if (ti_is_be) { // big endian + int x; + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + bgzf_write(fp, index2->offset, 8 * index2->n); + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + } else bgzf_write(fp, index2->offset, 8 * index2->n); + } +} + +static bcf_idx_t *bcf_idx_load_core(BGZF *fp) +{ + int i, ti_is_be; + char magic[4]; + bcf_idx_t *idx; + ti_is_be = bam_is_big_endian(); + if (fp == 0) { + fprintf(stderr, "[%s] fail to load index.\n", __func__); + return 0; + } + bgzf_read(fp, magic, 4); + if (strncmp(magic, "BCI\4", 4)) { + fprintf(stderr, "[%s] wrong magic number.\n", __func__); + return 0; + } + idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t)); + bgzf_read(fp, &idx->n, 4); + if (ti_is_be) bam_swap_endian_4p(&idx->n); + idx->index2 = (bcf_lidx_t*)calloc(idx->n, sizeof(bcf_lidx_t)); + for (i = 0; i < idx->n; ++i) { + bcf_lidx_t *index2 = idx->index2 + i; + int j; + bgzf_read(fp, &index2->n, 4); + if (ti_is_be) bam_swap_endian_4p(&index2->n); + index2->m = index2->n; + index2->offset = (uint64_t*)calloc(index2->m, 8); + bgzf_read(fp, index2->offset, index2->n * 8); + if (ti_is_be) + for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); + } + return idx; +} + +bcf_idx_t *bcf_idx_load_local(const char *fnidx) +{ + BGZF *fp; + fp = bgzf_open(fnidx, "r"); + if (fp) { + bcf_idx_t *idx = bcf_idx_load_core(fp); + bgzf_close(fp); + return idx; + } else return 0; +} + +#ifdef _USE_KNETFILE +static void download_from_remote(const char *url) +{ + const int buf_size = 1 * 1024 * 1024; + char *fn; + FILE *fp; + uint8_t *buf; + knetFile *fp_remote; + int l; + if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return; + l = strlen(url); + for (fn = (char*)url + l - 1; fn >= url; --fn) + if (*fn == '/') break; + ++fn; // fn now points to the file name + fp_remote = knet_open(url, "r"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); + return; + } + if ((fp = fopen(fn, "w")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); + knet_close(fp_remote); + return; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); +} +#else +static void download_from_remote(const char *url) +{ + return; +} +#endif + +static char *get_local_version(const char *fn) +{ + struct stat sbuf; + char *fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcat(strcpy(fnidx, fn), ".bci"); + if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) { + char *p, *url; + int l = strlen(fnidx); + for (p = fnidx + l - 1; p >= fnidx; --p) + if (*p == '/') break; + url = fnidx; fnidx = strdup(p + 1); + if (stat(fnidx, &sbuf) == 0) { + free(url); + return fnidx; + } + fprintf(stderr, "[%s] downloading the index file...\n", __func__); + download_from_remote(url); + free(url); + } + if (stat(fnidx, &sbuf) == 0) return fnidx; + free(fnidx); return 0; +} + +bcf_idx_t *bcf_idx_load(const char *fn) +{ + bcf_idx_t *idx; + char *fname = get_local_version(fn); + if (fname == 0) return 0; + idx = bcf_idx_load_local(fname); + free(fname); + return idx; +} + +int bcf_idx_build2(const char *fn, const char *_fnidx) +{ + char *fnidx; + BGZF *fpidx; + bcf_t *bp; + bcf_idx_t *idx; + if ((bp = bcf_open(fn, "r")) == 0) { + fprintf(stderr, "[bcf_idx_build2] fail to open the BAM file.\n"); + return -1; + } + idx = bcf_idx_core(bp); + bcf_close(bp); + if (_fnidx == 0) { + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bci"); + } else fnidx = strdup(_fnidx); + fpidx = bgzf_open(fnidx, "w"); + if (fpidx == 0) { + fprintf(stderr, "[bcf_idx_build2] fail to create the index file.\n"); + free(fnidx); + return -1; + } + bcf_idx_save(idx, fpidx); + bcf_idx_destroy(idx); + bgzf_close(fpidx); + free(fnidx); + return 0; +} + +int bcf_idx_build(const char *fn) +{ + return bcf_idx_build2(fn, 0); +} + +/******************************************** + * parse a region in the format chr:beg-end * + ********************************************/ + +int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end) +{ + char *s, *p; + int i, l, k; + l = strlen(str); + p = s = (char*)malloc(l+1); + /* squeeze out "," */ + for (i = k = 0; i != l; ++i) + if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; + s[k] = 0; + for (i = 0; i != k; ++i) if (s[i] == ':') break; + s[i] = 0; + if ((*tid = bcf_str2id(str2id, s)) < 0) { + free(s); + return -1; + } + if (i == k) { /* dump the whole sequence */ + *begin = 0; *end = 1<<29; free(s); + return 0; + } + for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; + *begin = atoi(p); + if (i < k) { + p = s + i + 1; + *end = atoi(p); + } else *end = 1<<29; + if (*begin > 0) --*begin; + free(s); + if (*begin > *end) return -1; + return 0; +} + +/******************************* + * retrieve a specified region * + *******************************/ + +uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg, int end) +{ + uint64_t min_off; + if (beg < 0) beg = 0; + if (end < beg) return 0xffffffffffffffffull; + min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1] + : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT]; + assert(min_off); + return min_off; +} + +int bcf_main_index(int argc, char *argv[]) +{ + if (argc == 1) { + fprintf(stderr, "Usage: bcftools index \n"); + return 1; + } + bcf_idx_build(argv[1]); + return 0; +} diff --git a/bcftools/main.c b/bcftools/main.c new file mode 100644 index 0000000..e271efd --- /dev/null +++ b/bcftools/main.c @@ -0,0 +1,25 @@ +#include +#include +#include "bcf.h" + +int bcfview(int argc, char *argv[]); +int bcf_main_index(int argc, char *argv[]); + +int main(int argc, char *argv[]) +{ + if (argc == 1) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bcftools \n\n"); + fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n"); + fprintf(stderr, " index index BCF\n"); + fprintf(stderr, "\n"); + return 1; + } + if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1); + if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1); + else { + fprintf(stderr, "[main] Unrecognized command.\n"); + return 1; + } + return 0; +} diff --git a/bcftools/prob1.c b/bcftools/prob1.c new file mode 100644 index 0000000..c5896f7 --- /dev/null +++ b/bcftools/prob1.c @@ -0,0 +1,237 @@ +#include +#include +#include +#include +#include "prob1.h" + +#define MC_AVG_ERR 0.007 +#define MC_MAX_EM_ITER 16 +#define MC_EM_EPS 1e-4 + +unsigned char seq_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +struct __bcf_p1aux_t { + int n, M; + double *q2p, *pdg; // pdg -> P(D|g) + double *phi, *CMk; // CMk=\binom{M}{k} + double *z, *zswap; // aux for afs + double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution + const uint8_t *PL; // point to PL + int PL_len; +}; + +void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta) +{ + int i; + if (type == MC_PTYPE_COND2) { + for (i = 0; i <= ma->M; ++i) + ma->phi[i] = 2. * (i + 1) / (ma->M + 1) / (ma->M + 2); + } else if (type == MC_PTYPE_FLAT) { + for (i = 0; i <= ma->M; ++i) + ma->phi[i] = 1. / (ma->M + 1); + } else { + double sum; + for (i = 0, sum = 0.; i < ma->M; ++i) + sum += (ma->phi[i] = theta / (ma->M - i)); + ma->phi[ma->M] = 1. - sum; + } +} + +bcf_p1aux_t *bcf_p1_init(int n) // FIXME: assuming diploid +{ + bcf_p1aux_t *ma; + int i; + ma = calloc(1, sizeof(bcf_p1aux_t)); + ma->n = n; ma->M = 2 * n; + ma->q2p = calloc(256, sizeof(double)); + ma->pdg = calloc(3 * ma->n, sizeof(double)); + ma->phi = calloc(ma->M + 1, sizeof(double)); + ma->CMk = calloc(ma->M + 1, sizeof(double)); + ma->z = calloc(2 * ma->n + 1, sizeof(double)); + ma->zswap = calloc(2 * ma->n + 1, sizeof(double)); + ma->afs = calloc(2 * ma->n + 1, sizeof(double)); + ma->afs1 = calloc(2 * ma->n + 1, sizeof(double)); + for (i = 0; i < 256; ++i) + ma->q2p[i] = pow(10., -i / 10.); + for (i = 0; i <= ma->M; ++i) + ma->CMk[i] = exp(lgamma(ma->M + 1) - lgamma(i + 1) - lgamma(ma->M - i + 1)); + bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior + return ma; +} + +void bcf_p1_destroy(bcf_p1aux_t *ma) +{ + if (ma) { + free(ma->q2p); free(ma->pdg); + free(ma->phi); free(ma->CMk); + free(ma->z); free(ma->zswap); + free(ma->afs); free(ma->afs1); + free(ma); + } +} + +#define char2int(s) (((int)s[0])<<8|s[1]) + +static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) +{ + int i, j, k; + long *p, tmp; + p = alloca(b->n_alleles * sizeof(long)); + memset(p, 0, sizeof(long) * b->n_alleles); + for (j = 0; j < ma->n; ++j) { + const uint8_t *pi = ma->PL + j * ma->PL_len; + double *pdg = ma->pdg + j * 3; + pdg[0] = ma->q2p[pi[b->n_alleles]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; + for (i = k = 0; i < b->n_alleles; ++i) { + p[i] += (int)pi[k]; + k += b->n_alleles - i; + } + } + for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i; + for (i = 1; i < b->n_alleles; ++i) // insertion sort + for (j = i; j > 0 && p[j] < p[j-1]; --j) + tmp = p[j], p[j] = p[j-1], p[j-1] = tmp; + for (i = b->n_alleles - 1; i >= 0; --i) + if ((p[i]&0xf) == 0) break; + return i; +} +// f0 is the reference allele frequency +static double mc_freq_iter(double f0, const bcf_p1aux_t *ma) +{ + double f, f3[3]; + int i; + f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + for (i = 0, f = 0.; i < ma->n; ++i) { + double *pdg; + pdg = ma->pdg + i * 3; + f += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) + / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); + } + f /= ma->n * 2.; + return f; +} + +int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) +{ + double sum, g[3]; + double max, f3[3], *pdg = ma->pdg + k * 3; + int q, i, max_i; + f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + for (i = 0, sum = 0.; i < 3; ++i) + sum += (g[i] = pdg[i] * f3[i]); + for (i = 0, max = -1., max_i = 0; i < 3; ++i) { + g[i] /= sum; + if (g[i] > max) max = g[i], max_i = i; + } + max = 1. - max; + if (max < 1e-308) max = 1e-308; + q = (int)(-3.434 * log(max) + .499); + if (q > 99) q = 99; + return q<<2|max_i; +} + +static void mc_cal_z(bcf_p1aux_t *ma) +{ + double *z[2], *tmp, *pdg; + int i, j; + z[0] = ma->z; + z[1] = ma->zswap; + pdg = ma->pdg; + z[0][0] = 1.; z[0][1] = z[0][2] = 0.; + for (j = 0; j < ma->n; ++j) { + int max = (j + 1) * 2; + double p[3]; + pdg = ma->pdg + j * 3; + p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2]; + z[1][0] = p[0] * z[0][0]; + z[1][1] = p[0] * z[0][1] + p[1] * z[0][0]; + for (i = 2; i <= max; ++i) + z[1][i] = p[0] * z[0][i] + p[1] * z[0][i-1] + p[2] * z[0][i-2]; + if (j < ma->n - 1) z[1][max+1] = z[1][max+2] = 0.; + tmp = z[0]; z[0] = z[1]; z[1] = tmp; + } + if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1)); +} + +static double mc_cal_afs(bcf_p1aux_t *ma) +{ + int k; + long double sum = 0.; + memset(ma->afs1, 0, sizeof(double) * (ma->M + 1)); + mc_cal_z(ma); + for (k = 0, sum = 0.; k <= ma->M; ++k) + sum += (long double)ma->phi[k] * ma->z[k] / ma->CMk[k]; + for (k = 0; k <= ma->M; ++k) { + ma->afs1[k] = ma->phi[k] * ma->z[k] / ma->CMk[k] / sum; + if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.; + } + for (k = 0, sum = 0.; k <= ma->M; ++k) { + ma->afs[k] += ma->afs1[k]; + sum += k * ma->afs1[k]; + } + return sum / ma->M; +} + +int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) +{ + int i, k; + long double sum = 0.; + // set PL and PL_len + for (i = 0; i < b->n_gi; ++i) { + if (b->gi[i].fmt == char2int("PL")) { + ma->PL = (uint8_t*)b->gi[i].data; + ma->PL_len = b->gi[i].len; + break; + } + } + if (b->n_alleles < 2) return -1; // FIXME: find a better solution + // + rst->rank0 = cal_pdg(b, ma); + rst->f_exp = mc_cal_afs(ma); + rst->p_ref = ma->afs1[ma->M]; + // calculate f_flat and f_em + for (k = 0, sum = 0.; k <= ma->M; ++k) + sum += (long double)ma->z[k] / ma->CMk[k]; + rst->f_flat = 0.; + for (k = 0; k <= ma->M; ++k) { + double p = ma->z[k] / ma->CMk[k] / sum; + rst->f_flat += k * p; + } + rst->f_flat /= ma->M; + { // calculate f_em + double flast = rst->f_flat; + for (i = 0; i < MC_MAX_EM_ITER; ++i) { + rst->f_em = mc_freq_iter(flast, ma); + if (fabs(rst->f_em - flast) < MC_EM_EPS) break; + flast = rst->f_em; + } + } + return 0; +} + +void bcf_p1_dump_afs(bcf_p1aux_t *ma) +{ + int k; + fprintf(stderr, "[afs]"); + for (k = 0; k <= ma->M; ++k) + fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]); + fprintf(stderr, "\n"); + memset(ma->afs, 0, sizeof(double) * (ma->M + 1)); +} diff --git a/bcftools/prob1.h b/bcftools/prob1.h new file mode 100644 index 0000000..d34a75f --- /dev/null +++ b/bcftools/prob1.h @@ -0,0 +1,33 @@ +#ifndef BCF_PROB1_H +#define BCF_PROB1_H + +#include "bcf.h" + +struct __bcf_p1aux_t; +typedef struct __bcf_p1aux_t bcf_p1aux_t; + +typedef struct { + int rank0; + double f_em, f_exp, f_flat, p_ref; +} bcf_p1rst_t; + +#define MC_PTYPE_FULL 1 +#define MC_PTYPE_COND2 2 +#define MC_PTYPE_FLAT 3 + +#ifdef __cplusplus +extern "C" { +#endif + + bcf_p1aux_t *bcf_p1_init(int n); + void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta); + void bcf_p1_destroy(bcf_p1aux_t *ma); + int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst); + int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); + void bcf_p1_dump_afs(bcf_p1aux_t *ma); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bcftools/vcfout.c b/bcftools/vcfout.c new file mode 100644 index 0000000..d71dfd9 --- /dev/null +++ b/bcftools/vcfout.c @@ -0,0 +1,185 @@ +#include +#include +#include +#include +#include "bcf.h" +#include "prob1.h" +#include "kstring.h" + +#include "khash.h" +KHASH_SET_INIT_INT64(set64) + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 16384) + +#define VC_NO_PL 1 +#define VC_NO_GENO 2 +#define VC_BCF 4 +#define VC_CALL 8 +#define VC_VARONLY 16 + +typedef struct { + int flag, prior_type; + char *fn_list; + double theta; +} viewconf_t; + +khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h) +{ + void *str2id; + gzFile fp; + kstream_t *ks; + int ret, dret, lineno = 1; + kstring_t *str; + khash_t(set64) *hash = 0; + + hash = kh_init(set64); + str2id = bcf_build_refhash(_h); + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + int tid = bcf_str2id(str2id, str->s); + if (tid >= 0 && dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) >= 0) { + uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1); + kh_put(set64, hash, x, &ret); + } else break; + } else fprintf(stderr, "[%s] %s is not a reference name (line %d).\n", __func__, str->s, lineno); + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (dret < 0) break; + ++lineno; + } + bcf_str2id_destroy(str2id); + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + return hash; +} + +static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, int flag) +{ + kstring_t s; + int x, is_var = (pr->p_ref < .5); + double r = is_var? pr->p_ref : 1. - pr->p_ref; + memset(&s, 0, sizeof(kstring_t)); + kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); + if (is_var) { + kputs(b->alt, &s); + } + kputc('\0', &s); kputc('\0', &s); + kputs(b->info, &s); + if (b->info[0]) kputc(';', &s); + ksprintf(&s, "AF1=%.3lf;AFE=%.3lf", 1.-pr->f_em, 1.-pr->f_exp); + kputc('\0', &s); + kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + x = (int)(r < 1e-100? 99 : -3.434 * log(r) + .499); + b->qual = x > 99? 99 : x; + return is_var; +} + +int bcfview(int argc, char *argv[]) +{ + bcf_t *bp, *bout = 0; + bcf1_t *b; + int c; + uint64_t n_processed = 0; + viewconf_t vc; + bcf_p1aux_t *p1 = 0; + int tid, begin, end; + khash_t(set64) *hash = 0; + + tid = begin = end = -1; + memset(&vc, 0, sizeof(viewconf_t)); + vc.prior_type = -1; vc.theta = 1e-3; + while ((c = getopt(argc, argv, "l:cGvLbP:t:")) >= 0) { + switch (c) { + case 'l': vc.fn_list = strdup(optarg); break; + case 'G': vc.flag |= VC_NO_GENO; break; + case 'L': vc.flag |= VC_NO_PL; break; + case 'b': vc.flag |= VC_BCF; break; + case 'c': vc.flag |= VC_CALL; break; + case 'v': vc.flag |= VC_VARONLY; break; + case 't': vc.theta = atof(optarg); break; + case 'P': + if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL; + else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2; + else if (strcmp(optarg, "flat") == 0) vc.prior_type = MC_PTYPE_FLAT; + break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: bcftools view [-cGPb] [-l list] [reg]\n"); + return 1; + } + + b = calloc(1, sizeof(bcf1_t)); + bp = bcf_open(argv[optind], "r"); + if (vc.flag & VC_BCF) { + bout = bcf_open("-", "w"); + bcf_hdr_cpy(&bout->h, &bp->h); + bcf_hdr_write(bout); + } + if (vc.flag & VC_CALL) { + p1 = bcf_p1_init(bp->h.n_smpl); + bcf_p1_init_prior(p1, vc.prior_type, vc.theta); + } + if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, &bp->h); + if (optind + 1 < argc) { + void *str2id = bcf_build_refhash(&bp->h); + if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) { + bcf_idx_t *idx; + idx = bcf_idx_load(argv[optind]); + if (idx) { + uint64_t off; + off = bcf_idx_query(idx, tid, begin, end); + bgzf_seek(bp->fp, off, SEEK_SET); + bcf_idx_destroy(idx); + } + } + } + while (bcf_read(bp, b) > 0) { + if (hash) { + uint64_t x = (uint64_t)b->tid<<32 | b->pos; + khint_t k = kh_get(set64, hash, x); + if (k == kh_end(hash)) continue; + } + if (tid >= 0) { + int l = strlen(b->ref); + l = b->pos + (l > 0? l : 1); + if (b->tid != tid || b->pos >= end) break; + if (!(l > begin && end > b->pos)) continue; + } + if (vc.flag & VC_CALL) { + bcf_p1rst_t pr; + int is_var; + bcf_p1_cal(b, p1, &pr); + is_var = update_bcf1(b, p1, &pr, vc.flag); + bcf_sync(bp->h.n_smpl, b); + if ((n_processed + 1) % 50000 == 0) bcf_p1_dump_afs(p1); + if ((vc.flag & VC_VARONLY) && !is_var) continue; + } + if (vc.flag & VC_BCF) bcf_write(bout, b); + else { + char *vcf; + if (vc.flag & VC_NO_GENO) { + b->n_gi = 0; + b->fmt[0] = '\0'; + } + vcf = bcf_fmt(bp, b); + puts(vcf); + free(vcf); + } + ++n_processed; + } + if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1); + bcf_close(bp); bcf_close(bout); + bcf_destroy(b); + if (hash) kh_destroy(set64, hash); + if (vc.fn_list) free(vc.fn_list); + if (p1) bcf_p1_destroy(p1); + return 0; +} +