From fd187467e3c5f026a4d934c5b0cb68a42939f6e1 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Wed, 4 Aug 2010 01:04:00 +0000 Subject: [PATCH] move bcftools to samtools --- Makefile | 12 +- bcftools/Makefile | 47 ++++++ bcf.c => bcftools/bcf.c | 0 bcf.h => bcftools/bcf.h | 14 ++ bcftools/bcfutils.c | 32 ++++ bcftools/index.c | 345 ++++++++++++++++++++++++++++++++++++++++ bcftools/main.c | 25 +++ bcftools/prob1.c | 237 +++++++++++++++++++++++++++ bcftools/prob1.h | 33 ++++ bcftools/vcfout.c | 185 +++++++++++++++++++++ 10 files changed, 924 insertions(+), 6 deletions(-) create mode 100644 bcftools/Makefile rename bcf.c => bcftools/bcf.c (100%) rename bcf.h => bcftools/bcf.h (68%) create mode 100644 bcftools/bcfutils.c create mode 100644 bcftools/index.c create mode 100644 bcftools/main.c create mode 100644 bcftools/prob1.c create mode 100644 bcftools/prob1.h create mode 100644 bcftools/vcfout.c diff --git a/Makefile b/Makefile index 7d98b24..8a7418e 100644 --- a/Makefile +++ b/Makefile @@ -7,10 +7,10 @@ LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ - bamtk.o kaln.o bam_mcns.o bam2bcf.o bcf.o + bamtk.o kaln.o bam_mcns.o bam2bcf.o PROG= samtools -INCLUDES= -SUBDIRS= . misc +INCLUDES= -Ibcftools +SUBDIRS= . bcftools misc LIBPATH= LIBCURSES= -lcurses # -lXCurses @@ -25,7 +25,7 @@ all-recur lib-recur clean-recur cleanlocal-recur install-recur: list='$(SUBDIRS)'; for subdir in $$list; do \ cd $$subdir; \ $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ - INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ + LIBPATH="$(LIBPATH)" $$target || exit 1; \ cd $$wdir; \ done; @@ -39,8 +39,8 @@ lib:libbam.a libbam.a:$(LOBJS) $(AR) -cru $@ $(LOBJS) -samtools:$(AOBJS) libbam.a - $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz +samtools:lib-recur $(AOBJS) + $(CC) $(CFLAGS) -o $@ $(AOBJS) libbam.a -lm $(LIBPATH) $(LIBCURSES) -lz -Lbcftools -lbcf razip:razip.o razf.o $(KNETFILE_O) $(CC) $(CFLAGS) -o $@ razf.o razip.o $(KNETFILE_O) -lz diff --git a/bcftools/Makefile b/bcftools/Makefile new file mode 100644 index 0000000..ec3fa74 --- /dev/null +++ b/bcftools/Makefile @@ -0,0 +1,47 @@ +CC= gcc +CFLAGS= -g -Wall -O2 #-fPIC #-m64 #-arch ppc +DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE +LOBJS= bcf.o bcfutils.o prob1.o index.o +OMISC= .. +AOBJS= vcfout.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o +PROG= bcftools +INCLUDES= -I.. +SUBDIRS= . + +.SUFFIXES:.c .o + +.c.o: + $(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@ + +all-recur lib-recur clean-recur cleanlocal-recur install-recur: + @target=`echo $@ | sed s/-recur//`; \ + wdir=`pwd`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + cd $$subdir; \ + $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \ + INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \ + cd $$wdir; \ + done; + +all:$(PROG) + +lib:libbcf.a + +libbcf.a:$(LOBJS) + $(AR) -cru $@ $(LOBJS) + +bcftools:lib $(AOBJS) + $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) -lz -L. -lbcf + +bcf.o:bcf.h +prob1.o:prob1.h bcf.h +vcfout.o:bcf.h prob1.h +main.o:bcf.h + +bcf.pdf:bcf.tex + pdflatex bcf + +cleanlocal: + rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a bcf.aux bcf.log bcf.pdf *.class libbcf.*.dylib libbcf.so* + +clean:cleanlocal-recur diff --git a/bcf.c b/bcftools/bcf.c similarity index 100% rename from bcf.c rename to bcftools/bcf.c diff --git a/bcf.h b/bcftools/bcf.h similarity index 68% rename from bcf.h rename to bcftools/bcf.h index 7512f3e..1c0436a 100644 --- a/bcf.h +++ b/bcftools/bcf.h @@ -36,6 +36,9 @@ typedef struct { bcf_hdr_t h; } bcf_t; +struct __bcf_idx_t; +typedef struct __bcf_idx_t bcf_idx_t; + #ifdef __cplusplus extern "C" { #endif @@ -47,9 +50,20 @@ extern "C" { int bcf_write(bcf_t *bp, const bcf1_t *b); int bcf_hdr_write(bcf_t *b); int bcf_hdr_sync(bcf_hdr_t *b); + int bcf_hdr_cpy(bcf_hdr_t *h, const bcf_hdr_t *h0); int bcf_destroy(bcf1_t *b); char *bcf_fmt(bcf_t *bp, bcf1_t *b); + void *bcf_build_refhash(bcf_hdr_t *h); + void bcf_str2id_destroy(void *_hash); + int bcf_str2id(void *_hash, const char *str); + + int bcf_idx_build(const char *fn); + uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg, int end); + int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end); + bcf_idx_t *bcf_idx_load(const char *fn); + void bcf_idx_destroy(bcf_idx_t *idx); + #ifdef __cplusplus } #endif diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c new file mode 100644 index 0000000..e0e4a06 --- /dev/null +++ b/bcftools/bcfutils.c @@ -0,0 +1,32 @@ +#include "bcf.h" +#include "khash.h" +KHASH_MAP_INIT_STR(str2id, int) + +void *bcf_build_refhash(bcf_hdr_t *h) +{ + khash_t(str2id) *hash; + int i, ret; + hash = kh_init(str2id); + for (i = 0; i < h->n_ref; ++i) { + khint_t k; + k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret + kh_val(hash, k) = i; + } + return hash; +} + +void bcf_str2id_destroy(void *_hash) +{ + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + if (hash) kh_destroy(str2id, hash); // Note that strings are not freed. +} + +int bcf_str2id(void *_hash, const char *str) +{ + khash_t(str2id) *hash = (khash_t(str2id)*)_hash; + khint_t k; + if (!hash) return -1; + k = kh_get(str2id, hash, str); + return k == kh_end(hash)? -1 : kh_val(hash, k); +} + diff --git a/bcftools/index.c b/bcftools/index.c new file mode 100644 index 0000000..b2663d4 --- /dev/null +++ b/bcftools/index.c @@ -0,0 +1,345 @@ +#include +#include +#include +#include "bam_endian.h" +#include "kstring.h" +#include "bcf.h" +#ifdef _USE_KNETFILE +#include "knetfile.h" +#endif + +#define TAD_LIDX_SHIFT 13 + +typedef struct { + int32_t n, m; + uint64_t *offset; +} bcf_lidx_t; + +struct __bcf_idx_t { + int32_t n; + bcf_lidx_t *index2; +}; + +/************ + * indexing * + ************/ + +static inline void insert_offset2(bcf_lidx_t *index2, int _beg, int _end, uint64_t offset) +{ + int i, beg, end; + beg = _beg >> TAD_LIDX_SHIFT; + end = (_end - 1) >> TAD_LIDX_SHIFT; + if (index2->m < end + 1) { + int old_m = index2->m; + index2->m = end + 1; + kroundup32(index2->m); + index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8); + memset(index2->offset + old_m, 0, 8 * (index2->m - old_m)); + } + if (beg == end) { + if (index2->offset[beg] == 0) index2->offset[beg] = offset; + } else { + for (i = beg; i <= end; ++i) + if (index2->offset[i] == 0) index2->offset[i] = offset; + } + if (index2->n < end + 1) index2->n = end + 1; +} + +static void fill_missing(bcf_idx_t *idx) +{ + int i, j; + for (i = 0; i < idx->n; ++i) { + bcf_lidx_t *idx2 = &idx->index2[i]; + for (j = 1; j < idx2->n; ++j) + if (idx2->offset[j] == 0) + idx2->offset[j] = idx2->offset[j-1]; + } +} + +bcf_idx_t *bcf_idx_core(bcf_t *bp) +{ + bcf_idx_t *idx; + int32_t last_coor, last_tid; + uint64_t last_off; + kstring_t *str; + BGZF *fp = bp->fp; + bcf1_t *b; + int ret; + + b = calloc(1, sizeof(bcf1_t)); + str = calloc(1, sizeof(kstring_t)); + idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t)); + idx->n = bp->h.n_ref; + idx->index2 = calloc(bp->h.n_ref, sizeof(bcf_lidx_t)); + + last_tid = 0xffffffffu; + last_off = bgzf_tell(fp); last_coor = 0xffffffffu; + while ((ret = bcf_read(bp, b)) > 0) { + int end, tmp; + if (last_tid != b->tid) { // change of chromosomes + last_tid = b->tid; + } else if (last_coor > b->pos) { + fprintf(stderr, "[bcf_idx_core] the input is out of order\n"); + free(str->s); free(str); free(idx); bcf_destroy(b); + return 0; + } + tmp = strlen(b->ref); + end = b->pos + (tmp > 0? tmp : 1); + insert_offset2(&idx->index2[b->tid], b->pos, end, last_off); + last_off = bgzf_tell(fp); + last_coor = b->pos; + } + fill_missing(idx); + free(str->s); free(str); bcf_destroy(b); + return idx; +} + +void bcf_idx_destroy(bcf_idx_t *idx) +{ + int i; + if (idx == 0) return; + for (i = 0; i < idx->n; ++i) free(idx->index2[i].offset); + free(idx->index2); + free(idx); +} + +/****************** + * index file I/O * + ******************/ + +void bcf_idx_save(const bcf_idx_t *idx, BGZF *fp) +{ + int32_t i, ti_is_be; + ti_is_be = bam_is_big_endian(); + bgzf_write(fp, "BCI\4", 4); + if (ti_is_be) { + uint32_t x = idx->n; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &idx->n, 4); + for (i = 0; i < idx->n; ++i) { + bcf_lidx_t *index2 = idx->index2 + i; + // write linear index (index2) + if (ti_is_be) { + int x = index2->n; + bgzf_write(fp, bam_swap_endian_4p(&x), 4); + } else bgzf_write(fp, &index2->n, 4); + if (ti_is_be) { // big endian + int x; + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + bgzf_write(fp, index2->offset, 8 * index2->n); + for (x = 0; (int)x < index2->n; ++x) + bam_swap_endian_8p(&index2->offset[x]); + } else bgzf_write(fp, index2->offset, 8 * index2->n); + } +} + +static bcf_idx_t *bcf_idx_load_core(BGZF *fp) +{ + int i, ti_is_be; + char magic[4]; + bcf_idx_t *idx; + ti_is_be = bam_is_big_endian(); + if (fp == 0) { + fprintf(stderr, "[%s] fail to load index.\n", __func__); + return 0; + } + bgzf_read(fp, magic, 4); + if (strncmp(magic, "BCI\4", 4)) { + fprintf(stderr, "[%s] wrong magic number.\n", __func__); + return 0; + } + idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t)); + bgzf_read(fp, &idx->n, 4); + if (ti_is_be) bam_swap_endian_4p(&idx->n); + idx->index2 = (bcf_lidx_t*)calloc(idx->n, sizeof(bcf_lidx_t)); + for (i = 0; i < idx->n; ++i) { + bcf_lidx_t *index2 = idx->index2 + i; + int j; + bgzf_read(fp, &index2->n, 4); + if (ti_is_be) bam_swap_endian_4p(&index2->n); + index2->m = index2->n; + index2->offset = (uint64_t*)calloc(index2->m, 8); + bgzf_read(fp, index2->offset, index2->n * 8); + if (ti_is_be) + for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]); + } + return idx; +} + +bcf_idx_t *bcf_idx_load_local(const char *fnidx) +{ + BGZF *fp; + fp = bgzf_open(fnidx, "r"); + if (fp) { + bcf_idx_t *idx = bcf_idx_load_core(fp); + bgzf_close(fp); + return idx; + } else return 0; +} + +#ifdef _USE_KNETFILE +static void download_from_remote(const char *url) +{ + const int buf_size = 1 * 1024 * 1024; + char *fn; + FILE *fp; + uint8_t *buf; + knetFile *fp_remote; + int l; + if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return; + l = strlen(url); + for (fn = (char*)url + l - 1; fn >= url; --fn) + if (*fn == '/') break; + ++fn; // fn now points to the file name + fp_remote = knet_open(url, "r"); + if (fp_remote == 0) { + fprintf(stderr, "[download_from_remote] fail to open remote file.\n"); + return; + } + if ((fp = fopen(fn, "w")) == 0) { + fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n"); + knet_close(fp_remote); + return; + } + buf = (uint8_t*)calloc(buf_size, 1); + while ((l = knet_read(fp_remote, buf, buf_size)) != 0) + fwrite(buf, 1, l, fp); + free(buf); + fclose(fp); + knet_close(fp_remote); +} +#else +static void download_from_remote(const char *url) +{ + return; +} +#endif + +static char *get_local_version(const char *fn) +{ + struct stat sbuf; + char *fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcat(strcpy(fnidx, fn), ".bci"); + if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) { + char *p, *url; + int l = strlen(fnidx); + for (p = fnidx + l - 1; p >= fnidx; --p) + if (*p == '/') break; + url = fnidx; fnidx = strdup(p + 1); + if (stat(fnidx, &sbuf) == 0) { + free(url); + return fnidx; + } + fprintf(stderr, "[%s] downloading the index file...\n", __func__); + download_from_remote(url); + free(url); + } + if (stat(fnidx, &sbuf) == 0) return fnidx; + free(fnidx); return 0; +} + +bcf_idx_t *bcf_idx_load(const char *fn) +{ + bcf_idx_t *idx; + char *fname = get_local_version(fn); + if (fname == 0) return 0; + idx = bcf_idx_load_local(fname); + free(fname); + return idx; +} + +int bcf_idx_build2(const char *fn, const char *_fnidx) +{ + char *fnidx; + BGZF *fpidx; + bcf_t *bp; + bcf_idx_t *idx; + if ((bp = bcf_open(fn, "r")) == 0) { + fprintf(stderr, "[bcf_idx_build2] fail to open the BAM file.\n"); + return -1; + } + idx = bcf_idx_core(bp); + bcf_close(bp); + if (_fnidx == 0) { + fnidx = (char*)calloc(strlen(fn) + 5, 1); + strcpy(fnidx, fn); strcat(fnidx, ".bci"); + } else fnidx = strdup(_fnidx); + fpidx = bgzf_open(fnidx, "w"); + if (fpidx == 0) { + fprintf(stderr, "[bcf_idx_build2] fail to create the index file.\n"); + free(fnidx); + return -1; + } + bcf_idx_save(idx, fpidx); + bcf_idx_destroy(idx); + bgzf_close(fpidx); + free(fnidx); + return 0; +} + +int bcf_idx_build(const char *fn) +{ + return bcf_idx_build2(fn, 0); +} + +/******************************************** + * parse a region in the format chr:beg-end * + ********************************************/ + +int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end) +{ + char *s, *p; + int i, l, k; + l = strlen(str); + p = s = (char*)malloc(l+1); + /* squeeze out "," */ + for (i = k = 0; i != l; ++i) + if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; + s[k] = 0; + for (i = 0; i != k; ++i) if (s[i] == ':') break; + s[i] = 0; + if ((*tid = bcf_str2id(str2id, s)) < 0) { + free(s); + return -1; + } + if (i == k) { /* dump the whole sequence */ + *begin = 0; *end = 1<<29; free(s); + return 0; + } + for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; + *begin = atoi(p); + if (i < k) { + p = s + i + 1; + *end = atoi(p); + } else *end = 1<<29; + if (*begin > 0) --*begin; + free(s); + if (*begin > *end) return -1; + return 0; +} + +/******************************* + * retrieve a specified region * + *******************************/ + +uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg, int end) +{ + uint64_t min_off; + if (beg < 0) beg = 0; + if (end < beg) return 0xffffffffffffffffull; + min_off = (beg>>TAD_LIDX_SHIFT >= idx->index2[tid].n)? idx->index2[tid].offset[idx->index2[tid].n-1] + : idx->index2[tid].offset[beg>>TAD_LIDX_SHIFT]; + assert(min_off); + return min_off; +} + +int bcf_main_index(int argc, char *argv[]) +{ + if (argc == 1) { + fprintf(stderr, "Usage: bcftools index \n"); + return 1; + } + bcf_idx_build(argv[1]); + return 0; +} diff --git a/bcftools/main.c b/bcftools/main.c new file mode 100644 index 0000000..e271efd --- /dev/null +++ b/bcftools/main.c @@ -0,0 +1,25 @@ +#include +#include +#include "bcf.h" + +int bcfview(int argc, char *argv[]); +int bcf_main_index(int argc, char *argv[]); + +int main(int argc, char *argv[]) +{ + if (argc == 1) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bcftools \n\n"); + fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n"); + fprintf(stderr, " index index BCF\n"); + fprintf(stderr, "\n"); + return 1; + } + if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1); + if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1); + else { + fprintf(stderr, "[main] Unrecognized command.\n"); + return 1; + } + return 0; +} diff --git a/bcftools/prob1.c b/bcftools/prob1.c new file mode 100644 index 0000000..c5896f7 --- /dev/null +++ b/bcftools/prob1.c @@ -0,0 +1,237 @@ +#include +#include +#include +#include +#include "prob1.h" + +#define MC_AVG_ERR 0.007 +#define MC_MAX_EM_ITER 16 +#define MC_EM_EPS 1e-4 + +unsigned char seq_nt4_table[256] = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 +}; + +struct __bcf_p1aux_t { + int n, M; + double *q2p, *pdg; // pdg -> P(D|g) + double *phi, *CMk; // CMk=\binom{M}{k} + double *z, *zswap; // aux for afs + double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution + const uint8_t *PL; // point to PL + int PL_len; +}; + +void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta) +{ + int i; + if (type == MC_PTYPE_COND2) { + for (i = 0; i <= ma->M; ++i) + ma->phi[i] = 2. * (i + 1) / (ma->M + 1) / (ma->M + 2); + } else if (type == MC_PTYPE_FLAT) { + for (i = 0; i <= ma->M; ++i) + ma->phi[i] = 1. / (ma->M + 1); + } else { + double sum; + for (i = 0, sum = 0.; i < ma->M; ++i) + sum += (ma->phi[i] = theta / (ma->M - i)); + ma->phi[ma->M] = 1. - sum; + } +} + +bcf_p1aux_t *bcf_p1_init(int n) // FIXME: assuming diploid +{ + bcf_p1aux_t *ma; + int i; + ma = calloc(1, sizeof(bcf_p1aux_t)); + ma->n = n; ma->M = 2 * n; + ma->q2p = calloc(256, sizeof(double)); + ma->pdg = calloc(3 * ma->n, sizeof(double)); + ma->phi = calloc(ma->M + 1, sizeof(double)); + ma->CMk = calloc(ma->M + 1, sizeof(double)); + ma->z = calloc(2 * ma->n + 1, sizeof(double)); + ma->zswap = calloc(2 * ma->n + 1, sizeof(double)); + ma->afs = calloc(2 * ma->n + 1, sizeof(double)); + ma->afs1 = calloc(2 * ma->n + 1, sizeof(double)); + for (i = 0; i < 256; ++i) + ma->q2p[i] = pow(10., -i / 10.); + for (i = 0; i <= ma->M; ++i) + ma->CMk[i] = exp(lgamma(ma->M + 1) - lgamma(i + 1) - lgamma(ma->M - i + 1)); + bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior + return ma; +} + +void bcf_p1_destroy(bcf_p1aux_t *ma) +{ + if (ma) { + free(ma->q2p); free(ma->pdg); + free(ma->phi); free(ma->CMk); + free(ma->z); free(ma->zswap); + free(ma->afs); free(ma->afs1); + free(ma); + } +} + +#define char2int(s) (((int)s[0])<<8|s[1]) + +static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) +{ + int i, j, k; + long *p, tmp; + p = alloca(b->n_alleles * sizeof(long)); + memset(p, 0, sizeof(long) * b->n_alleles); + for (j = 0; j < ma->n; ++j) { + const uint8_t *pi = ma->PL + j * ma->PL_len; + double *pdg = ma->pdg + j * 3; + pdg[0] = ma->q2p[pi[b->n_alleles]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; + for (i = k = 0; i < b->n_alleles; ++i) { + p[i] += (int)pi[k]; + k += b->n_alleles - i; + } + } + for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i; + for (i = 1; i < b->n_alleles; ++i) // insertion sort + for (j = i; j > 0 && p[j] < p[j-1]; --j) + tmp = p[j], p[j] = p[j-1], p[j-1] = tmp; + for (i = b->n_alleles - 1; i >= 0; --i) + if ((p[i]&0xf) == 0) break; + return i; +} +// f0 is the reference allele frequency +static double mc_freq_iter(double f0, const bcf_p1aux_t *ma) +{ + double f, f3[3]; + int i; + f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + for (i = 0, f = 0.; i < ma->n; ++i) { + double *pdg; + pdg = ma->pdg + i * 3; + f += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2]) + / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]); + } + f /= ma->n * 2.; + return f; +} + +int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) +{ + double sum, g[3]; + double max, f3[3], *pdg = ma->pdg + k * 3; + int q, i, max_i; + f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0; + for (i = 0, sum = 0.; i < 3; ++i) + sum += (g[i] = pdg[i] * f3[i]); + for (i = 0, max = -1., max_i = 0; i < 3; ++i) { + g[i] /= sum; + if (g[i] > max) max = g[i], max_i = i; + } + max = 1. - max; + if (max < 1e-308) max = 1e-308; + q = (int)(-3.434 * log(max) + .499); + if (q > 99) q = 99; + return q<<2|max_i; +} + +static void mc_cal_z(bcf_p1aux_t *ma) +{ + double *z[2], *tmp, *pdg; + int i, j; + z[0] = ma->z; + z[1] = ma->zswap; + pdg = ma->pdg; + z[0][0] = 1.; z[0][1] = z[0][2] = 0.; + for (j = 0; j < ma->n; ++j) { + int max = (j + 1) * 2; + double p[3]; + pdg = ma->pdg + j * 3; + p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2]; + z[1][0] = p[0] * z[0][0]; + z[1][1] = p[0] * z[0][1] + p[1] * z[0][0]; + for (i = 2; i <= max; ++i) + z[1][i] = p[0] * z[0][i] + p[1] * z[0][i-1] + p[2] * z[0][i-2]; + if (j < ma->n - 1) z[1][max+1] = z[1][max+2] = 0.; + tmp = z[0]; z[0] = z[1]; z[1] = tmp; + } + if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1)); +} + +static double mc_cal_afs(bcf_p1aux_t *ma) +{ + int k; + long double sum = 0.; + memset(ma->afs1, 0, sizeof(double) * (ma->M + 1)); + mc_cal_z(ma); + for (k = 0, sum = 0.; k <= ma->M; ++k) + sum += (long double)ma->phi[k] * ma->z[k] / ma->CMk[k]; + for (k = 0; k <= ma->M; ++k) { + ma->afs1[k] = ma->phi[k] * ma->z[k] / ma->CMk[k] / sum; + if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.; + } + for (k = 0, sum = 0.; k <= ma->M; ++k) { + ma->afs[k] += ma->afs1[k]; + sum += k * ma->afs1[k]; + } + return sum / ma->M; +} + +int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst) +{ + int i, k; + long double sum = 0.; + // set PL and PL_len + for (i = 0; i < b->n_gi; ++i) { + if (b->gi[i].fmt == char2int("PL")) { + ma->PL = (uint8_t*)b->gi[i].data; + ma->PL_len = b->gi[i].len; + break; + } + } + if (b->n_alleles < 2) return -1; // FIXME: find a better solution + // + rst->rank0 = cal_pdg(b, ma); + rst->f_exp = mc_cal_afs(ma); + rst->p_ref = ma->afs1[ma->M]; + // calculate f_flat and f_em + for (k = 0, sum = 0.; k <= ma->M; ++k) + sum += (long double)ma->z[k] / ma->CMk[k]; + rst->f_flat = 0.; + for (k = 0; k <= ma->M; ++k) { + double p = ma->z[k] / ma->CMk[k] / sum; + rst->f_flat += k * p; + } + rst->f_flat /= ma->M; + { // calculate f_em + double flast = rst->f_flat; + for (i = 0; i < MC_MAX_EM_ITER; ++i) { + rst->f_em = mc_freq_iter(flast, ma); + if (fabs(rst->f_em - flast) < MC_EM_EPS) break; + flast = rst->f_em; + } + } + return 0; +} + +void bcf_p1_dump_afs(bcf_p1aux_t *ma) +{ + int k; + fprintf(stderr, "[afs]"); + for (k = 0; k <= ma->M; ++k) + fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]); + fprintf(stderr, "\n"); + memset(ma->afs, 0, sizeof(double) * (ma->M + 1)); +} diff --git a/bcftools/prob1.h b/bcftools/prob1.h new file mode 100644 index 0000000..d34a75f --- /dev/null +++ b/bcftools/prob1.h @@ -0,0 +1,33 @@ +#ifndef BCF_PROB1_H +#define BCF_PROB1_H + +#include "bcf.h" + +struct __bcf_p1aux_t; +typedef struct __bcf_p1aux_t bcf_p1aux_t; + +typedef struct { + int rank0; + double f_em, f_exp, f_flat, p_ref; +} bcf_p1rst_t; + +#define MC_PTYPE_FULL 1 +#define MC_PTYPE_COND2 2 +#define MC_PTYPE_FLAT 3 + +#ifdef __cplusplus +extern "C" { +#endif + + bcf_p1aux_t *bcf_p1_init(int n); + void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta); + void bcf_p1_destroy(bcf_p1aux_t *ma); + int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst); + int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); + void bcf_p1_dump_afs(bcf_p1aux_t *ma); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bcftools/vcfout.c b/bcftools/vcfout.c new file mode 100644 index 0000000..d71dfd9 --- /dev/null +++ b/bcftools/vcfout.c @@ -0,0 +1,185 @@ +#include +#include +#include +#include +#include "bcf.h" +#include "prob1.h" +#include "kstring.h" + +#include "khash.h" +KHASH_SET_INIT_INT64(set64) + +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 16384) + +#define VC_NO_PL 1 +#define VC_NO_GENO 2 +#define VC_BCF 4 +#define VC_CALL 8 +#define VC_VARONLY 16 + +typedef struct { + int flag, prior_type; + char *fn_list; + double theta; +} viewconf_t; + +khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h) +{ + void *str2id; + gzFile fp; + kstream_t *ks; + int ret, dret, lineno = 1; + kstring_t *str; + khash_t(set64) *hash = 0; + + hash = kh_init(set64); + str2id = bcf_build_refhash(_h); + str = calloc(1, sizeof(kstring_t)); + fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); + ks = ks_init(fp); + while (ks_getuntil(ks, 0, str, &dret) >= 0) { + int tid = bcf_str2id(str2id, str->s); + if (tid >= 0 && dret != '\n') { + if (ks_getuntil(ks, 0, str, &dret) >= 0) { + uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1); + kh_put(set64, hash, x, &ret); + } else break; + } else fprintf(stderr, "[%s] %s is not a reference name (line %d).\n", __func__, str->s, lineno); + if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); + if (dret < 0) break; + ++lineno; + } + bcf_str2id_destroy(str2id); + ks_destroy(ks); + gzclose(fp); + free(str->s); free(str); + return hash; +} + +static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, int flag) +{ + kstring_t s; + int x, is_var = (pr->p_ref < .5); + double r = is_var? pr->p_ref : 1. - pr->p_ref; + memset(&s, 0, sizeof(kstring_t)); + kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); + if (is_var) { + kputs(b->alt, &s); + } + kputc('\0', &s); kputc('\0', &s); + kputs(b->info, &s); + if (b->info[0]) kputc(';', &s); + ksprintf(&s, "AF1=%.3lf;AFE=%.3lf", 1.-pr->f_em, 1.-pr->f_exp); + kputc('\0', &s); + kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + x = (int)(r < 1e-100? 99 : -3.434 * log(r) + .499); + b->qual = x > 99? 99 : x; + return is_var; +} + +int bcfview(int argc, char *argv[]) +{ + bcf_t *bp, *bout = 0; + bcf1_t *b; + int c; + uint64_t n_processed = 0; + viewconf_t vc; + bcf_p1aux_t *p1 = 0; + int tid, begin, end; + khash_t(set64) *hash = 0; + + tid = begin = end = -1; + memset(&vc, 0, sizeof(viewconf_t)); + vc.prior_type = -1; vc.theta = 1e-3; + while ((c = getopt(argc, argv, "l:cGvLbP:t:")) >= 0) { + switch (c) { + case 'l': vc.fn_list = strdup(optarg); break; + case 'G': vc.flag |= VC_NO_GENO; break; + case 'L': vc.flag |= VC_NO_PL; break; + case 'b': vc.flag |= VC_BCF; break; + case 'c': vc.flag |= VC_CALL; break; + case 'v': vc.flag |= VC_VARONLY; break; + case 't': vc.theta = atof(optarg); break; + case 'P': + if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL; + else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2; + else if (strcmp(optarg, "flat") == 0) vc.prior_type = MC_PTYPE_FLAT; + break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: bcftools view [-cGPb] [-l list] [reg]\n"); + return 1; + } + + b = calloc(1, sizeof(bcf1_t)); + bp = bcf_open(argv[optind], "r"); + if (vc.flag & VC_BCF) { + bout = bcf_open("-", "w"); + bcf_hdr_cpy(&bout->h, &bp->h); + bcf_hdr_write(bout); + } + if (vc.flag & VC_CALL) { + p1 = bcf_p1_init(bp->h.n_smpl); + bcf_p1_init_prior(p1, vc.prior_type, vc.theta); + } + if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, &bp->h); + if (optind + 1 < argc) { + void *str2id = bcf_build_refhash(&bp->h); + if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) { + bcf_idx_t *idx; + idx = bcf_idx_load(argv[optind]); + if (idx) { + uint64_t off; + off = bcf_idx_query(idx, tid, begin, end); + bgzf_seek(bp->fp, off, SEEK_SET); + bcf_idx_destroy(idx); + } + } + } + while (bcf_read(bp, b) > 0) { + if (hash) { + uint64_t x = (uint64_t)b->tid<<32 | b->pos; + khint_t k = kh_get(set64, hash, x); + if (k == kh_end(hash)) continue; + } + if (tid >= 0) { + int l = strlen(b->ref); + l = b->pos + (l > 0? l : 1); + if (b->tid != tid || b->pos >= end) break; + if (!(l > begin && end > b->pos)) continue; + } + if (vc.flag & VC_CALL) { + bcf_p1rst_t pr; + int is_var; + bcf_p1_cal(b, p1, &pr); + is_var = update_bcf1(b, p1, &pr, vc.flag); + bcf_sync(bp->h.n_smpl, b); + if ((n_processed + 1) % 50000 == 0) bcf_p1_dump_afs(p1); + if ((vc.flag & VC_VARONLY) && !is_var) continue; + } + if (vc.flag & VC_BCF) bcf_write(bout, b); + else { + char *vcf; + if (vc.flag & VC_NO_GENO) { + b->n_gi = 0; + b->fmt[0] = '\0'; + } + vcf = bcf_fmt(bp, b); + puts(vcf); + free(vcf); + } + ++n_processed; + } + if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1); + bcf_close(bp); bcf_close(bout); + bcf_destroy(b); + if (hash) kh_destroy(set64, hash); + if (vc.fn_list) free(vc.fn_list); + if (p1) bcf_p1_destroy(p1); + return 0; +} + -- 2.39.2