From 7eaf231bcd3c44fad56f8d7200e44a37ae453c88 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 2 Aug 2010 20:07:25 +0000 Subject: [PATCH] Generate binary VCF --- Makefile | 8 +- bam2bcf.c | 239 ++++++++++++++++++++++++++++++++++++++++++++++++++ bam2bcf.h | 37 ++++++++ bam_plcmd.c | 68 +++++++++++++-- bcf.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++++ bcf.h | 57 ++++++++++++ 6 files changed, 647 insertions(+), 9 deletions(-) create mode 100644 bam2bcf.c create mode 100644 bam2bcf.h create mode 100644 bcf.c create mode 100644 bcf.h diff --git a/Makefile b/Makefile index 88d8217..4f89dc1 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ CC= gcc -CFLAGS= -g -Wall -O2 #-m64 #-arch ppc +CFLAGS= -g -Wall #-O2 #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -D_CURSES_LIB=1 KNETFILE_O= knetfile.o LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ @@ -7,7 +7,7 @@ LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ $(KNETFILE_O) bam_sort.o sam_header.o bam_reheader.o AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ - bamtk.o kaln.o bam_mcns.o + bamtk.o kaln.o bam_mcns.o bam2bcf.o bcf.o PROG= samtools INCLUDES= SUBDIRS= . misc @@ -53,7 +53,7 @@ bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h sam.o:sam.h bam.h bam_import.o:bam.h kseq.h khash.h razf.h bam_pileup.o:bam.h razf.h ksort.h -bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h +bam_plcmd.o:bam.h faidx.h bam_maqcns.h glf.h bam_mcns.h bcf.h bam2bcf.h bam_index.o:bam.h khash.h ksort.h razf.h bam_endian.h bam_lpileup.o:bam.h ksort.h bam_tview.o:bam.h faidx.h bam_maqcns.h @@ -62,6 +62,8 @@ bam_sort.o:bam.h ksort.h razf.h bam_md.o:bam.h faidx.h glf.o:glf.h sam_header.o:sam_header.h khash.h +bcf.o:bcf.h +bam2bcf.o:bam2bcf.h bcf.h faidx.o:faidx.h razf.h khash.h faidx_main.o:faidx.h razf.h diff --git a/bam2bcf.c b/bam2bcf.c new file mode 100644 index 0000000..a919cbf --- /dev/null +++ b/bam2bcf.c @@ -0,0 +1,239 @@ +#include +#include +#include "bam.h" +#include "kstring.h" +#include "bam2bcf.h" +#include "bcf.h" + +extern void ks_introsort_uint32_t(size_t n, uint32_t a[]); + +#define CALL_ETA 0.03f +#define CALL_MAX 256 +#define CALL_DEFTHETA 0.85f + +struct __bcf_callaux_t { + int max_info, capQ; + double *fk; + uint32_t *info; +}; + +bcf_callaux_t *bcf_call_init(double theta) +{ + bcf_callaux_t *bca; + int n; + if (theta <= 0.) theta = CALL_DEFTHETA; + bca = calloc(1, sizeof(bcf_callaux_t)); + bca->capQ = 60; + bca->fk = calloc(CALL_MAX, sizeof(double)); + bca->fk[0] = 1.; + for (n = 1; n < CALL_MAX; ++n) + bca->fk[n] = theta >= 1.? 1. : pow(theta, n) * (1.0 - CALL_ETA) + CALL_ETA; + bca->fk[CALL_MAX-1] = 0.; + return bca; +} + +void bcf_call_destroy(bcf_callaux_t *bca) +{ + if (bca) { + free(bca->info); free(bca->fk); free(bca); + } +} + +typedef struct { + float esum[4], fsum[4]; + uint32_t c[4]; + int w[8]; +} auxaux_t; + +/* + The following code is nearly identical to bam_maqcns_glfgen() under + the simplified SOAPsnp model. It does the following: + + 1) Collect strand, base, quality and mapping quality information for + each base and combine them in an integer: + + x = min{baseQ,mapQ}<<24 | 1<<21 | strand<<18 | base<<16 | baseQ<<8 | mapQ + + 2) Sort the list of integers for the next step. + + 3) For each base, calculate e_b, the sum of weighted qualities. For + each type of base on each strand, the best quality has the highest + weight. Only the top 255 bases on each strand are used (different + from maqcns). + + 4) Rescale the total read depth to 255. + + 5) Calculate Q(D|g) = -10\log_{10}P(D|g) (d_a is the allele count): + + Q(D|)=\sum_{b\not=a}e_b + Q(D|)=3*(d_a+d_A)+\sum_{b\not=a,b\not=A}e_b + */ +int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r) +{ + int i, j, k, c, n; + float *p = r->p; + auxaux_t aux; + + memset(r, 0, sizeof(bcf_callret1_t)); + if (_n == 0) return -1; + + // enlarge the aux array if necessary + if (bca->max_info < _n) { + bca->max_info = _n; + kroundup32(bca->max_info); + bca->info = (uint32_t*)realloc(bca->info, 4 * bca->max_info); + } + // fill the aux array + for (i = n = 0; i < _n; ++i) { + const bam_pileup1_t *p = pl + i; + uint32_t q, x = 0, qq; + if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; // skip unmapped reads and deleted bases + q = (uint32_t)bam1_qual(p->b)[p->qpos]; // base quality + x |= (uint32_t)bam1_strand(p->b) << 18 | q << 8 | p->b->core.qual; + if (p->b->core.qual < q) q = p->b->core.qual; // cap the overall quality at mapping quality + x |= q << 24; + qq = bam1_seqi(bam1_seq(p->b), p->qpos); // base + q = bam_nt16_nt4_table[qq? qq : ref_base]; // q is the 2-bit base + if (q < 4) x |= 1 << 21 | q << 16; + + bca->info[n++] = x; + } + ks_introsort_uint32_t(n, bca->info); + r->depth = n; + // generate esum and fsum + memset(&aux, 0, sizeof(auxaux_t)); + for (j = n - 1, r->sum_Q2 = 0; j >= 0; --j) { // calculate esum and fsum + uint32_t info = bca->info[j]; + int tmp; + if (info>>24 < 4 && (info>>8&0x3f) != 0) info = 4<<24 | (info&0xffffff); + k = info>>16&7; + if (info>>24 > 0) { + aux.esum[k&3] += bca->fk[aux.w[k]] * (info>>24); + aux.fsum[k&3] += bca->fk[aux.w[k]]; + if (aux.w[k] + 1 < CALL_MAX) ++aux.w[k]; + ++aux.c[k&3]; + } + tmp = (int)(info&0xff) < bca->capQ? (int)(info&0xff) : bca->capQ; + r->sum_Q2 += tmp * tmp; + } + memcpy(r->esum, aux.esum, 4 * sizeof(float)); + // rescale ->c[] + for (j = c = 0; j != 4; ++j) c += aux.c[j]; + if (c > 255) { + for (j = 0; j != 4; ++j) aux.c[j] = (int)(254.0 * aux.c[j] / c + 0.499); + for (j = c = 0; j != 4; ++j) c += aux.c[j]; + } + // generate likelihood + for (j = 0; j != 4; ++j) { + float tmp; + // homozygous + for (k = 0, tmp = 0.0; k != 4; ++k) + if (j != k) tmp += aux.esum[k]; + p[j<<2|j] = tmp; // anything that is not j + // heterozygous + for (k = j + 1; k < 4; ++k) { + for (i = 0, tmp = 0.0; i != 4; ++i) + if (i != j && i != k) tmp += aux.esum[i]; + p[j<<2|k] = p[k<<2|j] = 3.01 * (aux.c[j] + aux.c[k]) + tmp; + } + } + return 0; +} + +/* + 1) Find the top 2 bases (from esum[4]). + + 2) If the reference base is among the top 2, consider the locus is + potentially biallelic and set call->a[2] as -1; otherwise, the + locus is potentially triallelic. If the reference is ambiguous, + take the weakest call as the pseudo-reference. + */ +int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call) +{ + int ref4, i, j; + int64_t sum[4], tmp; + call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base]; + if (ref4 > 4) ref4 = 4; + { // calculate esum + double esum[4]; + memset(esum, 0, sizeof(double) * 4); + for (i = 0; i < n; ++i) { + for (j = 0; j < 4; ++j) + esum[j] += calls[i].esum[j]; + } + for (j = 0; j < 4; ++j) + sum[j] = (int)(esum[j] * 100. + .499) << 2 | j; + } + // find the top 2 alleles + for (i = 1; i < 4; ++i) // insertion sort + for (j = i; j > 0 && sum[j] < sum[j-1]; --j) + tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp; + call->a[0] = sum[3]&3; call->a[1] = sum[2]&3; call->a[2] = -1; + // set the reference allele and alternative allele(s) + if (ref4 != call->a[0]) { // ref is not the best + if (ref4 < 4) { // not ambiguous + if (ref4 == call->a[1]) tmp = call->a[0], call->a[0] = call->a[1], call->a[1] = tmp; // switch 0 and 1 + else call->a[2] = call->a[1], call->a[1] = call->a[0], call->a[0] = ref4; // triallele + } + } + // set the PL array + if (call->n < n) { + call->n = n; + call->PL = realloc(call->PL, 6 * n); + } + { + int x, g[6], z; + double sum_min = 0.; + call->n_alleles = call->a[2] < 0? 2 : 3; + x = call->n_alleles * (call->n_alleles + 1) / 2; + // get the possible genotypes + for (i = z = 0; i < call->n_alleles; ++i) + for (j = i; j < call->n_alleles; ++j) + g[z++] = call->a[i]<<2 | call->a[j]; + for (i = 0; i < n; ++i) { + uint8_t *PL = call->PL + x * i; + const bcf_callret1_t *r = calls + i; + float min = 1e37; + for (j = 0; j < x; ++j) + if (min > r->p[g[j]]) min = r->p[g[j]]; + sum_min += min; + for (j = 0; j < x; ++j) { + int y; + y = (int)(r->p[g[j]] - min + .499); + if (y > 255) y = 255; + PL[j] = y; + } + } + call->shift = (int)(sum_min + .499); + } + for (i = call->depth = 0, tmp = 0; i < n; ++i) { + call->depth += calls[i].depth; + tmp += calls[i].sum_Q2; + } + call->rmsQ = (int)(sqrt((double)tmp / call->depth) + .499); + return 0; +} + +int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b) +{ + kstring_t s; + int i, beg; + b->tid = tid; b->pos = pos; b->qual = 0; + s.s = b->str; s.m = b->m_str; s.l = 0; + kputc('\0', &s); + kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s); + beg = bc->ori_ref > 3? 0 : 1; + for (i = beg; i < 4; ++i) { + if (bc->a[i] < 0) break; + if (i > beg) kputc(',', &s); + kputc("ACGT"[bc->a[i]], &s); + } + kputc('\0', &s); + kputc('\0', &s); + kputs("MQ=", &s); kputw(bc->rmsQ, &s); kputc('\0', &s); + kputs("PL", &s); kputc('\0', &s); + b->m_str = s.m; b->str = s.s; b->l_str = s.l; + bcf_sync(bc->n, b); + memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n); + return 0; +} diff --git a/bam2bcf.h b/bam2bcf.h new file mode 100644 index 0000000..3d9a804 --- /dev/null +++ b/bam2bcf.h @@ -0,0 +1,37 @@ +#ifndef BAM2BCF_H +#define BAM2BCF_H + +#include +#include "bcf.h" + +struct __bcf_callaux_t; +typedef struct __bcf_callaux_t bcf_callaux_t; + +typedef struct { + int depth; + uint64_t sum_Q2; + float p[16], esum[4]; +} bcf_callret1_t; + +typedef struct { + int a[4]; // alleles: ref, alt, alt2, alt3 + int n, n_alleles, shift, ori_ref; + int depth, rmsQ; + uint8_t *PL; +} bcf_call_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bcf_callaux_t *bcf_call_init(double theta); + void bcf_call_destroy(bcf_callaux_t *bca); + int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base /*4-bit*/, bcf_callaux_t *bca, bcf_callret1_t *r); + int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call); + int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/bam_plcmd.c b/bam_plcmd.c index bea9169..8affabf 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -6,6 +6,7 @@ #include "faidx.h" #include "bam_maqcns.h" #include "bam_mcns.h" +#include "bam2bcf.h" #include "khash.h" #include "glf.h" #include "kstring.h" @@ -452,6 +453,7 @@ int bam_pileup(int argc, char *argv[]) #define MPLP_VCF 0x1 #define MPLP_VAR 0x2 #define MPLP_AFALL 0x8 +#define MPLP_GLF 0x10 #define MPLP_AFS_BLOCK 0x10000 @@ -482,7 +484,6 @@ static int mplp_func(void *data, bam1_t *b) static int mpileup(mplp_conf_t *conf, int n, char **fn) { mplp_aux_t **data; - mc_aux_t *ma = 0; int i, tid, pos, *n_plp, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid; const bam_pileup1_t **plp; bam_mplp_t iter; @@ -490,10 +491,19 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) uint64_t N = 0; char *ref; khash_t(64) *hash = 0; - // allocate + + mc_aux_t *ma = 0; + + bcf_callaux_t *bca = 0; + bcf_callret1_t *bcr = 0; + bcf_call_t bc; + bcf_t *bp = 0; + + memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); + // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; @@ -525,7 +535,33 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } if (conf->fn_pos) hash = load_pos(conf->fn_pos, h); // write the VCF header - if (conf->flag & MPLP_VCF) { + if (conf->flag & MPLP_GLF) { + kstring_t s; + s.l = s.m = 0; s.s = 0; + bp = bcf_open("-", "w"); + for (i = 0; i < h->n_targets; ++i) { + kputs(h->target_name[i], &s); + kputc('\0', &s); + } + bp->h.l_nm = s.l; + bp->h.name = malloc(s.l); + memcpy(bp->h.name, s.s, s.l); + s.l = 0; + for (i = 0; i < n; ++i) { + const char *p; + if ((p = strstr(fn[i], ".bam")) != 0) + kputsn(fn[i], p - fn[i], &s); + else kputs(fn[i], &s); + kputc('\0', &s); + } + bp->h.l_smpl = s.l; + bp->h.sname = malloc(s.l); + memcpy(bp->h.sname, s.s, s.l); + bp->h.l_txt = 0; + free(s.s); + bcf_hdr_sync(&bp->h); + bcf_hdr_write(bp); + } else if (conf->flag & MPLP_VCF) { kstring_t s; s.l = s.m = 0; s.s = 0; puts("##fileformat=VCFv4.0"); @@ -546,7 +582,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) free(s.s); } // mpileup - if (conf->flag & MPLP_VCF) { + if (conf->flag & MPLP_GLF) { + bca = bcf_call_init(-1.); + bcr = calloc(n, sizeof(bcf_callret1_t)); + } else if (conf->flag & MPLP_VCF) { ma = mc_init(n); mc_init_prior(ma, conf->prior_type, conf->theta); } @@ -564,7 +603,19 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len); ref_tid = tid; } - if (conf->flag & MPLP_VCF) { + if (conf->flag & MPLP_GLF) { + int _ref0, ref16; + bcf1_t *b = calloc(1, sizeof(bcf1_t)); + _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; + ref16 = bam_nt16_table[_ref0]; + for (i = 0; i < n; ++i) + bcf_call_glfgen(n_plp[i], plp[i], ref16, bca, bcr + i); + bcf_call_combine(n, bcr, ref16, &bc); + bcf_call2bcf(tid, pos, &bc, b); + bcf_write(bp, b); + //fprintf(stderr, "%d,%d,%d\n", b->tid, b->pos, b->l_str); + bcf_destroy(b); + } else if (conf->flag & MPLP_VCF) { mc_rst_t r; int j, _ref0, depth, rms_q, _ref0b, is_var = 0, qref = 0, level = 2, tot; uint64_t sqr_sum; @@ -636,6 +687,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) putchar('\n'); } } + bcf_close(bp); if (conf->flag&MPLP_VCF) mc_dump_afs(ma); if (hash) { // free the hash table khint_t k; @@ -643,6 +695,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if (kh_exist(hash, k)) free(kh_val(hash, k)); kh_destroy(64, hash); } + bcf_call_destroy(bca); free(bc.PL); free(bcr); mc_destroy(ma); bam_mplp_destroy(iter); bam_header_destroy(h); @@ -663,7 +716,7 @@ int bam_mpileup(int argc, char *argv[]) mplp.max_mq = 60; mplp.prior_type = MC_PTYPE_FULL; mplp.theta = 1e-3; - while ((c = getopt(argc, argv, "vVcFSP:f:r:l:VM:q:t:")) >= 0) { + while ((c = getopt(argc, argv, "gvVcFSP:f:r:l:VM:q:t:")) >= 0) { switch (c) { case 't': mplp.theta = atof(optarg); break; case 'P': @@ -681,6 +734,7 @@ int bam_mpileup(int argc, char *argv[]) break; case 'r': mplp.reg = strdup(optarg); break; case 'l': mplp.fn_pos = strdup(optarg); break; + case 'g': mplp.flag |= MPLP_GLF; break; case 'V': case 'c': mplp.flag |= MPLP_VCF; break; case 'F': mplp.flag |= MPLP_AFALL; break; @@ -689,6 +743,7 @@ int bam_mpileup(int argc, char *argv[]) case 'q': mplp.min_mq = atoi(optarg); break; } } + if (mplp.flag&MPLP_GLF) mplp.flag &= ~MPLP_VCF; if (argc == 1) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); @@ -700,6 +755,7 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -t FLOAT scaled mutation rate [%lg]\n", mplp.theta); fprintf(stderr, " -P STR prior: full, flat, cond2 [full]\n"); fprintf(stderr, " -c generate VCF output (consensus calling)\n"); + fprintf(stderr, " -g generate GLF output\n"); fprintf(stderr, " -v show variant sites only\n"); fprintf(stderr, "\n"); fprintf(stderr, "Notes: Assuming error independency and diploid individuals.\n\n"); diff --git a/bcf.c b/bcf.c new file mode 100644 index 0000000..12f6634 --- /dev/null +++ b/bcf.c @@ -0,0 +1,247 @@ +#include +#include +#include +#include "kstring.h" +#include "bcf.h" + +int bcf_hdr_read(bcf_t *b); + +void bcf_hdr_clear(bcf_hdr_t *b) +{ + free(b->name); free(b->sname); free(b->txt); free(b->ns); free(b->sns); + memset(b, 0, sizeof(bcf_hdr_t)); +} + +bcf_t *bcf_open(const char *fn, const char *mode) +{ + bcf_t *b; + b = calloc(1, sizeof(bcf_t)); + if (strchr(mode, 'w')) { + b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdout), "w"); + } else { + b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), "r"); + } + b->fp->owned_file = 1; + if (strchr(mode, 'r')) bcf_hdr_read(b); + return b; +} + +int bcf_close(bcf_t *b) +{ + int ret; + if (b == 0) return 0; + ret = bgzf_close(b->fp); + free(b->h.name); free(b->h.sname); free(b->h.txt); free(b->h.ns); free(b->h.sns); + free(b); + return ret; +} + +int bcf_hdr_write(bcf_t *b) +{ + bcf_hdr_t *h; + if (b == 0) return -1; + h = &b->h; + bgzf_write(b->fp, "BCF\4", 4); + bgzf_write(b->fp, &h->l_nm, 4); + bgzf_write(b->fp, h->name, h->l_nm); + bgzf_write(b->fp, &h->l_smpl, 4); + bgzf_write(b->fp, h->sname, h->l_smpl); + bgzf_write(b->fp, &h->l_txt, 4); + bgzf_write(b->fp, h->txt, h->l_txt); + bgzf_flush(b->fp); + return 16 + h->l_nm + h->l_smpl + h->l_txt; +} + +int bcf_hdr_read(bcf_t *b) +{ + uint8_t magic[4]; + bcf_hdr_t *h; + if (b == 0) return -1; + bcf_hdr_clear(&b->h); + h = &b->h; + bgzf_read(b->fp, magic, 4); + bgzf_read(b->fp, &h->l_nm, 4); + h->name = malloc(h->l_nm); + bgzf_read(b->fp, h->name, h->l_nm); + bgzf_read(b->fp, &h->l_smpl, 4); + h->sname = malloc(h->l_smpl); + bgzf_read(b->fp, h->sname, h->l_smpl); + bgzf_read(b->fp, &h->l_txt, 4); + h->txt = malloc(h->l_txt); + bgzf_read(b->fp, h->txt, h->l_txt); + bcf_hdr_sync(&b->h); + return 16 + h->l_nm + h->l_smpl + h->l_txt; +} + +static inline char **cnt_null(int l, char *str, int *_n) +{ + int n = 0; + char *p, **list; + *_n = 0; + if (l == 0 || str == 0) return 0; + for (p = str; p != str + l; ++p) + if (*p == 0) ++n; + *_n = n; + list = calloc(n, sizeof(void*)); + list[0] = str; + for (p = str, n = 1; p < str + l - 1; ++p) + if (*p == 0) list[n++] = p + 1; + return list; +} + +int bcf_hdr_sync(bcf_hdr_t *b) +{ + if (b == 0) return -1; + b->ns = cnt_null(b->l_nm, b->name, &b->n_ref); + b->sns = cnt_null(b->l_smpl, b->sname, &b->n_smpl); + return 0; +} + +#define char2int(s) (((int)s[0])<<8|s[1]) + +int bcf_sync(int n_smpl, bcf1_t *b) +{ + char *p, *tmp[4], *s; + int i, n, c; + // set ref, alt, flt, info, fmt + b->ref = b->alt = b->flt = b->info = b->fmt = 0; + for (p = b->str, n = 0; p < b->str + b->l_str; ++p) + if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1; + if (n != 5) return -1; + b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4]; + // set n_alleles + for (p = b->alt, n = 1; *p; ++p) + if (*p == ',') ++n; + b->n_alleles = n; + c = toupper(*b->ref); + if (b->alt - b->ref == 2 && (c == 'A' || c == 'C' || c == 'G' || c == 'T')) + ++b->n_alleles; + // set n_gi and gi[i].fmt + for (p = b->fmt, n = 1; *p; ++p) + if (*p == ':') ++n; + if (n > b->m_gi) { + int old_m = b->m_gi; + b->m_gi = n; + kroundup32(b->m_gi); + b->gi = realloc(b->gi, b->m_gi * sizeof(bcf_ginfo_t)); + memset(b->gi + old_m, 0, (b->m_gi - old_m) * sizeof(bcf_ginfo_t)); + } + b->n_gi = n; + for (p = s = b->fmt, n = 0; *p; ++p) { + if (*p == ':' || *(p+1) == 0) { + char *q = *p == ':'? p : p + 1; + if ((q - s) != 2) return -2; + b->gi[n].fmt = char2int(s); + s = q; + } + } + // set gi[i].len + for (i = 0; i < b->n_gi; ++i) { + if (b->gi[i].fmt == char2int("PL")) { + b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2; + } else if (b->gi[i].fmt == char2int("DP") || b->gi[i].fmt == char2int("HQ")) { + b->gi[i].len = 2; + } else if (b->gi[i].fmt == char2int("GQ") || b->gi[i].fmt == char2int("GT")) { + b->gi[i].len = 1; + } else if (b->gi[i].fmt == char2int("GL")) { + b->gi[i].len = 4; + } + b->gi[i].data = malloc(n_smpl * b->gi[i].len); + } + return 0; +} + +int bcf_write(bcf_t *bp, const bcf1_t *b) +{ + uint32_t x; + int i, l = 0; + if (b == 0) return -1; + bgzf_write(bp->fp, &b->tid, 4); + bgzf_write(bp->fp, &b->pos, 4); + x = b->qual<<24 | b->l_str; + bgzf_write(bp->fp, &x, 4); + bgzf_write(bp->fp, b->str, b->l_str); + l = 12 + b->l_str; + for (i = 0; i < b->n_gi; ++i) { + bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl); + l += b->gi[i].len * bp->h.n_smpl; + } + return l; +} + +int bcf_read(bcf_t *bp, bcf1_t *b) +{ + int i, l = 0; + uint32_t x; + if (b == 0) return -1; + if (bgzf_read(bp->fp, &b->tid, 4) == 0) return 0; + bgzf_read(bp->fp, &b->pos, 4); + bgzf_read(bp->fp, &x, 4); + b->qual = x >> 24; b->l_str = x << 8 >> 8; + if (b->l_str > b->m_str) { + b->m_str = b->l_str; + kroundup32(b->m_str); + b->str = realloc(b->str, b->m_str); + } + bgzf_read(bp->fp, b->str, b->l_str); + l = 12 + b->l_str; + bcf_sync(bp->h.n_smpl, b); + for (i = 0; i < b->n_gi; ++i) { + bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * bp->h.n_smpl); + l += b->gi[i].len * bp->h.n_smpl; + } + return l; +} + +int bcf_destroy(bcf1_t *b) +{ + int i; + if (b == 0) return -1; + free(b->str); + for (i = 0; i < b->n_gi; ++i) + free(b->gi[i].data); + free(b->gi); + free(b); + return 0; +} + +static inline void fmt_str(const char *p, kstring_t *s) +{ + if (*p == 0) kputc('.', s); + else kputs(p, s); +} + +char *bcf_fmt(bcf_t *bp, bcf1_t *b) +{ + kstring_t s; + int i, j, x; + memset(&s, 0, sizeof(kstring_t)); + kputs(bp->h.ns[b->tid], &s); kputc('\t', &s); + kputw(b->pos + 1, &s); kputc('\t', &s); + fmt_str(b->str, &s); kputc('\t', &s); + fmt_str(b->ref, &s); kputc('\t', &s); + fmt_str(b->alt, &s); kputc('\t', &s); + kputw(b->qual, &s); kputc('\t', &s); + fmt_str(b->info, &s); kputc('\t', &s); + fmt_str(b->fmt, &s); + x = b->n_alleles * (b->n_alleles + 1) / 2; + for (j = 0; j < bp->h.n_smpl; ++j) { + kputc('\t', &s); + for (i = 0; i < b->n_gi; ++i) { + if (i) kputc(':', &s); + if (b->gi[i].fmt == char2int("PL")) { + uint8_t *d = (uint8_t*)b->gi[i].data + j * x; + int k; + for (k = 0; k < x; ++k) { + if (k > 0) kputc(',', &s); + kputw(d[k], &s); + } + } else if (b->gi[i].fmt == char2int("DP")) { + kputw(((uint16_t*)b->gi[i].data)[j], &s); + } else if (b->gi[i].fmt == char2int("GQ")) { + kputw(((uint8_t*)b->gi[i].data)[j], &s); + } + } + } + return s.s; +} diff --git a/bcf.h b/bcf.h new file mode 100644 index 0000000..7512f3e --- /dev/null +++ b/bcf.h @@ -0,0 +1,57 @@ +#ifndef BCF_H +#define BCF_H + +#include +#include "bgzf.h" + +typedef struct { + int fmt, len; // len is the unit length + void *data; + // derived info: fmt, len +} bcf_ginfo_t; + +typedef struct { + int32_t tid, pos; + uint32_t qual:8, l_str:24; + int m_str; + char *str, *ref, *alt, *flt, *info, *fmt; // fmt, ref, alt and info point to str + int n_gi, m_gi; + bcf_ginfo_t *gi; + int n_alleles; + // derived info: ref, alt, flt, info, fmt, n_gi, n_alleles +} bcf1_t; + +typedef struct { + int32_t n_ref, n_smpl; + int32_t l_nm; + int32_t l_smpl; + int32_t l_txt; + char *name, *sname, *txt; + char **ns, **sns; + // derived info: n_ref, n_smpl, ns, sns +} bcf_hdr_t; + +typedef struct { + BGZF *fp; + bcf_hdr_t h; +} bcf_t; + +#ifdef __cplusplus +extern "C" { +#endif + + bcf_t *bcf_open(const char *fn, const char *mode); + int bcf_close(bcf_t *b); + int bcf_read(bcf_t *bp, bcf1_t *b); + int bcf_sync(int n_smpl, bcf1_t *b); + int bcf_write(bcf_t *bp, const bcf1_t *b); + int bcf_hdr_write(bcf_t *b); + int bcf_hdr_sync(bcf_hdr_t *b); + int bcf_destroy(bcf1_t *b); + char *bcf_fmt(bcf_t *bp, bcf1_t *b); + +#ifdef __cplusplus +} +#endif + +#endif -- 2.39.2