--- /dev/null
+#include <math.h>
+#include <stdint.h>
+#include "bam.h"
+#include "kstring.h"
+#include "bam2bcf.h"
+#include "errmod.h"
+#include "bcftools/bcf.h"
+
+extern void ks_introsort_uint32_t(size_t n, uint32_t a[]);
+
+#define CALL_ETA 0.03f
+#define CALL_MAX 256
+#define CALL_DEFTHETA 0.83f
+
+#define CAP_DIST 25
+
+bcf_callaux_t *bcf_call_init(double theta, int min_baseQ)
+{
+ bcf_callaux_t *bca;
+ if (theta <= 0.) theta = CALL_DEFTHETA;
+ bca = calloc(1, sizeof(bcf_callaux_t));
+ bca->capQ = 60;
+ bca->openQ = 40; bca->extQ = 20; bca->tandemQ = 100;
+ bca->min_baseQ = min_baseQ;
+ bca->e = errmod_init(1. - theta);
+ return bca;
+}
+
+void bcf_call_destroy(bcf_callaux_t *bca)
+{
+ if (bca == 0) return;
+ errmod_destroy(bca->e);
+ free(bca->bases); free(bca->inscns); free(bca);
+}
+/* ref_base is the 4-bit representation of the reference base. It is
+ * negative if we are looking at an indel. */
+int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r)
+{
+ int i, n, ref4, is_indel, ori_depth = 0;
+ memset(r, 0, sizeof(bcf_callret1_t));
+ if (ref_base >= 0) {
+ ref4 = bam_nt16_nt4_table[ref_base];
+ is_indel = 0;
+ } else ref4 = 4, is_indel = 1;
+ if (_n == 0) return -1;
+ // enlarge the bases array if necessary
+ if (bca->max_bases < _n) {
+ bca->max_bases = _n;
+ kroundup32(bca->max_bases);
+ bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
+ }
+ // fill the bases array
+ memset(r, 0, sizeof(bcf_callret1_t));
+ for (i = n = 0; i < _n; ++i) {
+ const bam_pileup1_t *p = pl + i;
+ int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
+ // set base
+ if (p->is_del || p->is_refskip || (p->b->core.flag&BAM_FUNMAP)) continue;
+ ++ori_depth;
+ baseQ = q = is_indel? p->aux&0xff : (int)bam1_qual(p->b)[p->qpos]; // base/indel quality
+ seqQ = is_indel? (p->aux>>8&0xff) : 99;
+ if (q < bca->min_baseQ) continue;
+ if (q > seqQ) q = seqQ;
+ mapQ = p->b->core.qual < bca->capQ? p->b->core.qual : bca->capQ;
+ if (q > mapQ) q = mapQ;
+ if (q > 63) q = 63;
+ if (q < 4) q = 4;
+ if (!is_indel) {
+ b = bam1_seqi(bam1_seq(p->b), p->qpos); // base
+ b = bam_nt16_nt4_table[b? b : ref_base]; // b is the 2-bit base
+ is_diff = (ref4 < 4 && b == ref4)? 0 : 1;
+ } else {
+ b = p->aux>>16&0x3f;
+ is_diff = (b != 0);
+ }
+ bca->bases[n++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
+ // collect annotations
+ r->qsum[b] += q;
+ ++r->anno[0<<2|is_diff<<1|bam1_strand(p->b)];
+ min_dist = p->b->core.l_qseq - 1 - p->qpos;
+ if (min_dist > p->qpos) min_dist = p->qpos;
+ if (min_dist > CAP_DIST) min_dist = CAP_DIST;
+ r->anno[1<<2|is_diff<<1|0] += baseQ;
+ r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
+ r->anno[2<<2|is_diff<<1|0] += mapQ;
+ r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
+ r->anno[3<<2|is_diff<<1|0] += min_dist;
+ r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
+ }
+ r->depth = n; r->ori_depth = ori_depth;
+ // glfgen
+ errmod_cal(bca->e, n, 5, bca->bases, r->p);
+ return r->depth;
+}
+
+int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call)
+{
+ int ref4, i, j, qsum[4];
+ int64_t tmp;
+ if (ref_base >= 0) {
+ call->ori_ref = ref4 = bam_nt16_nt4_table[ref_base];
+ if (ref4 > 4) ref4 = 4;
+ } else call->ori_ref = -1, ref4 = 0;
+ // calculate qsum
+ memset(qsum, 0, 4 * sizeof(int));
+ for (i = 0; i < n; ++i)
+ for (j = 0; j < 4; ++j)
+ qsum[j] += calls[i].qsum[j];
+ for (j = 0; j < 4; ++j) qsum[j] = qsum[j] << 2 | j;
+ // find the top 2 alleles
+ for (i = 1; i < 4; ++i) // insertion sort
+ for (j = i; j > 0 && qsum[j] < qsum[j-1]; --j)
+ tmp = qsum[j], qsum[j] = qsum[j-1], qsum[j-1] = tmp;
+ // set the reference allele and alternative allele(s)
+ for (i = 0; i < 5; ++i) call->a[i] = -1;
+ call->unseen = -1;
+ call->a[0] = ref4;
+ for (i = 3, j = 1; i >= 0; --i) {
+ if ((qsum[i]&3) != ref4) {
+ if (qsum[i]>>2 != 0) call->a[j++] = qsum[i]&3;
+ else break;
+ }
+ }
+ if (ref_base >= 0) { // for SNPs, find the "unseen" base
+ if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
+ call->unseen = j, call->a[j++] = qsum[i]&3;
+ call->n_alleles = j;
+ } else {
+ call->n_alleles = j;
+ if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
+ }
+ // set the PL array
+ if (call->n < n) {
+ call->n = n;
+ call->PL = realloc(call->PL, 15 * n);
+ }
+ {
+ int x, g[15], z;
+ double sum_min = 0.;
+ x = call->n_alleles * (call->n_alleles + 1) / 2;
+ // get the possible genotypes
+ for (i = z = 0; i < call->n_alleles; ++i)
+ for (j = i; j < call->n_alleles; ++j)
+ g[z++] = call->a[i] * 5 + call->a[j];
+ for (i = 0; i < n; ++i) {
+ uint8_t *PL = call->PL + x * i;
+ const bcf_callret1_t *r = calls + i;
+ float min = 1e37;
+ for (j = 0; j < x; ++j)
+ if (min > r->p[g[j]]) min = r->p[g[j]];
+ sum_min += min;
+ for (j = 0; j < x; ++j) {
+ int y;
+ y = (int)(r->p[g[j]] - min + .499);
+ if (y > 255) y = 255;
+ PL[j] = y;
+ }
+ }
+// if (ref_base < 0) fprintf(stderr, "%d,%d,%f,%d\n", call->n_alleles, x, sum_min, call->unseen);
+ call->shift = (int)(sum_min + .499);
+ }
+ // combine annotations
+ memset(call->anno, 0, 16 * sizeof(int));
+ for (i = call->depth = call->ori_depth = 0, tmp = 0; i < n; ++i) {
+ call->depth += calls[i].depth;
+ call->ori_depth += calls[i].ori_depth;
+ for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
+ }
+ return 0;
+}
+
+int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP,
+ const bcf_callaux_t *bca, const char *ref)
+{
+ extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
+ kstring_t s;
+ int i, j;
+ b->n_smpl = bc->n;
+ b->tid = tid; b->pos = pos; b->qual = 0;
+ s.s = b->str; s.m = b->m_str; s.l = 0;
+ kputc('\0', &s);
+ if (bc->ori_ref < 0) { // an indel
+ // write REF
+ kputc(ref[pos], &s);
+ for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+1+j], &s);
+ kputc('\0', &s);
+ // write ALT
+ kputc(ref[pos], &s);
+ for (i = 1; i < 4; ++i) {
+ if (bc->a[i] < 0) break;
+ if (i > 1) {
+ kputc(',', &s); kputc(ref[pos], &s);
+ }
+ if (bca->indel_types[bc->a[i]] < 0) { // deletion
+ for (j = -bca->indel_types[bc->a[i]]; j < bca->indelreg; ++j)
+ kputc(ref[pos+1+j], &s);
+ } else { // insertion; cannot be a reference unless a bug
+ char *inscns = &bca->inscns[bc->a[i] * bca->maxins];
+ for (j = 0; j < bca->indel_types[bc->a[i]]; ++j)
+ kputc("ACGTN"[(int)inscns[j]], &s);
+ for (j = 0; j < bca->indelreg; ++j) kputc(ref[pos+1+j], &s);
+ }
+ }
+ kputc('\0', &s);
+ } else { // a SNP
+ kputc("ACGTN"[bc->ori_ref], &s); kputc('\0', &s);
+ for (i = 1; i < 5; ++i) {
+ if (bc->a[i] < 0) break;
+ if (i > 1) kputc(',', &s);
+ kputc(bc->unseen == i? 'X' : "ACGT"[bc->a[i]], &s);
+ }
+ kputc('\0', &s);
+ }
+ kputc('\0', &s);
+ // INFO
+ if (bc->ori_ref < 0) kputs("INDEL;", &s);
+ kputs("DP=", &s); kputw(bc->ori_depth, &s); kputs(";I16=", &s);
+ for (i = 0; i < 16; ++i) {
+ if (i) kputc(',', &s);
+ kputw(bc->anno[i], &s);
+ }
+ kputc('\0', &s);
+ // FMT
+ kputs("PL", &s);
+ if (bcr) {
+ kputs(":DP", &s);
+ if (is_SP) kputs(":SP", &s);
+ }
+ kputc('\0', &s);
+ b->m_str = s.m; b->str = s.s; b->l_str = s.l;
+ bcf_sync(b);
+ memcpy(b->gi[0].data, bc->PL, b->gi[0].len * bc->n);
+ if (bcr) {
+ uint16_t *dp = (uint16_t*)b->gi[1].data;
+ uint8_t *sp = is_SP? b->gi[2].data : 0;
+ for (i = 0; i < bc->n; ++i) {
+ bcf_callret1_t *p = bcr + i;
+ dp[i] = p->depth < 0xffff? p->depth : 0xffff;
+ if (is_SP) {
+ if (p->anno[0] + p->anno[1] < 2 || p->anno[2] + p->anno[3] < 2
+ || p->anno[0] + p->anno[2] < 2 || p->anno[1] + p->anno[3] < 2)
+ {
+ sp[i] = 0;
+ } else {
+ double left, right, two;
+ int x;
+ kt_fisher_exact(p->anno[0], p->anno[1], p->anno[2], p->anno[3], &left, &right, &two);
+ x = (int)(-4.343 * log(two) + .499);
+ if (x > 255) x = 255;
+ sp[i] = x;
+ }
+ }
+ }
+ }
+ return 0;
+}
--- /dev/null
+#ifndef BAM2BCF_H
+#define BAM2BCF_H
+
+#include <stdint.h>
+#include "errmod.h"
+#include "bcftools/bcf.h"
+
+#define B2B_INDEL_NULL 10000
+
+typedef struct __bcf_callaux_t {
+ int capQ, min_baseQ;
+ int openQ, extQ, tandemQ;
+ // for internal uses
+ int max_bases;
+ int indel_types[4];
+ int maxins, indelreg;
+ char *inscns;
+ uint16_t *bases;
+ errmod_t *e;
+ void *rghash;
+} bcf_callaux_t;
+
+typedef struct {
+ int depth, ori_depth, qsum[4];
+ int anno[16];
+ float p[25];
+} bcf_callret1_t;
+
+typedef struct {
+ int a[5]; // alleles: ref, alt, alt2, alt3
+ int n, n_alleles, shift, ori_ref, unseen;
+ int anno[16], depth, ori_depth;
+ uint8_t *PL;
+} bcf_call_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ);
+ void bcf_call_destroy(bcf_callaux_t *bca);
+ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r);
+ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call);
+ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int is_SP,
+ const bcf_callaux_t *bca, const char *ref);
+ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
+ const void *rghash);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+#include <assert.h>
+#include <ctype.h>
+#include <string.h>
+#include "bam.h"
+#include "bam2bcf.h"
+#include "ksort.h"
+#include "kaln.h"
+#include "kprobaln.h"
+#include "khash.h"
+KHASH_SET_INIT_STR(rg)
+
+#define MINUS_CONST 0x10000000
+#define INDEL_WINDOW_SIZE 50
+#define MIN_SUPPORT_COEF 500
+
+void *bcf_call_add_rg(void *_hash, const char *hdtext, const char *list)
+{
+ const char *s, *p, *q, *r, *t;
+ khash_t(rg) *hash;
+ if (list == 0 || hdtext == 0) return _hash;
+ if (_hash == 0) _hash = kh_init(rg);
+ hash = (khash_t(rg)*)_hash;
+ if ((s = strstr(hdtext, "@RG\t")) == 0) return hash;
+ do {
+ t = strstr(s + 4, "@RG\t"); // the next @RG
+ if ((p = strstr(s, "\tID:")) != 0) p += 4;
+ if ((q = strstr(s, "\tPL:")) != 0) q += 4;
+ if (p && q && (t == 0 || (p < t && q < t))) { // ID and PL are both present
+ int lp, lq;
+ char *x;
+ for (r = p; *r && *r != '\t' && *r != '\n'; ++r); lp = r - p;
+ for (r = q; *r && *r != '\t' && *r != '\n'; ++r); lq = r - q;
+ x = calloc((lp > lq? lp : lq) + 1, 1);
+ for (r = q; *r && *r != '\t' && *r != '\n'; ++r) x[r-q] = *r;
+ if (strstr(list, x)) { // insert ID to the hash table
+ khint_t k;
+ int ret;
+ for (r = p; *r && *r != '\t' && *r != '\n'; ++r) x[r-p] = *r;
+ x[r-p] = 0;
+ k = kh_get(rg, hash, x);
+ if (k == kh_end(hash)) k = kh_put(rg, hash, x, &ret);
+ else free(x);
+ } else free(x);
+ }
+ s = t;
+ } while (s);
+ return hash;
+}
+
+void bcf_call_del_rghash(void *_hash)
+{
+ khint_t k;
+ khash_t(rg) *hash = (khash_t(rg)*)_hash;
+ if (hash == 0) return;
+ for (k = kh_begin(hash); k < kh_end(hash); ++k)
+ if (kh_exist(hash, k))
+ free((char*)kh_key(hash, k));
+ kh_destroy(rg, hash);
+}
+
+static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, int is_left, int32_t *_tpos)
+{
+ int k, x = c->pos, y = 0, last_y = 0;
+ *_tpos = c->pos;
+ for (k = 0; k < c->n_cigar; ++k) {
+ int op = cigar[k] & BAM_CIGAR_MASK;
+ int l = cigar[k] >> BAM_CIGAR_SHIFT;
+ if (op == BAM_CMATCH) {
+ if (c->pos > tpos) return y;
+ if (x + l > tpos) {
+ *_tpos = tpos;
+ return y + (tpos - x);
+ }
+ x += l; y += l;
+ last_y = y;
+ } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
+ else if (op == BAM_CDEL || op == BAM_CREF_SKIP) {
+ if (x + l > tpos) {
+ *_tpos = is_left? x : x + l;
+ return y;
+ }
+ x += l;
+ }
+ }
+ *_tpos = x;
+ return last_y;
+}
+// FIXME: check if the inserted sequence is consistent with the homopolymer run
+// l is the relative gap length and l_run is the length of the homopolymer on the reference
+static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run)
+{
+ int q, qh;
+ q = bca->openQ + bca->extQ * (abs(l) - 1);
+ qh = l_run >= 3? (int)(bca->tandemQ * (double)abs(l) / l_run + .499) : 1000;
+ return q < qh? q : qh;
+}
+
+static inline int est_indelreg(int pos, const char *ref, int l, char *ins4)
+{
+ int i, j, max = 0, max_i = pos, score = 0;
+ l = abs(l);
+ for (i = pos + 1, j = 0; ref[i]; ++i, ++j) {
+ if (ins4) score += (toupper(ref[i]) != "ACGTN"[(int)ins4[j%l]])? -10 : 1;
+ else score += (toupper(ref[i]) != toupper(ref[pos+1+j%l]))? -10 : 1;
+ if (score < 0) break;
+ if (max < score) max = score, max_i = i;
+ }
+ return max_i - pos;
+}
+
+int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref,
+ const void *rghash)
+{
+ extern void ks_introsort_uint32_t(int, uint32_t*);
+ int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins, *score1, *score2, max_ref2;
+ int N, K, l_run, ref_type, n_alt;
+ char *inscns = 0, *ref2, *query;
+ khash_t(rg) *hash = (khash_t(rg)*)rghash;
+ if (ref == 0 || bca == 0) return -1;
+ // mark filtered reads
+ if (rghash) {
+ N = 0;
+ for (s = N = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ const uint8_t *rg = bam_aux_get(p->b, "RG");
+ p->aux = 1; // filtered by default
+ if (rg) {
+ khint_t k = kh_get(rg, hash, (const char*)(rg + 1));
+ if (k != kh_end(hash)) p->aux = 0, ++N; // not filtered
+ }
+ }
+ }
+ if (N == 0) return -1; // no reads left
+ }
+ // determine if there is a gap
+ for (s = N = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i)
+ if (plp[s][i].indel != 0) break;
+ if (i < n_plp[s]) break;
+ }
+ if (s == n) return -1; // there is no indel at this position.
+ for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
+ { // find out how many types of indels are present
+ int m, n_alt = 0, n_tot = 0;
+ uint32_t *aux;
+ aux = calloc(N + 1, 4);
+ m = max_rd_len = 0;
+ aux[m++] = MINUS_CONST; // zero indel is always a type
+ for (s = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ const bam_pileup1_t *p = plp[s] + i;
+ if (rghash == 0 || p->aux == 0) {
+ ++n_tot;
+ if (p->indel != 0) {
+ ++n_alt;
+ aux[m++] = MINUS_CONST + p->indel;
+ }
+ }
+ j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b));
+ if (j > max_rd_len) max_rd_len = j;
+ }
+ }
+ ks_introsort(uint32_t, m, aux);
+ // squeeze out identical types
+ for (i = 1, n_types = 1; i < m; ++i)
+ if (aux[i] != aux[i-1]) ++n_types;
+ if (n_types == 1 || n_alt * MIN_SUPPORT_COEF < n_tot) { // no indels or too few supporting reads
+ free(aux); return -1;
+ }
+ types = (int*)calloc(n_types, sizeof(int));
+ t = 0;
+ types[t++] = aux[0] - MINUS_CONST;
+ for (i = 1; i < m; ++i)
+ if (aux[i] != aux[i-1])
+ types[t++] = aux[i] - MINUS_CONST;
+ free(aux);
+ for (t = 0; t < n_types; ++t)
+ if (types[t] == 0) break;
+ ref_type = t; // the index of the reference type (0)
+ assert(n_types < 64);
+ }
+ { // calculate left and right boundary
+ left = pos > INDEL_WINDOW_SIZE? pos - INDEL_WINDOW_SIZE : 0;
+ right = pos + INDEL_WINDOW_SIZE;
+ if (types[0] < 0) right -= types[0];
+ // in case the alignments stand out the reference
+ for (i = pos; i < right; ++i)
+ if (ref[i] == 0) break;
+ right = i;
+ }
+ { // the length of the homopolymer run around the current position
+ int c = bam_nt16_table[(int)ref[pos + 1]];
+ if (c == 15) l_run = 1;
+ else {
+ for (i = pos + 2; ref[i]; ++i)
+ if (bam_nt16_table[(int)ref[i]] != c) break;
+ l_run = i;
+ for (i = pos; i >= 0; --i)
+ if (bam_nt16_table[(int)ref[i]] != c) break;
+ l_run -= i + 1;
+ }
+ }
+ // construct the consensus sequence
+ max_ins = types[n_types - 1]; // max_ins is at least 0
+ if (max_ins > 0) {
+ int *inscns_aux = calloc(4 * n_types * max_ins, sizeof(int));
+ // count the number of occurrences of each base at each position for each type of insertion
+ for (t = 0; t < n_types; ++t) {
+ if (types[t] > 0) {
+ for (s = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ if (p->indel == types[t]) {
+ uint8_t *seq = bam1_seq(p->b);
+ for (k = 1; k <= p->indel; ++k) {
+ int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)];
+ if (c < 4) ++inscns_aux[(t*max_ins+(k-1))*4 + c];
+ }
+ }
+ }
+ }
+ }
+ }
+ // use the majority rule to construct the consensus
+ inscns = calloc(n_types * max_ins, 1);
+ for (t = 0; t < n_types; ++t) {
+ for (j = 0; j < types[t]; ++j) {
+ int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*4];
+ for (k = 0; k < 4; ++k)
+ if (ia[k] > max)
+ max = ia[k], max_k = k;
+ inscns[t*max_ins + j] = max? max_k : 4;
+ }
+ }
+ free(inscns_aux);
+ }
+ // compute the likelihood given each type of indel for each read
+ max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]);
+ ref2 = calloc(max_ref2, 1);
+ query = calloc(right - left + max_rd_len + max_ins + 2, 1);
+ score1 = calloc(N * n_types, sizeof(int));
+ score2 = calloc(N * n_types, sizeof(int));
+ bca->indelreg = 0;
+ for (t = 0; t < n_types; ++t) {
+ int l, ir;
+ kpa_par_t apf1 = { 1e-4, 1e-2, 10 }, apf2 = { 1e-6, 1e-3, 10 };
+ apf1.bw = apf2.bw = abs(types[t]) + 3;
+ // compute indelreg
+ if (types[t] == 0) ir = 0;
+ else if (types[t] > 0) ir = est_indelreg(pos, ref, types[t], &inscns[t*max_ins]);
+ else ir = est_indelreg(pos, ref, -types[t], 0);
+ if (ir > bca->indelreg) bca->indelreg = ir;
+// fprintf(stderr, "%d, %d, %d\n", pos, types[t], ir);
+ // write ref2
+ for (k = 0, j = left; j <= pos; ++j)
+ ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]];
+ if (types[t] <= 0) j += -types[t];
+ else for (l = 0; l < types[t]; ++l)
+ ref2[k++] = inscns[t*max_ins + l];
+ if (types[0] < 0) { // mask deleted sequences to avoid a particular error in the model.
+ int jj, tmp = types[t] >= 0? -types[0] : -types[0] + types[t];
+ for (jj = 0; jj < tmp && j < right && ref[j]; ++jj, ++j)
+ ref2[k++] = 4;
+ }
+ for (; j < right && ref[j]; ++j)
+ ref2[k++] = bam_nt16_nt4_table[bam_nt16_table[(int)ref[j]]];
+ for (; k < max_ref2; ++k) ref2[k] = 4;
+ if (j < right) right = j;
+ // align each read to ref2
+ for (s = K = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i, ++K) {
+ bam_pileup1_t *p = plp[s] + i;
+ int qbeg, qend, tbeg, tend, sc;
+ uint8_t *seq = bam1_seq(p->b);
+ // FIXME: the following skips soft clips, but using them may be more sensitive.
+ // determine the start and end of sequences for alignment
+ qbeg = tpos2qpos(&p->b->core, bam1_cigar(p->b), left, 0, &tbeg);
+ qend = tpos2qpos(&p->b->core, bam1_cigar(p->b), right, 1, &tend);
+ if (types[t] < 0) {
+ int l = -types[t];
+ tbeg = tbeg - l > left? tbeg - l : left;
+ }
+ // write the query sequence
+ for (l = qbeg; l < qend; ++l)
+ query[l - qbeg] = bam_nt16_nt4_table[bam1_seqi(seq, l)];
+ { // do realignment; this is the bottleneck
+ const uint8_t *qual = bam1_qual(p->b), *bq;
+ uint8_t *qq;
+ qq = calloc(qend - qbeg, 1);
+ bq = (uint8_t*)bam_aux_get(p->b, "ZQ");
+ if (bq) ++bq; // skip type
+ for (l = qbeg; l < qend; ++l) {
+ qq[l - qbeg] = bq? qual[l] + (bq[l] - 64) : qual[l];
+ if (qq[l - qbeg] > 30) qq[l - qbeg] = 30;
+ if (qq[l - qbeg] < 7) qq[l - qbeg] = 7;
+ }
+ sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf1, 0, 0);
+ l = (int)(100. * sc / (qend - qbeg) + .499); // used for adjusting indelQ below
+ if (l > 255) l = 255;
+ score1[K*n_types + t] = score2[K*n_types + t] = sc<<8 | l;
+ if (sc > 5) {
+ sc = kpa_glocal((uint8_t*)ref2 + tbeg - left, tend - tbeg + abs(types[t]),
+ (uint8_t*)query, qend - qbeg, qq, &apf2, 0, 0);
+ l = (int)(100. * sc / (qend - qbeg) + .499);
+ if (l > 255) l = 255;
+ score2[K*n_types + t] = sc<<8 | l;
+ }
+ free(qq);
+ }
+/*
+ for (l = 0; l < tend - tbeg + abs(types[t]); ++l)
+ fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr);
+ fputc('\n', stderr);
+ for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr);
+ fputc('\n', stderr);
+ fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s qbeg=%d tbeg=%d score=%d\n", pos, types[t], s, i, bam1_qname(p->b), qbeg, tbeg, sc);
+*/
+ }
+ }
+ }
+ free(ref2); free(query);
+ { // compute indelQ
+ int *sc, tmp, *sumq;
+ sc = alloca(n_types * sizeof(int));
+ sumq = alloca(n_types * sizeof(int));
+ memset(sumq, 0, sizeof(int) * n_types);
+ for (s = K = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i, ++K) {
+ bam_pileup1_t *p = plp[s] + i;
+ int *sct = &score1[K*n_types], indelQ1, indelQ2, seqQ, indelQ;
+ for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+ tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+ /* errmod_cal() assumes that if the call is wrong, the
+ * likelihoods of other events are equal. This is about
+ * right for substitutions, but is not desired for
+ * indels. To reuse errmod_cal(), I have to make
+ * compromise for multi-allelic indels.
+ */
+ if ((sc[0]&0x3f) == ref_type) {
+ indelQ1 = (sc[1]>>14) - (sc[0]>>14);
+ seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run);
+ } else {
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sc[t]&0x3f) == ref_type) break;
+ indelQ1 = (sc[t]>>14) - (sc[0]>>14);
+ seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run);
+ }
+ tmp = sc[0]>>6 & 0xff;
+ indelQ1 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1 + .499); // reduce indelQ
+ sct = &score2[K*n_types];
+ for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sc[j] < sc[j-1]; --j)
+ tmp = sc[j], sc[j] = sc[j-1], sc[j-1] = tmp;
+ if ((sc[0]&0x3f) == ref_type) {
+ indelQ2 = (sc[1]>>14) - (sc[0]>>14);
+ } else {
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sc[t]&0x3f) == ref_type) break;
+ indelQ2 = (sc[t]>>14) - (sc[0]>>14);
+ }
+ tmp = sc[0]>>6 & 0xff;
+ indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499);
+ // pick the smaller between indelQ1 and indelQ2
+ indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2;
+ p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ;
+ sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ;
+// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d q=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ);
+ }
+ }
+ // determine bca->indel_types[] and bca->inscns
+ bca->maxins = max_ins;
+ bca->inscns = realloc(bca->inscns, bca->maxins * 4);
+ for (t = 0; t < n_types; ++t)
+ sumq[t] = sumq[t]<<6 | t;
+ for (t = 1; t < n_types; ++t) // insertion sort
+ for (j = t; j > 0 && sumq[j] > sumq[j-1]; --j)
+ tmp = sumq[j], sumq[j] = sumq[j-1], sumq[j-1] = tmp;
+ for (t = 0; t < n_types; ++t) // look for the reference type
+ if ((sumq[t]&0x3f) == ref_type) break;
+ if (t) { // then move the reference type to the first
+ tmp = sumq[t];
+ for (; t > 0; --t) sumq[t] = sumq[t-1];
+ sumq[0] = tmp;
+ }
+ for (t = 0; t < 4; ++t) bca->indel_types[t] = B2B_INDEL_NULL;
+ for (t = 0; t < 4 && t < n_types; ++t) {
+ bca->indel_types[t] = types[sumq[t]&0x3f];
+ memcpy(&bca->inscns[t * bca->maxins], &inscns[(sumq[t]&0x3f) * max_ins], bca->maxins);
+ }
+ // update p->aux
+ for (s = n_alt = 0; s < n; ++s) {
+ for (i = 0; i < n_plp[s]; ++i) {
+ bam_pileup1_t *p = plp[s] + i;
+ int x = types[p->aux>>16&0x3f];
+ for (j = 0; j < 4; ++j)
+ if (x == bca->indel_types[j]) break;
+ p->aux = j<<16 | (j == 4? 0 : (p->aux&0xffff));
+ if ((p->aux>>16&0x3f) > 0) ++n_alt;
+// fprintf(stderr, "X pos=%d read=%d:%d name=%s call=%d type=%d q=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), p->aux>>16&63, bca->indel_types[p->aux>>16&63], p->aux&0xff, p->aux>>8&0xff);
+ }
+ }
+ }
+ free(score1); free(score2);
+ // free
+ free(types); free(inscns);
+ return n_alt > 0? 0 : -1;
+}
--- /dev/null
+CC= gcc
+CFLAGS= -g -Wall -O2 #-m64 #-arch ppc
+DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE
+LOBJS= bcf.o vcf.o bcfutils.o prob1.o ld.o kfunc.o index.o fet.o bcf2qcall.o
+OMISC= ..
+AOBJS= call1.o main.o $(OMISC)/kstring.o $(OMISC)/bgzf.o $(OMISC)/knetfile.o
+PROG= bcftools
+INCLUDES=
+SUBDIRS= .
+
+.SUFFIXES:.c .o
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DFLAGS) -I.. $(INCLUDES) $< -o $@
+
+all-recur lib-recur clean-recur cleanlocal-recur install-recur:
+ @target=`echo $@ | sed s/-recur//`; \
+ wdir=`pwd`; \
+ list='$(SUBDIRS)'; for subdir in $$list; do \
+ cd $$subdir; \
+ $(MAKE) CC="$(CC)" DFLAGS="$(DFLAGS)" CFLAGS="$(CFLAGS)" \
+ INCLUDES="$(INCLUDES)" LIBPATH="$(LIBPATH)" $$target || exit 1; \
+ cd $$wdir; \
+ done;
+
+all:$(PROG)
+
+lib:libbcf.a
+
+libbcf.a:$(LOBJS)
+ $(AR) -cru $@ $(LOBJS)
+
+bcftools:lib $(AOBJS)
+ $(CC) $(CFLAGS) -o $@ $(AOBJS) -lm $(LIBPATH) -lz -L. -lbcf
+
+bcf.o:bcf.h
+vcf.o:bcf.h
+index.o:bcf.h
+bcfutils.o:bcf.h
+prob1.o:prob1.h bcf.h
+call1.o:prob1.h bcf.h
+bcf2qcall.o:bcf.h
+main.o:bcf.h
+
+bcf.pdf:bcf.tex
+ pdflatex bcf
+
+cleanlocal:
+ rm -fr gmon.out *.o a.out *.dSYM $(PROG) *~ *.a bcf.aux bcf.log bcf.pdf *.class libbcf.*.dylib libbcf.so*
+
+clean:cleanlocal-recur
--- /dev/null
+The view command of bcftools calls variants, tests Hardy-Weinberg
+equilibrium (HWE), tests allele balances and estimates allele frequency.
+
+This command calls a site as a potential variant if P(ref|D,F) is below
+0.9 (controlled by the -p option), where D is data and F is the prior
+allele frequency spectrum (AFS).
+
+The view command performs two types of allele balance tests, both based
+on Fisher's exact test for 2x2 contingency tables with the row variable
+being reference allele or not. In the first table, the column variable
+is strand. Two-tail P-value is taken. We test if variant bases tend to
+come from one strand. In the second table, the column variable is
+whether a base appears in the first or the last 11bp of the read.
+One-tail P-value is taken. We test if variant bases tend to occur
+towards the end of reads, which is usually an indication of
+misalignment.
+
+Site allele frequency is estimated in two ways. In the first way, the
+frequency is esimated as \argmax_f P(D|f) under the assumption of
+HWE. Prior AFS is not used. In the second way, the frequency is
+estimated as the posterior expectation of allele counts \sum_k
+kP(k|D,F), dividied by the total number of haplotypes. HWE is not
+assumed, but the estimate depends on the prior AFS. The two estimates
+largely agree when the signal is strong, but may differ greatly on weak
+sites as in this case, the prior plays an important role.
+
+To test HWE, we calculate the posterior distribution of genotypes
+(ref-hom, het and alt-hom). Chi-square test is performed. It is worth
+noting that the model used here is prior dependent and assumes HWE,
+which is different from both models for allele frequency estimate. The
+new model actually yields a third estimate of site allele frequency.
+
+The estimate allele frequency spectrum is printed to stderr per 64k
+sites. The estimate is in fact only the first round of a EM
+procedure. The second model (not the model for HWE testing) is used to
+estimate the AFS.
\ No newline at end of file
--- /dev/null
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+use Carp;
+
+my $opts = parse_params();
+bcf_fix();
+
+exit;
+
+#--------------------------------
+
+sub error
+{
+ my (@msg) = @_;
+ if ( scalar @msg ) { confess @msg; }
+ die
+ "Usage: bcftools view test.bcf | bcf-fix.pl > test.vcf\n",
+ "Options:\n",
+ " -h, -?, --help This help message.\n",
+ "\n";
+}
+
+
+sub parse_params
+{
+ my $opts = {};
+ while (my $arg=shift(@ARGV))
+ {
+ if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
+ error("Unknown parameter \"$arg\". Run -h for help.\n");
+ }
+ return $opts;
+}
+
+sub bcf_fix
+{
+ while (my $line=<STDIN>)
+ {
+ if ( $line=~/^#CHROM/ )
+ {
+ print
+qq[##INFO=<ID=DP4,Number=4,Type=Integer,Description="Read depth for 1) forward reference bases; 2) reverse ref; 3) forward non-ref; 4) reverse non-ref">
+##INFO=<ID=PV4,Number=4,Type=Float,Description="P-values for 1) strand bias (exact test); 2) baseQ bias (t-test); 3) mapQ bias (t); 4) tail distance bias (t)">
+##INFO=<ID=AF1,Number=1,Type=Float,Description="EM estimate of site allele frequency without prior">
+##INFO=<ID=AFE,Number=1,Type=Float,Description="Posterior expectation of site allele frequency (with prior)">
+##INFO=<ID=HWE,Number=1,Type=Float,Description="P-value for Hardy-Weinberg equillibrium (chi-square test)">
+##INFO=<ID=MQ,Number=1,Type=Integer,Descriptin="RMS mapping quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GL,Number=3,Type=Float,Description="Likelihoods for RR,RA,AA genotypes (R=ref,A=alt)">
+];
+ print $line;
+ }
+ elsif ( $line=~/^#/ )
+ {
+ print $line;
+ }
+ else
+ {
+ my @items = split(/\t/,$line);
+ my @tags = split(/:/,$items[8]); # FORMAT tags
+
+ my $nidx=2;
+ my @idxs; # Mapping which defines new ordering: $idxs[$inew]=$iold; GT comes first, PL second
+ for (my $i=0; $i<@tags; $i++)
+ {
+ if ( $tags[$i] eq 'GT' ) { $idxs[0]=$i; }
+ elsif ( $tags[$i] eq 'PL' ) { $idxs[1]=$i; }
+ else { $idxs[$nidx++]=$i; }
+ }
+ if ( !exists($tags[0]) or !exists($tags[1]) ) { error("FIXME: expected GT and PL in the format field.\n"); }
+
+ # First fix the FORMAT column
+ $items[8] = 'GT:GL';
+ for (my $i=2; $i<@tags; $i++)
+ {
+ $items[8] .= ':'.$tags[$idxs[$i]];
+ }
+
+ # Now all the genotype columns
+ for (my $iitem=9; $iitem<@items; $iitem++)
+ {
+ @tags = split(/:/,$items[$iitem]);
+ $items[$iitem] = $tags[$idxs[0]] .':';
+
+ # GL=-PL/10
+ my ($a,$b,$c) = split(/,/,$tags[$idxs[1]]);
+ $items[$iitem] .= sprintf "%.2f,%.2f,%.2f",-$a/10.,-$b/10.,-$c/10.;
+
+ for (my $itag=2; $itag<@tags; $itag++)
+ {
+ $items[$iitem] .= ':'.$tags[$idxs[$itag]];
+ }
+ }
+ print join("\t",@items);
+ }
+ }
+}
+
--- /dev/null
+#include <string.h>
+#include <ctype.h>
+#include <stdio.h>
+#include "kstring.h"
+#include "bcf.h"
+
+bcf_t *bcf_open(const char *fn, const char *mode)
+{
+ bcf_t *b;
+ b = calloc(1, sizeof(bcf_t));
+ if (strchr(mode, 'w')) {
+ b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdout), mode);
+ } else {
+ b->fp = strcmp(fn, "-")? bgzf_open(fn, mode) : bgzf_fdopen(fileno(stdin), mode);
+ }
+#ifndef BCF_LITE
+ b->fp->owned_file = 1;
+#endif
+ return b;
+}
+
+int bcf_close(bcf_t *b)
+{
+ int ret;
+ if (b == 0) return 0;
+ ret = bgzf_close(b->fp);
+ free(b);
+ return ret;
+}
+
+int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h)
+{
+ if (b == 0 || h == 0) return -1;
+ bgzf_write(b->fp, "BCF\4", 4);
+ bgzf_write(b->fp, &h->l_nm, 4);
+ bgzf_write(b->fp, h->name, h->l_nm);
+ bgzf_write(b->fp, &h->l_smpl, 4);
+ bgzf_write(b->fp, h->sname, h->l_smpl);
+ bgzf_write(b->fp, &h->l_txt, 4);
+ bgzf_write(b->fp, h->txt, h->l_txt);
+ bgzf_flush(b->fp);
+ return 16 + h->l_nm + h->l_smpl + h->l_txt;
+}
+
+bcf_hdr_t *bcf_hdr_read(bcf_t *b)
+{
+ uint8_t magic[4];
+ bcf_hdr_t *h;
+ if (b == 0) return 0;
+ h = calloc(1, sizeof(bcf_hdr_t));
+ bgzf_read(b->fp, magic, 4);
+ bgzf_read(b->fp, &h->l_nm, 4);
+ h->name = malloc(h->l_nm);
+ bgzf_read(b->fp, h->name, h->l_nm);
+ bgzf_read(b->fp, &h->l_smpl, 4);
+ h->sname = malloc(h->l_smpl);
+ bgzf_read(b->fp, h->sname, h->l_smpl);
+ bgzf_read(b->fp, &h->l_txt, 4);
+ h->txt = malloc(h->l_txt);
+ bgzf_read(b->fp, h->txt, h->l_txt);
+ bcf_hdr_sync(h);
+ return h;
+}
+
+void bcf_hdr_destroy(bcf_hdr_t *h)
+{
+ if (h == 0) return;
+ free(h->name); free(h->sname); free(h->txt); free(h->ns); free(h->sns);
+ free(h);
+}
+
+static inline char **cnt_null(int l, char *str, int *_n)
+{
+ int n = 0;
+ char *p, **list;
+ *_n = 0;
+ if (l == 0 || str == 0) return 0;
+ for (p = str; p != str + l; ++p)
+ if (*p == 0) ++n;
+ *_n = n;
+ list = calloc(n, sizeof(void*));
+ list[0] = str;
+ for (p = str, n = 1; p < str + l - 1; ++p)
+ if (*p == 0) list[n++] = p + 1;
+ return list;
+}
+
+int bcf_hdr_sync(bcf_hdr_t *b)
+{
+ if (b == 0) return -1;
+ if (b->ns) free(b->ns);
+ if (b->sns) free(b->sns);
+ if (b->l_nm) b->ns = cnt_null(b->l_nm, b->name, &b->n_ref);
+ else b->ns = 0, b->n_ref = 0;
+ b->sns = cnt_null(b->l_smpl, b->sname, &b->n_smpl);
+ return 0;
+}
+
+int bcf_sync(bcf1_t *b)
+{
+ char *p, *tmp[5];
+ int i, n, n_smpl = b->n_smpl;
+ ks_tokaux_t aux;
+ // set ref, alt, flt, info, fmt
+ b->ref = b->alt = b->flt = b->info = b->fmt = 0;
+ for (p = b->str, n = 0; p < b->str + b->l_str; ++p)
+ if (*p == 0 && p+1 != b->str + b->l_str) tmp[n++] = p + 1;
+ if (n != 5) {
+ fprintf(stderr, "[%s] incorrect number of fields (%d != 5). Corrupted file?\n", __func__, n);
+ return -1;
+ }
+ b->ref = tmp[0]; b->alt = tmp[1]; b->flt = tmp[2]; b->info = tmp[3]; b->fmt = tmp[4];
+ // set n_alleles
+ if (*b->alt == 0) b->n_alleles = 1;
+ else {
+ for (p = b->alt, n = 1; *p; ++p)
+ if (*p == ',') ++n;
+ b->n_alleles = n + 1;
+ }
+ // set n_gi and gi[i].fmt
+ for (p = b->fmt, n = 1; *p; ++p)
+ if (*p == ':') ++n;
+ if (n > b->m_gi) {
+ int old_m = b->m_gi;
+ b->m_gi = n;
+ kroundup32(b->m_gi);
+ b->gi = realloc(b->gi, b->m_gi * sizeof(bcf_ginfo_t));
+ memset(b->gi + old_m, 0, (b->m_gi - old_m) * sizeof(bcf_ginfo_t));
+ }
+ b->n_gi = n;
+ for (p = kstrtok(b->fmt, ":", &aux), n = 0; p; p = kstrtok(0, 0, &aux))
+ b->gi[n++].fmt = bcf_str2int(p, aux.p - p);
+ // set gi[i].len
+ for (i = 0; i < b->n_gi; ++i) {
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+ b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2;
+ } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("HQ", 2)) {
+ b->gi[i].len = 2;
+ } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("GT", 2)
+ || b->gi[i].fmt == bcf_str2int("SP", 2))
+ {
+ b->gi[i].len = 1;
+ } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
+ b->gi[i].len = b->n_alleles * (b->n_alleles + 1) / 2 * 4;
+ }
+ b->gi[i].data = realloc(b->gi[i].data, n_smpl * b->gi[i].len);
+ }
+ return 0;
+}
+
+int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b)
+{
+ int i, l = 0;
+ if (b == 0) return -1;
+ bgzf_write(bp->fp, &b->tid, 4);
+ bgzf_write(bp->fp, &b->pos, 4);
+ bgzf_write(bp->fp, &b->qual, 4);
+ bgzf_write(bp->fp, &b->l_str, 4);
+ bgzf_write(bp->fp, b->str, b->l_str);
+ l = 12 + b->l_str;
+ for (i = 0; i < b->n_gi; ++i) {
+ bgzf_write(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl);
+ l += b->gi[i].len * h->n_smpl;
+ }
+ return l;
+}
+
+int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b)
+{
+ int i, l = 0;
+ if (b == 0) return -1;
+ if (bgzf_read(bp->fp, &b->tid, 4) == 0) return -1;
+ b->n_smpl = h->n_smpl;
+ bgzf_read(bp->fp, &b->pos, 4);
+ bgzf_read(bp->fp, &b->qual, 4);
+ bgzf_read(bp->fp, &b->l_str, 4);
+ if (b->l_str > b->m_str) {
+ b->m_str = b->l_str;
+ kroundup32(b->m_str);
+ b->str = realloc(b->str, b->m_str);
+ }
+ bgzf_read(bp->fp, b->str, b->l_str);
+ l = 12 + b->l_str;
+ if (bcf_sync(b) < 0) return -2;
+ for (i = 0; i < b->n_gi; ++i) {
+ bgzf_read(bp->fp, b->gi[i].data, b->gi[i].len * h->n_smpl);
+ l += b->gi[i].len * h->n_smpl;
+ }
+ return l;
+}
+
+int bcf_destroy(bcf1_t *b)
+{
+ int i;
+ if (b == 0) return -1;
+ free(b->str);
+ for (i = 0; i < b->m_gi; ++i)
+ free(b->gi[i].data);
+ free(b->gi);
+ free(b);
+ return 0;
+}
+
+static inline void fmt_str(const char *p, kstring_t *s)
+{
+ if (*p == 0) kputc('.', s);
+ else kputs(p, s);
+}
+
+void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s)
+{
+ int i, j, x;
+ s->l = 0;
+ if (h->n_ref) kputs(h->ns[b->tid], s);
+ else kputw(b->tid, s);
+ kputc('\t', s);
+ kputw(b->pos + 1, s); kputc('\t', s);
+ fmt_str(b->str, s); kputc('\t', s);
+ fmt_str(b->ref, s); kputc('\t', s);
+ fmt_str(b->alt, s); kputc('\t', s);
+ ksprintf(s, "%.3g", b->qual); kputc('\t', s);
+ fmt_str(b->flt, s); kputc('\t', s);
+ fmt_str(b->info, s);
+ if (b->fmt[0]) {
+ kputc('\t', s);
+ fmt_str(b->fmt, s);
+ }
+ x = b->n_alleles * (b->n_alleles + 1) / 2;
+ if (b->n_gi == 0) return;
+ for (j = 0; j < h->n_smpl; ++j) {
+ kputc('\t', s);
+ for (i = 0; i < b->n_gi; ++i) {
+ if (i) kputc(':', s);
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+ uint8_t *d = (uint8_t*)b->gi[i].data + j * x;
+ int k;
+ for (k = 0; k < x; ++k) {
+ if (k > 0) kputc(',', s);
+ kputw(d[k], s);
+ }
+ } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
+ kputw(((uint16_t*)b->gi[i].data)[j], s);
+ } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("SP", 2)) {
+ kputw(((uint8_t*)b->gi[i].data)[j], s);
+ } else if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
+ int y = ((uint8_t*)b->gi[i].data)[j];
+ if (y>>7&1) {
+ kputsn("./.", 3, s);
+ } else {
+ kputc('0' + (y>>3&7), s);
+ kputc("/|"[y>>6&1], s);
+ kputc('0' + (y&7), s);
+ }
+ } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
+ float *d = (float*)b->gi[i].data + j * x;
+ int k;
+ //printf("- %lx\n", d);
+ for (k = 0; k < x; ++k) {
+ if (k > 0) kputc(',', s);
+ ksprintf(s, "%.2f", d[k]);
+ }
+ }
+ }
+ }
+}
+
+char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b)
+{
+ kstring_t s;
+ s.l = s.m = 0; s.s = 0;
+ bcf_fmt_core(h, b, &s);
+ return s.s;
+}
+
+int bcf_append_info(bcf1_t *b, const char *info, int l)
+{
+ int shift = b->fmt - b->str;
+ int l_fmt = b->l_str - shift;
+ char *ori = b->str;
+ if (b->l_str + l > b->m_str) { // enlarge if necessary
+ b->m_str = b->l_str + l;
+ kroundup32(b->m_str);
+ b->str = realloc(b->str, b->m_str);
+ }
+ memmove(b->str + shift + l, b->str + shift, l_fmt); // move the FORMAT field
+ memcpy(b->str + shift - 1, info, l); // append to the INFO field
+ b->str[shift + l - 1] = '\0';
+ b->fmt = b->str + shift + l;
+ b->l_str += l;
+ if (ori != b->str) bcf_sync(b); // synchronize when realloc changes the pointer
+ return 0;
+}
+
+int bcf_cpy(bcf1_t *r, const bcf1_t *b)
+{
+ char *t1 = r->str;
+ bcf_ginfo_t *t2 = r->gi;
+ int i, t3 = r->m_str, t4 = r->m_gi;
+ *r = *b;
+ r->str = t1; r->gi = t2; r->m_str = t3; r->m_gi = t4;
+ if (r->m_str < b->m_str) {
+ r->m_str = b->m_str;
+ r->str = realloc(r->str, r->m_str);
+ }
+ memcpy(r->str, b->str, r->m_str);
+ bcf_sync(r); // calling bcf_sync() is simple but inefficient
+ for (i = 0; i < r->n_gi; ++i)
+ memcpy(r->gi[i].data, b->gi[i].data, r->n_smpl * r->gi[i].len);
+ return 0;
+}
+
+int bcf_is_indel(const bcf1_t *b)
+{
+ char *p;
+ if (strlen(b->ref) > 1) return 1;
+ for (p = b->alt; *p; ++p)
+ if (*p != ',' && p[1] != ',' && p[1] != '\0')
+ return 1;
+ return 0;
+}
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2010 Broad Institute
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+/* Contact: Heng Li <lh3@live.co.uk> */
+
+#ifndef BCF_H
+#define BCF_H
+
+#include <stdint.h>
+#include <zlib.h>
+
+#ifndef BCF_LITE
+#include "bgzf.h"
+typedef BGZF *bcfFile;
+#else
+typedef gzFile bcfFile;
+#define bgzf_open(fn, mode) gzopen(fn, mode)
+#define bgzf_fdopen(fd, mode) gzdopen(fd, mode)
+#define bgzf_close(fp) gzclose(fp)
+#define bgzf_read(fp, buf, len) gzread(fp, buf, len)
+#define bgzf_write(fp, buf, len)
+#define bgzf_flush(fp)
+#endif
+
+/*
+ A member in the structs below is said to "primary" if its content
+ cannot be inferred from other members in any of structs below; a
+ member is said to be "derived" if its content can be derived from
+ other members. For example, bcf1_t::str is primary as this comes from
+ the input data, while bcf1_t::info is derived as it can always be
+ correctly set if we know bcf1_t::str. Derived members are for quick
+ access to the content and must be synchronized with the primary data.
+ */
+
+typedef struct {
+ uint32_t fmt; // format of the block, set by bcf_str2int().
+ int len; // length of data for each individual
+ void *data; // concatenated data
+ // derived info: fmt, len (<-bcf1_t::fmt)
+} bcf_ginfo_t;
+
+typedef struct {
+ int32_t tid, pos; // refID and 0-based position
+ int32_t l_str, m_str; // length and the allocated size of ->str
+ float qual; // SNP quality
+ char *str; // concatenated string of variable length strings in VCF (from col.2 to col.7)
+ char *ref, *alt, *flt, *info, *fmt; // they all point to ->str; no memory allocation
+ int n_gi, m_gi; // number and the allocated size of geno fields
+ bcf_ginfo_t *gi; // array of geno fields
+ int n_alleles, n_smpl; // number of alleles and samples
+ // derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl)
+} bcf1_t;
+
+typedef struct {
+ int32_t n_ref, n_smpl; // number of reference sequences and samples
+ int32_t l_nm; // length of concatenated sequence names; 0 padded
+ int32_t l_smpl; // length of concatenated sample names; 0 padded
+ int32_t l_txt; // length of header text (lines started with ##)
+ char *name, *sname, *txt; // concatenated sequence names, sample names and header text
+ char **ns, **sns; // array of sequence and sample names; point to name and sname, respectively
+ // derived info: n_ref (<-name), n_smpl (<-sname), ns (<-name), sns (<-sname)
+} bcf_hdr_t;
+
+typedef struct {
+ int is_vcf; // if the file in operation is a VCF
+ void *v; // auxillary data structure for VCF
+ bcfFile fp; // file handler for BCF
+} bcf_t;
+
+struct __bcf_idx_t;
+typedef struct __bcf_idx_t bcf_idx_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ // open a BCF file; for BCF file only
+ bcf_t *bcf_open(const char *fn, const char *mode);
+ // close file
+ int bcf_close(bcf_t *b);
+ // read one record from BCF; return -1 on end-of-file, and <-1 for errors
+ int bcf_read(bcf_t *bp, const bcf_hdr_t *h, bcf1_t *b);
+ // call this function if b->str is changed
+ int bcf_sync(bcf1_t *b);
+ // write a BCF record
+ int bcf_write(bcf_t *bp, const bcf_hdr_t *h, const bcf1_t *b);
+ // read the BCF header; BCF only
+ bcf_hdr_t *bcf_hdr_read(bcf_t *b);
+ // write the BCF header
+ int bcf_hdr_write(bcf_t *b, const bcf_hdr_t *h);
+ // set bcf_hdr_t::ns and bcf_hdr_t::sns
+ int bcf_hdr_sync(bcf_hdr_t *b);
+ // destroy the header
+ void bcf_hdr_destroy(bcf_hdr_t *h);
+ // destroy a record
+ int bcf_destroy(bcf1_t *b);
+ // BCF->VCF conversion
+ char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b);
+ // append more info
+ int bcf_append_info(bcf1_t *b, const char *info, int l);
+ // copy
+ int bcf_cpy(bcf1_t *r, const bcf1_t *b);
+
+ // open a VCF or BCF file if "b" is set in "mode"
+ bcf_t *vcf_open(const char *fn, const char *mode);
+ // close a VCF/BCF file
+ int vcf_close(bcf_t *bp);
+ // read the VCF/BCF header
+ bcf_hdr_t *vcf_hdr_read(bcf_t *bp);
+ // read a VCF/BCF record; return -1 on end-of-file and <-1 for errors
+ int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
+ // write the VCF header
+ int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h);
+ // write a VCF record
+ int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b);
+
+ // keep the first n alleles and discard the rest
+ int bcf_shrink_alt(bcf1_t *b, int n);
+ // convert GL to PL
+ int bcf_gl2pl(bcf1_t *b);
+ // if the site is an indel
+ int bcf_is_indel(const bcf1_t *b);
+
+ // string hash table
+ void *bcf_build_refhash(bcf_hdr_t *h);
+ void bcf_str2id_destroy(void *_hash);
+ int bcf_str2id_add(void *_hash, const char *str);
+ int bcf_str2id(void *_hash, const char *str);
+ void *bcf_str2id_init();
+
+ // indexing related functions
+ int bcf_idx_build(const char *fn);
+ uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg);
+ int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end);
+ bcf_idx_t *bcf_idx_load(const char *fn);
+ void bcf_idx_destroy(bcf_idx_t *idx);
+
+#ifdef __cplusplus
+}
+#endif
+
+static inline uint32_t bcf_str2int(const char *str, int l)
+{
+ int i;
+ uint32_t x = 0;
+ for (i = 0; i < l && i < 4; ++i) {
+ if (str[i] == 0) return x;
+ x = x<<8 | str[i];
+ }
+ return x;
+}
+
+#endif
--- /dev/null
+\documentclass[10pt,pdftex]{article}
+\usepackage{color}
+\definecolor{gray}{rgb}{0.7,0.7,0.7}
+
+\setlength{\topmargin}{0.0cm}
+\setlength{\textheight}{21.5cm}
+\setlength{\oddsidemargin}{0cm}
+\setlength{\textwidth}{16.5cm}
+\setlength{\columnsep}{0.6cm}
+
+\begin{document}
+
+\begin{center}
+\begin{tabular}{|l|l|l|l|l|}
+\hline
+\multicolumn{2}{|c|}{\bf Field} & \multicolumn{1}{c|}{\bf Descrption} & \multicolumn{1}{c|}{\bf Type} & \multicolumn{1}{c|}{\bf Value} \\\hline\hline
+\multicolumn{2}{|l|}{\tt magic} & Magic string & {\tt char[4]} & {\tt BCF\char92 4} \\\hline
+\multicolumn{2}{|l|}{\tt l\_nm} & Length of concatenated sequence names & {\tt int32\_t} & \\\hline
+\multicolumn{2}{|l|}{\tt name} & Concatenated names, {\tt NULL} padded & {\tt char[l\_nm]} & \\\hline
+\multicolumn{2}{|l|}{\tt l\_smpl} & Length of concatenated sample names & {\tt int32\_t} & \\\hline
+\multicolumn{2}{|l|}{\tt sname} & Concatenated sample names & {\tt char[l\_smpl]} & \\\hline
+\multicolumn{2}{|l|}{\tt l\_txt} & Length of the meta text (double-hash lines)& {\tt int32\_t} & \\\hline
+\multicolumn{2}{|l|}{\tt text} & Meta text, {\tt NULL} terminated & {\tt char[l\_txt]} & \\\hline
+\multicolumn{5}{|c|}{\it \color{gray}{List of records until the end of the file}}\\\cline{2-5}
+& {\tt seq\_id} & Reference sequence ID & {\tt int32\_t} & \\\cline{2-5}
+& {\tt pos} & Position & {\tt int32\_t} & \\\cline{2-5}
+& {\tt qual} & Variant quality & {\tt float} & \\\cline{2-5}
+& {\tt l\_str} & Length of str & {\tt int32\_t} & \\\cline{2-5}
+& {\tt str} & {\tt ID+REF+ALT+FILTER+INFO+FORMAT}, {\tt NULL} padded & {\tt char[slen]} &\\\cline{2-5}
+& \multicolumn{4}{c|}{Blocks of data; \#blocks and formats defined by {\tt FORMAT} (table below)}\\
+\hline
+\end{tabular}
+\end{center}
+
+\begin{center}
+\begin{tabular}{cll}
+\hline
+\multicolumn{1}{l}{\bf Field} & \multicolumn{1}{l}{\bf Type} & \multicolumn{1}{l}{\bf Description} \\\hline
+{\tt DP} & {\tt uint16\_t[n]} & Read depth \\
+{\tt GL} & {\tt float[n*x]} & Log10 likelihood of data; $x=\frac{m(m+1)}{2}$, $m=\#\{alleles\}$\\
+{\tt GT} & {\tt uint8\_t[n]} & {\tt phase\char60\char60 6 | allele1\char60\char60 3 | allele2} \\
+{\tt GQ} & {\tt uint8\_t[n]} & {Genotype quality}\\
+{\tt HQ} & {\tt uint8\_t[n*2]} & {Haplotype quality}\\
+{\tt PL} & {\tt uint8\_t[n*x]} & {Phred-scaled likelihood of data}\\
+\emph{misc} & {\tt int32\_t+char*} & {\tt NULL} padded concatenated strings (integer equal to the length) \\
+\hline
+\end{tabular}
+\end{center}
+
+\begin{itemize}
+\item The file is {\tt BGZF} compressed.
+\item All integers are little-endian.
+\item In a string, a missing value `.' is an empty C string ``{\tt
+ \char92 0}'' (not ``{\tt .\char92 0}'')
+\item For {\tt GL} and {\tt PL}, likelihoods of genotypes appear in the
+ order of alleles in {\tt REF} and then {\tt ALT}. For example, if {\tt
+ REF=C}, {\tt ALT=T,A}, likelihoods appear in the order of {\tt
+ CC,CT,CA,TT,TA,AA}.
+\item {\tt GL} is an extension to and is backward compatible with the
+ {\tt GL} genotype field in {\tt VCFv4.0}.
+\end{itemize}
+
+\end{document}
\ No newline at end of file
--- /dev/null
+#include <errno.h>
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+#include "bcf.h"
+
+static int8_t nt4_table[256] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+
+static int read_I16(bcf1_t *b, int anno[16])
+{
+ char *p;
+ int i;
+ if ((p = strstr(b->info, "I16=")) == 0) return -1;
+ p += 4;
+ for (i = 0; i < 16; ++i) {
+ anno[i] = strtol(p, &p, 10);
+ if (anno[i] == 0 && (errno == EINVAL || errno == ERANGE)) return -2;
+ ++p;
+ }
+ return 0;
+}
+
+int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b)
+{
+ int a[4], k, g[10], l, map[4], k1, j, i, i0, anno[16], dp, mq, d_rest;
+ char *s;
+ if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+ if (i == b->n_gi) return -1; // no PL
+ if (read_I16(b, anno) != 0) return -1; // no I16; FIXME: can be improved
+ d_rest = dp = anno[0] + anno[1] + anno[2] + anno[3];
+ if (dp == 0) return -1; // depth is zero
+ mq = (int)(sqrt((double)(anno[9] + anno[11]) / dp) + .499);
+ i0 = i;
+ a[0] = nt4_table[(int)b->ref[0]];
+ if (a[0] > 3) return -1; // ref is not A/C/G/T
+ a[1] = a[2] = a[3] = -2; // -1 has a special meaning
+ if (b->alt[0] == 0) return -1; // no alternate allele
+ map[0] = map[1] = map[2] = map[3] = -2;
+ map[a[0]] = 0;
+ for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
+ if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
+ a[k+1] = nt4_table[(int)*s];
+ if (a[k+1] >= 0) map[a[k+1]] = k+1;
+ else k1 = k+1;
+ if (s[1] == 0) break;
+ }
+ for (k = 0; k < 4; ++k)
+ if (map[k] < 0) map[k] = k1;
+ for (i = 0; i < h->n_smpl; ++i) {
+ int d;
+ uint8_t *p = b->gi[i0].data + i * b->gi[i0].len;
+ for (j = 0; j < b->gi[i0].len; ++j)
+ if (p[j]) break;
+ d = (int)((double)d_rest / (h->n_smpl - i) + .499);
+ if (d == 0) d = 1;
+ if (j == b->gi[i0].len) d = 0;
+ d_rest -= d;
+ for (k = j = 0; k < 4; ++k) {
+ for (l = k; l < 4; ++l) {
+ int t, x = map[k], y = map[l];
+ if (x > y) t = x, x = y, y = t;
+ g[j++] = p[x * b->n_alleles - x * (x-1) / 2 + (y - x)];
+ }
+ }
+ printf("%s\t%d\t%c", h->ns[b->tid], b->pos+1, *b->ref);
+ printf("\t%d\t%d\t0", d, mq);
+ for (j = 0; j < 10; ++j)
+ printf("\t%d", g[j]);
+ printf("\t%s\n", h->sns[i]);
+ }
+ return 0;
+}
--- /dev/null
+.TH bcftools 1 "2 October 2010" "bcftools" "Bioinformatics tools"
+.SH NAME
+.PP
+bcftools - Utilities for the Binary Call Format (BCF) and VCF.
+.SH SYNOPSIS
+.PP
+bcftools index in.bcf
+.PP
+bcftools view in.bcf chr2:100-200 > out.vcf
+.PP
+bcftools view -vc in.bcf > out.vcf 2> out.afs
+
+.SH DESCRIPTION
+.PP
+Bcftools is a toolkit for processing VCF/BCF files, calling variants and
+estimating site allele frequencies and allele frequency spectrums.
+
+.SH COMMANDS AND OPTIONS
+
+.TP 10
+.B view
+.B bcftools view
+.RB [ \-cbuSAGgHvNQ ]
+.RB [ \-1
+.IR nGroup1 ]
+.RB [ \-l
+.IR listFile ]
+.RB [ \-t
+.IR mutRate ]
+.RB [ \-p
+.IR varThres ]
+.RB [ \-P
+.IR prior ]
+.I in.bcf
+.RI [ region ]
+
+Convert between BCF and VCF, call variant candidates and estimate allele
+frequencies.
+
+.B OPTIONS:
+.RS
+.TP 10
+.B -b
+Output in the BCF format. The default is VCF.
+.TP
+.B -c
+Call variants.
+.TP
+.B -v
+Output variant sites only (force -c)
+.TP
+.B -g
+Call per-sample genotypes at variant sites (force -c)
+.TP
+.B -u
+Uncompressed BCF output (force -b).
+.TP
+.B -S
+The input is VCF instead of BCF.
+.TP
+.B -A
+Retain all possible alternate alleles at variant sites. By default, this
+command discards unlikely alleles.
+.TP
+.B -G
+Suppress all individual genotype information.
+.TP
+.B -H
+Perform Hardy-Weiberg Equilibrium test. This will add computation time, sometimes considerably.
+.TP
+.B -N
+Skip sites where the REF field is not A/C/G/T
+.TP
+.B -Q
+Output the QCALL likelihood format
+.TP
+.B -f
+Reference-free variant calling mode. In this mode, the prior will be
+folded; a variant is called iff the sample(s) contains at least two
+alleles; the QUAL field in the VCF/BCF output is changed accordingly.
+.TP
+.BI "-1 " INT
+Number of group-1 samples. This option is used for dividing input into
+two groups for comparing. A zero value disables this functionality. [0]
+.TP
+.BI "-l " FILE
+List of sites at which information are outputted [all sites]
+.TP
+.BI "-t " FLOAT
+Scaled muttion rate for variant calling [0.001]
+.TP
+.BI "-p " FLOAT
+A site is considered to be a variant if P(ref|D)<FLOAT [0.5]
+.TP
+.BI "-P " STR
+Prior or initial allele frequency spectrum. If STR can be
+.IR full ,
+.IR cond2 ,
+.I flat
+or the file consisting of error output from a previous variant calling
+run.
+.RE
+
+.TP
+.B index
+.B bcftools index
+.I in.bcf
+
+Index sorted BCF for random access.
+.RE
+
+.TP
+.B cat
+.B bcftools cat
+.I in1.bcf
+.RI [ "in2.bcf " [ ... "]]]"
+
+Concatenate BCF files. The input files are required to be sorted and
+have identical samples appearing in the same order.
+.RE
--- /dev/null
+#include "bcf.h"
+#include "kstring.h"
+#include "khash.h"
+KHASH_MAP_INIT_STR(str2id, int)
+
+void *bcf_build_refhash(bcf_hdr_t *h)
+{
+ khash_t(str2id) *hash;
+ int i, ret;
+ hash = kh_init(str2id);
+ for (i = 0; i < h->n_ref; ++i) {
+ khint_t k;
+ k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret
+ kh_val(hash, k) = i;
+ }
+ return hash;
+}
+
+void *bcf_str2id_init()
+{
+ return kh_init(str2id);
+}
+
+void bcf_str2id_destroy(void *_hash)
+{
+ khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
+ if (hash) kh_destroy(str2id, hash); // Note that strings are not freed.
+}
+
+int bcf_str2id(void *_hash, const char *str)
+{
+ khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
+ khint_t k;
+ if (!hash) return -1;
+ k = kh_get(str2id, hash, str);
+ return k == kh_end(hash)? -1 : kh_val(hash, k);
+}
+
+int bcf_str2id_add(void *_hash, const char *str)
+{
+ khint_t k;
+ int ret;
+ khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
+ if (!hash) return -1;
+ k = kh_put(str2id, hash, str, &ret);
+ if (ret == 0) return kh_val(hash, k);
+ kh_val(hash, k) = kh_size(hash) - 1;
+ return kh_val(hash, k);
+}
+
+int bcf_shrink_alt(bcf1_t *b, int n)
+{
+ char *p;
+ int i, j, k, *z, n_smpl = b->n_smpl;
+ if (b->n_alleles <= n) return -1;
+ if (n > 1) {
+ for (p = b->alt, k = 1; *p; ++p)
+ if (*p == ',' && ++k == n) break;
+ *p = '\0';
+ } else p = b->alt, *p = '\0';
+ ++p;
+ memmove(p, b->flt, b->str + b->l_str - b->flt);
+ b->l_str -= b->flt - p;
+ z = alloca(sizeof(int) / 2 * n * (n+1));
+ for (i = k = 0; i < n; ++i)
+ for (j = 0; j < n - i; ++j)
+ z[k++] = i * b->n_alleles + j;
+ for (i = 0; i < b->n_gi; ++i) {
+ bcf_ginfo_t *g = b->gi + i;
+ if (g->fmt == bcf_str2int("PL", 2)) {
+ int l, x = b->n_alleles * (b->n_alleles + 1) / 2;
+ uint8_t *d = (uint8_t*)g->data;
+ g->len = n * (n + 1) / 2;
+ for (l = k = 0; l < n_smpl; ++l) {
+ uint8_t *dl = d + l * x;
+ for (j = 0; j < g->len; ++j) d[k++] = dl[z[j]];
+ }
+ } // FIXME: to add GL
+ }
+ b->n_alleles = n;
+ bcf_sync(b);
+ return 0;
+}
+
+int bcf_gl2pl(bcf1_t *b)
+{
+ char *p;
+ int i, n_smpl = b->n_smpl;
+ bcf_ginfo_t *g;
+ float *d0;
+ uint8_t *d1;
+ if (strstr(b->fmt, "PL")) return -1;
+ if ((p = strstr(b->fmt, "GL")) == 0) return -1;
+ *p = 'P';
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == bcf_str2int("GL", 2))
+ break;
+ g = b->gi + i;
+ g->fmt = bcf_str2int("PL", 2);
+ g->len /= 4; // 4 == sizeof(float)
+ d0 = (float*)g->data; d1 = (uint8_t*)g->data;
+ for (i = 0; i < n_smpl * g->len; ++i) {
+ int x = (int)(-10. * d0[i] + .499);
+ if (x > 255) x = 255;
+ if (x < 0) x = 0;
+ d1[i] = x;
+ }
+ return 0;
+}
--- /dev/null
+#include <unistd.h>
+#include <stdlib.h>
+#include <math.h>
+#include <zlib.h>
+#include <errno.h>
+#include "bcf.h"
+#include "prob1.h"
+#include "kstring.h"
+
+#include "khash.h"
+KHASH_SET_INIT_INT64(set64)
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define VC_NO_GENO 2
+#define VC_BCFOUT 4
+#define VC_CALL 8
+#define VC_VARONLY 16
+#define VC_VCFIN 32
+#define VC_UNCOMP 64
+#define VC_HWE 128
+#define VC_KEEPALT 256
+#define VC_ACGT_ONLY 512
+#define VC_QCALL 1024
+#define VC_CALL_GT 2048
+#define VC_ADJLD 4096
+#define VC_NO_INDEL 8192
+#define VC_FOLDED 16384
+
+typedef struct {
+ int flag, prior_type, n1;
+ char *fn_list, *prior_file;
+ double theta, pref, indel_frac;
+} viewconf_t;
+
+khash_t(set64) *bcf_load_pos(const char *fn, bcf_hdr_t *_h)
+{
+ void *str2id;
+ gzFile fp;
+ kstream_t *ks;
+ int ret, dret, lineno = 1;
+ kstring_t *str;
+ khash_t(set64) *hash = 0;
+
+ hash = kh_init(set64);
+ str2id = bcf_build_refhash(_h);
+ str = calloc(1, sizeof(kstring_t));
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ ks = ks_init(fp);
+ while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ int tid = bcf_str2id(str2id, str->s);
+ if (tid >= 0 && dret != '\n') {
+ if (ks_getuntil(ks, 0, str, &dret) >= 0) {
+ uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1);
+ kh_put(set64, hash, x, &ret);
+ } else break;
+ } else fprintf(stderr, "[%s] %s is not a reference name (line %d).\n", __func__, str->s, lineno);
+ if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+ if (dret < 0) break;
+ ++lineno;
+ }
+ bcf_str2id_destroy(str2id);
+ ks_destroy(ks);
+ gzclose(fp);
+ free(str->s); free(str);
+ return hash;
+}
+
+static double test_hwe(const double g[3])
+{
+ extern double kf_gammaq(double p, double x);
+ double fexp, chi2, f[3], n;
+ int i;
+ n = g[0] + g[1] + g[2];
+ fexp = (2. * g[2] + g[1]) / (2. * n);
+ if (fexp > 1. - 1e-10) fexp = 1. - 1e-10;
+ if (fexp < 1e-10) fexp = 1e-10;
+ f[0] = n * (1. - fexp) * (1. - fexp);
+ f[1] = n * 2. * fexp * (1. - fexp);
+ f[2] = n * fexp * fexp;
+ for (i = 0, chi2 = 0.; i < 3; ++i)
+ chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i];
+ return kf_gammaq(.5, chi2 / 2.);
+}
+
+typedef struct {
+ double p[4];
+ int mq, depth, is_tested, d[4];
+} anno16_t;
+
+static double ttest(int n1, int n2, int a[4])
+{
+ extern double kf_betai(double a, double b, double x);
+ double t, v, u1, u2;
+ if (n1 == 0 || n2 == 0 || n1 + n2 < 3) return 1.0;
+ u1 = (double)a[0] / n1; u2 = (double)a[2] / n2;
+ if (u1 <= u2) return 1.;
+ t = (u1 - u2) / sqrt(((a[1] - n1 * u1 * u1) + (a[3] - n2 * u2 * u2)) / (n1 + n2 - 2) * (1./n1 + 1./n2));
+ v = n1 + n2 - 2;
+// printf("%d,%d,%d,%d,%lf,%lf,%lf\n", a[0], a[1], a[2], a[3], t, u1, u2);
+ return t < 0.? 1. : .5 * kf_betai(.5*v, .5, v/(v+t*t));
+}
+
+static int test16_core(int anno[16], anno16_t *a)
+{
+ extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
+ double left, right;
+ int i;
+ a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
+ memcpy(a->d, anno, 4 * sizeof(int));
+ a->depth = anno[0] + anno[1] + anno[2] + anno[3];
+ a->is_tested = (anno[0] + anno[1] > 0 && anno[2] + anno[3] > 0);
+ if (a->depth == 0) return -1;
+ a->mq = (int)(sqrt((anno[9] + anno[11]) / a->depth) + .499);
+ kt_fisher_exact(anno[0], anno[1], anno[2], anno[3], &left, &right, &a->p[0]);
+ for (i = 1; i < 4; ++i)
+ a->p[i] = ttest(anno[0] + anno[1], anno[2] + anno[3], anno+4*i);
+ return 0;
+}
+
+static int test16(bcf1_t *b, anno16_t *a)
+{
+ char *p;
+ int i, anno[16];
+ a->p[0] = a->p[1] = a->p[2] = a->p[3] = 1.;
+ a->d[0] = a->d[1] = a->d[2] = a->d[3] = 0.;
+ a->mq = a->depth = a->is_tested = 0;
+ if ((p = strstr(b->info, "I16=")) == 0) return -1;
+ p += 4;
+ for (i = 0; i < 16; ++i) {
+ errno = 0; anno[i] = strtol(p, &p, 10);
+ if (anno[i] == 0 && (errno == EINVAL || errno == ERANGE)) return -2;
+ ++p;
+ }
+ return test16_core(anno, a);
+}
+
+static void rm_info(bcf1_t *b, const char *key)
+{
+ char *p, *q;
+ if ((p = strstr(b->info, key)) == 0) return;
+ for (q = p; *q && *q != ';'; ++q);
+ if (p > b->info && *(p-1) == ';') --p;
+ memmove(p, q, b->l_str - (q - b->str));
+ b->l_str -= q - p;
+ bcf_sync(b);
+}
+
+static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag)
+{
+ kstring_t s;
+ int is_var = (pr->p_ref < pref);
+ double p_hwe, r = is_var? pr->p_ref : 1. - pr->p_ref;
+ anno16_t a;
+
+ p_hwe = pr->g[0] >= 0.? test_hwe(pr->g) : 1.0; // only do HWE g[] is calculated
+ test16(b, &a);
+ rm_info(b, "I16=");
+
+ memset(&s, 0, sizeof(kstring_t));
+ kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s);
+ kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
+ kputs(b->info, &s);
+ if (b->info[0]) kputc(';', &s);
+// ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih);
+ ksprintf(&s, "AF1=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, pr->cil, pr->cih);
+ ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
+ if (a.is_tested) {
+ if (pr->pc[0] >= 0.) ksprintf(&s, ";PC4=%lg,%lg,%lg,%lg", pr->pc[0], pr->pc[1], pr->pc[2], pr->pc[3]);
+ ksprintf(&s, ";PV4=%.2lg,%.2lg,%.2lg,%.2lg", a.p[0], a.p[1], a.p[2], a.p[3]);
+ }
+ if (pr->g[0] >= 0. && p_hwe <= .2)
+ ksprintf(&s, ";GC=%.2lf,%.2lf,%.2lf;HWE=%.3lf", pr->g[2], pr->g[1], pr->g[0], p_hwe);
+ kputc('\0', &s);
+ kputs(b->fmt, &s); kputc('\0', &s);
+ free(b->str);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ b->qual = r < 1e-100? 99 : -4.343 * log(r);
+ if (b->qual > 99) b->qual = 99;
+ bcf_sync(b);
+ if (!is_var) bcf_shrink_alt(b, 1);
+ else if (!(flag&VC_KEEPALT))
+ bcf_shrink_alt(b, pr->rank0 < 2? 2 : pr->rank0+1);
+ if (is_var && (flag&VC_CALL_GT)) { // call individual genotype
+ int i, x, old_n_gi = b->n_gi;
+ s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str;
+ kputs(":GT:GQ", &s); kputc('\0', &s);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ bcf_sync(b);
+ for (i = 0; i < b->n_smpl; ++i) {
+ x = bcf_p1_call_gt(pa, pr->f_em, i);
+ ((uint8_t*)b->gi[old_n_gi].data)[i] = (x&3) == 0? 1<<3|1 : (x&3) == 1? 1 : 0;
+ ((uint8_t*)b->gi[old_n_gi+1].data)[i] = x>>2;
+ }
+ }
+ return is_var;
+}
+
+double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]);
+
+int bcfview(int argc, char *argv[])
+{
+ extern int bcf_2qcall(bcf_hdr_t *h, bcf1_t *b);
+ extern void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x);
+ bcf_t *bp, *bout = 0;
+ bcf1_t *b, *blast;
+ int c;
+ uint64_t n_processed = 0;
+ viewconf_t vc;
+ bcf_p1aux_t *p1 = 0;
+ bcf_hdr_t *h;
+ int tid, begin, end;
+ char moder[4], modew[4];
+ khash_t(set64) *hash = 0;
+
+ tid = begin = end = -1;
+ memset(&vc, 0, sizeof(viewconf_t));
+ vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.;
+ while ((c = getopt(argc, argv, "fN1:l:cHAGvbSuP:t:p:QgLi:I")) >= 0) {
+ switch (c) {
+ case 'f': vc.flag |= VC_FOLDED; break;
+ case '1': vc.n1 = atoi(optarg); break;
+ case 'l': vc.fn_list = strdup(optarg); break;
+ case 'N': vc.flag |= VC_ACGT_ONLY; break;
+ case 'G': vc.flag |= VC_NO_GENO; break;
+ case 'A': vc.flag |= VC_KEEPALT; break;
+ case 'b': vc.flag |= VC_BCFOUT; break;
+ case 'S': vc.flag |= VC_VCFIN; break;
+ case 'c': vc.flag |= VC_CALL; break;
+ case 'v': vc.flag |= VC_VARONLY | VC_CALL; break;
+ case 'u': vc.flag |= VC_UNCOMP | VC_BCFOUT; break;
+ case 'H': vc.flag |= VC_HWE; break;
+ case 'g': vc.flag |= VC_CALL_GT | VC_CALL; break;
+ case 'I': vc.flag |= VC_NO_INDEL; break;
+ case 't': vc.theta = atof(optarg); break;
+ case 'p': vc.pref = atof(optarg); break;
+ case 'i': vc.indel_frac = atof(optarg); break;
+ case 'Q': vc.flag |= VC_QCALL; break;
+ case 'L': vc.flag |= VC_ADJLD; break;
+ case 'P':
+ if (strcmp(optarg, "full") == 0) vc.prior_type = MC_PTYPE_FULL;
+ else if (strcmp(optarg, "cond2") == 0) vc.prior_type = MC_PTYPE_COND2;
+ else if (strcmp(optarg, "flat") == 0) vc.prior_type = MC_PTYPE_FLAT;
+ else vc.prior_file = strdup(optarg);
+ break;
+ }
+ }
+ if (argc == optind) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bcftools view [options] <in.bcf> [reg]\n\n");
+ fprintf(stderr, "Options: -c SNP calling\n");
+ fprintf(stderr, " -v output potential variant sites only (force -c)\n");
+ fprintf(stderr, " -g call genotypes at variant sites (force -c)\n");
+ fprintf(stderr, " -b output BCF instead of VCF\n");
+ fprintf(stderr, " -u uncompressed BCF output (force -b)\n");
+ fprintf(stderr, " -S input is VCF\n");
+ fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n");
+ fprintf(stderr, " -G suppress all individual genotype information\n");
+ fprintf(stderr, " -H perform Hardy-Weinberg test (slower)\n");
+ fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n");
+ fprintf(stderr, " -Q output the QCALL likelihood format\n");
+ fprintf(stderr, " -L calculate LD for adjacent sites\n");
+ fprintf(stderr, " -I skip indels\n");
+ fprintf(stderr, " -f reference-free variant calling\n");
+ fprintf(stderr, " -1 INT number of group-1 samples [0]\n");
+ fprintf(stderr, " -l FILE list of sites to output [all sites]\n");
+ fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4lg]\n", vc.theta);
+ fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4lg]\n", vc.indel_frac);
+ fprintf(stderr, " -p FLOAT variant if P(ref|D)<FLOAT [%.3lg]\n", vc.pref);
+ fprintf(stderr, " -P STR type of prior: full, cond2, flat [full]\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+
+ b = calloc(1, sizeof(bcf1_t));
+ blast = calloc(1, sizeof(bcf1_t));
+ strcpy(moder, "r");
+ if (!(vc.flag & VC_VCFIN)) strcat(moder, "b");
+ strcpy(modew, "w");
+ if (vc.flag & VC_BCFOUT) strcat(modew, "b");
+ if (vc.flag & VC_UNCOMP) strcat(modew, "u");
+ bp = vcf_open(argv[optind], moder);
+ h = vcf_hdr_read(bp);
+ bout = vcf_open("-", modew);
+ if (!(vc.flag & VC_QCALL)) vcf_hdr_write(bout, h);
+ if (vc.flag & VC_CALL) {
+ p1 = bcf_p1_init(h->n_smpl);
+ if (vc.prior_file) {
+ if (bcf_p1_read_prior(p1, vc.prior_file) < 0) {
+ fprintf(stderr, "[%s] fail to read the prior AFS.\n", __func__);
+ return 1;
+ }
+ } else bcf_p1_init_prior(p1, vc.prior_type, vc.theta);
+ if (vc.n1 > 0) {
+ bcf_p1_set_n1(p1, vc.n1);
+ bcf_p1_init_subprior(p1, vc.prior_type, vc.theta);
+ }
+ if (vc.indel_frac > 0.) bcf_p1_indel_prior(p1, vc.indel_frac); // otherwise use the default indel_frac
+ if (vc.flag & VC_FOLDED) bcf_p1_set_folded(p1);
+ }
+ if (vc.fn_list) hash = bcf_load_pos(vc.fn_list, h);
+ if (optind + 1 < argc && !(vc.flag&VC_VCFIN)) {
+ void *str2id = bcf_build_refhash(h);
+ if (bcf_parse_region(str2id, argv[optind+1], &tid, &begin, &end) >= 0) {
+ bcf_idx_t *idx;
+ idx = bcf_idx_load(argv[optind]);
+ if (idx) {
+ uint64_t off;
+ off = bcf_idx_query(idx, tid, begin);
+ if (off == 0) {
+ fprintf(stderr, "[%s] no records in the query region.\n", __func__);
+ return 1; // FIXME: a lot of memory leaks...
+ }
+ bgzf_seek(bp->fp, off, SEEK_SET);
+ bcf_idx_destroy(idx);
+ }
+ }
+ }
+ while (vcf_read(bp, h, b) > 0) {
+ int is_indel = bcf_is_indel(b);
+ if ((vc.flag & VC_NO_INDEL) && is_indel) continue;
+ if ((vc.flag & VC_ACGT_ONLY) && !is_indel) {
+ int x;
+ if (b->ref[0] == 0 || b->ref[1] != 0) continue;
+ x = toupper(b->ref[0]);
+ if (x != 'A' && x != 'C' && x != 'G' && x != 'T') continue;
+ }
+ if (hash) {
+ uint64_t x = (uint64_t)b->tid<<32 | b->pos;
+ khint_t k = kh_get(set64, hash, x);
+ if (kh_size(hash) == 0) break;
+ if (k == kh_end(hash)) continue;
+ kh_del(set64, hash, k);
+ }
+ if (tid >= 0) {
+ int l = strlen(b->ref);
+ l = b->pos + (l > 0? l : 1);
+ if (b->tid != tid || b->pos >= end) break;
+ if (!(l > begin && end > b->pos)) continue;
+ }
+ ++n_processed;
+ if (vc.flag & VC_QCALL) { // output QCALL format; STOP here
+ bcf_2qcall(h, b);
+ continue;
+ }
+ if (vc.flag & (VC_CALL|VC_ADJLD)) bcf_gl2pl(b);
+ if (vc.flag & VC_CALL) { // call variants
+ bcf_p1rst_t pr;
+ bcf_p1_cal(b, p1, &pr); // pr.g[3] is not calculated here
+ if (vc.flag&VC_HWE) bcf_p1_cal_g3(p1, pr.g);
+ if (n_processed % 100000 == 0) {
+ fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed);
+ bcf_p1_dump_afs(p1);
+ }
+ if (pr.p_ref >= vc.pref && (vc.flag & VC_VARONLY)) continue;
+ update_bcf1(h->n_smpl, b, p1, &pr, vc.pref, vc.flag);
+ }
+ if (vc.flag & VC_ADJLD) { // compute LD
+ double f[4], r2;
+ if ((r2 = bcf_ld_freq(blast, b, f)) >= 0) {
+ kstring_t s;
+ s.m = s.l = 0; s.s = 0;
+ if (*b->info) kputc(';', &s);
+ ksprintf(&s, "NEIR=%.3lf;NEIF=%.3lf,%.3lf", r2, f[0]+f[2], f[0]+f[1]);
+ bcf_append_info(b, s.s, s.l);
+ free(s.s);
+ }
+ bcf_cpy(blast, b);
+ }
+ if (vc.flag & VC_NO_GENO) { // do not output GENO fields
+ b->n_gi = 0;
+ b->fmt[0] = '\0';
+ }
+ vcf_write(bout, h, b);
+ }
+ if (vc.prior_file) free(vc.prior_file);
+ if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1);
+ bcf_hdr_destroy(h);
+ bcf_destroy(b); bcf_destroy(blast);
+ vcf_close(bp); vcf_close(bout);
+ if (hash) kh_destroy(set64, hash);
+ if (vc.fn_list) free(vc.fn_list);
+ if (p1) bcf_p1_destroy(p1);
+ return 0;
+}
--- /dev/null
+#include <math.h>
+#include <stdlib.h>
+
+/* This program is implemented with ideas from this web page:
+ *
+ * http://www.langsrud.com/fisher.htm
+ */
+
+// log\binom{n}{k}
+static double lbinom(int n, int k)
+{
+ if (k == 0 || n == k) return 0;
+ return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);
+}
+
+// n11 n12 | n1_
+// n21 n22 | n2_
+//-----------+----
+// n_1 n_2 | n
+
+// hypergeometric distribution
+static double hypergeo(int n11, int n1_, int n_1, int n)
+{
+ return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1));
+}
+
+typedef struct {
+ int n11, n1_, n_1, n;
+ double p;
+} hgacc_t;
+
+// incremental version of hypergenometric distribution
+static double hypergeo_acc(int n11, int n1_, int n_1, int n, hgacc_t *aux)
+{
+ if (n1_ || n_1 || n) {
+ aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n;
+ } else { // then only n11 changed; the rest fixed
+ if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) {
+ if (n11 == aux->n11 + 1) { // incremental
+ aux->p *= (double)(aux->n1_ - aux->n11) / n11
+ * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1);
+ aux->n11 = n11;
+ return aux->p;
+ }
+ if (n11 == aux->n11 - 1) { // incremental
+ aux->p *= (double)aux->n11 / (aux->n1_ - n11)
+ * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11);
+ aux->n11 = n11;
+ return aux->p;
+ }
+ }
+ aux->n11 = n11;
+ }
+ aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n);
+ return aux->p;
+}
+
+double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two)
+{
+ int i, j, max, min;
+ double p, q, left, right;
+ hgacc_t aux;
+ int n1_, n_1, n;
+
+ n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n
+ max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail
+ min = (n1_ + n_1 - n < 0) ? 0 : (n1_ + n_1 - n < 0); // min n11, for left tail
+ *two = *_left = *_right = 1.;
+ if (min == max) return 1.; // no need to do test
+ q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table
+ // left tail
+ p = hypergeo_acc(min, 0, 0, 0, &aux);
+ for (left = 0., i = min + 1; p < 0.99999999 * q; ++i) // loop until underflow
+ left += p, p = hypergeo_acc(i, 0, 0, 0, &aux);
+ --i;
+ if (p < 1.00000001 * q) left += p;
+ else --i;
+ // right tail
+ p = hypergeo_acc(max, 0, 0, 0, &aux);
+ for (right = 0., j = max - 1; p < 0.99999999 * q; --j) // loop until underflow
+ right += p, p = hypergeo_acc(j, 0, 0, 0, &aux);
+ if (p < 1.00000001 * q) right += p;
+ else ++j;
+ // two-tail
+ *two = left + right;
+ if (*two > 1.) *two = 1.;
+ // adjust left and right
+ if (abs(i - n11) < abs(j - n11)) right = 1. - left + q;
+ else left = 1.0 - right + q;
+ *_left = left; *_right = right;
+ return q;
+}
+
+#ifdef FET_MAIN
+#include <stdio.h>
+
+int main(int argc, char *argv[])
+{
+ char id[1024];
+ int n11, n12, n21, n22;
+ double left, right, twotail, prob;
+
+ while (scanf("%s%d%d%d%d", id, &n11, &n12, &n21, &n22) == 5) {
+ prob = kt_fisher_exact(n11, n12, n21, n22, &left, &right, &twotail);
+ printf("%s\t%d\t%d\t%d\t%d\t%.6g\t%.6g\t%.6g\t%.6g\n", id, n11, n12, n21, n22,
+ prob, left, right, twotail);
+ }
+ return 0;
+}
+#endif
--- /dev/null
+#include <assert.h>
+#include <ctype.h>
+#include <sys/stat.h>
+#include "bam_endian.h"
+#include "kstring.h"
+#include "bcf.h"
+#ifdef _USE_KNETFILE
+#include "knetfile.h"
+#endif
+
+#define TAD_LIDX_SHIFT 13
+
+typedef struct {
+ int32_t n, m;
+ uint64_t *offset;
+} bcf_lidx_t;
+
+struct __bcf_idx_t {
+ int32_t n;
+ bcf_lidx_t *index2;
+};
+
+/************
+ * indexing *
+ ************/
+
+static inline void insert_offset2(bcf_lidx_t *index2, int _beg, int _end, uint64_t offset)
+{
+ int i, beg, end;
+ beg = _beg >> TAD_LIDX_SHIFT;
+ end = (_end - 1) >> TAD_LIDX_SHIFT;
+ if (index2->m < end + 1) {
+ int old_m = index2->m;
+ index2->m = end + 1;
+ kroundup32(index2->m);
+ index2->offset = (uint64_t*)realloc(index2->offset, index2->m * 8);
+ memset(index2->offset + old_m, 0, 8 * (index2->m - old_m));
+ }
+ if (beg == end) {
+ if (index2->offset[beg] == 0) index2->offset[beg] = offset;
+ } else {
+ for (i = beg; i <= end; ++i)
+ if (index2->offset[i] == 0) index2->offset[i] = offset;
+ }
+ if (index2->n < end + 1) index2->n = end + 1;
+}
+
+bcf_idx_t *bcf_idx_core(bcf_t *bp, bcf_hdr_t *h)
+{
+ bcf_idx_t *idx;
+ int32_t last_coor, last_tid;
+ uint64_t last_off;
+ kstring_t *str;
+ BGZF *fp = bp->fp;
+ bcf1_t *b;
+ int ret;
+
+ b = calloc(1, sizeof(bcf1_t));
+ str = calloc(1, sizeof(kstring_t));
+ idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t));
+ idx->n = h->n_ref;
+ idx->index2 = calloc(h->n_ref, sizeof(bcf_lidx_t));
+
+ last_tid = 0xffffffffu;
+ last_off = bgzf_tell(fp); last_coor = 0xffffffffu;
+ while ((ret = bcf_read(bp, h, b)) > 0) {
+ int end, tmp;
+ if (last_tid != b->tid) { // change of chromosomes
+ last_tid = b->tid;
+ } else if (last_coor > b->pos) {
+ fprintf(stderr, "[bcf_idx_core] the input is out of order\n");
+ free(str->s); free(str); free(idx); bcf_destroy(b);
+ return 0;
+ }
+ tmp = strlen(b->ref);
+ end = b->pos + (tmp > 0? tmp : 1);
+ insert_offset2(&idx->index2[b->tid], b->pos, end, last_off);
+ last_off = bgzf_tell(fp);
+ last_coor = b->pos;
+ }
+ free(str->s); free(str); bcf_destroy(b);
+ return idx;
+}
+
+void bcf_idx_destroy(bcf_idx_t *idx)
+{
+ int i;
+ if (idx == 0) return;
+ for (i = 0; i < idx->n; ++i) free(idx->index2[i].offset);
+ free(idx->index2);
+ free(idx);
+}
+
+/******************
+ * index file I/O *
+ ******************/
+
+void bcf_idx_save(const bcf_idx_t *idx, BGZF *fp)
+{
+ int32_t i, ti_is_be;
+ ti_is_be = bam_is_big_endian();
+ bgzf_write(fp, "BCI\4", 4);
+ if (ti_is_be) {
+ uint32_t x = idx->n;
+ bgzf_write(fp, bam_swap_endian_4p(&x), 4);
+ } else bgzf_write(fp, &idx->n, 4);
+ for (i = 0; i < idx->n; ++i) {
+ bcf_lidx_t *index2 = idx->index2 + i;
+ // write linear index (index2)
+ if (ti_is_be) {
+ int x = index2->n;
+ bgzf_write(fp, bam_swap_endian_4p(&x), 4);
+ } else bgzf_write(fp, &index2->n, 4);
+ if (ti_is_be) { // big endian
+ int x;
+ for (x = 0; (int)x < index2->n; ++x)
+ bam_swap_endian_8p(&index2->offset[x]);
+ bgzf_write(fp, index2->offset, 8 * index2->n);
+ for (x = 0; (int)x < index2->n; ++x)
+ bam_swap_endian_8p(&index2->offset[x]);
+ } else bgzf_write(fp, index2->offset, 8 * index2->n);
+ }
+}
+
+static bcf_idx_t *bcf_idx_load_core(BGZF *fp)
+{
+ int i, ti_is_be;
+ char magic[4];
+ bcf_idx_t *idx;
+ ti_is_be = bam_is_big_endian();
+ if (fp == 0) {
+ fprintf(stderr, "[%s] fail to load index.\n", __func__);
+ return 0;
+ }
+ bgzf_read(fp, magic, 4);
+ if (strncmp(magic, "BCI\4", 4)) {
+ fprintf(stderr, "[%s] wrong magic number.\n", __func__);
+ return 0;
+ }
+ idx = (bcf_idx_t*)calloc(1, sizeof(bcf_idx_t));
+ bgzf_read(fp, &idx->n, 4);
+ if (ti_is_be) bam_swap_endian_4p(&idx->n);
+ idx->index2 = (bcf_lidx_t*)calloc(idx->n, sizeof(bcf_lidx_t));
+ for (i = 0; i < idx->n; ++i) {
+ bcf_lidx_t *index2 = idx->index2 + i;
+ int j;
+ bgzf_read(fp, &index2->n, 4);
+ if (ti_is_be) bam_swap_endian_4p(&index2->n);
+ index2->m = index2->n;
+ index2->offset = (uint64_t*)calloc(index2->m, 8);
+ bgzf_read(fp, index2->offset, index2->n * 8);
+ if (ti_is_be)
+ for (j = 0; j < index2->n; ++j) bam_swap_endian_8p(&index2->offset[j]);
+ }
+ return idx;
+}
+
+bcf_idx_t *bcf_idx_load_local(const char *fnidx)
+{
+ BGZF *fp;
+ fp = bgzf_open(fnidx, "r");
+ if (fp) {
+ bcf_idx_t *idx = bcf_idx_load_core(fp);
+ bgzf_close(fp);
+ return idx;
+ } else return 0;
+}
+
+#ifdef _USE_KNETFILE
+static void download_from_remote(const char *url)
+{
+ const int buf_size = 1 * 1024 * 1024;
+ char *fn;
+ FILE *fp;
+ uint8_t *buf;
+ knetFile *fp_remote;
+ int l;
+ if (strstr(url, "ftp://") != url && strstr(url, "http://") != url) return;
+ l = strlen(url);
+ for (fn = (char*)url + l - 1; fn >= url; --fn)
+ if (*fn == '/') break;
+ ++fn; // fn now points to the file name
+ fp_remote = knet_open(url, "r");
+ if (fp_remote == 0) {
+ fprintf(stderr, "[download_from_remote] fail to open remote file.\n");
+ return;
+ }
+ if ((fp = fopen(fn, "w")) == 0) {
+ fprintf(stderr, "[download_from_remote] fail to create file in the working directory.\n");
+ knet_close(fp_remote);
+ return;
+ }
+ buf = (uint8_t*)calloc(buf_size, 1);
+ while ((l = knet_read(fp_remote, buf, buf_size)) != 0)
+ fwrite(buf, 1, l, fp);
+ free(buf);
+ fclose(fp);
+ knet_close(fp_remote);
+}
+#else
+static void download_from_remote(const char *url)
+{
+ return;
+}
+#endif
+
+static char *get_local_version(const char *fn)
+{
+ struct stat sbuf;
+ char *fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcat(strcpy(fnidx, fn), ".bci");
+ if ((strstr(fnidx, "ftp://") == fnidx || strstr(fnidx, "http://") == fnidx)) {
+ char *p, *url;
+ int l = strlen(fnidx);
+ for (p = fnidx + l - 1; p >= fnidx; --p)
+ if (*p == '/') break;
+ url = fnidx; fnidx = strdup(p + 1);
+ if (stat(fnidx, &sbuf) == 0) {
+ free(url);
+ return fnidx;
+ }
+ fprintf(stderr, "[%s] downloading the index file...\n", __func__);
+ download_from_remote(url);
+ free(url);
+ }
+ if (stat(fnidx, &sbuf) == 0) return fnidx;
+ free(fnidx); return 0;
+}
+
+bcf_idx_t *bcf_idx_load(const char *fn)
+{
+ bcf_idx_t *idx;
+ char *fname = get_local_version(fn);
+ if (fname == 0) return 0;
+ idx = bcf_idx_load_local(fname);
+ free(fname);
+ return idx;
+}
+
+int bcf_idx_build2(const char *fn, const char *_fnidx)
+{
+ char *fnidx;
+ BGZF *fpidx;
+ bcf_t *bp;
+ bcf_idx_t *idx;
+ bcf_hdr_t *h;
+ if ((bp = bcf_open(fn, "r")) == 0) {
+ fprintf(stderr, "[bcf_idx_build2] fail to open the BAM file.\n");
+ return -1;
+ }
+ h = bcf_hdr_read(bp);
+ idx = bcf_idx_core(bp, h);
+ bcf_close(bp);
+ if (_fnidx == 0) {
+ fnidx = (char*)calloc(strlen(fn) + 5, 1);
+ strcpy(fnidx, fn); strcat(fnidx, ".bci");
+ } else fnidx = strdup(_fnidx);
+ fpidx = bgzf_open(fnidx, "w");
+ if (fpidx == 0) {
+ fprintf(stderr, "[bcf_idx_build2] fail to create the index file.\n");
+ free(fnidx);
+ return -1;
+ }
+ bcf_idx_save(idx, fpidx);
+ bcf_idx_destroy(idx);
+ bgzf_close(fpidx);
+ free(fnidx);
+ return 0;
+}
+
+int bcf_idx_build(const char *fn)
+{
+ return bcf_idx_build2(fn, 0);
+}
+
+/********************************************
+ * parse a region in the format chr:beg-end *
+ ********************************************/
+
+int bcf_parse_region(void *str2id, const char *str, int *tid, int *begin, int *end)
+{
+ char *s, *p;
+ int i, l, k;
+ l = strlen(str);
+ p = s = (char*)malloc(l+1);
+ /* squeeze out "," */
+ for (i = k = 0; i != l; ++i)
+ if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
+ s[k] = 0;
+ for (i = 0; i != k; ++i) if (s[i] == ':') break;
+ s[i] = 0;
+ if ((*tid = bcf_str2id(str2id, s)) < 0) {
+ free(s);
+ return -1;
+ }
+ if (i == k) { /* dump the whole sequence */
+ *begin = 0; *end = 1<<29; free(s);
+ return 0;
+ }
+ for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
+ *begin = atoi(p);
+ if (i < k) {
+ p = s + i + 1;
+ *end = atoi(p);
+ } else *end = 1<<29;
+ if (*begin > 0) --*begin;
+ free(s);
+ if (*begin > *end) return -1;
+ return 0;
+}
+
+/*******************************
+ * retrieve a specified region *
+ *******************************/
+
+uint64_t bcf_idx_query(const bcf_idx_t *idx, int tid, int beg)
+{
+ uint64_t min_off, *offset;
+ int i;
+ if (beg < 0) beg = 0;
+ offset = idx->index2[tid].offset;
+ for (i = beg>>TAD_LIDX_SHIFT; i < idx->index2[tid].n && offset[i] == 0; ++i);
+ min_off = (i == idx->index2[tid].n)? offset[idx->index2[tid].n-1] : offset[i];
+ return min_off;
+}
+
+int bcf_main_index(int argc, char *argv[])
+{
+ if (argc == 1) {
+ fprintf(stderr, "Usage: bcftools index <in.bcf>\n");
+ return 1;
+ }
+ bcf_idx_build(argv[1]);
+ return 0;
+}
--- /dev/null
+#include <math.h>
+
+
+/* Log gamma function
+ * \log{\Gamma(z)}
+ * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
+ */
+double kf_lgamma(double z)
+{
+ double x = 0;
+ x += 0.1659470187408462e-06 / (z+7);
+ x += 0.9934937113930748e-05 / (z+6);
+ x -= 0.1385710331296526 / (z+5);
+ x += 12.50734324009056 / (z+4);
+ x -= 176.6150291498386 / (z+3);
+ x += 771.3234287757674 / (z+2);
+ x -= 1259.139216722289 / (z+1);
+ x += 676.5203681218835 / z;
+ x += 0.9999999999995183;
+ return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5);
+}
+
+/* complementary error function
+ * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt
+ * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66
+ */
+double kf_erfc(double x)
+{
+ const double p0 = 220.2068679123761;
+ const double p1 = 221.2135961699311;
+ const double p2 = 112.0792914978709;
+ const double p3 = 33.912866078383;
+ const double p4 = 6.37396220353165;
+ const double p5 = .7003830644436881;
+ const double p6 = .03526249659989109;
+ const double q0 = 440.4137358247522;
+ const double q1 = 793.8265125199484;
+ const double q2 = 637.3336333788311;
+ const double q3 = 296.5642487796737;
+ const double q4 = 86.78073220294608;
+ const double q5 = 16.06417757920695;
+ const double q6 = 1.755667163182642;
+ const double q7 = .08838834764831844;
+ double expntl, z, p;
+ z = fabs(x) * M_SQRT2;
+ if (z > 37.) return x > 0.? 0. : 2.;
+ expntl = exp(z * z * - .5);
+ if (z < 10. / M_SQRT2) // for small z
+ p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0)
+ / (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0);
+ else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65)))));
+ return x > 0.? 2. * p : 2. * (1. - p);
+}
+
+/* The following computes regularized incomplete gamma functions.
+ * Formulas are taken from Wiki, with additional input from Numerical
+ * Recipes in C (for modified Lentz's algorithm) and AS245
+ * (http://lib.stat.cmu.edu/apstat/245).
+ *
+ * A good online calculator is available at:
+ *
+ * http://www.danielsoper.com/statcalc/calc23.aspx
+ *
+ * It calculates upper incomplete gamma function, which equals
+ * kf_gammaq(s,z)*tgamma(s).
+ */
+
+#define KF_GAMMA_EPS 1e-14
+#define KF_TINY 1e-290
+
+// regularized lower incomplete gamma function, by series expansion
+static double _kf_gammap(double s, double z)
+{
+ double sum, x;
+ int k;
+ for (k = 1, sum = x = 1.; k < 100; ++k) {
+ sum += (x *= z / (s + k));
+ if (x / sum < KF_GAMMA_EPS) break;
+ }
+ return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum));
+}
+// regularized upper incomplete gamma function, by continued fraction
+static double _kf_gammaq(double s, double z)
+{
+ int j;
+ double C, D, f;
+ f = 1. + z - s; C = f; D = 0.;
+ // Modified Lentz's algorithm for computing continued fraction
+ // See Numerical Recipes in C, 2nd edition, section 5.2
+ for (j = 1; j < 100; ++j) {
+ double a = j * (s - j), b = (j<<1) + 1 + z - s, d;
+ D = b + a * D;
+ if (D < KF_TINY) D = KF_TINY;
+ C = b + a / C;
+ if (C < KF_TINY) C = KF_TINY;
+ D = 1. / D;
+ d = C * D;
+ f *= d;
+ if (fabs(d - 1.) < KF_GAMMA_EPS) break;
+ }
+ return exp(s * log(z) - z - kf_lgamma(s) - log(f));
+}
+
+double kf_gammap(double s, double z)
+{
+ return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);
+}
+
+double kf_gammaq(double s, double z)
+{
+ return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);
+}
+
+/* Regularized incomplete beta function. The method is taken from
+ * Numerical Recipe in C, 2nd edition, section 6.4. The following web
+ * page calculates the incomplete beta function, which equals
+ * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b):
+ *
+ * http://www.danielsoper.com/statcalc/calc36.aspx
+ */
+static double kf_betai_aux(double a, double b, double x)
+{
+ double C, D, f;
+ int j;
+ if (x == 0.) return 0.;
+ if (x == 1.) return 1.;
+ f = 1.; C = f; D = 0.;
+ // Modified Lentz's algorithm for computing continued fraction
+ for (j = 1; j < 200; ++j) {
+ double aa, d;
+ int m = j>>1;
+ aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1))
+ : m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m));
+ D = 1. + aa * D;
+ if (D < KF_TINY) D = KF_TINY;
+ C = 1. + aa / C;
+ if (C < KF_TINY) C = KF_TINY;
+ D = 1. / D;
+ d = C * D;
+ f *= d;
+ if (fabs(d - 1.) < KF_GAMMA_EPS) break;
+ }
+ return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f;
+}
+double kf_betai(double a, double b, double x)
+{
+ return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x);
+}
+
+#ifdef KF_MAIN
+#include <stdio.h>
+int main(int argc, char *argv[])
+{
+ double x = 5.5, y = 3;
+ double a, b;
+ printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x));
+ printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y));
+ a = 2; b = 2; x = 0.5;
+ printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b)));
+ return 0;
+}
+#endif
--- /dev/null
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "bcf.h"
+
+static double g_q2p[256];
+
+#define LD_ITER_MAX 50
+#define LD_ITER_EPS 1e-4
+
+#define _G1(h, k) ((h>>1&1) + (k>>1&1))
+#define _G2(h, k) ((h&1) + (k&1))
+
+// 0: the previous site; 1: the current site
+static int freq_iter(int n, double *pdg[2], double f[4])
+{
+ double ff[4];
+ int i, k, h;
+ memset(ff, 0, 4 * sizeof(double));
+ for (i = 0; i < n; ++i) {
+ double *p[2], sum, tmp;
+ p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3;
+ for (k = 0, sum = 0.; k < 4; ++k)
+ for (h = 0; h < 4; ++h)
+ sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)];
+ for (k = 0; k < 4; ++k) {
+ tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)])
+ + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)])
+ + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)])
+ + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]);
+ ff[k] += f[k] * tmp / sum;
+ }
+ }
+ for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n);
+ return 0;
+}
+
+double bcf_ld_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4])
+{
+ const bcf1_t *b[2];
+ uint8_t *PL[2];
+ int i, j, PL_len[2], n_smpl;
+ double *pdg[2], flast[4], r;
+ // initialize g_q2p if necessary
+ if (g_q2p[0] == 0.)
+ for (i = 0; i < 256; ++i)
+ g_q2p[i] = pow(10., -i / 10.);
+ // initialize others
+ if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples
+ n_smpl = b0->n_smpl;
+ b[0] = b0; b[1] = b1;
+ f[0] = f[1] = f[2] = f[3] = -1.;
+ if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only
+ // set PL and PL_len
+ for (j = 0; j < 2; ++j) {
+ const bcf1_t *bj = b[j];
+ for (i = 0; i < bj->n_gi; ++i) {
+ if (bj->gi[i].fmt == bcf_str2int("PL", 2)) {
+ PL[j] = (uint8_t*)bj->gi[i].data;
+ PL_len[j] = bj->gi[i].len;
+ break;
+ }
+ }
+ if (i == bj->n_gi) return -1; // no PL
+ }
+ // fill pdg[2]
+ pdg[0] = malloc(3 * n_smpl * sizeof(double));
+ pdg[1] = malloc(3 * n_smpl * sizeof(double));
+ for (j = 0; j < 2; ++j) {
+ for (i = 0; i < n_smpl; ++i) {
+ const uint8_t *pi = PL[j] + i * PL_len[j];
+ double *p = pdg[j] + i * 3;
+ p[0] = g_q2p[pi[b[j]->n_alleles]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]];
+ }
+ }
+ // iteration
+ f[0] = f[1] = f[2] = f[3] = 0.25; // this is a really bad guess...
+ for (j = 0; j < LD_ITER_MAX; ++j) {
+ double eps = 0;
+ memcpy(flast, f, 4 * sizeof(double));
+ freq_iter(n_smpl, pdg, f);
+ for (i = 0; i < 4; ++i) {
+ double x = fabs(f[i] - flast[i]);
+ if (x > eps) eps = x;
+ }
+ if (eps < LD_ITER_EPS) break;
+ }
+ // free
+ free(pdg[0]); free(pdg[1]);
+ { // calculate r^2
+ double p[2], q[2], D;
+ p[0] = f[0] + f[1]; q[0] = 1 - p[0];
+ p[1] = f[0] + f[2]; q[1] = 1 - p[1];
+ D = f[0] * f[3] - f[1] * f[2];
+ r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1]));
+ // fprintf(stderr, "R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r2);
+ if (isnan(r)) r = -1.;
+ }
+ return r;
+}
--- /dev/null
+#include <string.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include "bcf.h"
+
+int bcfview(int argc, char *argv[]);
+int bcf_main_index(int argc, char *argv[]);
+
+#define BUF_SIZE 0x10000
+
+int bcf_cat(int n, char * const *fn)
+{
+ int i;
+ bcf_t *out;
+ uint8_t *buf;
+ buf = malloc(BUF_SIZE);
+ out = bcf_open("-", "w");
+ for (i = 0; i < n; ++i) {
+ bcf_t *in;
+ bcf_hdr_t *h;
+ off_t end;
+ struct stat s;
+ in = bcf_open(fn[i], "r");
+ h = bcf_hdr_read(in);
+ if (i == 0) bcf_hdr_write(out, h);
+ bcf_hdr_destroy(h);
+#ifdef _USE_KNETFILE
+ fstat(knet_fileno(in->fp->x.fpr), &s);
+ end = s.st_size - 28;
+ while (knet_tell(in->fp->x.fpr) < end) {
+ int size = knet_tell(in->fp->x.fpr) + BUF_SIZE < end? BUF_SIZE : end - knet_tell(in->fp->x.fpr);
+ knet_read(in->fp->x.fpr, buf, size);
+ fwrite(buf, 1, size, out->fp->x.fpw);
+ }
+#else
+ abort(); // FIXME: not implemented
+#endif
+ bcf_close(in);
+ }
+ bcf_close(out);
+ free(buf);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ if (argc == 1) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: bcftools <command> <arguments>\n\n");
+ fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n");
+ fprintf(stderr, " index index BCF\n");
+ fprintf(stderr, " cat concatenate BCFs\n");
+ fprintf(stderr, "\n");
+ return 1;
+ }
+ if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1);
+ else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1);
+ else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2);
+ else {
+ fprintf(stderr, "[main] Unrecognized command.\n");
+ return 1;
+ }
+ return 0;
+}
--- /dev/null
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include "prob1.h"
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define MC_MAX_EM_ITER 16
+#define MC_EM_EPS 1e-4
+#define MC_DEF_INDEL 0.15
+
+unsigned char seq_nt4_table[256] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+
+struct __bcf_p1aux_t {
+ int n, M, n1, is_indel, is_folded;
+ double *q2p, *pdg; // pdg -> P(D|g)
+ double *phi, *phi_indel;
+ double *z, *zswap; // aux for afs
+ double *z1, *z2, *phi1, *phi2; // only calculated when n1 is set
+ double t, t1, t2;
+ double *afs, *afs1; // afs: accumulative AFS; afs1: site posterior distribution
+ const uint8_t *PL; // point to PL
+ int PL_len;
+};
+
+static void fold_array(int M, double *x)
+{
+ int k;
+ for (k = 0; k < M/2; ++k)
+ x[k] = x[M-k] = (x[k] + x[M-k]) / 2.;
+}
+
+void bcf_p1_indel_prior(bcf_p1aux_t *ma, double x)
+{
+ int i;
+ for (i = 0; i < ma->M; ++i)
+ ma->phi_indel[i] = ma->phi[i] * x;
+ ma->phi_indel[ma->M] = 1. - ma->phi[ma->M] * x;
+}
+
+static void init_prior(int type, double theta, int M, double *phi)
+{
+ int i;
+ if (type == MC_PTYPE_COND2) {
+ for (i = 0; i <= M; ++i)
+ phi[i] = 2. * (i + 1) / (M + 1) / (M + 2);
+ } else if (type == MC_PTYPE_FLAT) {
+ for (i = 0; i <= M; ++i)
+ phi[i] = 1. / (M + 1);
+ } else {
+ double sum;
+ for (i = 0, sum = 0.; i < M; ++i)
+ sum += (phi[i] = theta / (M - i));
+ phi[M] = 1. - sum;
+ }
+}
+
+void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta)
+{
+ init_prior(type, theta, ma->M, ma->phi);
+ bcf_p1_indel_prior(ma, MC_DEF_INDEL);
+}
+
+void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta)
+{
+ if (ma->n1 <= 0 || ma->n1 >= ma->M) return;
+ init_prior(type, theta, 2*ma->n1, ma->phi1);
+ init_prior(type, theta, 2*(ma->n - ma->n1), ma->phi2);
+}
+
+int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn)
+{
+ gzFile fp;
+ kstring_t s;
+ kstream_t *ks;
+ long double sum;
+ int dret, k;
+ memset(&s, 0, sizeof(kstring_t));
+ fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ ks = ks_init(fp);
+ memset(ma->phi, 0, sizeof(double) * (ma->M + 1));
+ while (ks_getuntil(ks, '\n', &s, &dret) >= 0) {
+ if (strstr(s.s, "[afs] ") == s.s) {
+ char *p = s.s + 6;
+ for (k = 0; k <= ma->M; ++k) {
+ int x;
+ double y;
+ x = strtol(p, &p, 10);
+ if (x != k && (errno == EINVAL || errno == ERANGE)) return -1;
+ ++p;
+ y = strtod(p, &p);
+ if (y == 0. && (errno == EINVAL || errno == ERANGE)) return -1;
+ ma->phi[ma->M - k] += y;
+ }
+ }
+ }
+ ks_destroy(ks);
+ gzclose(fp);
+ free(s.s);
+ for (sum = 0., k = 0; k <= ma->M; ++k) sum += ma->phi[k];
+ fprintf(stderr, "[prior]");
+ for (k = 0; k <= ma->M; ++k) ma->phi[k] /= sum;
+ for (k = 0; k <= ma->M; ++k) fprintf(stderr, " %d:%.3lg", k, ma->phi[ma->M - k]);
+ fputc('\n', stderr);
+ for (sum = 0., k = 1; k < ma->M; ++k) sum += ma->phi[ma->M - k] * (2.* k * (ma->M - k) / ma->M / (ma->M - 1));
+ fprintf(stderr, "[%s] heterozygosity=%lf, ", __func__, (double)sum);
+ for (sum = 0., k = 1; k <= ma->M; ++k) sum += k * ma->phi[ma->M - k] / ma->M;
+ fprintf(stderr, "theta=%lf\n", (double)sum);
+ bcf_p1_indel_prior(ma, MC_DEF_INDEL);
+ return 0;
+}
+
+bcf_p1aux_t *bcf_p1_init(int n)
+{
+ bcf_p1aux_t *ma;
+ int i;
+ ma = calloc(1, sizeof(bcf_p1aux_t));
+ ma->n1 = -1;
+ ma->n = n; ma->M = 2 * n;
+ ma->q2p = calloc(256, sizeof(double));
+ ma->pdg = calloc(3 * ma->n, sizeof(double));
+ ma->phi = calloc(ma->M + 1, sizeof(double));
+ ma->phi_indel = calloc(ma->M + 1, sizeof(double));
+ ma->phi1 = calloc(ma->M + 1, sizeof(double));
+ ma->phi2 = calloc(ma->M + 1, sizeof(double));
+ ma->z = calloc(2 * ma->n + 1, sizeof(double));
+ ma->zswap = calloc(2 * ma->n + 1, sizeof(double));
+ ma->z1 = calloc(ma->M + 1, sizeof(double)); // actually we do not need this large
+ ma->z2 = calloc(ma->M + 1, sizeof(double));
+ ma->afs = calloc(2 * ma->n + 1, sizeof(double));
+ ma->afs1 = calloc(2 * ma->n + 1, sizeof(double));
+ for (i = 0; i < 256; ++i)
+ ma->q2p[i] = pow(10., -i / 10.);
+ bcf_p1_init_prior(ma, MC_PTYPE_FULL, 1e-3); // the simplest prior
+ return ma;
+}
+
+int bcf_p1_set_n1(bcf_p1aux_t *b, int n1)
+{
+ if (n1 == 0 || n1 >= b->n) return -1;
+ b->n1 = n1;
+ return 0;
+}
+
+void bcf_p1_set_folded(bcf_p1aux_t *p1a)
+{
+ if (p1a->n1 < 0) {
+ p1a->is_folded = 1;
+ fold_array(p1a->M, p1a->phi);
+ fold_array(p1a->M, p1a->phi_indel);
+ }
+}
+
+void bcf_p1_destroy(bcf_p1aux_t *ma)
+{
+ if (ma) {
+ free(ma->q2p); free(ma->pdg);
+ free(ma->phi); free(ma->phi_indel); free(ma->phi1); free(ma->phi2);
+ free(ma->z); free(ma->zswap); free(ma->z1); free(ma->z2);
+ free(ma->afs); free(ma->afs1);
+ free(ma);
+ }
+}
+
+static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
+{
+ int i, j, k;
+ long *p, tmp;
+ p = alloca(b->n_alleles * sizeof(long));
+ memset(p, 0, sizeof(long) * b->n_alleles);
+ for (j = 0; j < ma->n; ++j) {
+ const uint8_t *pi = ma->PL + j * ma->PL_len;
+ double *pdg = ma->pdg + j * 3;
+ pdg[0] = ma->q2p[pi[b->n_alleles]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]];
+ for (i = k = 0; i < b->n_alleles; ++i) {
+ p[i] += (int)pi[k];
+ k += b->n_alleles - i;
+ }
+ }
+ for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i;
+ for (i = 1; i < b->n_alleles; ++i) // insertion sort
+ for (j = i; j > 0 && p[j] < p[j-1]; --j)
+ tmp = p[j], p[j] = p[j-1], p[j-1] = tmp;
+ for (i = b->n_alleles - 1; i >= 0; --i)
+ if ((p[i]&0xf) == 0) break;
+ return i;
+}
+// f0 is the reference allele frequency
+static double mc_freq_iter(double f0, const bcf_p1aux_t *ma)
+{
+ double f, f3[3];
+ int i;
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ for (i = 0, f = 0.; i < ma->n; ++i) {
+ double *pdg;
+ pdg = ma->pdg + i * 3;
+ f += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2])
+ / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]);
+ }
+ f /= ma->n * 2.;
+ return f;
+}
+
+int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
+{
+ double sum, g[3];
+ double max, f3[3], *pdg = ma->pdg + k * 3;
+ int q, i, max_i;
+ f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+ for (i = 0, sum = 0.; i < 3; ++i)
+ sum += (g[i] = pdg[i] * f3[i]);
+ for (i = 0, max = -1., max_i = 0; i < 3; ++i) {
+ g[i] /= sum;
+ if (g[i] > max) max = g[i], max_i = i;
+ }
+ max = 1. - max;
+ if (max < 1e-308) max = 1e-308;
+ q = (int)(-4.343 * log(max) + .499);
+ if (q > 99) q = 99;
+ return q<<2|max_i;
+}
+
+#define TINY 1e-20
+
+static void mc_cal_y_core(bcf_p1aux_t *ma, int beg)
+{
+ double *z[2], *tmp, *pdg;
+ int _j, last_min, last_max;
+ z[0] = ma->z;
+ z[1] = ma->zswap;
+ pdg = ma->pdg;
+ memset(z[0], 0, sizeof(double) * (ma->M + 1));
+ memset(z[1], 0, sizeof(double) * (ma->M + 1));
+ z[0][0] = 1.;
+ last_min = last_max = 0;
+ ma->t = 0.;
+ for (_j = beg; _j < ma->n; ++_j) {
+ int k, j = _j - beg, _min = last_min, _max = last_max;
+ double p[3], sum;
+ pdg = ma->pdg + _j * 3;
+ p[0] = pdg[0]; p[1] = 2. * pdg[1]; p[2] = pdg[2];
+ for (; _min < _max && z[0][_min] < TINY; ++_min) z[0][_min] = z[1][_min] = 0.;
+ for (; _max > _min && z[0][_max] < TINY; --_max) z[0][_max] = z[1][_max] = 0.;
+ _max += 2;
+ if (_min == 0)
+ k = 0, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k];
+ if (_min <= 1)
+ k = 1, z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k] + k*(2*j+2-k) * p[1] * z[0][k-1];
+ for (k = _min < 2? 2 : _min; k <= _max; ++k)
+ z[1][k] = (2*j+2-k)*(2*j-k+1) * p[0] * z[0][k]
+ + k*(2*j+2-k) * p[1] * z[0][k-1]
+ + k*(k-1)* p[2] * z[0][k-2];
+ for (k = _min, sum = 0.; k <= _max; ++k) sum += z[1][k];
+ ma->t += log(sum / ((2. * j + 2) * (2. * j + 1)));
+ for (k = _min; k <= _max; ++k) z[1][k] /= sum;
+ if (_min >= 1) z[1][_min-1] = 0.;
+ if (_min >= 2) z[1][_min-2] = 0.;
+ if (j < ma->n - 1) z[1][_max+1] = z[1][_max+2] = 0.;
+ if (_j == ma->n1 - 1) { // set pop1
+ ma->t1 = ma->t;
+ memcpy(ma->z1, z[1], sizeof(double) * (ma->n1 * 2 + 1));
+ }
+ tmp = z[0]; z[0] = z[1]; z[1] = tmp;
+ last_min = _min; last_max = _max;
+ }
+ if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1));
+}
+
+static void mc_cal_y(bcf_p1aux_t *ma)
+{
+ if (ma->n1 > 0 && ma->n1 < ma->n) {
+ int k;
+ long double x;
+ memset(ma->z1, 0, sizeof(double) * (2 * ma->n1 + 1));
+ memset(ma->z2, 0, sizeof(double) * (2 * (ma->n - ma->n1) + 1));
+ ma->t1 = ma->t2 = 0.;
+ mc_cal_y_core(ma, ma->n1);
+ ma->t2 = ma->t;
+ memcpy(ma->z2, ma->z, sizeof(double) * (2 * (ma->n - ma->n1) + 1));
+ mc_cal_y_core(ma, 0);
+ // rescale z
+ x = expl(ma->t - (ma->t1 + ma->t2));
+ for (k = 0; k <= ma->M; ++k) ma->z[k] *= x;
+ } else mc_cal_y_core(ma, 0);
+}
+
+static void contrast(bcf_p1aux_t *ma, double pc[4]) // mc_cal_y() must be called before hand
+{
+ int k, n1 = ma->n1, n2 = ma->n - ma->n1;
+ long double sum1, sum2;
+ pc[0] = pc[1] = pc[2] = pc[3] = -1.;
+ if (n1 <= 0 || n2 <= 0) return;
+ for (k = 0, sum1 = 0.; k <= 2*n1; ++k) sum1 += ma->phi1[k] * ma->z1[k];
+ for (k = 0, sum2 = 0.; k <= 2*n2; ++k) sum2 += ma->phi2[k] * ma->z2[k];
+ pc[2] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1;
+ pc[3] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2;
+ for (k = 2; k < 4; ++k) {
+ pc[k] = pc[k] > .5? -(-4.343 * log(1. - pc[k] + TINY) + .499) : -4.343 * log(pc[k] + TINY) + .499;
+ pc[k] = (int)pc[k];
+ if (pc[k] > 99) pc[k] = 99;
+ if (pc[k] < -99) pc[k] = -99;
+ }
+ pc[0] = ma->phi2[2*n2] * ma->z2[2*n2] / sum2 * (1. - ma->phi1[2*n1] * ma->z1[2*n1] / sum1);
+ pc[1] = ma->phi1[2*n1] * ma->z1[2*n1] / sum1 * (1. - ma->phi2[2*n2] * ma->z2[2*n2] / sum2);
+ pc[0] = pc[0] == 1.? 99 : (int)(-4.343 * log(1. - pc[0]) + .499);
+ pc[1] = pc[1] == 1.? 99 : (int)(-4.343 * log(1. - pc[1]) + .499);
+}
+
+static double mc_cal_afs(bcf_p1aux_t *ma)
+{
+ int k;
+ long double sum = 0.;
+ double *phi = ma->is_indel? ma->phi_indel : ma->phi;
+ memset(ma->afs1, 0, sizeof(double) * (ma->M + 1));
+ mc_cal_y(ma);
+ for (k = 0, sum = 0.; k <= ma->M; ++k)
+ sum += (long double)phi[k] * ma->z[k];
+ for (k = 0; k <= ma->M; ++k) {
+ ma->afs1[k] = phi[k] * ma->z[k] / sum;
+ if (isnan(ma->afs1[k]) || isinf(ma->afs1[k])) return -1.;
+ }
+ for (k = 0, sum = 0.; k <= ma->M; ++k) {
+ ma->afs[k] += ma->afs1[k];
+ sum += k * ma->afs1[k];
+ }
+ return sum / ma->M;
+}
+
+long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3])
+{
+ long double pd = 0., g2[3];
+ int i, k;
+ memset(g2, 0, sizeof(long double) * 3);
+ for (k = 0; k < p1a->M; ++k) {
+ double f = (double)k / p1a->M, f3[3], g1[3];
+ long double z = 1.;
+ g1[0] = g1[1] = g1[2] = 0.;
+ f3[0] = (1. - f) * (1. - f); f3[1] = 2. * f * (1. - f); f3[2] = f * f;
+ for (i = 0; i < p1a->n; ++i) {
+ double *pdg = p1a->pdg + i * 3;
+ double x = pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2];
+ z *= x;
+ g1[0] += pdg[0] * f3[0] / x;
+ g1[1] += pdg[1] * f3[1] / x;
+ g1[2] += pdg[2] * f3[2] / x;
+ }
+ pd += p1a->phi[k] * z;
+ for (i = 0; i < 3; ++i)
+ g2[i] += p1a->phi[k] * z * g1[i];
+ }
+ for (i = 0; i < 3; ++i) g[i] = g2[i] / pd;
+ return pd;
+}
+
+int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
+{
+ int i, k;
+ long double sum = 0.;
+ ma->is_indel = bcf_is_indel(b);
+ // set PL and PL_len
+ for (i = 0; i < b->n_gi; ++i) {
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+ ma->PL = (uint8_t*)b->gi[i].data;
+ ma->PL_len = b->gi[i].len;
+ break;
+ }
+ }
+ if (b->n_alleles < 2) return -1; // FIXME: find a better solution
+ //
+ rst->rank0 = cal_pdg(b, ma);
+ rst->f_exp = mc_cal_afs(ma);
+ rst->p_ref = ma->is_folded? ma->afs1[ma->M] + ma->afs1[0] : ma->afs1[ma->M];
+ // calculate f_flat and f_em
+ for (k = 0, sum = 0.; k <= ma->M; ++k)
+ sum += (long double)ma->z[k];
+ rst->f_flat = 0.;
+ for (k = 0; k <= ma->M; ++k) {
+ double p = ma->z[k] / sum;
+ rst->f_flat += k * p;
+ }
+ rst->f_flat /= ma->M;
+ { // calculate f_em
+ double flast = rst->f_flat;
+ for (i = 0; i < MC_MAX_EM_ITER; ++i) {
+ rst->f_em = mc_freq_iter(flast, ma);
+ if (fabs(rst->f_em - flast) < MC_EM_EPS) break;
+ flast = rst->f_em;
+ }
+ }
+ { // estimate equal-tail credible interval (95% level)
+ int l, h;
+ double p;
+ for (i = 0, p = 0.; i < ma->M; ++i)
+ if (p + ma->afs1[i] > 0.025) break;
+ else p += ma->afs1[i];
+ l = i;
+ for (i = ma->M-1, p = 0.; i >= 0; --i)
+ if (p + ma->afs1[i] > 0.025) break;
+ else p += ma->afs1[i];
+ h = i;
+ rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M;
+ }
+ rst->g[0] = rst->g[1] = rst->g[2] = -1.;
+ contrast(ma, rst->pc);
+ return 0;
+}
+
+void bcf_p1_dump_afs(bcf_p1aux_t *ma)
+{
+ int k;
+ if (ma->is_folded) fold_array(ma->M, ma->afs);
+ fprintf(stderr, "[afs]");
+ for (k = 0; k <= ma->M; ++k)
+ fprintf(stderr, " %d:%.3lf", k, ma->afs[ma->M - k]);
+ fprintf(stderr, "\n");
+ memset(ma->afs, 0, sizeof(double) * (ma->M + 1));
+}
--- /dev/null
+#ifndef BCF_PROB1_H
+#define BCF_PROB1_H
+
+#include "bcf.h"
+
+struct __bcf_p1aux_t;
+typedef struct __bcf_p1aux_t bcf_p1aux_t;
+
+typedef struct {
+ int rank0;
+ double f_em, f_exp, f_flat, p_ref;
+ double cil, cih;
+ double pc[4];
+ double g[3];
+} bcf_p1rst_t;
+
+#define MC_PTYPE_FULL 1
+#define MC_PTYPE_COND2 2
+#define MC_PTYPE_FLAT 3
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ bcf_p1aux_t *bcf_p1_init(int n);
+ void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta);
+ void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta);
+ void bcf_p1_destroy(bcf_p1aux_t *ma);
+ int bcf_p1_cal(bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst);
+ int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k);
+ void bcf_p1_dump_afs(bcf_p1aux_t *ma);
+ int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn);
+ long double bcf_p1_cal_g3(bcf_p1aux_t *p1a, double g[3]);
+ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1);
+ void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+#include <zlib.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "bcf.h"
+#include "kstring.h"
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 4096)
+
+typedef struct {
+ gzFile fp;
+ FILE *fpout;
+ kstream_t *ks;
+ void *refhash;
+ kstring_t line;
+ int max_ref;
+} vcf_t;
+
+bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
+{
+ kstring_t meta, smpl;
+ int dret;
+ vcf_t *v;
+ bcf_hdr_t *h;
+ if (!bp->is_vcf) return bcf_hdr_read(bp);
+ h = calloc(1, sizeof(bcf_hdr_t));
+ v = (vcf_t*)bp->v;
+ v->line.l = 0;
+ memset(&meta, 0, sizeof(kstring_t));
+ memset(&smpl, 0, sizeof(kstring_t));
+ while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) {
+ if (v->line.l < 2) continue;
+ if (v->line.s[0] != '#') return 0; // no sample line
+ if (v->line.s[0] == '#' && v->line.s[1] == '#') {
+ kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta);
+ } else if (v->line.s[0] == '#') {
+ int k;
+ ks_tokaux_t aux;
+ char *p;
+ for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
+ if (k >= 9) {
+ kputsn(p, aux.p - p, &smpl);
+ kputc('\0', &smpl);
+ }
+ }
+ break;
+ }
+ }
+ kputc('\0', &meta);
+ h->name = 0;
+ h->sname = smpl.s; h->l_smpl = smpl.l;
+ h->txt = meta.s; h->l_txt = meta.l;
+ bcf_hdr_sync(h);
+ return h;
+}
+
+bcf_t *vcf_open(const char *fn, const char *mode)
+{
+ bcf_t *bp;
+ vcf_t *v;
+ if (strchr(mode, 'b')) return bcf_open(fn, mode);
+ bp = calloc(1, sizeof(bcf_t));
+ v = calloc(1, sizeof(vcf_t));
+ bp->is_vcf = 1;
+ bp->v = v;
+ v->refhash = bcf_str2id_init();
+ if (strchr(mode, 'r')) {
+ v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+ v->ks = ks_init(v->fp);
+ } else if (strchr(mode, 'w'))
+ v->fpout = strcmp(fn, "-")? fopen(fn, "w") : stdout;
+ return bp;
+}
+
+int vcf_close(bcf_t *bp)
+{
+ vcf_t *v;
+ if (bp == 0) return -1;
+ if (!bp->is_vcf) return bcf_close(bp);
+ v = (vcf_t*)bp->v;
+ if (v->fp) {
+ ks_destroy(v->ks);
+ gzclose(v->fp);
+ }
+ if (v->fpout) fclose(v->fpout);
+ free(v->line.s);
+ bcf_str2id_destroy(v->refhash);
+ free(v);
+ free(bp);
+ return 0;
+}
+
+int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h)
+{
+ vcf_t *v = (vcf_t*)bp->v;
+ int i, has_ref = 0, has_ver = 0;
+ if (!bp->is_vcf) return bcf_hdr_write(bp, h);
+ if (h->l_txt > 0) {
+ if (strstr(h->txt, "##fileformat=")) has_ver = 1;
+ if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n");
+ fwrite(h->txt, 1, h->l_txt - 1, v->fpout);
+ if (strstr(h->txt, "##SQ=")) has_ref = 1;
+ }
+ if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n");
+ fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
+ for (i = 0; i < h->n_smpl; ++i)
+ fprintf(v->fpout, "\t%s", h->sns[i]);
+ fputc('\n', v->fpout);
+ return 0;
+}
+
+int vcf_write(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
+{
+ vcf_t *v = (vcf_t*)bp->v;
+ extern void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s);
+ if (!bp->is_vcf) return bcf_write(bp, h, b);
+ bcf_fmt_core(h, b, &v->line);
+ fwrite(v->line.s, 1, v->line.l, v->fpout);
+ fputc('\n', v->fpout);
+ return v->line.l + 1;
+}
+
+int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
+{
+ int dret, k, i, sync = 0;
+ vcf_t *v = (vcf_t*)bp->v;
+ char *p, *q;
+ kstring_t str, rn;
+ ks_tokaux_t aux, a2;
+ if (!bp->is_vcf) return bcf_read(bp, h, b);
+ v->line.l = 0;
+ str.l = 0; str.m = b->m_str; str.s = b->str;
+ rn.l = rn.m = h->l_nm; rn.s = h->name;
+ if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1;
+ b->n_smpl = h->n_smpl;
+ for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
+ *(char*)aux.p = 0;
+ if (k == 0) { // ref
+ int tid = bcf_str2id(v->refhash, p);
+ if (tid < 0) {
+ tid = bcf_str2id_add(v->refhash, p);
+ kputs(p, &rn); kputc('\0', &rn);
+ sync = 1;
+ }
+ b->tid = tid;
+ } else if (k == 1) { // pos
+ b->pos = atoi(p) - 1;
+ } else if (k == 5) { // qual
+ b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0;
+ } else if (k <= 8) { // variable length strings
+ kputs(p, &str); kputc('\0', &str);
+ b->l_str = str.l; b->m_str = str.m; b->str = str.s;
+ if (k == 8) bcf_sync(b);
+ } else { // k > 9
+ if (strncmp(p, "./.", 3) == 0) {
+ for (i = 0; i < b->n_gi; ++i) {
+ if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
+ ((uint8_t*)b->gi[i].data)[k-9] = 1<<7;
+ } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("SP", 2)) {
+ ((uint8_t*)b->gi[i].data)[k-9] = 0;
+ } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
+ ((uint16_t*)b->gi[i].data)[k-9] = 0;
+ } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+ int y = b->n_alleles * (b->n_alleles + 1) / 2;
+ memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y);
+ } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
+ int y = b->n_alleles * (b->n_alleles + 1) / 2;
+ memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4);
+ }
+ }
+ goto endblock;
+ }
+ for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) {
+ if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
+ ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6;
+ } else if (b->gi[i].fmt == bcf_str2int("GQ", 2) || b->gi[i].fmt == bcf_str2int("SP", 2)) {
+ double _x = strtod(q, &q);
+ int x = (int)(_x + .499);
+ if (x > 255) x = 255;
+ ((uint8_t*)b->gi[i].data)[k-9] = x;
+ } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
+ int x = strtol(q, &q, 10);
+ if (x > 0xffff) x = 0xffff;
+ ((uint16_t*)b->gi[i].data)[k-9] = x;
+ } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+ int x, y, j;
+ uint8_t *data = (uint8_t*)b->gi[i].data;
+ y = b->n_alleles * (b->n_alleles + 1) / 2;
+ for (j = 0; j < y; ++j) {
+ x = strtol(q, &q, 10);
+ if (x > 255) x = 255;
+ data[(k-9) * y + j] = x;
+ ++q;
+ }
+ } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
+ int j, y;
+ float x, *data = (float*)b->gi[i].data;
+ y = b->n_alleles * (b->n_alleles + 1) / 2;
+ for (j = 0; j < y; ++j) {
+ x = strtod(q, &q);
+ data[(k-9) * y + j] = x;
+ ++q;
+ }
+ }
+ }
+ endblock: i = i;
+ }
+ }
+ h->l_nm = rn.l; h->name = rn.s;
+ if (sync) bcf_hdr_sync(h);
+ return v->line.l + 1;
+}
--- /dev/null
+#!/usr/bin/perl -w
+
+# Author: lh3
+
+use strict;
+use warnings;
+use Getopt::Std;
+
+&main;
+exit;
+
+sub main {
+ &usage if (@ARGV < 1);
+ my $command = shift(@ARGV);
+ my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
+ hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
+ gapstats=>\&gapstats, splitchr=>\&splitchr);
+ die("Unknown command \"$command\".\n") if (!defined($func{$command}));
+ &{$func{$command}};
+}
+
+sub splitchr {
+ my %opts = (l=>5000000);
+ getopts('l:', \%opts);
+ my $l = $opts{l};
+ die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN);
+ while (<>) {
+ my @t = split;
+ my $last = 0;
+ for (my $i = 0; $i < $t[1];) {
+ my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l;
+ print "$t[0]:".($i+1)."-$e\n";
+ $i = $e;
+ }
+ }
+}
+
+sub subsam {
+ die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
+ my ($fh, %h);
+ my $fn = shift(@ARGV);
+ my @col;
+ open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
+ $h{$_} = 1 for (@ARGV);
+ while (<$fh>) {
+ if (/^##/) {
+ print;
+ } elsif (/^#/) {
+ my @t = split;
+ my @s = @t[0..8]; # all fixed fields + FORMAT
+ for (9 .. $#t) {
+ if ($h{$t[$_]}) {
+ push(@s, $t[$_]);
+ push(@col, $_);
+ }
+ }
+ pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
+ print join("\t", @s), "\n";
+ } else {
+ my @t = split;
+ if (@col == 0) {
+ print join("\t", @t[0..7]), "\n";
+ } else {
+ print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
+ }
+ }
+ }
+ close($fh);
+}
+
+sub listsam {
+ die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
+ while (<>) {
+ if (/^#/ && !/^##/) {
+ my @t = split;
+ print join("\n", @t[9..$#t]), "\n";
+ exit;
+ }
+ }
+}
+
+sub fillac {
+ die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
+ while (<>) {
+ if (/^#/) {
+ print;
+ } else {
+ my @t = split;
+ my @c = (0);
+ my $n = 0;
+ my $s = -1;
+ @_ = split(":", $t[8]);
+ for (0 .. $#_) {
+ if ($_[$_] eq 'GT') { $s = $_; last; }
+ }
+ if ($s < 0) {
+ print join("\t", @t), "\n";
+ next;
+ }
+ for (9 .. $#t) {
+ if ($t[$_] =~ /^0,0,0/) {
+ } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
+ ++$c[$2]; ++$c[$3];
+ $n += 2;
+ }
+ }
+ my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
+ my $info = $t[7];
+ $info =~ s/(;?)AC=(\d+)//;
+ $info =~ s/(;?)AN=(\d+)//;
+ if ($info eq '.') {
+ $info = $AC;
+ } else {
+ $info .= ";$AC";
+ }
+ $t[7] = $info;
+ print join("\t", @t), "\n";
+ }
+ }
+}
+
+sub ldstats {
+ my %opts = (t=>0.9);
+ getopts('t:', \%opts);
+ die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
+ my $cutoff = $opts{t};
+ my ($last, $lastchr) = (0x7fffffff, '');
+ my ($x, $y, $n) = (0, 0, 0);
+ while (<>) {
+ if (/^([^#\s]+)\s(\d+)/) {
+ my ($chr, $pos) = ($1, $2);
+ if (/NEIR=([\d\.]+)/) {
+ ++$n;
+ ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
+ }
+ $last = $pos; $lastchr = $chr;
+ }
+ }
+ print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
+ print "Fraction: ", $y/$n, "\n";
+ print "Length: $x\n";
+}
+
+sub qstats {
+ my %opts = (r=>'', s=>0.02, v=>undef);
+ getopts('r:s:v', \%opts);
+ die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
+Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
+ my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
+ my %h = ();
+ my $is_vcf = defined($opts{v})? 1 : 0;
+ if ($opts{r}) { # read the reference positions
+ my $fh;
+ open($fh, $opts{r}) || die;
+ while (<$fh>) {
+ next if (/^#/);
+ if ($is_vcf) {
+ my @t = split;
+ $h{$t[0],$t[1]} = $t[4];
+ } else {
+ $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
+ }
+ }
+ close($fh);
+ }
+ my $hsize = scalar(keys %h);
+ my @a;
+ while (<>) {
+ next if (/^#/);
+ my @t = split;
+ next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
+ $t[3] = uc($t[3]); $t[4] = uc($t[4]);
+ my @s = split(',', $t[4]);
+ $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
+ next if (length($s[0]) != 1);
+ my $hit;
+ if ($is_vcf) {
+ $hit = 0;
+ my $aa = $h{$t[0],$t[1]};
+ if (defined($aa)) {
+ my @aaa = split(",", $aa);
+ for (@aaa) {
+ $hit = 1 if ($_ eq $s[0]);
+ }
+ }
+ } else {
+ $hit = defined($h{$t[0],$t[1]})? 1 : 0;
+ }
+ push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
+ }
+ push(@a, [-1, 0, 0, 0]); # end marker
+ die("[qstats] No SNP data!\n") if (@a == 0);
+ @a = sort {$b->[0]<=>$a->[0]} @a;
+ my $next = $opts{s};
+ my $last = $a[0];
+ my @c = (0, 0, 0, 0);
+ my @lc;
+ $lc[1] = $lc[2] = 0;
+ for my $p (@a) {
+ if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
+ my @x;
+ $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
+ $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
+ $x[2] = sprintf("%.4f", $c[3] / $c[1]);
+ my $a = $c[1] - $lc[1];
+ my $b = $c[2] - $lc[2];
+ $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
+ print join("\t", $last, @c, @x), "\n";
+ $next = $c[0]/@a + $opts{s};
+ $lc[1] = $c[1]; $lc[2] = $c[2];
+ }
+ ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
+ $last = $p->[0];
+ }
+}
+
+sub varFilter {
+ my %opts = (d=>2, D=>10000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4);
+ getopts('pd:D:W:Q:w:a:1:2:3:4:', \%opts);
+ die(qq/
+Usage: vcfutils.pl varFilter [options] <in.vcf>
+
+Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
+ -d INT minimum read depth [$opts{d}]
+ -D INT maximum read depth [$opts{D}]
+ -a INT minimum number of alternate bases [$opts{a}]
+ -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
+ -W INT window size for filtering adjacent gaps [$opts{W}]
+ -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
+ -2 FLOAT min P-value for baseQ bias [$opts{2}]
+ -3 FLOAT min P-value for mapQ bias [$opts{3}]
+ -4 FLOAT min P-value for end distance bias [$opts{4}]
+ -p print filtered variants
+
+Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
+\n/) if (@ARGV == 0 && -t STDIN);
+
+ # calculate the window size
+ my ($ol, $ow) = ($opts{W}, $opts{w});
+ my $max_dist = $ol > $ow? $ol : $ow;
+ # the core loop
+ my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
+ while (<>) {
+ my @t = split;
+ if (/^#/) {
+ print; next;
+ }
+ next if ($t[4] eq '.'); # skip non-var sites
+ # check if the site is a SNP
+ my $type = 1; # SNP
+ if (length($t[3]) > 1) {
+ $type = 2; # MNP
+ my @s = split(',', $t[4]);
+ for (@s) {
+ $type = 3 if (length != length($t[3]));
+ }
+ } else {
+ my @s = split(',', $t[4]);
+ for (@s) {
+ $type = 3 if (length > 1);
+ }
+ }
+ # clear the out-of-range elements
+ while (@staging) {
+ # Still on the same chromosome and the first element's window still affects this position?
+ last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
+ varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
+ }
+ my $flt = 0;
+ # parse annotations
+ my ($dp, $mq, $dp_alt) = (-1, -1, -1);
+ if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
+ $dp = $1 + $2 + $3 + $4;
+ $dp_alt = $3 + $4;
+ }
+ if ($t[7] =~ /DP=(\d+)/i) {
+ $dp = $1;
+ }
+ $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
+ # the depth and mapQ filter
+ if ($dp >= 0) {
+ if ($dp < $opts{d}) {
+ $flt = 2;
+ } elsif ($dp > $opts{D}) {
+ $flt = 3;
+ }
+ }
+ $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
+ $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
+ $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
+ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
+
+ my $score = $t[5] * 100 + $dp_alt;
+ my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
+ if ($flt == 0) {
+ if ($type == 3) { # an indel
+ # filtering SNPs and MNPs
+ for my $x (@staging) {
+ next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
+ $x->[1] = 5;
+ }
+ # check the staging list for indel filtering
+ for my $x (@staging) {
+ next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
+ if ($x->[0]>>2 < $score) {
+ $x->[1] = 6;
+ } else {
+ $flt = 6; last;
+ }
+ }
+ } else { # SNP or MNP
+ for my $x (@staging) {
+ next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
+ $flt = 5;
+ last;
+ }
+ # check MNP
+ for my $x (@staging) {
+ next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
+ if ($x->[0]>>2 < $score) {
+ $x->[1] = 8;
+ } else {
+ $flt = 8; last;
+ }
+ }
+ }
+ }
+ push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
+ }
+ # output the last few elements in the staging list
+ while (@staging) {
+ varFilter_aux(shift @staging, $opts{p});
+ }
+}
+
+sub varFilter_aux {
+ my ($first, $is_print) = @_;
+ if ($first->[1] == 0) {
+ print join("\t", @$first[3 .. @$first-1]), "\n";
+ } elsif ($is_print) {
+ print STDERR join("\t", substr("UQdDaGgPM", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
+ }
+}
+
+sub gapstats {
+ my (@c0, @c1);
+ $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
+ while (<>) {
+ next if (/^#/);
+ my @t = split;
+ next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
+ my @s = split(',', $t[4]);
+ for my $x (@s) {
+ my $l = length($x) - length($t[3]) + 5000;
+ if ($x =~ /^-/) {
+ $l = -(length($x) - 1) + 5000;
+ } elsif ($x =~ /^\+/) {
+ $l = length($x) - 1 + 5000;
+ }
+ $c0[$l] += 1 / @s;
+ }
+ }
+ for (my $i = 0; $i < 10000; ++$i) {
+ next if ($c0[$i] == 0);
+ $c1[0] += $c0[$i];
+ $c1[1] += $c0[$i] if (($i-5000)%3 == 0);
+ printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
+ }
+ printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
+}
+
+sub ucscsnp2vcf {
+ die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
+ print "##fileformat=VCFv4.0\n";
+ print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
+ while (<>) {
+ my @t = split("\t");
+ my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
+ my $pos = $t[2] + 1;
+ my @alt;
+ push(@alt, $t[7]);
+ if ($t[6] eq '-') {
+ $t[9] = reverse($t[9]);
+ $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
+ }
+ my @a = split("/", $t[9]);
+ for (@a) {
+ push(@alt, $_) if ($_ ne $alt[0]);
+ }
+ if ($indel) {
+ --$pos;
+ for (0 .. $#alt) {
+ $alt[$_] =~ tr/-//d;
+ $alt[$_] = "N$alt[$_]";
+ }
+ }
+ my $ref = shift(@alt);
+ my $af = $t[13] > 0? ";AF=$t[13]" : '';
+ my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
+ my $info = "molType=$t[10];class=$t[11]$valid$af";
+ print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
+ }
+}
+
+sub hapmap2vcf {
+ die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
+ my $fn = shift(@ARGV);
+ # parse UCSC SNP
+ warn("Parsing UCSC SNPs...\n");
+ my ($fh, %map);
+ open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
+ while (<$fh>) {
+ my @t = split;
+ next if ($t[3] - $t[2] != 1); # not SNP
+ @{$map{$t[4]}} = @t[1,3,7];
+ }
+ close($fh);
+ # write VCF
+ warn("Writing VCF...\n");
+ print "##fileformat=VCFv4.0\n";
+ while (<>) {
+ my @t = split;
+ if ($t[0] eq 'rs#') { # the first line
+ print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
+ } else {
+ next unless ($map{$t[0]});
+ next if (length($t[1]) != 3); # skip non-SNPs
+ my $a = \@{$map{$t[0]}};
+ my $ref = $a->[2];
+ my @u = split('/', $t[1]);
+ if ($u[1] eq $ref) {
+ $u[1] = $u[0]; $u[0] = $ref;
+ } elsif ($u[0] ne $ref) { next; }
+ my $alt = $u[1];
+ my %w;
+ $w{$u[0]} = 0; $w{$u[1]} = 1;
+ my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
+ my $is_tri = 0;
+ for (@t[11..$#t]) {
+ if ($_ eq 'NN') {
+ push(@s, './.');
+ } else {
+ my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
+ if (!defined($a[0]) || !defined($a[1])) {
+ $is_tri = 1;
+ last;
+ }
+ push(@s, "$a[0]/$a[1]");
+ }
+ }
+ next if ($is_tri);
+ print join("\t", @s), "\n";
+ }
+ }
+}
+
+sub usage {
+ die(qq/
+Usage: vcfutils.pl <command> [<arguments>]\n
+Command: subsam get a subset of samples
+ listsam list the samples
+ fillac fill the allele count field
+ qstats SNP stats stratified by QUAL
+ varFilter filtering short variants
+ hapmap2vcf convert the hapmap format to VCF
+ ucscsnp2vcf convert UCSC SNP SQL dump to VCF
+\n/);
+}
--- /dev/null
+#include <math.h>
+#include "errmod.h"
+#include "ksort.h"
+KSORT_INIT_GENERIC(uint16_t)
+
+typedef struct __errmod_coef_t {
+ double *fk, *beta, *lhet;
+} errmod_coef_t;
+
+typedef struct {
+ double fsum[16], bsum[16];
+ uint32_t c[16];
+} call_aux_t;
+
+static errmod_coef_t *cal_coef(double depcorr, double eta)
+{
+ int k, n, q;
+ long double sum, sum1;
+ double *lC;
+ errmod_coef_t *ec;
+
+ ec = calloc(1, sizeof(errmod_coef_t));
+ // initialize ->fk
+ ec->fk = (double*)calloc(256, sizeof(double));
+ ec->fk[0] = 1.0;
+ for (n = 1; n != 256; ++n)
+ ec->fk[n] = pow(1. - depcorr, n) * (1.0 - eta) + eta;
+ // initialize ->coef
+ ec->beta = (double*)calloc(256 * 256 * 64, sizeof(double));
+ lC = (double*)calloc(256 * 256, sizeof(double));
+ for (n = 1; n != 256; ++n) {
+ double lgn = lgamma(n+1);
+ for (k = 1; k <= n; ++k)
+ lC[n<<8|k] = lgn - lgamma(k+1) - lgamma(n-k+1);
+ }
+ for (q = 1; q != 64; ++q) {
+ double e = pow(10.0, -q/10.0);
+ double le = log(e);
+ double le1 = log(1.0 - e);
+ for (n = 1; n <= 255; ++n) {
+ double *beta = ec->beta + (q<<16|n<<8);
+ sum1 = sum = 0.0;
+ for (k = n; k >= 0; --k, sum1 = sum) {
+ sum = sum1 + expl(lC[n<<8|k] + k*le + (n-k)*le1);
+ beta[k] = -10. / M_LN10 * logl(sum1 / sum);
+ }
+ }
+ }
+ // initialize ->lhet
+ ec->lhet = (double*)calloc(256 * 256, sizeof(double));
+ for (n = 0; n < 256; ++n)
+ for (k = 0; k < 256; ++k)
+ ec->lhet[n<<8|k] = lC[n<<8|k] - M_LN2 * n;
+ free(lC);
+ return ec;
+}
+
+errmod_t *errmod_init(float depcorr)
+{
+ errmod_t *em;
+ em = (errmod_t*)calloc(1, sizeof(errmod_t));
+ em->depcorr = depcorr;
+ em->coef = cal_coef(depcorr, 0.03);
+ return em;
+}
+
+void errmod_destroy(errmod_t *em)
+{
+ if (em == 0) return;
+ free(em->coef->lhet); free(em->coef->fk); free(em->coef->beta);
+ free(em->coef); free(em);
+}
+// qual:6, strand:1, base:4
+int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
+{
+ call_aux_t aux;
+ int i, j, k, w[32];
+
+ if (m > m) return -1;
+ memset(q, 0, m * m * sizeof(float));
+ if (n == 0) return 0;
+ // calculate aux.esum and aux.fsum
+ if (n > 255) { // then sample 255 bases
+ ks_shuffle(uint16_t, n, bases);
+ n = 255;
+ }
+ ks_introsort(uint16_t, n, bases);
+ memset(w, 0, 32 * sizeof(int));
+ memset(&aux, 0, sizeof(call_aux_t));
+ for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
+ uint16_t b = bases[j];
+ int q = b>>5 < 4? 4 : b>>5;
+ if (q > 63) q = 63;
+ k = b&0x1f;
+ aux.fsum[k&0xf] += em->coef->fk[w[k]];
+ aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]];
+ ++aux.c[k&0xf];
+ ++w[k];
+ }
+ // generate likelihood
+ for (j = 0; j != m; ++j) {
+ float tmp1, tmp3;
+ int tmp2, bar_e;
+ // homozygous
+ for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != m; ++k) {
+ if (k == j) continue;
+ tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k];
+ }
+ if (tmp2) {
+ bar_e = (int)(tmp1 / tmp3 + 0.499);
+ if (bar_e > 63) bar_e = 63;
+ q[j*m+j] = tmp1;
+ }
+ // heterozygous
+ for (k = j + 1; k < m; ++k) {
+ int cjk = aux.c[j] + aux.c[k];
+ for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) {
+ if (i == j || i == k) continue;
+ tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i];
+ }
+ if (tmp2) {
+ bar_e = (int)(tmp1 / tmp3 + 0.499);
+ if (bar_e > 63) bar_e = 63;
+ q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
+ } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
+ }
+ for (k = 0; k != m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0;
+ }
+ return 0;
+}
--- /dev/null
+#ifndef ERRMOD_H
+#define ERRMOD_H
+
+#include <stdint.h>
+
+struct __errmod_coef_t;
+
+typedef struct {
+ double depcorr;
+ struct __errmod_coef_t *coef;
+} errmod_t;
+
+errmod_t *errmod_init(float depcorr);
+void errmod_destroy(errmod_t *em);
+int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q);
+
+#endif
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2003-2006, 2008-2010, by Heng Li <lh3lh3@live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+#include "kprobaln.h"
+
+/*****************************************
+ * Probabilistic banded glocal alignment *
+ *****************************************/
+
+#define EI .25
+#define EM .33333333333
+
+static float g_qual2prob[256];
+
+#define set_u(u, b, i, k) { int x=(i)-(b); x=x>0?x:0; (u)=((k)-x+1)*3; }
+
+kpa_par_t kpa_par_def = { 0.001, 0.1, 10 };
+kpa_par_t kpa_par_alt = { 0.0001, 0.01, 10 };
+
+/*
+ The topology of the profile HMM:
+
+ /\ /\ /\ /\
+ I[1] I[k-1] I[k] I[L]
+ ^ \ \ ^ \ ^ \ \ ^
+ | \ \ | \ | \ \ |
+ M[0] M[1] -> ... -> M[k-1] -> M[k] -> ... -> M[L] M[L+1]
+ \ \/ \/ \/ /
+ \ /\ /\ /\ /
+ -> D[k-1] -> D[k] ->
+
+ M[0] points to every {M,I}[k] and every {M,I}[k] points M[L+1].
+
+ On input, _ref is the reference sequence and _query is the query
+ sequence. Both are sequences of 0/1/2/3/4 where 4 stands for an
+ ambiguous residue. iqual is the base quality. c sets the gap open
+ probability, gap extension probability and band width.
+
+ On output, state and q are arrays of length l_query. The higher 30
+ bits give the reference position the query base is matched to and the
+ lower two bits can be 0 (an alignment match) or 1 (an
+ insertion). q[i] gives the phred scaled posterior probability of
+ state[i] being wrong.
+ */
+int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
+ const kpa_par_t *c, int *state, uint8_t *q)
+{
+ double **f, **b = 0, *s, m[9], sI, sM, bI, bM, pb;
+ float *qual, *_qual;
+ const uint8_t *ref, *query;
+ int bw, bw2, i, k, is_diff = 0, is_backward = 1, Pr;
+
+ /*** initialization ***/
+ is_backward = state && q? 1 : 0;
+ ref = _ref - 1; query = _query - 1; // change to 1-based coordinate
+ bw = l_ref > l_query? l_ref : l_query;
+ if (bw > c->bw) bw = c->bw;
+ if (bw < abs(l_ref - l_query)) bw = abs(l_ref - l_query);
+ bw2 = bw * 2 + 1;
+ // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
+ f = calloc(l_query+1, sizeof(void*));
+ if (is_backward) b = calloc(l_query+1, sizeof(void*));
+ for (i = 0; i <= l_query; ++i) {
+ f[i] = calloc(bw2 * 3 + 6, sizeof(double)); // FIXME: this is over-allocated for very short seqs
+ if (is_backward) b[i] = calloc(bw2 * 3 + 6, sizeof(double));
+ }
+ s = calloc(l_query+2, sizeof(double)); // s[] is the scaling factor to avoid underflow
+ // initialize qual
+ _qual = calloc(l_query, sizeof(float));
+ if (g_qual2prob[0] == 0)
+ for (i = 0; i < 256; ++i)
+ g_qual2prob[i] = pow(10, -i/10.);
+ for (i = 0; i < l_query; ++i) _qual[i] = g_qual2prob[iqual? iqual[i] : 30];
+ qual = _qual - 1;
+ // initialize transition probability
+ sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof
+ m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM);
+ m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.;
+ m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e;
+ bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1
+ /*** forward ***/
+ // f[0]
+ set_u(k, bw, 0, 0);
+ f[0][k] = s[0] = 1.;
+ { // f[1]
+ double *fi = f[1], sum;
+ int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
+ for (k = beg, sum = 0.; k <= end; ++k) {
+ int u;
+ double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
+ set_u(u, bw, 1, k);
+ fi[u+0] = e * bM; fi[u+1] = EI * bI;
+ sum += fi[u] + fi[u+1];
+ }
+ // rescale
+ s[1] = sum;
+ set_u(_beg, bw, 1, beg); set_u(_end, bw, 1, end); _end += 2;
+ for (k = _beg; k <= _end; ++k) fi[k] /= sum;
+ }
+ // f[2..l_query]
+ for (i = 2; i <= l_query; ++i) {
+ double *fi = f[i], *fi1 = f[i-1], sum, qli = qual[i];
+ int beg = 1, end = l_ref, x, _beg, _end;
+ uint8_t qyi = query[i];
+ x = i - bw; beg = beg > x? beg : x; // band start
+ x = i + bw; end = end < x? end : x; // band end
+ for (k = beg, sum = 0.; k <= end; ++k) {
+ int u, v11, v01, v10;
+ double e;
+ e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli * EM;
+ set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1);
+ fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
+ fi[u+1] = EI * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
+ fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
+ sum += fi[u] + fi[u+1] + fi[u+2];
+// fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG
+ }
+ // rescale
+ s[i] = sum;
+ set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
+ for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum;
+ }
+ { // f[l_query+1]
+ double sum;
+ for (k = 1, sum = 0.; k <= l_ref; ++k) {
+ int u;
+ set_u(u, bw, l_query, k);
+ if (u < 3 || u >= bw2*3+3) continue;
+ sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
+ }
+ s[l_query+1] = sum; // the last scaling factor
+ }
+ { // compute likelihood
+ double p = 1., Pr1 = 0.;
+ for (i = 0; i <= l_query + 1; ++i) {
+ p *= s[i];
+ if (p < 1e-100) Pr += -4.343 * log(p), p = 1.;
+ }
+ Pr1 += -4.343 * log(p * l_ref * l_query);
+ Pr = (int)(Pr1 + .499);
+ if (!is_backward) { // skip backward and MAP
+ for (i = 0; i <= l_query; ++i) free(f[i]);
+ free(f); free(s); free(_qual);
+ return Pr;
+ }
+ }
+ /*** backward ***/
+ // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
+ for (k = 1; k <= l_ref; ++k) {
+ int u;
+ double *bi = b[l_query];
+ set_u(u, bw, l_query, k);
+ if (u < 3 || u >= bw2*3+3) continue;
+ bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
+ }
+ // b[l_query-1..1]
+ for (i = l_query - 1; i >= 1; --i) {
+ int beg = 1, end = l_ref, x, _beg, _end;
+ double *bi = b[i], *bi1 = b[i+1], y = (i > 1), qli1 = qual[i+1];
+ uint8_t qyi1 = query[i+1];
+ x = i - bw; beg = beg > x? beg : x;
+ x = i + bw; end = end < x? end : x;
+ for (k = end; k >= beg; --k) {
+ int u, v11, v01, v10;
+ double e;
+ set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1);
+ e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 * EM) * bi1[v11];
+ bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
+ bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1];
+ bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
+// fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG
+ }
+ // rescale
+ set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2;
+ for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
+ }
+ { // b[0]
+ int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
+ double sum = 0.;
+ for (k = end; k >= beg; --k) {
+ int u;
+ double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] * EM;
+ set_u(u, bw, 1, k);
+ if (u < 3 || u >= bw2*3+3) continue;
+ sum += e * b[1][u+0] * bM + EI * b[1][u+1] * bI;
+ }
+ set_u(k, bw, 0, 0);
+ pb = b[0][k] = sum / s[0]; // if everything works as is expected, pb == 1.0
+ }
+ is_diff = fabs(pb - 1.) > 1e-7? 1 : 0;
+ /*** MAP ***/
+ for (i = 1; i <= l_query; ++i) {
+ double sum = 0., *fi = f[i], *bi = b[i], max = 0.;
+ int beg = 1, end = l_ref, x, max_k = -1;
+ x = i - bw; beg = beg > x? beg : x;
+ x = i + bw; end = end < x? end : x;
+ for (k = beg; k <= end; ++k) {
+ int u;
+ double z;
+ set_u(u, bw, i, k);
+ z = fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z;
+ z = fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z;
+ }
+ max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
+ if (state) state[i-1] = max_k;
+ if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k;
+#ifdef _MAIN
+ fprintf(stderr, "(%.10lg,%.10lg) (%d,%d:%c,%c:%d) %lg\n", pb, sum, i-1, max_k>>2,
+ "ACGT"[query[i]], "ACGT"[ref[(max_k>>2)+1]], max_k&3, max); // DEBUG
+#endif
+ }
+ /*** free ***/
+ for (i = 0; i <= l_query; ++i) {
+ free(f[i]); free(b[i]);
+ }
+ free(f); free(b); free(s); free(_qual);
+ return Pr;
+}
+
+#ifdef _MAIN
+#include <unistd.h>
+int main(int argc, char *argv[])
+{
+ uint8_t conv[256], *iqual, *ref, *query;
+ int c, l_ref, l_query, i, q = 30, b = 10, P;
+ while ((c = getopt(argc, argv, "b:q:")) >= 0) {
+ switch (c) {
+ case 'b': b = atoi(optarg); break;
+ case 'q': q = atoi(optarg); break;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "Usage: %s [-q %d] [-b %d] <ref> <query>\n", argv[0], q, b); // example: acttc attc
+ return 1;
+ }
+ memset(conv, 4, 256);
+ conv['a'] = conv['A'] = 0; conv['c'] = conv['C'] = 1;
+ conv['g'] = conv['G'] = 2; conv['t'] = conv['T'] = 3;
+ ref = (uint8_t*)argv[optind]; query = (uint8_t*)argv[optind+1];
+ l_ref = strlen((char*)ref); l_query = strlen((char*)query);
+ for (i = 0; i < l_ref; ++i) ref[i] = conv[ref[i]];
+ for (i = 0; i < l_query; ++i) query[i] = conv[query[i]];
+ iqual = malloc(l_query);
+ memset(iqual, q, l_query);
+ kpa_par_def.bw = b;
+ P = kpa_glocal(ref, l_ref, query, l_query, iqual, &kpa_par_alt, 0, 0);
+ fprintf(stderr, "%d\n", P);
+ free(iqual);
+ return 0;
+}
+#endif
--- /dev/null
+/* The MIT License
+
+ Copyright (c) 2003-2006, 2008, 2009 by Heng Li <lh3@live.co.uk>
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice shall be
+ included in all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+#ifndef LH3_KPROBALN_H_
+#define LH3_KPROBALN_H_
+
+#include <stdint.h>
+
+typedef struct {
+ float d, e;
+ int bw;
+} kpa_par_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ int kpa_glocal(const uint8_t *_ref, int l_ref, const uint8_t *_query, int l_query, const uint8_t *iqual,
+ const kpa_par_t *c, int *state, uint8_t *q);
+
+#ifdef __cplusplus
+}
+#endif
+
+extern kpa_par_t kpa_par_def, kpa_par_alt;
+
+#endif
--- /dev/null
+import java.io.*;
+import java.lang.*;
+
+public class HmmGlocal
+{
+ private double[] qual2prob;
+ private double cd, ce; // gap open probility [1e-3], gap extension probability [0.1]
+ private int cb; // band width [7]
+
+ public HmmGlocal(final double d, final double e, final int b) {
+ cd = d; ce = e; cb = b;
+ qual2prob = new double[256];
+ for (int i = 0; i < 256; ++i)
+ qual2prob[i] = Math.pow(10, -i/10.);
+ }
+ private static int set_u(final int b, final int i, final int k) {
+ int x = i - b;
+ x = x > 0? x : 0;
+ return (k + 1 - x) * 3;
+ }
+ public int hmm_glocal(final byte[] _ref, final byte[] _query, final byte[] _iqual, int[] state, byte[] q) {
+ int i, k;
+ /*** initialization ***/
+ // change coordinates
+ int l_ref = _ref.length;
+ byte[] ref = new byte[l_ref+1];
+ for (i = 0; i < l_ref; ++i) ref[i+1] = _ref[i]; // FIXME: this is silly...
+ int l_query = _query.length;
+ byte[] query = new byte[l_query+1];
+ double[] qual = new double[l_query+1];
+ for (i = 0; i < l_query; ++i) {
+ query[i+1] = _query[i];
+ qual[i+1] = qual2prob[_iqual[i]];
+ }
+ // set band width
+ int bw2, bw = l_ref > l_query? l_ref : l_query;
+ if (bw > cb) bw = cb;
+ if (bw < Math.abs(l_ref - l_query)) bw = Math.abs(l_ref - l_query);
+ bw2 = bw * 2 + 1;
+ // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[]
+ double[][] f = new double[l_query+1][bw2*3 + 6];
+ double[][] b = new double[l_query+1][bw2*3 + 6];
+ double[] s = new double[l_query+2];
+ // initialize transition probabilities
+ double sM, sI, bM, bI;
+ sM = sI = 1. / (2 * l_query + 2);
+ bM = (1 - cd) / l_query; bI = cd / l_query; // (bM+bI)*l_query==1
+ double[] m = new double[9];
+ m[0*3+0] = (1 - cd - cd) * (1 - sM); m[0*3+1] = m[0*3+2] = cd * (1 - sM);
+ m[1*3+0] = (1 - ce) * (1 - sI); m[1*3+1] = ce * (1 - sI); m[1*3+2] = 0.;
+ m[2*3+0] = 1 - ce; m[2*3+1] = 0.; m[2*3+2] = ce;
+ /*** forward ***/
+ // f[0]
+ f[0][set_u(bw, 0, 0)] = s[0] = 1.;
+ { // f[1]
+ double[] fi = f[1];
+ double sum;
+ int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1, _beg, _end;
+ for (k = beg, sum = 0.; k <= end; ++k) {
+ int u;
+ double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] / 3.;
+ u = set_u(bw, 1, k);
+ fi[u+0] = e * bM; fi[u+1] = .25 * bI;
+ sum += fi[u] + fi[u+1];
+ }
+ // rescale
+ s[1] = sum;
+ _beg = set_u(bw, 1, beg); _end = set_u(bw, 1, end); _end += 2;
+ for (k = _beg; k <= _end; ++k) fi[k] /= sum;
+ }
+ // f[2..l_query]
+ for (i = 2; i <= l_query; ++i) {
+ double[] fi = f[i], fi1 = f[i-1];
+ double sum, qli = qual[i];
+ int beg = 1, end = l_ref, x, _beg, _end;
+ byte qyi = query[i];
+ x = i - bw; beg = beg > x? beg : x; // band start
+ x = i + bw; end = end < x? end : x; // band end
+ for (k = beg, sum = 0.; k <= end; ++k) {
+ int u, v11, v01, v10;
+ double e;
+ e = (ref[k] > 3 || qyi > 3)? 1. : ref[k] == qyi? 1. - qli : qli / 3.;
+ u = set_u(bw, i, k); v11 = set_u(bw, i-1, k-1); v10 = set_u(bw, i-1, k); v01 = set_u(bw, i, k-1);
+ fi[u+0] = e * (m[0] * fi1[v11+0] + m[3] * fi1[v11+1] + m[6] * fi1[v11+2]);
+ fi[u+1] = .25 * (m[1] * fi1[v10+0] + m[4] * fi1[v10+1]);
+ fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2];
+ sum += fi[u] + fi[u+1] + fi[u+2];
+ //System.out.println("("+i+","+k+";"+u+"): "+fi[u]+","+fi[u+1]+","+fi[u+2]);
+ }
+ // rescale
+ s[i] = sum;
+ _beg = set_u(bw, i, beg); _end = set_u(bw, i, end); _end += 2;
+ for (k = _beg, sum = 1./sum; k <= _end; ++k) fi[k] *= sum;
+ }
+ { // f[l_query+1]
+ double sum;
+ for (k = 1, sum = 0.; k <= l_ref; ++k) {
+ int u = set_u(bw, l_query, k);
+ if (u < 3 || u >= bw2*3+3) continue;
+ sum += f[l_query][u+0] * sM + f[l_query][u+1] * sI;
+ }
+ s[l_query+1] = sum; // the last scaling factor
+ }
+ /*** backward ***/
+ // b[l_query] (b[l_query+1][0]=1 and thus \tilde{b}[][]=1/s[l_query+1]; this is where s[l_query+1] comes from)
+ for (k = 1; k <= l_ref; ++k) {
+ int u = set_u(bw, l_query, k);
+ double[] bi = b[l_query];
+ if (u < 3 || u >= bw2*3+3) continue;
+ bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1];
+ }
+ // b[l_query-1..1]
+ for (i = l_query - 1; i >= 1; --i) {
+ int beg = 1, end = l_ref, x, _beg, _end;
+ double[] bi = b[i], bi1 = b[i+1];
+ double y = (i > 1)? 1. : 0., qli1 = qual[i+1];
+ byte qyi1 = query[i+1];
+ x = i - bw; beg = beg > x? beg : x;
+ x = i + bw; end = end < x? end : x;
+ for (k = end; k >= beg; --k) {
+ int u, v11, v01, v10;
+ double e;
+ u = set_u(bw, i, k); v11 = set_u(bw, i+1, k+1); v10 = set_u(bw, i+1, k); v01 = set_u(bw, i, k+1);
+ e = (k >= l_ref? 0 : (ref[k+1] > 3 || qyi1 > 3)? 1. : ref[k+1] == qyi1? 1. - qli1 : qli1 / 3.) * bi1[v11];
+ bi[u+0] = e * m[0] + .25 * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e.
+ bi[u+1] = e * m[3] + .25 * m[4] * bi1[v10+1];
+ bi[u+2] = (e * m[6] + m[8] * bi[v01+2]) * y;
+ }
+ // rescale
+ _beg = set_u(bw, i, beg); _end = set_u(bw, i, end); _end += 2;
+ for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y;
+ }
+ double pb;
+ { // b[0]
+ int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1;
+ double sum = 0.;
+ for (k = end; k >= beg; --k) {
+ int u = set_u(bw, 1, k);
+ double e = (ref[k] > 3 || query[1] > 3)? 1. : ref[k] == query[1]? 1. - qual[1] : qual[1] / 3.;
+ if (u < 3 || u >= bw2*3+3) continue;
+ sum += e * b[1][u+0] * bM + .25 * b[1][u+1] * bI;
+ }
+ pb = b[0][set_u(bw, 0, 0)] = sum / s[0]; // if everything works as is expected, pb == 1.0
+ }
+ int is_diff = Math.abs(pb - 1.) > 1e-7? 1 : 0;
+ /*** MAP ***/
+ for (i = 1; i <= l_query; ++i) {
+ double sum = 0., max = 0.;
+ double[] fi = f[i], bi = b[i];
+ int beg = 1, end = l_ref, x, max_k = -1;
+ x = i - bw; beg = beg > x? beg : x;
+ x = i + bw; end = end < x? end : x;
+ for (k = beg; k <= end; ++k) {
+ int u = set_u(bw, i, k);
+ double z;
+ sum += (z = fi[u+0] * bi[u+0]); if (z > max) { max = z; max_k = (k-1)<<2 | 0; }
+ sum += (z = fi[u+1] * bi[u+1]); if (z > max) { max = z; max_k = (k-1)<<2 | 1; }
+ }
+ max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0
+ if (state != null) state[i-1] = max_k;
+ if (q != null) {
+ k = (int)(-4.343 * Math.log(1. - max) + .499);
+ q[i-1] = (byte)(k > 100? 99 : k);
+ }
+ //System.out.println("("+pb+","+sum+")"+" ("+(i-1)+","+(max_k>>2)+","+(max_k&3)+","+max+")");
+ }
+ return 0;
+ }
+
+ public static void main(String[] args) {
+ byte[] ref = {'\0', '\1', '\3', '\3', '\1'};
+ byte[] query = {'\0', '\3', '\3', '\1'};
+ byte[] qual = new byte[4];
+ qual[0] = qual[1] = qual[2] = qual[3] = (byte)20;
+ HmmGlocal hg = new HmmGlocal(1e-3, 0.1, 7);
+ hg.hmm_glocal(ref, query, qual, null, null);
+ }
+}
\ No newline at end of file
--- /dev/null
+#include <stdlib.h>
+#include <string.h>
+#include "sample.h"
+#include "khash.h"
+KHASH_MAP_INIT_STR(sm, int)
+
+bam_sample_t *bam_smpl_init(void)
+{
+ bam_sample_t *s;
+ s = calloc(1, sizeof(bam_sample_t));
+ s->rg2smid = kh_init(sm);
+ s->sm2id = kh_init(sm);
+ return s;
+}
+
+void bam_smpl_destroy(bam_sample_t *sm)
+{
+ int i;
+ khint_t k;
+ khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
+ if (sm == 0) return;
+ for (i = 0; i < sm->n; ++i) free(sm->smpl[i]);
+ free(sm->smpl);
+ for (k = kh_begin(rg2smid); k != kh_end(rg2smid); ++k)
+ if (kh_exist(rg2smid, k)) free((char*)kh_key(rg2smid, k));
+ kh_destroy(sm, sm->rg2smid);
+ kh_destroy(sm, sm->sm2id);
+ free(sm);
+}
+
+static void add_pair(bam_sample_t *sm, khash_t(sm) *sm2id, const char *key, const char *val)
+{
+ khint_t k_rg, k_sm;
+ int ret;
+ khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
+ k_rg = kh_get(sm, rg2smid, key);
+ if (k_rg != kh_end(rg2smid)) return; // duplicated @RG-ID
+ k_rg = kh_put(sm, rg2smid, strdup(key), &ret);
+ k_sm = kh_get(sm, sm2id, val);
+ if (k_sm == kh_end(sm2id)) { // absent
+ if (sm->n == sm->m) {
+ sm->m = sm->m? sm->m<<1 : 1;
+ sm->smpl = realloc(sm->smpl, sizeof(void*) * sm->m);
+ }
+ sm->smpl[sm->n] = strdup(val);
+ k_sm = kh_put(sm, sm2id, sm->smpl[sm->n], &ret);
+ kh_val(sm2id, k_sm) = sm->n++;
+ }
+ kh_val(rg2smid, k_rg) = kh_val(sm2id, k_sm);
+}
+
+int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
+{
+ const char *p = txt, *q, *r;
+ kstring_t buf;
+ int n = 0;
+ khash_t(sm) *sm2id = (khash_t(sm)*)sm->sm2id;
+ memset(&buf, 0, sizeof(kstring_t));
+ while ((q = strstr(p, "@RG")) != 0) {
+ p = q + 3;
+ r = q = 0;
+ if ((q = strstr(p, "\tID:")) != 0) q += 4;
+ if ((r = strstr(p, "\tSM:")) != 0) r += 4;
+ if (r && q) {
+ char *u, *v;
+ int oq, or;
+ for (u = (char*)q; *u && *u != '\t' && *u != '\n'; ++u);
+ for (v = (char*)r; *v && *v != '\t' && *v != '\n'; ++v);
+ oq = *u; or = *v; *u = *v = '\0';
+ buf.l = 0; kputs(fn, &buf); kputc('/', &buf); kputs(q, &buf);
+ add_pair(sm, sm2id, buf.s, r);
+ *u = oq; *v = or;
+ } else break;
+ p = q > r? q : r;
+ ++n;
+ }
+ if (n == 0) add_pair(sm, sm2id, fn, fn);
+ free(buf.s);
+ return 0;
+}
+
+int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str)
+{
+ khint_t k;
+ khash_t(sm) *rg2smid = (khash_t(sm)*)sm->rg2smid;
+ if (rg) {
+ str->l = 0;
+ kputs(fn, str); kputc('/', str); kputs(rg, str);
+ k = kh_get(sm, rg2smid, str->s);
+ } else k = kh_get(sm, rg2smid, fn);
+ return k == kh_end(rg2smid)? -1 : kh_val(rg2smid, k);
+}
--- /dev/null
+#ifndef BAM_SAMPLE_H
+#define BAM_SAMPLE_H
+
+#include "kstring.h"
+
+typedef struct {
+ int n, m;
+ char **smpl;
+ void *rg2smid, *sm2id;
+} bam_sample_t;
+
+bam_sample_t *bam_smpl_init(void);
+int bam_smpl_add(bam_sample_t *sm, const char *abs, const char *txt);
+int bam_smpl_rg2smid(const bam_sample_t *sm, const char *fn, const char *rg, kstring_t *str);
+void bam_smpl_destroy(bam_sample_t *sm);
+
+#endif