lib:libbam.a
libbam.a:$(LOBJS)
- $(AR) -cru $@ $(LOBJS)
+ $(AR) -csru $@ $(LOBJS)
samtools:lib-recur $(AOBJS)
$(CC) $(CFLAGS) -o $@ $(AOBJS) -Lbcftools $(LIBPATH) libbam.a -lbcf $(LIBCURSES) -lm -lz
@copyright Genome Research Ltd.
*/
-#define BAM_VERSION "0.1.16 (r963:234)"
+#define BAM_VERSION "0.1.16-dev (r969:252)"
#include <stdint.h>
#include <stdlib.h>
return k == kh_end(h)? -1 : kh_value(h, k);
}
-int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end)
+int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end)
{
- char *s, *p;
- int i, l, k;
+ char *s;
+ int i, l, k, name_end;
khiter_t iter;
khash_t(s) *h;
bam_init_header_hash(header);
h = (khash_t(s)*)header->hash;
- l = strlen(str);
- p = s = (char*)malloc(l+1);
- /* squeeze out "," */
- for (i = k = 0; i != l; ++i)
- if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
- s[k] = 0;
- for (i = 0; i != k; ++i) if (s[i] == ':') break;
- s[i] = 0;
- iter = kh_get(s, h, s); /* get the ref_id */
- if (iter == kh_end(h)) { // name not found
- *ref_id = -1; free(s);
- return -1;
- }
- *ref_id = kh_value(h, iter);
- if (i == k) { /* dump the whole sequence */
- *begin = 0; *end = 1<<29; free(s);
- return 0;
- }
- for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
- *begin = atoi(p);
- if (i < k) {
- p = s + i + 1;
- *end = atoi(p);
- } else *end = 1<<29;
- if (*begin > 0) --*begin;
+ *ref_id = *beg = *end = -1;
+ name_end = l = strlen(str);
+ s = (char*)malloc(l+1);
+ // remove space
+ for (i = k = 0; i < l; ++i)
+ if (!isspace(str[i])) s[k++] = str[i];
+ s[k] = 0; l = k;
+ // determine the sequence name
+ for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
+ if (i >= 0) name_end = i;
+ if (name_end < l) { // check if this is really the end
+ int n_hyphen = 0;
+ for (i = name_end + 1; i < l; ++i) {
+ if (s[i] == '-') ++n_hyphen;
+ else if (!isdigit(s[i]) && s[i] != ',') break;
+ }
+ if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
+ s[name_end] = 0;
+ iter = kh_get(s, h, s);
+ if (iter == kh_end(h)) { // cannot find the sequence name
+ iter = kh_get(s, h, str); // try str as the name
+ if (iter == kh_end(h)) {
+ if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__);
+ free(s); return -1;
+ } else s[name_end] = ':', name_end = l;
+ }
+ } else iter = kh_get(s, h, str);
+ *ref_id = kh_val(h, iter);
+ // parse the interval
+ if (name_end < l) {
+ for (i = k = name_end + 1; i < l; ++i)
+ if (s[i] != ',') s[k++] = s[i];
+ s[k] = 0;
+ *beg = atoi(s + name_end + 1);
+ for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
+ *end = i < k? atoi(s + i + 1) : 1<<29;
+ if (*beg > 0) --*beg;
+ } else *beg = 0, *end = 1<<29;
free(s);
- if (*begin > *end) {
- fprintf(stderr, "[bam_parse_region] invalid region.\n");
- return -1;
- }
- return 0;
+ return *beg <= *end? 0 : -1;
}
int32_t bam_aux2i(const uint8_t *s)
else if (str->s[5] == 'S') while (p < str->s + str->l) ((uint16_t*)s)[k++] = (uint16_t)strtol(p, &p, 0), ++p;
else if (str->s[5] == 'i') while (p < str->s + str->l) ((int32_t*)s)[k++] = (int32_t)strtol(p, &p, 0), ++p;
else if (str->s[5] == 'I') while (p < str->s + str->l) ((uint32_t*)s)[k++] = (uint32_t)strtol(p, &p, 0), ++p;
- else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++] = (float)strtof(p, &p), ++p;
+ else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++] = (float)strtod(p, &p), ++p;
else parse_error(fp->n_lines, "unrecognized array type");
s += Bsize * n; doff += size;
} else parse_error(fp->n_lines, "unrecognized type");
save_bin = save_tid = last_tid = last_bin = 0xffffffffu;
save_off = last_off = bam_tell(fp); last_coor = 0xffffffffu;
- n_mapped = n_unmapped = n_no_coor = off_end = 0;
+ n_mapped = n_unmapped = n_no_coor = off_end = 0;
off_beg = off_end = bam_tell(fp);
while ((ret = bam_read1(fp, b)) >= 0) {
if (c->tid < 0) ++n_no_coor;
- if (last_tid != c->tid) { // change of chromosomes
+ if (last_tid < c->tid) { // change of chromosomes
last_tid = c->tid;
last_bin = 0xffffffffu;
+ } else if (last_tid > c->tid) {
+ fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %d-th chr > %d-th chr\n",
+ bam1_qname(b), last_tid+1, c->tid+1);
+ return NULL;
} else if (last_coor > c->pos) {
fprintf(stderr, "[bam_index_core] the alignment is not sorted (%s): %u > %u in %d-th chr\n",
bam1_qname(b), last_coor, c->pos, c->tid+1);
- exit(1);
+ return NULL;
}
if (c->tid >= 0) insert_offset2(&idx->index2[b->core.tid], b, last_off);
if (c->bin != last_bin) { // then possibly write the binning index
if (bam_tell(fp) <= last_off) {
fprintf(stderr, "[bam_index_core] bug in BGZF/RAZF: %llx < %llx\n",
(unsigned long long)bam_tell(fp), (unsigned long long)last_off);
- exit(1);
+ return NULL;
}
if (c->flag & BAM_FUNMAP) ++n_unmapped;
else ++n_mapped;
++n_no_coor;
if (c->tid >= 0 && n_no_coor) {
fprintf(stderr, "[bam_index_core] the alignment is not sorted: reads without coordinates prior to reads with coordinates.\n");
- exit(1);
+ return NULL;
}
}
}
}
idx = bam_index_core(fp);
bam_close(fp);
+ if(idx == 0) {
+ fprintf(stderr, "[bam_index_build2] fail to index the BAM file.\n");
+ return -1;
+ }
if (_fnidx == 0) {
fnidx = (char*)calloc(strlen(fn) + 5, 1);
strcpy(fnidx, fn); strcat(fnidx, ".bai");
if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
if (tid != ref_tid) {
free(ref); ref = 0;
- if (conf->fai) ref = fai_fetch(conf->fai, h->target_name[tid], &ref_len);
+ if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
ref_tid = tid;
}
#ifndef BCF_H
#define BCF_H
+#define BCF_VERSION "0.1.16-dev (r963:240)"
+
#include <stdint.h>
#include <zlib.h>
#include "khash.h"
KHASH_MAP_INIT_STR(str2id, int)
+// FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated...
void *bcf_build_refhash(bcf_hdr_t *h)
{
khash_t(str2id) *hash;
bcf_sync(b);
}
-static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[9])
+static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[10])
{
kstring_t s;
int has_I16, is_var;
if (em[4] >= 0 && em[4] <= 0.05) ksprintf(&s, ";G3=%.4g,%.4g,%.4g;HWE=%.3g", em[3], em[2], em[1], em[4]);
if (em[5] >= 0 && em[6] >= 0) ksprintf(&s, ";AF2=%.4g,%.4g", 1 - em[5], 1 - em[6]);
if (em[7] >= 0) ksprintf(&s, ";LRT=%.3g", em[7]);
+ if (em[8] >= 0) ksprintf(&s, ";LRT2=%.3g", em[8]);
+ //if (em[9] >= 0) ksprintf(&s, ";LRT1=%.3g", em[9]);
}
if (pr == 0) { // if pr is unset, return
kputc('\0', &s); kputs(b->fmt, &s); kputc('\0', &s);
is_var = (pr->p_ref < pref);
r = is_var? pr->p_ref : pr->p_var;
- ksprintf(&s, ";CI95=%.4g,%.4g", pr->cil, pr->cih); // FIXME: when EM is not used, ";" should be omitted!
+// ksprintf(&s, ";CI95=%.4g,%.4g", pr->cil, pr->cih); // FIXME: when EM is not used, ";" should be omitted!
+ ksprintf(&s, ";AC1=%d", pr->ac);
if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
if (fq < -999) fq = -999;
if (q[i] > 255) q[i] = 255;
}
if (pr->perm_rank >= 0) ksprintf(&s, ";PR=%d", pr->perm_rank);
+ // ksprintf(&s, ";LRT3=%.3g", pr->lrt);
ksprintf(&s, ";PCHI2=%.3g;PC2=%d,%d", q[1], q[2], pr->p_chi2);
}
if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
if (!strstr(str.s, "##INFO=<ID=FQ,"))
kputs("##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=AF1,"))
- kputs("##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the site allele frequency of the first ALT allele\">\n", &str);
+ kputs("##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele frequency (assuming HWE)\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=AC1,"))
+ kputs("##INFO=<ID=AC1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele count (no HWE assumption)\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=G3,"))
kputs("##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=HWE,"))
kputs("##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test P-value based on G3\">\n", &str);
- if (!strstr(str.s, "##INFO=<ID=CI95,"))
- kputs("##INFO=<ID=CI95,Number=2,Type=Float,Description=\"Equal-tail Bayesian credible interval of the site allele frequency at the 95% level\">\n", &str);
+// if (!strstr(str.s, "##INFO=<ID=CI95,"))
+// kputs("##INFO=<ID=CI95,Number=2,Type=Float,Description=\"Equal-tail Bayesian credible interval of the site allele frequency at the 95% level\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=PV4,"))
kputs("##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=INDEL,"))
}
while (vcf_read(bp, hin, b) > 0) {
int is_indel;
- double em[9];
+ double em[10];
if ((vc.flag & VC_VARONLY) && strcmp(b->alt, "X") == 0) continue;
if ((vc.flag & VC_VARONLY) && vc.min_smpl_frac > 0.) {
extern int bcf_smpl_covered(const bcf1_t *b);
bcf_2qcall(hout, b);
continue;
}
- if (vc.flag & VC_EM) bcf_em1(b, vc.n1, 0xff, em);
+ if (vc.flag & (VC_CALL|VC_ADJLD|VC_EM)) bcf_gl2pl(b);
+ if (vc.flag & VC_EM) bcf_em1(b, vc.n1, 0x1ff, em);
else {
int i;
for (i = 0; i < 9; ++i) em[i] = -1.;
}
- if (vc.flag & (VC_CALL|VC_ADJLD)) bcf_gl2pl(b);
if (vc.flag & VC_CALL) { // call variants
bcf_p1rst_t pr;
int calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr);
int i, n = 0;
for (i = 0; i < vc.n_perm; ++i) {
#ifdef BCF_PERM_LRT // LRT based permutation is much faster but less robust to artifacts
- double x[9];
+ double x[10];
bcf_shuffle(b, seeds[i]);
bcf_em1(b, vc.n1, 1<<7, x);
if (x[7] < em[7]) ++n;
// x[5..6]: group1 freq, group2 freq
// x[7]: 1-degree P-value
// x[8]: 2-degree P-value
-int bcf_em1(const bcf1_t *b, int n1, int flag, double x[9])
+// x[9]: 1-degree P-value with freq estimated from genotypes
+int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
{
double *pdg;
int i, n, n2;
n = b->n_smpl; n2 = n - n1;
pdg = get_pdg3(b);
if (pdg == 0) return -1;
- for (i = 0; i < 9; ++i) x[i] = -1.;
+ for (i = 0; i < 10; ++i) x[i] = -1.; // set to negative
{
if ((x[0] = est_freq(n, pdg)) < 0.) {
free(pdg);
}
x[0] = freqml(x[0], 0, n, pdg);
}
- if (flag & (0xf<<1|1<<8)) { // estimate the genotype frequency and test HWE
+ if (flag & (0xf<<1|3<<8)) { // estimate the genotype frequency and test HWE
double *g = x + 1, f3[3], r;
f3[0] = g[0] = (1 - x[0]) * (1 - x[0]);
f3[1] = g[1] = 2 * x[0] * (1 - x[0]);
f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i];
x[7] = kf_gammaq(.5, log(lk_ratio_test(n, n1, pdg, f3)));
}
- if ((flag & 1<<8) && n1 > 0 && n1 < n) { // 2-degree P-value
- double g[3][3];
+ if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value
+ double g[3][3], tmp;
for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double));
for (i = 0; i < ITER_MAX; ++i)
if (g3_iter(g[1], pdg, 0, n1) < EPS) break;
for (i = 0; i < ITER_MAX; ++i)
if (g3_iter(g[2], pdg, n1, n) < EPS) break;
- x[8] = kf_gammaq(1., log(lk_ratio_test(n, n1, pdg, g)));
+ tmp = log(lk_ratio_test(n, n1, pdg, g));
+ x[8] = kf_gammaq(1., tmp);
+ x[9] = kf_gammaq(.5, tmp);
}
// free
free(pdg);
{
if (argc == 1) {
fprintf(stderr, "\n");
+ fprintf(stderr, "Program: bcftools (Tools for data in the VCF/BCF formats)\n");
+ fprintf(stderr, "Version: %s\n\n", BCF_VERSION);
fprintf(stderr, "Usage: bcftools <command> <arguments>\n\n");
fprintf(stderr, "Command: view print, extract, convert and call SNPs from BCF\n");
fprintf(stderr, " index index BCF\n");
for (k = 0, sum = 0.; k < ma->M; ++k)
sum += ma->afs1[k];
rst->p_var = (double)sum;
+ { // compute the allele count
+ double max = -1;
+ rst->ac = -1;
+ for (k = 0; k <= ma->M; ++k)
+ if (max < ma->z[k]) max = ma->z[k], rst->ac = k;
+ rst->ac = ma->M - rst->ac;
+ }
// calculate f_flat and f_em
for (k = 0, sum = 0.; k <= ma->M; ++k)
sum += (long double)ma->z[k];
{ // estimate equal-tail credible interval (95% level)
int l, h;
double p;
- for (i = 0, p = 0.; i < ma->M; ++i)
+ for (i = 0, p = 0.; i <= ma->M; ++i)
if (p + ma->afs1[i] > 0.025) break;
else p += ma->afs1[i];
l = i;
- for (i = ma->M-1, p = 0.; i >= 0; --i)
+ for (i = ma->M, p = 0.; i >= 0; --i)
if (p + ma->afs1[i] > 0.025) break;
else p += ma->afs1[i];
h = i;
rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M;
}
+ if (ma->n1 > 0) { // compute LRT
+ double max0, max1, max2;
+ for (k = 0, max0 = -1; k <= ma->M; ++k)
+ if (max0 < ma->z[k]) max0 = ma->z[k];
+ for (k = 0, max1 = -1; k <= ma->n1 * 2; ++k)
+ if (max1 < ma->z1[k]) max1 = ma->z1[k];
+ for (k = 0, max2 = -1; k <= ma->M - ma->n1 * 2; ++k)
+ if (max2 < ma->z2[k]) max2 = ma->z2[k];
+ rst->lrt = log(max1 * max2 / max0);
+ rst->lrt = rst->lrt < 0? 1 : kf_gammaq(.5, rst->lrt);
+ } else rst->lrt = -1.0;
rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0;
if (do_contrast && rst->p_var > 0.5) // skip contrast2() if the locus is a strong non-variant
rst->p_chi2 = contrast2(ma, rst->cmp);
typedef struct {
int rank0, perm_rank; // NB: perm_rank is always set to -1 by bcf_p1_cal()
+ int ac; // ML alternative allele count
double f_exp, f_flat, p_ref_folded, p_ref, p_var_folded, p_var;
double cil, cih;
- double cmp[3], p_chi2; // used by contrast2()
+ double cmp[3], p_chi2, lrt; // used by contrast2()
} bcf_p1rst_t;
#define MC_PTYPE_FULL 1
int bcf_p1_set_n1(bcf_p1aux_t *b, int n1);
void bcf_p1_set_folded(bcf_p1aux_t *p1a); // only effective when set_n1() is not called
- int bcf_em1(const bcf1_t *b, int n1, int flag, double x[9]);
+ int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10]);
#ifdef __cplusplus
}
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
{
- char *s, *p, c;
- int i, l, k;
+ char *s, c;
+ int i, l, k, name_end;
khiter_t iter;
faidx1_t val;
khash_t(s) *h;
beg = end = -1;
h = fai->hash;
- l = strlen(str);
- p = s = (char*)malloc(l+1);
- /* squeeze out "," */
- for (i = k = 0; i != l; ++i)
- if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i];
- s[k] = 0;
- for (i = 0; i != k; ++i) if (s[i] == ':') break;
- s[i] = 0;
- iter = kh_get(s, h, s); /* get the ref_id */
- if (iter == kh_end(h)) {
- *len = 0;
- free(s); return 0;
- }
+ name_end = l = strlen(str);
+ s = (char*)malloc(l+1);
+ // remove space
+ for (i = k = 0; i < l; ++i)
+ if (!isspace(str[i])) s[k++] = str[i];
+ s[k] = 0; l = k;
+ // determine the sequence name
+ for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end
+ if (i >= 0) name_end = i;
+ if (name_end < l) { // check if this is really the end
+ int n_hyphen = 0;
+ for (i = name_end + 1; i < l; ++i) {
+ if (s[i] == '-') ++n_hyphen;
+ else if (!isdigit(s[i]) && s[i] != ',') break;
+ }
+ if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
+ s[name_end] = 0;
+ iter = kh_get(s, h, s);
+ if (iter == kh_end(h)) { // cannot find the sequence name
+ iter = kh_get(s, h, str); // try str as the name
+ if (iter == kh_end(h)) {
+ *len = 0;
+ free(s); return 0;
+ } else s[name_end] = ':', name_end = l;
+ }
+ } else iter = kh_get(s, h, str);
val = kh_value(h, iter);
- if (i == k) { /* dump the whole sequence */
- beg = 0; end = val.len;
- } else {
- for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break;
- beg = atoi(p);
- if (i < k) {
- p = s + i + 1;
- end = atoi(p);
- } else end = val.len;
- }
- if (beg > 0) --beg;
+ // parse the interval
+ if (name_end < l) {
+ for (i = k = name_end + 1; i < l; ++i)
+ if (s[i] != ',') s[k++] = s[i];
+ s[k] = 0;
+ beg = atoi(s + name_end + 1);
+ for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break;
+ end = i < k? atoi(s + i + 1) : val.len;
+ if (beg > 0) --beg;
+ } else beg = 0, end = val.len;
if (beg >= val.len) beg = val.len;
if (end >= val.len) end = val.len;
if (beg > end) beg = end;