From ed5305b1d40c8d34c26221a6a897c8ebb9f4d5c6 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Mon, 9 Aug 2010 05:12:05 +0000 Subject: [PATCH] help messages --- bcftools/README | 36 ++++++++++++++++++++++++++++++++++++ bcftools/vcfout.c | 14 ++++++++++++-- 2 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 bcftools/README diff --git a/bcftools/README b/bcftools/README new file mode 100644 index 0000000..1d7159d --- /dev/null +++ b/bcftools/README @@ -0,0 +1,36 @@ +The view command of bcftools calls variants, tests Hardy-Weinberg +equilibrium (HWE), tests allele balances and estimates allele frequency. + +This command calls a site as a potential variant if P(ref|D,F) is below +0.9 (controlled by the -p option), where D is data and F is the prior +allele frequency spectrum (AFS). + +The view command performs two types of allele balance tests, both based +on Fisher's exact test for 2x2 contingency tables with the row variable +being reference allele or not. In the first table, the column variable +is strand. Two-tail P-value is taken. We test if variant bases tend to +come from one strand. In the second table, the column variable is +whether a base appears in the first or the last 11bp of the read. +One-tail P-value is taken. We test if variant bases tend to occur +towards the end of reads, which is usually an indication of +misalignment. + +Site allele frequency is estimated in two ways. In the first way, the +frequency is esimated as \argmax_f P(D|f) under the assumption of +HWE. Prior AFS is not used. In the second way, the frequency is +estimated as the posterior expectation of allele counts \sum_k +kP(k|D,F), dividied by the total number of haplotypes. HWE is not +assumed, but the estimate depends on the prior AFS. The two estimates +largely agree when the signal is strong, but may differ greatly on weak +sites as in this case, the prior plays an important role. + +To test HWE, we calculate the posterior distribution of genotypes +(ref-hom, het and alt-hom). Chi-square test is performed. It is worth +noting that the model used here is prior dependent and assumes HWE, +which is different from both models for allele frequency estimate. The +new model actually yields a third estimate of site allele frequency. + +The estimate allele frequency spectrum is printed to stderr per 64k +sites. The estimate is in fact only the first round of a EM +procedure. The second model (not the model for HWE testing) is used to +estimate the AFS. \ No newline at end of file diff --git a/bcftools/vcfout.c b/bcftools/vcfout.c index 37107d0..989e831 100644 --- a/bcftools/vcfout.c +++ b/bcftools/vcfout.c @@ -126,7 +126,7 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p if (b->info[0]) kputc(';', &s); ksprintf(&s, "AF1=%.3lf;AFE=%.3lf", 1.-pr->f_em, 1.-pr->f_exp); } - if (p_hwe <= .2) ksprintf(&s, ";HWE=%.3lf", p_hwe); + if (p_hwe <= .2) ksprintf(&s, ";GC=%.2lf,%.2lf,%.2lf;HWE=%.3lf", pr->g[2], pr->g[1], pr->g[0], p_hwe); if (p_dp >= 0. && p_dp <= .2) ksprintf(&s, ";TDP=%.3lf", p_dp); if (p_ed >= 0. && p_ed <= .2) ksprintf(&s, ";TED=%.3lf", p_ed); kputc('\0', &s); @@ -172,7 +172,17 @@ int bcfview(int argc, char *argv[]) } } if (argc == optind) { - fprintf(stderr, "Usage: bcftools view [-cGPb] [-l list] [reg]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: bcftools view [options] [reg]\n\n"); + fprintf(stderr, "Options: -c SNP calling\n"); + fprintf(stderr, " -G suppress all individual genotype information\n"); + fprintf(stderr, " -L discard the PL genotype field\n"); + fprintf(stderr, " -v output potential variant sites only\n"); + fprintf(stderr, " -l FILE list of sites to output [all sites]\n"); + fprintf(stderr, " -t FLOAT scaled mutation rate [%.4lg]\n", vc.theta); + fprintf(stderr, " -p FLOAT variant if P(ref|D)