help messages

author Heng Li <lh3@live.co.uk>

Mon, 9 Aug 2010 05:12:05 +0000 (05:12 +0000)

committer Heng Li <lh3@live.co.uk>

Mon, 9 Aug 2010 05:12:05 +0000 (05:12 +0000)
author Heng Li <lh3@live.co.uk>
Mon, 9 Aug 2010 05:12:05 +0000 (05:12 +0000)
committer Heng Li <lh3@live.co.uk>
Mon, 9 Aug 2010 05:12:05 +0000 (05:12 +0000)
diff --git a/bcftools/README b/bcftools/README

new file mode 100644 (file)

index 0000000..1d7159d
--- /dev/null
+++ b/bcftools/README
@@ -0,0 +1,36 @@
+The view command of bcftools calls variants, tests Hardy-Weinberg
+equilibrium (HWE), tests allele balances and estimates allele frequency.
+
+This command calls a site as a potential variant if P(ref|D,F) is below
+0.9 (controlled by the -p option), where D is data and F is the prior
+allele frequency spectrum (AFS).
+
+The view command performs two types of allele balance tests, both based
+on Fisher's exact test for 2x2 contingency tables with the row variable
+being reference allele or not. In the first table, the column variable
+is strand. Two-tail P-value is taken. We test if variant bases tend to
+come from one strand. In the second table, the column variable is
+whether a base appears in the first or the last 11bp of the read.
+One-tail P-value is taken. We test if variant bases tend to occur
+towards the end of reads, which is usually an indication of
+misalignment.
+
+Site allele frequency is estimated in two ways. In the first way, the
+frequency is esimated as \argmax_f P(D|f) under the assumption of
+HWE. Prior AFS is not used. In the second way, the frequency is
+estimated as the posterior expectation of allele counts \sum_k
+kP(k|D,F), dividied by the total number of haplotypes. HWE is not
+assumed, but the estimate depends on the prior AFS. The two estimates
+largely agree when the signal is strong, but may differ greatly on weak
+sites as in this case, the prior plays an important role.
+
+To test HWE, we calculate the posterior distribution of genotypes
+(ref-hom, het and alt-hom). Chi-square test is performed. It is worth
+noting that the model used here is prior dependent and assumes HWE,
+which is different from both models for allele frequency estimate. The
+new model actually yields a third estimate of site allele frequency.
+
+The estimate allele frequency spectrum is printed to stderr per 64k
+sites. The estimate is in fact only the first round of a EM
+procedure. The second model (not the model for HWE testing) is used to
+estimate the AFS.
+\ No newline at end of file
diff --git a/bcftools/vcfout.c b/bcftools/vcfout.c

index 37107d097a8e2ff3c42cc0695e786dfa83143f6e..989e831aff90dffc3f53eb984e90bfadca949740 100644 (file)
--- a/bcftools/vcfout.c
+++ b/bcftools/vcfout.c
@@ -126,7 +126,7 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p
                 if (b->info[0]) kputc(';', &s);
                 ksprintf(&s, "AF1=%.3lf;AFE=%.3lf", 1.-pr->f_em, 1.-pr->f_exp);
         }
-       if (p_hwe <= .2) ksprintf(&s, ";HWE=%.3lf", p_hwe);
+       if (p_hwe <= .2) ksprintf(&s, ";GC=%.2lf,%.2lf,%.2lf;HWE=%.3lf", pr->g[2], pr->g[1], pr->g[0], p_hwe);
         if (p_dp >= 0. && p_dp <= .2) ksprintf(&s, ";TDP=%.3lf", p_dp);
         if (p_ed >= 0. && p_ed <= .2) ksprintf(&s, ";TED=%.3lf", p_ed);
         kputc('\0', &s);
@@ -172,7 +172,17 @@ int bcfview(int argc, char *argv[])
                 }
         }
         if (argc == optind) {
-               fprintf(stderr, "Usage: bcftools view [-cGPb] [-l list] <in.bcf> [reg]\n");
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:   bcftools view [options] <in.bcf> [reg]\n\n");
+               fprintf(stderr, "Options: -c        SNP calling\n");
+               fprintf(stderr, "         -G        suppress all individual genotype information\n");
+               fprintf(stderr, "         -L        discard the PL genotype field\n");
+               fprintf(stderr, "         -v        output potential variant sites only\n");
+               fprintf(stderr, "         -l FILE   list of sites to output [all sites]\n");
+               fprintf(stderr, "         -t FLOAT  scaled mutation rate [%.4lg]\n", vc.theta);
+               fprintf(stderr, "         -p FLOAT  variant if P(ref|D)<FLOAT [%.3lg]\n", vc.pref);
+               fprintf(stderr, "         -P STR    type of prior: full, cond2, flat [full]\n");
+               fprintf(stderr, "\n");
                 return 1;
         }
author	Heng Li <lh3@live.co.uk>
	Mon, 9 Aug 2010 05:12:05 +0000 (05:12 +0000)
committer	Heng Li <lh3@live.co.uk>
	Mon, 9 Aug 2010 05:12:05 +0000 (05:12 +0000)
bcftools/README	[new file with mode: 0644]	patch \| blob
bcftools/vcfout.c		patch \| blob \| history