From: Heng Li Date: Fri, 22 Apr 2011 03:13:42 +0000 (+0000) Subject: Release samtools-0.1.16 (r963:234) X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=8ec3e2c3435d8d000ff5edbeb250e8b46c324eca Release samtools-0.1.16 (r963:234) --- diff --git a/NEWS b/NEWS index de55679..a600bb1 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,40 @@ +Beta Release 0.1.16 (21 April, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Notable changes in samtools: + + * Support the new SAM/BAM type `B' in the latest SAM spec v1.4. + + * When the output file of `samtools merge' exists, do not overwrite it unless + a new command-line option `-f' is applied. + + * Bugfix: BED support is not working when the input BED is not sorted. + + * Bugfix: some reads without coordinates but given on the reverse strand are + lost in merging. + +Notable changes in bcftools: + + * Code cleanup: separated max-likelihood inference and Bayesian inference. + + * Test Hardy-Weinberg equilibrium with a likelihood-ratio test. + + * Provided another association test P-value by likelihood-ratio test. + + * Use Brent's method to estimate the site allele frequency when EM converges + slowly. The resulting ML estimate of allele frequnecy is more accurate. + + * Added the `ldpair' command, which computes r^2 between SNP pairs given in + an input file. + +Also, the `pileup' command, which has been deprecated by `mpileup' since +version 0.1.10, will be dropped in the next release. The old `pileup' command +is substandard and causing a lot of confusion. + +(0.1.16: 21 April 2011, r963:234) + + + Beta Release 0.1.15 (10 April, 2011) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bam.h b/bam.h index 2de5b09..e7360bb 100644 --- a/bam.h +++ b/bam.h @@ -40,7 +40,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.15 (r949:203)" +#define BAM_VERSION "0.1.16 (r963:234)" #include #include diff --git a/bcftools/bcftools.1 b/bcftools/bcftools.1 index 6d11e77..c6b4968 100644 --- a/bcftools/bcftools.1 +++ b/bcftools/bcftools.1 @@ -95,13 +95,18 @@ Uncompressed BCF output (force -b). .B Consensus/Variant Calling Options: .TP 10 .B -c -Call variants. +Call variants using Bayesian inference. This option automatically invokes option +.BR -e . .TP .BI -d \ FLOAT When .B -v is in use, skip loci where the fraction of samples covered by reads is below FLOAT. [0] .TP +.B -e +Perform max-likelihood inference only, including estimating the site allele frequency, +testing Hardy-Weinberg equlibrium and testing associations with LRT. +.TP .B -g Call per-sample genotypes at variant sites (force -c) .TP diff --git a/bcftools/main.c b/bcftools/main.c index 07d9e93..7293374 100644 --- a/bcftools/main.c +++ b/bcftools/main.c @@ -1,8 +1,12 @@ #include #include #include +#include #include "bcf.h" +#include "kseq.h" +KSTREAM_INIT(gzFile, gzread, 0x10000) + int bcfview(int argc, char *argv[]); int bcf_main_index(int argc, char *argv[]); @@ -42,16 +46,88 @@ int bcf_cat(int n, char * const *fn) return 0; } -int bcf_main_pwld(int argc, char *argv[]) +extern double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); + +int bcf_main_ldpair(int argc, char *argv[]) +{ + bcf_t *fp; + bcf_hdr_t *h; + bcf1_t *b0, *b1; + bcf_idx_t *idx; + kstring_t str; + void *str2id; + gzFile fplist; + kstream_t *ks; + int dret, lineno = 0; + if (argc < 3) { + fprintf(stderr, "Usage: bcftools ldpair \n"); + return 1; + } + fplist = gzopen(argv[2], "rb"); + ks = ks_init(fplist); + memset(&str, 0, sizeof(kstring_t)); + fp = bcf_open(argv[1], "rb"); + h = bcf_hdr_read(fp); + str2id = bcf_build_refhash(h); + idx = bcf_idx_load(argv[1]); + if (idx == 0) { + fprintf(stderr, "[%s] No bcf index is found. Abort!\n", __func__); + return 1; + } + b0 = calloc(1, sizeof(bcf1_t)); + b1 = calloc(1, sizeof(bcf1_t)); + while (ks_getuntil(ks, '\n', &str, &dret) >= 0) { + char *p, *q; + int k; + int tid0 = -1, tid1 = -1, pos0 = -1, pos1 = -1; + ++lineno; + for (p = q = str.s, k = 0; *p; ++p) { + if (*p == ' ' || *p == '\t') { + *p = '\0'; + if (k == 0) tid0 = bcf_str2id(str2id, q); + else if (k == 1) pos0 = atoi(q) - 1; + else if (k == 2) tid1 = strcmp(q, "=")? bcf_str2id(str2id, q) : tid0; + else if (k == 3) pos1 = atoi(q) - 1; + q = p + 1; + ++k; + } + } + if (k == 3) pos1 = atoi(q) - 1; + if (tid0 >= 0 && tid1 >= 0 && pos0 >= 0 && pos1 >= 0) { + uint64_t off; + double r, f[4]; + off = bcf_idx_query(idx, tid0, pos0); + bgzf_seek(fp->fp, off, SEEK_SET); + while (bcf_read(fp, h, b0) >= 0 && b0->pos != pos0); + off = bcf_idx_query(idx, tid1, pos1); + bgzf_seek(fp->fp, off, SEEK_SET); + while (bcf_read(fp, h, b1) >= 0 && b1->pos != pos1); + r = bcf_pair_freq(b0, b1, f); + r *= r; + printf("%s\t%d\t%s\t%d\t%.4g\t%.4g\t%.4g\t%.4g\t%.4g\n", h->ns[tid0], pos0+1, h->ns[tid1], pos1+1, + r, f[0], f[1], f[2], f[3]); + } //else fprintf(stderr, "[%s] Parse error at line %d.\n", __func__, lineno); + } + bcf_destroy(b0); bcf_destroy(b1); + bcf_idx_destroy(idx); + bcf_str2id_destroy(str2id); + bcf_hdr_destroy(h); + bcf_close(fp); + free(str.s); + ks_destroy(ks); + gzclose(fplist); + return 0; +} + +int bcf_main_ld(int argc, char *argv[]) { - extern double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4]); bcf_t *fp; bcf_hdr_t *h; bcf1_t **b, *b0; int i, j, m, n; double f[4]; if (argc == 1) { - fprintf(stderr, "Usage: bcftools pwld \n"); + fprintf(stderr, "Usage: bcftools ld \n"); return 1; } fp = bcf_open(argv[1], "rb"); @@ -95,12 +171,14 @@ int main(int argc, char *argv[]) fprintf(stderr, " index index BCF\n"); fprintf(stderr, " cat concatenate BCFs\n"); fprintf(stderr, " ld compute all-pair r^2\n"); + fprintf(stderr, " ldpair compute r^2 between requested pairs\n"); fprintf(stderr, "\n"); return 1; } if (strcmp(argv[1], "view") == 0) return bcfview(argc-1, argv+1); else if (strcmp(argv[1], "index") == 0) return bcf_main_index(argc-1, argv+1); - else if (strcmp(argv[1], "ld") == 0) return bcf_main_pwld(argc-1, argv+1); + else if (strcmp(argv[1], "ld") == 0) return bcf_main_ld(argc-1, argv+1); + else if (strcmp(argv[1], "ldpair") == 0) return bcf_main_ldpair(argc-1, argv+1); else if (strcmp(argv[1], "cat") == 0) return bcf_cat(argc-2, argv+2); // cat is different ... else { fprintf(stderr, "[main] Unrecognized command.\n"); diff --git a/samtools.1 b/samtools.1 index ba32392..c71fc87 100644 --- a/samtools.1 +++ b/samtools.1 @@ -1,4 +1,4 @@ -.TH samtools 1 "16 March 2011" "samtools-0.1.14" "Bioinformatics tools" +.TH samtools 1 "21 April 2011" "samtools-0.1.16" "Bioinformatics tools" .SH NAME .PP samtools - Utilities for the Sequence Alignment/Map (SAM) format @@ -285,7 +285,7 @@ Approximately the maximum required memory. [500000000] .TP .B merge -samtools merge [-nur] [-h inh.sam] [-R reg] [...] +samtools merge [-nur1f] [-h inh.sam] [-R reg] [...] Merge multiple sorted alignments. The header reference lists of all the input BAM files, and the @SQ headers of @@ -302,6 +302,12 @@ and the headers of other files will be ignored. .B OPTIONS: .RS .TP 8 +.B -1 +Use zlib compression level 1 to comrpess the output +.TP +.B -f +Force to overwrite the output file if present. +.TP 8 .BI -h \ FILE Use the lines of .I FILE @@ -313,17 +319,18 @@ replacing any header lines that would otherwise be copied from is actually in SAM format, though any alignment records it may contain are ignored.) .TP +.B -n +The input alignments are sorted by read names rather than by chromosomal +coordinates +.TP .BI -R \ STR Merge files in the specified region indicated by .I STR +[null] .TP .B -r Attach an RG tag to each alignment. The tag value is inferred from file names. .TP -.B -n -The input alignments are sorted by read names rather than by chromosomal -coordinates -.TP .B -u Uncompressed BAM output .RE