From 19bf588b6b3425b1e925a2f837f0b8f351d057d7 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 10 Apr 2011 22:31:54 +0000 Subject: [PATCH] Release samtools-0.1.15 (r949:203) --- NEWS | 69 +++++++++++++++++++++++++++++++++++++++++++----- bam.h | 4 +-- bam2depth.c | 27 +++++++++++-------- bam_plcmd.c | 56 +++++++++++++++++++++------------------ bam_stat.c | 55 +++++++++++++++++++------------------- bcftools/call1.c | 53 +++++++++++++++++++------------------ bcftools/main.c | 1 + 7 files changed, 167 insertions(+), 98 deletions(-) diff --git a/NEWS b/NEWS index 946ba7b..de55679 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,37 @@ -Beta release 0.1.14 (16 March, 2011) +Beta Release 0.1.15 (10 April, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Noteable changes: + + * Allow to perform variant calling or to extract information in multiple + regions specified by a BED file (`samtools mpileup -l', `samtools view -L' + and `bcftools view -l'). + + * Added the `depth' command to samtools to compute the per-base depth with a + simpler interface. File `bam2depth.c', which implements this command, is the + recommended example on how to use the mpileup APIs. + + * Estimate genotype frequencies with ML; perform chi^2 based Hardy-Weinberg + test using this estimate. + + * For `samtools view', when `-R' is specified, drop read groups in the header + that are not contained in the specified file. + + * For `samtools flagstat', separate QC-pass and QC-fail reads. + + * Improved the command line help of `samtools mpileup' and `bcftools view'. + + * Use a global variable to control the verbose level of samtools stderr + output. Nonetheless, it has not been full utilized. + + * Fixed an issue in association test which may report false associations, + possibly due to floating point underflow. + +(0.1.15: 10 April 2011, r949:203) + + + +Beta release 0.1.14 (21 March, 2011) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This release implements a method for testing associations for case-control @@ -9,17 +42,41 @@ has not been evaluated on real data. Another new feature is to make X chromosome variant calls when female and male samples are both present. The user needs to provide a file indicating the -ploidy of each sample. +ploidy of each sample (see also manual bcftools/bcftools.1). + +Other notable changes: + + * Added `bcftools view -F' to parse BCF files generated by samtools r921 or + older which encodes PL in a different way. -Other new features: + * Changed the behavior of `bcftools view -s'. Now when a list of samples is + provided, the samples in the output will be reordered to match the ordering + in the sample list. This change is mainly designed for association test. + + * Sped up `bcftools view -v' for target sequencing given thousands of samples. + Also added a new option `view -d' to skip loci where only a few samples are + covered by reads. + + * Dropped HWE test. This feature has never been implemented properly. An EM + should be much better. To be implemented in future. + + * Added the `cat' command to samtools. This command concatenate BAMs with + identical sequence dictionaries in an efficient way. Modified from bam_cat.c + written by Chris Saunders. + + * Added `samtools view -1' to write BAMs at a low compression level but twice + faster to create. The `sort' command generates temporary files at a low + compression level as well. + + * Added `samtools mpileup -6' to accept "BAM" with Illumina 1.3+ quality + strings (strictly speaking, such a file is not BAM). * Added `samtools mpileup -L' to skip INDEL calling in regions with excessively high coverage. Such regions dramatically slow down mpileup. - * Added `bcftools view -F' to parse BCF files generated by samtools r921 or - older which encode PL in a different way. + * Updated `misc/export2sam.pl', provided by Chris Saunders from Illumina Inc. -(0.1.14: 16 March 2011, r930:163) +(0.1.14: 21 March 2011, r933:170) diff --git a/bam.h b/bam.h index 98612dd..87e3de3 100644 --- a/bam.h +++ b/bam.h @@ -33,14 +33,14 @@ BAM library provides I/O and various operations on manipulating files in the BAM (Binary Alignment/Mapping) or SAM (Sequence Alignment/Map) - format. It now supports importing from or exporting to TAM, sorting, + format. It now supports importing from or exporting to SAM, sorting, merging, generating pileup, and quickly retrieval of reads overlapped with a specified region. @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.14 (r947:199)" +#define BAM_VERSION "0.1.15 (r949:203)" #include #include diff --git a/bam2depth.c b/bam2depth.c index 95588f9..ca36b89 100644 --- a/bam2depth.c +++ b/bam2depth.c @@ -13,6 +13,7 @@ typedef struct { // auxiliary data structure bamFile fp; // the file handler bam_iter_t iter; // NULL if a region not specified + int min_mapQ; // mapQ filter } aux_t; void *bed_read(const char *fn); // read a BED or position list file @@ -20,10 +21,12 @@ void bed_destroy(void *_h); // destroy the BED data structure int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps // This function reads a BAM alignment from one BAM file. -static int read_bam(void *data, bam1_t *b) +static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure - return aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); + int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b); + if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; + return ret; } #ifdef _MAIN_BAM2DEPTH @@ -32,7 +35,7 @@ int main(int argc, char *argv[]) int main_depth(int argc, char *argv[]) #endif { - int i, n, tid, beg, end, pos, *n_plp, baseQ = 0; + int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure @@ -41,29 +44,31 @@ int main_depth(int argc, char *argv[]) bam_mplp_t mplp; // parse the command line - while ((n = getopt(argc, argv, "r:b:q:")) >= 0) { + while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) { switch (n) { case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now case 'q': baseQ = atoi(optarg); break; // base quality threshold + case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold } } if (optind == argc) { - fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-b in.bed] [...]\n"); + fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] [...]\n"); return 1; } // initialize the auxiliary data structures n = argc - optind; // the number of BAMs on the command line - data = calloc(n, sizeof(void*)); - beg = 0; end = 1<<30; tid = -1; + data = calloc(n, sizeof(void*)); // data[i] for the i-th input + beg = 0; end = 1<<30; tid = -1; // set the default region for (i = 0; i < n; ++i) { bam_header_t *htmp; data[i] = calloc(1, sizeof(aux_t)); data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM + data[i]->min_mapQ = mapQ; // set the mapQ filter htmp = bam_header_read(data[i]->fp); // read the BAM header if (i == 0) { - h = htmp; // keep the header for the 1st BAM + h = htmp; // keep the header of the 1st BAM if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header if (tid >= 0) { // if a region is specified and parsed successfully @@ -80,15 +85,15 @@ int main_depth(int argc, char *argv[]) while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip - fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); - for (i = 0; i < n; ++i) { + fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster + for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } - printf("\t%d", n_plp[i] - m); + printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } diff --git a/bam_plcmd.c b/bam_plcmd.c index a879f47..f6381c3 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -437,6 +437,7 @@ int bam_pileup(int argc, char *argv[]) fprintf(stderr, " -G FLOAT prior of an indel between two haplotypes (for -c) [%.4g]\n", d->ido->r_indel); fprintf(stderr, " -I INT phred prob. of an indel in sequencing/prep. (for -c) [%d]\n", d->ido->q_indel); fprintf(stderr, "\n"); + fprintf(stderr, "Warning: Please use the `mpileup' command instead `pileup'. `Pileup' is deprecated!\n\n"); free(fn_list); free(fn_fa); free(d); return 1; } @@ -932,32 +933,35 @@ int bam_mpileup(int argc, char *argv[]) if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; if (argc == 1) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); - fprintf(stderr, "Options: -f FILE reference sequence file [null]\n"); - fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); - fprintf(stderr, " -l FILE list of positions (format: chr pos) [null]\n"); - fprintf(stderr, " -b FILE list of input BAM files [null]\n"); - fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); - fprintf(stderr, " -Q INT min base quality [%d]\n", mplp.min_baseQ); - fprintf(stderr, " -q INT filter out alignment with MQ smaller than INT [%d]\n", mplp.min_mq); - fprintf(stderr, " -d INT max per-BAM depth [%d]\n", mplp.max_depth); - fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); - fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); - fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); - fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); - fprintf(stderr, " -h INT coefficient for homopolyer errors [%d]\n", mplp.tandemQ); - fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); - fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac); - fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); - fprintf(stderr, " -6 quality is in the Illumina-1.3+ encoding\n"); - fprintf(stderr, " -A use anomalous read pairs in SNP/INDEL calling\n"); - fprintf(stderr, " -g generate BCF output\n"); - fprintf(stderr, " -u do not compress BCF output\n"); - fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n"); - fprintf(stderr, " -B disable BAQ computation\n"); - fprintf(stderr, " -D output per-sample DP\n"); - fprintf(stderr, " -S output per-sample SP (strand bias P-value, slow)\n"); - fprintf(stderr, " -I do not perform indel calling\n"); + fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); + fprintf(stderr, "Input options:\n\n"); + fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n"); + fprintf(stderr, " -A count anomalous read pairs\n"); + fprintf(stderr, " -B disable BAQ computation\n"); + fprintf(stderr, " -b FILE list of input BAM files [null]\n"); + fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); + fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n"); + fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n"); + fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); + fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n"); + fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); + fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); + fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq); + fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ); + fprintf(stderr, "\nOutput options:\n\n"); + fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n"); + fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n"); + fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n"); + fprintf(stderr, " -u generate uncompress BCF output\n"); + fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n"); + fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); + fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac); + fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ); + fprintf(stderr, " -I do not perform indel calling\n"); + fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); + fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); + fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); + fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Notes: Assuming diploid individuals.\n\n"); return 1; diff --git a/bam_stat.c b/bam_stat.c index ea9deee..f2de0f1 100644 --- a/bam_stat.c +++ b/bam_stat.c @@ -3,31 +3,31 @@ #include "bam.h" typedef struct { - long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good; - long long n_sgltn, n_read1, n_read2; - long long n_qcfail, n_dup; - long long n_diffchr, n_diffhigh; + long long n_reads[2], n_mapped[2], n_pair_all[2], n_pair_map[2], n_pair_good[2]; + long long n_sgltn[2], n_read1[2], n_read2[2]; + long long n_dup[2]; + long long n_diffchr[2], n_diffhigh[2]; } bam_flagstat_t; #define flagstat_loop(s, c) do { \ - ++(s)->n_reads; \ + int w = ((c)->flag & BAM_FQCFAIL)? 1 : 0; \ + ++(s)->n_reads[w]; \ if ((c)->flag & BAM_FPAIRED) { \ - ++(s)->n_pair_all; \ - if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good; \ - if ((c)->flag & BAM_FREAD1) ++(s)->n_read1; \ - if ((c)->flag & BAM_FREAD2) ++(s)->n_read2; \ - if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn; \ + ++(s)->n_pair_all[w]; \ + if ((c)->flag & BAM_FPROPER_PAIR) ++(s)->n_pair_good[w]; \ + if ((c)->flag & BAM_FREAD1) ++(s)->n_read1[w]; \ + if ((c)->flag & BAM_FREAD2) ++(s)->n_read2[w]; \ + if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) ++(s)->n_sgltn[w]; \ if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) { \ - ++(s)->n_pair_map; \ + ++(s)->n_pair_map[w]; \ if ((c)->mtid != (c)->tid) { \ - ++(s)->n_diffchr; \ - if ((c)->qual >= 5) ++(s)->n_diffhigh; \ + ++(s)->n_diffchr[w]; \ + if ((c)->qual >= 5) ++(s)->n_diffhigh[w]; \ } \ } \ } \ - if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped; \ - if ((c)->flag & BAM_FQCFAIL) ++(s)->n_qcfail; \ - if ((c)->flag & BAM_FDUP) ++(s)->n_dup; \ + if (!((c)->flag & BAM_FUNMAP)) ++(s)->n_mapped[w]; \ + if ((c)->flag & BAM_FDUP) ++(s)->n_dup[w]; \ } while (0) bam_flagstat_t *bam_flagstat_core(bamFile fp) @@ -59,18 +59,17 @@ int bam_flagstat(int argc, char *argv[]) assert(fp); header = bam_header_read(fp); s = bam_flagstat_core(fp); - printf("%lld in total\n", s->n_reads); - printf("%lld QC failure\n", s->n_qcfail); - printf("%lld duplicates\n", s->n_dup); - printf("%lld mapped (%.2f%%)\n", s->n_mapped, (float)s->n_mapped / s->n_reads * 100.0); - printf("%lld paired in sequencing\n", s->n_pair_all); - printf("%lld read1\n", s->n_read1); - printf("%lld read2\n", s->n_read2); - printf("%lld properly paired (%.2f%%)\n", s->n_pair_good, (float)s->n_pair_good / s->n_pair_all * 100.0); - printf("%lld with itself and mate mapped\n", s->n_pair_map); - printf("%lld singletons (%.2f%%)\n", s->n_sgltn, (float)s->n_sgltn / s->n_pair_all * 100.0); - printf("%lld with mate mapped to a different chr\n", s->n_diffchr); - printf("%lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh); + printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]); + printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]); + printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0); + printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]); + printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]); + printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]); + printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0); + printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]); + printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0); + printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]); + printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]); free(s); bam_header_destroy(header); bam_close(fp); diff --git a/bcftools/call1.c b/bcftools/call1.c index cb33e06..d61d2c4 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -332,31 +332,34 @@ int bcfview(int argc, char *argv[]) } if (argc == optind) { fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bcftools view [options] [reg]\n\n"); - fprintf(stderr, "Options: -c SNP calling\n"); - fprintf(stderr, " -v output potential variant sites only (force -c)\n"); - fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); - fprintf(stderr, " -b output BCF instead of VCF\n"); - fprintf(stderr, " -u uncompressed BCF output (force -b)\n"); - fprintf(stderr, " -S input is VCF\n"); - fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); - fprintf(stderr, " -G suppress all individual genotype information\n"); - fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); - fprintf(stderr, " -Q output the QCALL likelihood format\n"); - fprintf(stderr, " -L calculate LD for adjacent sites\n"); - fprintf(stderr, " -I skip indels\n"); - fprintf(stderr, " -F PL generated by r921 or before\n"); - fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n"); - fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n"); - fprintf(stderr, " -l FILE list of sites to output [all sites]\n"); - fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta); - fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); - fprintf(stderr, " -p FLOAT variant if P(ref|D) [reg]\n\n"); + fprintf(stderr, "Input/output options:\n\n"); + fprintf(stderr, " -A keep all possible alternate alleles at variant sites\n"); + fprintf(stderr, " -b output BCF instead of VCF\n"); + fprintf(stderr, " -D FILE sequence dictionary for VCF->BCF conversion [null]\n"); + fprintf(stderr, " -F PL generated by r921 or before (which generate old ordering)\n"); + fprintf(stderr, " -G suppress all individual genotype information\n"); + fprintf(stderr, " -l FILE list of sites (chr pos) or regions (BED) to output [all sites]\n"); + fprintf(stderr, " -L calculate LD for adjacent sites\n"); + fprintf(stderr, " -N skip sites where REF is not A/C/G/T\n"); + fprintf(stderr, " -Q output the QCALL likelihood format\n"); + fprintf(stderr, " -s FILE list of samples to use [all samples]\n"); + fprintf(stderr, " -S input is VCF\n"); + fprintf(stderr, " -u uncompressed BCF output (force -b)\n"); + fprintf(stderr, "\nConsensus/variant calling options:\n\n"); + fprintf(stderr, " -c SNP calling\n"); + fprintf(stderr, " -d FLOAT skip loci where less than FLOAT fraction of samples covered [0]\n"); + fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); + fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); + fprintf(stderr, " -I skip indels\n"); + fprintf(stderr, " -p FLOAT variant if P(ref|D)