From 72b2e933582d3f3c6d40f2218b2bf931259f33b8 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 1 Mar 2011 14:51:01 +0000 Subject: [PATCH] * optionally drop phase ambiguous reads * write vcf format 4.1 instead of 4.0 --- ChangeLog | 172 ++++++++++++++++++++++++++++++++++++++++++++++++ NEWS | 84 ++++++++++++++++++++++- bam.h | 2 +- bam2bcf_indel.c | 4 +- bcftools/vcf.c | 4 +- phase.c | 10 ++- samtools.1 | 5 +- 7 files changed, 272 insertions(+), 9 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6324ec8..a471838 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,175 @@ +------------------------------------------------------------------------ +r925 | lh3lh3 | 2011-02-28 15:45:17 -0500 (Mon, 28 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/phase.c + +minor changes to a heuristic rule + +------------------------------------------------------------------------ +r924 | lh3lh3 | 2011-02-28 15:24:04 -0500 (Mon, 28 Feb 2011) | 4 lines +Changed paths: + M /trunk/samtools/bam.h + M /trunk/samtools/bcftools/vcfutils.pl + M /trunk/samtools/phase.c + + * 0.1.12-r924:126 + * fixed a bug in phase (due to recent changes) + * fixed a bug in vcf2fq + +------------------------------------------------------------------------ +r923 | lh3lh3 | 2011-02-28 12:57:39 -0500 (Mon, 28 Feb 2011) | 5 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam.h + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/phase.c + + * put version number in bam.h + * write version to BCF + * in phase, change the default -q to 37 + * output a little more information during phasing + +------------------------------------------------------------------------ +r922 | lh3lh3 | 2011-02-25 16:40:09 -0500 (Fri, 25 Feb 2011) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bamtk.c + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/bcf.tex + M /trunk/samtools/bcftools/bcf2qcall.c + M /trunk/samtools/bcftools/bcfutils.c + M /trunk/samtools/bcftools/ld.c + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/vcf.c + M /trunk/samtools/cut_target.c + + * change the order of PL/GL according to the latest VCF spec + * change the type of SP to int32_t + +------------------------------------------------------------------------ +r921 | lh3lh3 | 2011-02-25 14:40:56 -0500 (Fri, 25 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.tex + +update the BCF spec + +------------------------------------------------------------------------ +r920 | lh3lh3 | 2011-02-25 00:59:27 -0500 (Fri, 25 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/Makefile + M /trunk/samtools/bam_md.c + M /trunk/samtools/bam_plcmd.c + M /trunk/samtools/bam_sort.c + M /trunk/samtools/bamtk.c + A /trunk/samtools/cut_target.c + M /trunk/samtools/errmod.h + M /trunk/samtools/faidx.c + M /trunk/samtools/khash.h + M /trunk/samtools/kstring.c + M /trunk/samtools/kstring.h + A /trunk/samtools/phase.c + M /trunk/samtools/samtools.1 + +added the phase command + +------------------------------------------------------------------------ +r918 | lh3lh3 | 2011-02-24 10:05:54 -0500 (Thu, 24 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/prob1.c + M /trunk/samtools/bcftools/prob1.h + +added "const" to bcf_p1_cal() + +------------------------------------------------------------------------ +r917 | lh3lh3 | 2011-02-24 09:36:30 -0500 (Thu, 24 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/bam.c + +more meaningful BAM truncation message + +------------------------------------------------------------------------ +r916 | lh3lh3 | 2011-02-24 09:35:06 -0500 (Thu, 24 Feb 2011) | 3 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.c + M /trunk/samtools/bcftools/vcf.c + + * automatically fix errors in GL + * output unrecognized FORMAT as "." + +------------------------------------------------------------------------ +r913 | lh3lh3 | 2011-02-10 22:59:47 -0500 (Thu, 10 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/bcf.h + M /trunk/samtools/bcftools/call1.c + M /trunk/samtools/bcftools/vcf.c + +finished VCF->BCF conversion + +------------------------------------------------------------------------ +r910 | petulda | 2011-02-03 03:13:48 -0500 (Thu, 03 Feb 2011) | 1 line +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + +Prevent division by zero +------------------------------------------------------------------------ +r909 | lh3lh3 | 2011-02-02 11:29:20 -0500 (Wed, 02 Feb 2011) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/call1.c + +fixed a typo in the VCF header + +------------------------------------------------------------------------ +r908 | lh3lh3 | 2011-02-02 11:28:24 -0500 (Wed, 02 Feb 2011) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf.c + M /trunk/samtools/bam_index.c + + * fixed an out-of-boundary bug + * improved sorting order checking in index + +------------------------------------------------------------------------ +r907 | lh3lh3 | 2011-01-29 22:59:20 -0500 (Sat, 29 Jan 2011) | 4 lines +Changed paths: + M /trunk/samtools/INSTALL + M /trunk/samtools/bam_tview.c + M /trunk/samtools/knetfile.c + + * avoid a segfault when network connect fails + * update INSTALL + * fixed a bug in tview on big-endian by Nathan Weeks + +------------------------------------------------------------------------ +r903 | lh3lh3 | 2011-01-27 14:50:02 -0500 (Thu, 27 Jan 2011) | 3 lines +Changed paths: + M /trunk/samtools/bam2bcf_indel.c + M /trunk/samtools/bam_md.c + + * fixed a rare memory issue in bam_md.c + * fixed a bug in indel calling related to unmapped and refskip reads + +------------------------------------------------------------------------ +r902 | lh3lh3 | 2011-01-23 21:46:18 -0500 (Sun, 23 Jan 2011) | 2 lines +Changed paths: + M /trunk/samtools/bcftools/fet.c + +fixed two minor bugs in Fisher's exact test + +------------------------------------------------------------------------ +r899 | petulda | 2011-01-19 09:28:02 -0500 (Wed, 19 Jan 2011) | 1 line +Changed paths: + M /trunk/samtools/bcftools/vcfutils.pl + +Skip sites with unknown ref +------------------------------------------------------------------------ +r898 | lh3lh3 | 2011-01-15 12:56:05 -0500 (Sat, 15 Jan 2011) | 2 lines +Changed paths: + M /trunk/samtools/ChangeLog + M /trunk/samtools/bam_maqcns.c + M /trunk/samtools/bam_md.c + +move bam_nt16_nt4_table[] from bam_maqcns.c to bam_md.c + ------------------------------------------------------------------------ r896 | lh3lh3 | 2011-01-06 10:52:15 -0500 (Thu, 06 Jan 2011) | 3 lines Changed paths: diff --git a/NEWS b/NEWS index 6b4d8aa..2965046 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,85 @@ +Beta release 0.1.13 (28 February, 2011) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The most important though largely invisible modification is the change of the +order of genotypes in the PL VCF/BCF tag. This is to conform the upcoming VCF +spec v4.1. The change means that 0.1.13 is not backward compatible with VCF/BCF +generated by samtools older than r921 inclusive. VCF/BCF generated by the new +samtools will contain a line `##fileformat=VCFv4.1' as well as the samtools +version number. + +Single Individual Haplotyping (SIH) is added as an experimental feature. It +originally aims to produce haploid consensus from fosmid pool sequencing, but +also works with short-read data. For short reads, phased blocks are usually too +short to be useful in most applications, but phasing can be powerful for ruling +out some SNPs close to INDELs and some of clustered spurious SNPs between copies +of CNVs. On one data set, phasing based SNP calling is better in terms of both +sensitivity and specificity than the standard methods. + + +Other notable changes in samtools: + + * Construct per-sample consensus to reduced the effect of nearby SNPs in INDEL + calling. This may reduce the power, but improves specificity. + + * Improved sorting order checking in indexing. Now indexing is the preferred way + to check if a BAM is sorted. + + * Added a switch `-E' to mpileup and calmd. This option uses an alternative way + to apply BAQ, which increases sensistivity, especially to MNPs, at the cost of + a little loss in specificity. + + * Added `mpileup -A' to allow to use reads in anomalous pairs in SNP calling. + + * Added `mpileup -m' to allow fine control of INDEL candidates. + + * Added `mpileup -S' to compute per-sample strand bias P-value. + + * Added `mpileup -G' to exclude read groups in variant calling. + + * Fixed segfault in indel calling related to unmapped and refskip reads. + + * Fixed an integer overflow in INDEL calling. This bug produces wrong INDEL + genotypes for longer INDELs, typically over 10bp. + + * Fixed a bug in tview on big-endian machines. + + * Fixed a very rare memory issue in bam_md.c + + * Fixed an out-of-boundary bug in mpileup when the read base is `N'. + + * Fixed a compiling error when the knetfile library is not used. Fixed a + library compiling error due to the lack of bam_nt16_nt4_table[] table. + + +Other notable changes in bcftools: + + * Updated the BCF spec. + + * Added the `FQ' VCF INFO field, which gives the phred-scaled probability + of all samples being the samei (identical to the reference or all homozygous + variants). Option `view -f' has been dropped. + + * Implementated of "vcfutils.pl vcf2fq" to generate a consensus sequence + similar to "samtools.pl pileup2fq". + + * Make sure the GT FORMAT field is always the first FORMAT to conform the VCF + spec. Drop bcf-fix.pl. + + * Output bcftools specific INFO and FORMAT in the VCF header. + + * Added `view -s' to call variants from a subset of samples. + + * Properly convert VCF to BCF with a user provided sequence dictionary. Nonetheless, + custom fields are still unparsed and will be stored as a missing value. + + * Fixed a minor bug in Fisher's exact test; the results are rarely changed. + + +(0.1.13: 28 February 2011, r926+130) + + + Beta release 0.1.12a (2 December, 2010) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -532,4 +614,4 @@ Beta Release 0.1.1 (22 December, 2008) The is the first public release of samtools. For more information, please check the manual page `samtools.1' and the samtools website -http://samtools.sourceforge.net \ No newline at end of file +http://samtools.sourceforge.net diff --git a/bam.h b/bam.h index 41f8d95..0ab9c21 100644 --- a/bam.h +++ b/bam.h @@ -40,7 +40,7 @@ @copyright Genome Research Ltd. */ -#define BAM_VERSION "0.1.12-r924:126" +#define BAM_VERSION "0.1.12-r925+130" #include #include diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index c157bc6..899428f 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -420,9 +420,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla indelQ2 = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ2 + .499); // pick the smaller between indelQ1 and indelQ2 indelQ = indelQ1 < indelQ2? indelQ1 : indelQ2; + if (indelQ > 255) indelQ = 255; + if (seqQ > 255) seqQ = 255; p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; -// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d q=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ); +// fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); } } // determine bca->indel_types[] and bca->inscns diff --git a/bcftools/vcf.c b/bcftools/vcf.c index 81a38ea..9daa845 100644 --- a/bcftools/vcf.c +++ b/bcftools/vcf.c @@ -124,10 +124,10 @@ int vcf_hdr_write(bcf_t *bp, const bcf_hdr_t *h) if (!bp->is_vcf) return bcf_hdr_write(bp, h); if (h->l_txt > 0) { if (strstr(h->txt, "##fileformat=")) has_ver = 1; - if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n"); + if (has_ver == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); fwrite(h->txt, 1, h->l_txt - 1, v->fpout); } - if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.0\n"); + if (h->l_txt == 0) fprintf(v->fpout, "##fileformat=VCFv4.1\n"); fprintf(v->fpout, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); for (i = 0; i < h->n_smpl; ++i) fprintf(v->fpout, "\t%s", h->sns[i]); diff --git a/phase.c b/phase.c index dfe3eea..ef4eff9 100644 --- a/phase.c +++ b/phase.c @@ -17,6 +17,7 @@ KSTREAM_INIT(gzFile, gzread, 16384) #define FLAG_FIX_CHIMERA 0x1 #define FLAG_LIST_EXCL 0x4 +#define FLAG_DROP_AMBI 0x8 typedef struct { // configurations, initialized in the main function @@ -308,7 +309,8 @@ static int clean_seqs(int vpos, nseq_t *hash) static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) { - int i, is_flip; + int i, is_flip, drop_ambi; + drop_ambi = g->flag & FLAG_DROP_AMBI; is_flip = (drand48() < 0.5); for (i = 0; i < g->n; ++i) { int end, which; @@ -322,7 +324,7 @@ static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash) if (k == kh_end(hash)) which = 3; else { frag_t *f = &kh_val(hash, k); - if (f->ambig) which = 2; + if (f->ambig) which = drop_ambi? 2 : 3; else if (f->phased && f->flip) which = 2; else if (f->phased == 0) which = 3; else { // phased and not flipped @@ -521,7 +523,7 @@ int main_phase(int argc, char *argv[]) memset(&g, 0, sizeof(phaseg_t)); g.flag = FLAG_FIX_CHIMERA; g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256; - while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:")) >= 0) { + while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:A:")) >= 0) { switch (c) { case 'D': g.max_depth = atoi(optarg); break; case 'q': g.min_varLOD = atoi(optarg); break; @@ -529,6 +531,7 @@ int main_phase(int argc, char *argv[]) case 'k': g.k = atoi(optarg); break; case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break; case 'e': g.flag |= FLAG_LIST_EXCL; break; + case 'A': g.flag |= FLAG_DROP_AMBI; break; case 'b': g.pre = strdup(optarg); break; case 'l': fn_list = strdup(optarg); break; } @@ -543,6 +546,7 @@ int main_phase(int argc, char *argv[]) fprintf(stderr, " -D INT max read depth [%d]\n", g.max_depth); // fprintf(stderr, " -l FILE list of sites to phase [null]\n"); fprintf(stderr, " -F do not attempt to fix chimeras\n"); + fprintf(stderr, " -A drop reads with ambiguous phase\n"); // fprintf(stderr, " -e do not discover SNPs (effective with -l)\n"); fprintf(stderr, "\n"); return 1; diff --git a/samtools.1 b/samtools.1 index 8d299b7..f4a54f5 100644 --- a/samtools.1 +++ b/samtools.1 @@ -414,12 +414,15 @@ designed for cutting fosmid clones from fosmid pool sequencing [Ref. Kitzman et .TP .B phase -samtools phase [-F] [-k len] [-b prefix] [-q minLOD] [-Q minBaseQ] +samtools phase [-AF] [-k len] [-b prefix] [-q minLOD] [-Q minBaseQ] Call and phase heterozygous SNPs. .B OPTIONS: .RS .TP 8 +.B -A +Drop reads with ambiguous phase. +.TP 8 .BI -b \ STR Prefix of BAM output. When this option is in use, phase-0 reads will be saved in file .BR STR .0.bam -- 2.39.2