Merge remote branch 'remotes/master/master' into fb-annots

author Petr Danecek <pd3@sanger.ac.uk>

Tue, 12 Mar 2013 13:25:17 +0000 (13:25 +0000)

committer Petr Danecek <pd3@sanger.ac.uk>

Tue, 12 Mar 2013 13:25:17 +0000 (13:25 +0000)
author Petr Danecek <pd3@sanger.ac.uk>
Tue, 12 Mar 2013 13:25:17 +0000 (13:25 +0000)
committer Petr Danecek <pd3@sanger.ac.uk>
Tue, 12 Mar 2013 13:25:17 +0000 (13:25 +0000)
diff --combined bam_plcmd.c

index e45c8c8c81649630449f78d653cbd037e2046408,8935c5a2f52437057596f6e35ab74dc85a2a569c..9c4f273dcbfe2c3576e9c6ea0e2d7661839bbb85
--- 1/bam_plcmd.c
--- 2/bam_plcmd.c
+++ b/bam_plcmd.c
@@@ -4,6 -4,8 +4,8 @@@
   #include <ctype.h>
   #include <string.h>
   #include <errno.h>
+ #include <sys/stat.h>
+ #include <getopt.h>
   #include "sam.h"
   #include "faidx.h"
   #include "kstring.h"
@@@ -80,6 -82,7 +82,7 @@@ int bed_overlap(const void *_h, const c
   
   typedef struct {
         int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag;
+     int rflag_require, rflag_filter;
         int openQ, extQ, tandemQ, min_support; // for indels
         double min_frac; // for indels
         char *reg, *pl_list;
@@@ -117,6 -120,8 +120,8 @@@ static int mplp_func(void *data, bam1_
                         skip = 1;
                         continue;
                 }
+         if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; }
+         if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; }
                 if (ma->conf->bed) { // test overlap
                         skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b)));
                         if (skip) continue;
@@@ -215,10 -220,6 +220,10 @@@ static int mpileup(mplp_conf_t *conf, i
           }
                 data[i]->conf = conf;
                 h_tmp = bam_header_read(data[i]->fp);
+ +        if ( !h_tmp ) {
+ +            fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
+ +            exit(1);
+ +        }
                 data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
                 bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
                 rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
@@@ -316,7 -317,7 +321,7 @@@
                         ref16 = bam_nt16_table[_ref0];
                         for (i = 0; i < gplp.n; ++i)
                                 bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
- -                      bcf_call_combine(gplp.n, bcr, ref16, &bc);
+ +                      bcf_call_combine(gplp.n, bcr, bca, ref16, &bc);
                         bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, 0, 0);
                         bcf_write(bp, bh, b);
                         bcf_destroy(b);
@@@ -324,7 -325,7 +329,7 @@@
                         if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
                                 for (i = 0; i < gplp.n; ++i)
                                         bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
- -                              if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
+ +                              if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) {
                                         b = calloc(1, sizeof(bcf1_t));
                                         bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, bca, ref);
                                         bcf_write(bp, bh, b);
@@@ -397,11 -398,15 +402,15 @@@
   }
   
   #define MAX_PATH_LEN 1024
- static int read_file_list(const char *file_list,int *n,char **argv[])
+ int read_file_list(const char *file_list,int *n,char **argv[])
   {
       char buf[MAX_PATH_LEN];
-     int len, nfiles;
-     char **files;
+     int len, nfiles = 0;
+     char **files = NULL;
+     struct stat sb;
+ 
+     *n = 0;
+     *argv = NULL;
   
       FILE *fh = fopen(file_list,"r");
       if ( !fh )
@@@ -410,28 -415,33 +419,33 @@@
           return 1;
       }
   
-     // Speed is not an issue here, determine the number of files by reading the file twice
-     nfiles = 0;
-     while ( fgets(buf,MAX_PATH_LEN,fh) ) nfiles++;
- 
-     if ( fseek(fh, 0L, SEEK_SET) )
-     {
-         fprintf(stderr,"%s: %s\n", file_list,strerror(errno));
-         return 1;
-     }
- 
       files = calloc(nfiles,sizeof(char*));
       nfiles = 0;
       while ( fgets(buf,MAX_PATH_LEN,fh) ) 
       {
+         // allow empty lines and trailing spaces
           len = strlen(buf);
           while ( len>0 && isspace(buf[len-1]) ) len--;
           if ( !len ) continue;
   
-         files[nfiles] = malloc(sizeof(char)*(len+1)); 
-         strncpy(files[nfiles],buf,len);
-         files[nfiles][len] = 0;
+         // check sanity of the file list
+         buf[len] = 0;
+         if (stat(buf, &sb) != 0)
+         {
+             // no such file, check if it is safe to print its name
+             int i, safe_to_print = 1;
+             for (i=0; i<len; i++)
+                 if (!isprint(buf[i])) { safe_to_print = 0; break; } 
+             if ( safe_to_print )
+                 fprintf(stderr,"The file list \"%s\" appears broken, could not locate: %s\n", file_list,buf);
+             else
+                 fprintf(stderr,"Does the file \"%s\" really contain a list of files and do all exist?\n", file_list);
+             return 1;
+         }
+ 
           nfiles++;
+         files = realloc(files,nfiles*sizeof(char*));
+         files[nfiles-1] = strdup(buf);
       }
       fclose(fh);
       if ( !nfiles )
@@@ -460,8 -470,16 +474,16 @@@ int bam_mpileup(int argc, char *argv[]
         mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
         mplp.min_frac = 0.002; mplp.min_support = 1;
         mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN;
-       while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV")) >= 0) {
+     static struct option lopts[] = 
+     {
+         {"rf",1,0,1},   // require flag
+         {"ff",1,0,2},   // filter flag
+         {0,0,0,0}
+     };
+       while ((c = getopt_long(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV1:2:",lopts,NULL)) >= 0) {
                 switch (c) {
+         case  1 : mplp.rflag_require = strtol(optarg,0,0); break;
+         case  2 : mplp.rflag_filter  = strtol(optarg,0,0); break;
                 case 'f':
                         mplp.fai = fai_load(optarg);
                         if (mplp.fai == 0) return 1;
@@@ -517,7 -535,7 +539,7 @@@
                 fprintf(stderr, "       -6           assume the quality is in the Illumina-1.3+ encoding\n");
                 fprintf(stderr, "       -A           count anomalous read pairs\n");
                 fprintf(stderr, "       -B           disable BAQ computation\n");
-               fprintf(stderr, "       -b FILE      list of input BAM files [null]\n");
+               fprintf(stderr, "       -b FILE      list of input BAM filenames, one per line [null]\n");
                 fprintf(stderr, "       -C INT       parameter for adjusting mapQ; 0 to disable [0]\n");
                 fprintf(stderr, "       -d INT       max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth);
                 fprintf(stderr, "       -E           recalculate extended BAQ on the fly thus ignoring existing BQs\n");
@@@ -529,6 -547,8 +551,8 @@@
                 fprintf(stderr, "       -R           ignore RG tags\n");
                 fprintf(stderr, "       -q INT       skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq);
                 fprintf(stderr, "       -Q INT       skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ);
+               fprintf(stderr, "       --rf INT     required flags: skip reads with mask bits unset []\n");
+               fprintf(stderr, "       --ff INT     filter flags: skip reads with mask bits set []\n");
                 fprintf(stderr, "\nOutput options:\n\n");
                 fprintf(stderr, "       -D           output per-sample DP in BCF (require -g/-u)\n");
                 fprintf(stderr, "       -g           generate BCF output (genotype likelihoods)\n");
diff --combined bcftools/call1.c

index eb584987d4c4a23b1c8fcab33332e701536dd149,bc1cd407e7967290051b4ffbf4c73938468a43dd..18805a03e7b76ab8db3040cc6e9d87a5e16dadaa
--- 1/bcftools/call1.c
--- 2/bcftools/call1.c
+++ b/bcftools/call1.c
@@@ -190,27 -190,7 +190,27 @@@ static char **read_samples(const char *
         *_n = 0;
         s.l = s.m = 0; s.s = 0;
         fp = gzopen(fn, "r");
- -      if (fp == 0) return 0; // fail to open file
+ +      if (fp == 0) 
+ +    {
+ +        // interpret as sample names, not as a file name
+ +        const char *t = fn, *p = t;
+ +        while (*t)
+ +        {
+ +            t++;
+ +            if ( *t==',' || !*t )
+ +            { 
+ +                sam = realloc(sam, sizeof(void*)*(n+1));
+ +                sam[n] = (char*) malloc(sizeof(char)*(t-p+2));
+ +                memcpy(sam[n], p, t-p);
+ +                sam[n][t-p]   = 0;
+ +                sam[n][t-p+1] = 2;    // assume diploid
+ +                p = t+1;
+ +                n++; 
+ +            }
+ +        }
+ +        *_n = n;
+ +        return sam; // fail to open file
+ +    }
         ks = ks_init(fp);
         while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
                 int l;
@@@ -286,18 -266,8 +286,18 @@@ static void write_header(bcf_hdr_t *h
           kputs("##INFO=<ID=QCHI2,Number=1,Type=Integer,Description=\"Phred scaled PCHI2.\">\n", &str);
       if (!strstr(str.s, "##INFO=<ID=RP,"))
           kputs("##INFO=<ID=PR,Number=1,Type=Integer,Description=\"# permutations yielding a smaller PCHI2.\">\n", &str);
+ +    if (!strstr(str.s, "##INFO=<ID=QBD,"))
+ +        kputs("##INFO=<ID=QBD,Number=1,Type=Float,Description=\"Quality by Depth: QUAL/#reads\">\n", &str);
+ +    if (!strstr(str.s, "##INFO=<ID=QBDNR,"))
+ +        kputs("##INFO=<ID=QBDNR,Number=1,Type=Float,Description=\"Quality by Depth: QUAL/#nref-reads\">\n", &str);
+ +    if (!strstr(str.s, "##INFO=<ID=RPS,"))
+ +        kputs("##INFO=<ID=RPS,Number=3,Type=Float,Description=\"Read Position Stats: depth, average, stddev\">\n", &str);
+ +    if (!strstr(str.s, "##INFO=<ID=RPB,"))
+ +        kputs("##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Read Position Bias\">\n", &str);
+ +    if (!strstr(str.s, "##INFO=<ID=MDV,"))
+ +        kputs("##INFO=<ID=MDV,Number=1,Type=Integer,Description=\"Maximum number of high-quality nonRef reads in samples\">\n", &str);
       if (!strstr(str.s, "##INFO=<ID=VDB,"))
- -        kputs("##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias\">\n", &str);
+ +        kputs("##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias (v2) for filtering splice-site artefacts in RNA-seq data. Note: this version may be broken.\">\n", &str);
       if (!strstr(str.s, "##FORMAT=<ID=GT,"))
           kputs("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n", &str);
       if (!strstr(str.s, "##FORMAT=<ID=GQ,"))
@@@ -328,6 -298,9 +328,9 @@@ int bcfview(int argc, char *argv[]
         extern int bcf_trio_call(uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt);
         extern int bcf_pair_call(const bcf1_t *b);
         extern int bcf_min_diff(const bcf1_t *b);
+       extern int bcf_p1_get_M(bcf_p1aux_t *b);
+ 
+       extern gzFile bcf_p1_fp_lk;
   
         bcf_t *bp, *bout = 0;
         bcf1_t *b, *blast;
@@@ -343,10 -316,10 +346,10 @@@
         memset(&vc, 0, sizeof(viewconf_t));
         vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; vc.min_lrt = 1; vc.min_ma_lrt = -1;
         memset(qcnt, 0, 8 * 256);
-       while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Ywm:")) >= 0) {
+       while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Ywm:K:")) >= 0) {
                 switch (c) {
                 case '1': vc.n1 = atoi(optarg); break;
-               case 'l': vc.bed = bed_read(optarg); break;
+               case 'l': vc.bed = bed_read(optarg); if (!vc.bed) fprintf(stderr,"Could not read \"%s\"\n", optarg); return 1; break;
                 case 'D': vc.fn_dict = strdup(optarg); break;
                 case 'F': vc.flag |= VC_FIX_PL; break;
                 case 'N': vc.flag |= VC_ACGT_ONLY; break;
@@@ -373,6 -346,7 +376,7 @@@
                 case 'C': vc.min_lrt = atof(optarg); break;
                 case 'X': vc.min_perm_p = atof(optarg); break;
                 case 'd': vc.min_smpl_frac = atof(optarg); break;
+               case 'K': bcf_p1_fp_lk = gzopen(optarg, "w"); break;
                 case 's': vc.subsam = read_samples(optarg, &vc.n_sub);
                         vc.ploidy = calloc(vc.n_sub + 1, 1);
                         for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1];
@@@ -462,7 -436,7 +466,7 @@@
                         vc.sublist = calloc(vc.n_sub, sizeof(int));
                         hout = bcf_hdr_subsam(hin, vc.n_sub, vc.subsam, vc.sublist);
                 }
-               if (vc.flag & VC_CALL) write_header(hout);
+               write_header(hout); // always print the header
                 vcf_hdr_write(bout, hout);
         }
         if (vc.flag & VC_CALL) {
@@@ -496,6 -470,10 +500,10 @@@
                         }
                 }
         }
+       if (bcf_p1_fp_lk && p1) {
+               int32_t M = bcf_p1_get_M(p1);
+               gzwrite(bcf_p1_fp_lk, &M, 4);
+       }
         while (vcf_read(bp, hin, b) > 0) {
                 int is_indel, cons_llr = -1;
                 int64_t cons_gt = -1;
@@@ -544,15 -522,19 +552,19 @@@
                         int i;
                         for (i = 0; i < 9; ++i) em[i] = -1.;
                 }
-         if ( !(vc.flag&VC_KEEPALT) && vc.flag&VC_CALL && vc.min_ma_lrt>=0 )
+         if ( !(vc.flag&VC_KEEPALT) && (vc.flag&VC_CALL) && vc.min_ma_lrt>=0 )
           {
               bcf_p1_set_ploidy(b, p1); // could be improved: do this per site to allow pseudo-autosomal regions
- -            int gts = call_multiallelic_gt(b,p1,vc.min_ma_lrt);
+ +            int gts = call_multiallelic_gt(b, p1, vc.min_ma_lrt, vc.flag&VC_VARONLY);
               if ( gts<=1 && vc.flag & VC_VARONLY ) continue;
           }
                 else if (vc.flag & VC_CALL) { // call variants
                         bcf_p1rst_t pr;
-                       int calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr);
+                       int calret;
+                       gzwrite(bcf_p1_fp_lk, &b->tid, 4);
+                       gzwrite(bcf_p1_fp_lk, &b->pos, 4);
+                       gzwrite(bcf_p1_fp_lk, &em[0], sizeof(double));
+                       calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr);
                         if (n_processed % 100000 == 0) {
                                 fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed);
                                 bcf_p1_dump_afs(p1);
@@@ -597,6 -579,8 +609,8 @@@
                 } else bcf_fix_gt(b);
                 vcf_write(bout, hout, b);
         }
+ 
+       if (bcf_p1_fp_lk) gzclose(bcf_p1_fp_lk);
         if (vc.prior_file) free(vc.prior_file);
         if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1);
         if (hin != hout) bcf_hdr_destroy(hout);
diff --combined bcftools/prob1.c

index d6557223e729eddbf0b2f70bd561c2d3d326143c,ffa608e29d6408c03a858f3db3daba162a1e9ed2..f04cf085f0a499e29bf4470356acf5de90a6b396
--- 1/bcftools/prob1.c
--- 2/bcftools/prob1.c
+++ b/bcftools/prob1.c
@@@ -5,6 -5,7 +5,7 @@@
   #include <errno.h>
   #include <assert.h>
   #include <limits.h>
+ #include <zlib.h>
   #include "prob1.h"
   #include "kstring.h"
   
@@@ -15,6 -16,8 +16,8 @@@ KSTREAM_INIT(gzFile, gzread, 16384
   #define MC_EM_EPS 1e-5
   #define MC_DEF_INDEL 0.15
   
+ gzFile bcf_p1_fp_lk;
+ 
   unsigned char seq_nt4_table[256] = {
         4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
         4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4, 
@@@ -165,6 -168,8 +168,8 @@@ bcf_p1aux_t *bcf_p1_init(int n, uint8_
         return ma;
   }
   
+ int bcf_p1_get_M(bcf_p1aux_t *b) { return b->M; }
+ 
   int bcf_p1_set_n1(bcf_p1aux_t *b, int n1)
   {
         if (n1 == 0 || n1 >= b->n) return -1;
@@@ -203,124 -208,7 +208,124 @@@ void bcf_p1_destroy(bcf_p1aux_t *ma
   extern double kf_gammap(double s, double z);
   int test16(bcf1_t *b, anno16_t *a);
   
- -int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold)
+ +// Wigginton 2005, PMID: 15789306
+ +// written by Jan Wigginton
+ +double calc_hwe(int obs_hom1, int obs_hom2, int obs_hets)
+ +{
+ +    if (obs_hom1 + obs_hom2 + obs_hets == 0 ) return 1;
+ +
+ +    assert(obs_hom1 >= 0 && obs_hom2 >= 0 && obs_hets >= 0);
+ +
+ +    int obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1;
+ +    int obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2;
+ +
+ +    int rare_copies = 2 * obs_homr + obs_hets;
+ +    int genotypes   = obs_hets + obs_homc + obs_homr;
+ +
+ +    double *het_probs = (double*) calloc(rare_copies+1, sizeof(double));
+ +
+ +    /* start at midpoint */
+ +    int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes);
+ +
+ +    /* check to ensure that midpoint and rare alleles have same parity */
+ +    if ((rare_copies & 1) ^ (mid & 1)) mid++;
+ +
+ +    int curr_hets = mid;
+ +    int curr_homr = (rare_copies - mid) / 2;
+ +    int curr_homc = genotypes - curr_hets - curr_homr;
+ +
+ +    het_probs[mid] = 1.0;
+ +    double sum = het_probs[mid];
+ +    for (curr_hets = mid; curr_hets > 1; curr_hets -= 2)
+ +    {
+ +        het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0));
+ +        sum += het_probs[curr_hets - 2];
+ +
+ +        /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */
+ +        curr_homr++;
+ +        curr_homc++;
+ +    }
+ +
+ +    curr_hets = mid;
+ +    curr_homr = (rare_copies - mid) / 2;
+ +    curr_homc = genotypes - curr_hets - curr_homr;
+ +    for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2)
+ +    {
+ +        het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc /((curr_hets + 2.0) * (curr_hets + 1.0));
+ +        sum += het_probs[curr_hets + 2];
+ +
+ +        /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */
+ +        curr_homr--;
+ +        curr_homc--;
+ +    }
+ +    int i;
+ +    for (i = 0; i <= rare_copies; i++) het_probs[i] /= sum;
+ +
+ +    /*  p-value calculation for p_hwe  */
+ +    double p_hwe = 0.0;
+ +    for (i = 0; i <= rare_copies; i++)
+ +    {
+ +        if (het_probs[i] > het_probs[obs_hets])
+ +            continue;
+ +        p_hwe += het_probs[i];
+ +    }
+ +
+ +    p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe;
+ +    free(het_probs);
+ +    return p_hwe;
+ +
+ +}
+ +
+ +
+ +static void _bcf1_set_ref(bcf1_t *b, int idp)
+ +{
+ +    kstring_t s;
+ +    int old_n_gi = b->n_gi;
+ +    s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str;
+ +    kputs(":GT", &s); kputc('\0', &s);
+ +    b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ +    bcf_sync(b);
+ +
+ +    // Call GTs
+ +    int isample, an = 0;
+ +    for (isample = 0; isample < b->n_smpl; isample++) 
+ +    {
+ +        if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 )
+ +            ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7;
+ +        else
+ +        {
+ +            ((uint8_t*)b->gi[old_n_gi].data)[isample] = 0;
+ +            an += b->ploidy ? b->ploidy[isample] : 2;
+ +        }
+ +    }
+ +    bcf_fit_alt(b,1);
+ +    b->qual = 999;
+ +
+ +    // Prepare BCF for output: ref, alt, filter, info, format
+ +    memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); 
+ +    kputs(b->ref, &s); kputc('\0', &s);
+ +    kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
+ +    {
+ +        ksprintf(&s, "AN=%d;", an);
+ +        kputs(b->info, &s); 
+ +        anno16_t a;
+ +        int has_I16 = test16(b, &a) >= 0? 1 : 0;
+ +        if (has_I16 )
+ +        {
+ +            if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
+ +            ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
+ +        }
+ +        kputc('\0', &s);
+ +        rm_info(&s, "I16=");
+ +        rm_info(&s, "QS=");
+ +    }
+ +    kputs(b->fmt, &s); kputc('\0', &s);
+ +    free(b->str);
+ +    b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ +    bcf_sync(b);
+ +}
+ +
+ +int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only)
   {
       int nals = 1;
       char *p;
@@@ -331,6 -219,8 +336,6 @@@
       }
       if ( b->alt[0] && !*p ) nals++;
   
- -    if ( nals==1 ) return 1;
- -
       if ( nals>4 )
       {
           if ( *b->ref=='N' ) return 0;
@@@ -338,9 -228,10 +343,9 @@@
           exit(1);
       }
   
- -    // find PL and DP FORMAT indexes
+ +    // find PL, DV and DP FORMAT indexes
       uint8_t *pl = NULL;
- -    int npl = 0, idp=-1;
- -    int i;
+ +    int i, npl = 0, idp = -1, idv = -1;
       for (i = 0; i < b->n_gi; ++i) 
       {
           if (b->gi[i].fmt == bcf_str2int("PL", 2)) 
@@@ -348,13 -239,7 +353,13 @@@
               pl  = (uint8_t*)b->gi[i].data;
               npl = b->gi[i].len;
           }
- -        if (b->gi[i].fmt == bcf_str2int("DP", 2))  idp=i;
+ +        else if (b->gi[i].fmt == bcf_str2int("DP", 2))  idp=i;
+ +        else if (b->gi[i].fmt == bcf_str2int("DV", 2))  idv=i;
+ +    }
+ +    if ( nals==1 ) 
+ +    {
+ +        if ( !var_only ) _bcf1_set_ref(b, idp);
+ +        return 1;
       }
       if ( !pl ) return -1;
   
@@@ -385,9 -270,9 +390,9 @@@
       if ( sscanf(p+3,"%lf,%lf,%lf,%lf",&qsum[0],&qsum[1],&qsum[2],&qsum[3])!=4 ) { fprintf(stderr,"Could not parse %s\n",p); exit(1); }
   
   
- -    // Calculate the most likely combination of alleles
+ +    // Calculate the most likely combination of alleles, remembering the most and second most likely set
       int ia,ib,ic, max_als=0, max_als2=0;
- -    double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN;
+ +    double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN, lk_sums[3];
       for (ia=0; ia<nals; ia++)
       {
           double lk_tot = 0;
@@@ -404,7 -289,6 +409,7 @@@
           else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia; }
           lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
       }
+ +    lk_sums[0] = lk_sum;
       if ( nals>1 )
       {
           for (ia=0; ia<nals; ia++)
@@@ -433,7 -317,6 +438,7 @@@
                   lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
               }
           }
+ +        lk_sums[1] = lk_sum;
       }
       if ( nals>2 )
       {
@@@ -471,7 -354,6 +476,7 @@@
                   }
               }
           }
+ +        lk_sums[2] = lk_sum;
       }
   
       // Should we add another allele, does it increase the likelihood significantly?
@@@ -480,12 -362,9 +485,12 @@@
       for (i=0; i<nals; i++) if ( max_als2&1<<i) n2++;
       if ( n2<n1 && kf_gammap(1,2.0*(max_lk-max_lk2))<threshold )
       {
- -        max_lk = max_lk2;
+ +        // the threshold not exceeded, use the second most likely set with fewer alleles
+ +        max_lk  = max_lk2;
           max_als = max_als2;
+ +        n1 = n2;
       }
+ +    lk_sum = lk_sums[n1-1];
   
       // Get the BCF record ready for GT and GQ
       kstring_t s;
@@@ -497,20 -376,19 +502,20 @@@
   
       // Call GTs
       int isample, gts=0, ac[4] = {0,0,0,0};
+ +    int nRR = 0, nAA = 0, nRA = 0, max_dv = 0, dp_nref = 0;
       for (isample = 0; isample < b->n_smpl; isample++) 
       {
           int ploidy = b->ploidy ? b->ploidy[isample] : 2;
           double *p = pdg + isample*npdg;
           int ia, als = 0;
- -        double lk = 0, lk_sum=0;
+ +        double lk = 0, lk_s = 0;
           for (ia=0; ia<nals; ia++)
           {
               if ( !(max_als&1<<ia) ) continue;
               int iaa = (ia+1)*(ia+2)/2-1;
               double _lk = p[iaa]*qsum[ia]*qsum[ia];
               if ( _lk > lk ) { lk = _lk; als = ia<<3 | ia; }
- -            lk_sum += _lk;
+ +            lk_s += _lk;
           }
           if ( ploidy==2 ) 
           {
@@@ -524,48 -402,26 +529,48 @@@
                       int iab = iaa - ia + ib;
                       double _lk = 2*qsum[ia]*qsum[ib]*p[iab];
                       if ( _lk > lk ) { lk = _lk; als = ib<<3 | ia; }
- -                    lk_sum += _lk;
+ +                    lk_s += _lk;
                   }
               }
           }
- -        lk = -log(1-lk/lk_sum)/0.2302585;
- -        if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 ) 
+ +        lk = -log(1-lk/lk_s)/0.2302585;
+ +        int dp = 0;
+ +        if ( idp>=0 && (dp=((uint16_t*)b->gi[idp].data)[isample])==0 )
           {
+ +            // no coverage
               ((uint8_t*)b->gi[old_n_gi].data)[isample]   = 1<<7;
               ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = 0;
               continue;
           }
+ +        if ( lk>99 ) lk = 99;
           ((uint8_t*)b->gi[old_n_gi].data)[isample]   = als;
- -        ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = lk<100 ? (int)lk : 99;
+ +        ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = (int)lk;
+ +
+ +        // For MDV annotation
+ +        int dv;
+ +        if ( als && idv>=0 && (dv=((uint16_t*)b->gi[idv].data)[isample]) )
+ +        {
+ +            if ( max_dv < dv ) max_dv = dv;
+ +            dp_nref += dp;
+ +        }
+ +
+ +        // For HWE annotation; multiple ALT alleles treated as one
+ +        if ( !als ) nRR++;
+ +        else if ( !(als>>3&7) || !(als&7) ) nRA++;
+ +        else nAA++;
   
           gts |= 1<<(als>>3&7) | 1<<(als&7);
           ac[ als>>3&7 ]++;
           ac[ als&7 ]++;
       }
+ +    free(pdg);
       bcf_fit_alt(b,max_als);
   
+ +    // The VCF spec is ambiguous about QUAL: is it the probability of anything else
+ +    //  (that is QUAL(non-ref) = P(ref)+P(any non-ref other than ALT)) or is it
+ +    //  QUAL(non-ref)=P(ref) and QUAL(ref)=1-P(ref)? Assuming the latter.
+ +    b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*log(1-exp(ref_lk - lk_sum));
+ +    if ( b->qual>999 ) b->qual = 999;
   
       // Prepare BCF for output: ref, alt, filter, info, format
       memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); 
@@@ -598,14 -454,6 +603,14 @@@
           {
               if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
               ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
+ +            ksprintf(&s, ";QBD=%e", b->qual/(a.d[0] + a.d[1] + a.d[2] + a.d[3]));
+ +            if ( dp_nref ) ksprintf(&s, ";QBDNR=%e", b->qual/dp_nref);
+ +            if ( max_dv ) ksprintf(&s, ";MDV=%d", max_dv);
+ +        }
+ +        if ( nAA+nRA )
+ +        {
+ +            double hwe = calc_hwe(nAA, nRR, nRA);
+ +            ksprintf(&s, ";HWE=%e", hwe);
           }
           kputc('\0', &s);
           rm_info(&s, "I16=");
@@@ -614,8 -462,12 +619,8 @@@
       kputs(b->fmt, &s); kputc('\0', &s);
       free(b->str);
       b->m_str = s.m; b->l_str = s.l; b->str = s.s;
- -    b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*(max_lk - lk_sum);
- -    if ( b->qual>999 ) b->qual = 999;
       bcf_sync(b);
   
- -
- -    free(pdg);
       return gts;
   }
   
@@@ -751,6 -603,8 +756,8 @@@ static void mc_cal_y_core(bcf_p1aux_t *
                 }
         }
         if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1));
+       if (bcf_p1_fp_lk)
+               gzwrite(bcf_p1_fp_lk, ma->z, sizeof(double) * (ma->M + 1));
   }
   
   static void mc_cal_y(bcf_p1aux_t *ma)
diff --combined samtools.1

index 4b3f75a1926f0c4248f228f6378d2ff06e3e0c77,118a6e83079bddd9787e89673d64e7a4b1ddd0b3..869feaac02a64b8e49ba531f5bc164b413da9902
--- 1/samtools.1
--- 2/samtools.1
+++ b/samtools.1
@@@ -30,7 -30,7 +30,7 @@@ bcftools index in.bc
   .PP
   bcftools view in.bcf chr2:100-200 > out.vcf
   .PP
- -bcftools view -vc in.bcf > out.vcf 2> out.afs
+ +bcftools view -Nvm0.99 in.bcf > out.vcf 2> out.afs
   
   .SH DESCRIPTION
   .PP
@@@ -140,38 -140,17 +140,38 @@@ to another samtools command
   
   .TP
   .B tview
- -samtools tview <in.sorted.bam> [ref.fasta]
+ +samtools tview 
+ +.RB [ \-p 
+ +.IR chr:pos ]
+ +.RB [ \-s 
+ +.IR STR ]
+ +.RB [ \-d 
+ +.IR display ] 
+ +.RI <in.sorted.bam> 
+ +.RI [ref.fasta]
   
   Text alignment viewer (based on the ncurses library). In the viewer,
   press `?' for help and press `g' to check the alignment start from a
   region in the format like `chr10:10,000,000' or `=10,000,000' when
   viewing the same reference sequence.
   
+ +.B Options:
+ +.RS
+ +.TP 14
+ +.BI -d \ display
+ +Output as (H)tml or (C)urses or (T)ext
+ +.TP
+ +.BI -p \ chr:pos
+ +Go directly to this position
+ +.TP
+ +.BI -s \ STR
+ +Display only reads from this sample or read group
+ +.RE
+ +
   .TP
   .B mpileup
- -.B samtools mpileup
- -.RB [ \-EBug ]
+ +samtools mpileup
+ +.RB [ \-EBugp ]
   .RB [ \-C
   .IR capQcoef ]
   .RB [ \-r
@@@ -318,10 -297,6 +318,10 @@@ Phred-scaled gap open sequencing error 
   .I INT
   leads to more indel calls. [40]
   .TP
+ +.BI -p
+ +Apply -m and -F thresholds per sample to increase sensitivity of calling.
+ +By default both options are applied to reads pooled from all samples.
+ +.TP
   .BI -P \ STR
   Comma dilimited list of platforms (determined by
   .BR @RG-PL )
@@@ -353,7 -328,7 +353,7 @@@ which enables fast BAM concatenation
   
   .TP
   .B sort
- samtools sort [-no] [-m maxMem] <in.bam> <out.prefix>
+ samtools sort [-nof] [-m maxMem] <in.bam> <out.prefix>
   
   Sort alignments by leftmost coordinates. File
   .I <out.prefix>.bam
@@@ -371,6 -346,13 +371,13 @@@ Output the final alignment to the stand
   .B -n
   Sort by read names rather than by chromosomal coordinates
   .TP
+ .B -f
+ Use
+ .I <out.prefix>
+ as the full output path and do not append
+ .I .bam
+ suffix.
+ .TP
   .BI -m \ INT
   Approximately the maximum required memory. [500000000]
   .RE
@@@ -595,8 -577,6 +602,8 @@@ Minimum base quality to be used in het 
   .IR mutRate ]
   .RB [ \-p
   .IR varThres ]
+ +.RB [ \-m
+ +.IR varThres ]
   .RB [ \-P
   .IR prior ]
   .RB [ \-1
@@@ -679,12 -659,6 +686,12 @@@ Call per-sample genotypes at variant si
   .BI -i \ FLOAT
   Ratio of INDEL-to-SNP mutation rate [0.15]
   .TP
+ +.BI -m \ FLOAT
+ +New model for improved multiallelic and rare-variant calling. Another
+ +ALT allele is accepted if P(chi^2) of LRT exceeds the FLOAT threshold. The 
+ +parameter seems robust and the actual value usually does not affect the results
+ +much; a good value to use is 0.99. This is the recommended calling method. [0]
+ +.TP
   .BI -p \ FLOAT
   A site is considered to be a variant if P(ref|D)<FLOAT [0.5]
   .TP
@@@ -844,9 -818,6 +851,9 @@@ RP int     # permutations yielding a smalle
   CLR   int     Phred log ratio of genotype likelihoods with and without the trio/pair constraint
   UGT   string  Most probable genotype configuration without the trio constraint
   CGT   string  Most probable configuration with the trio constraint
+ +VDB   float   Tests variant positions within reads. Intended for filtering RNA-seq artifacts around splice sites
+ +RPB   float   Mann-Whitney rank-sum test for tail distance bias
+ +HWE   float   Hardy-Weinberg equilibrium test, Wigginton et al., PMID: 15789306
   .TE
   
   .SH EXAMPLES
author	Petr Danecek <pd3@sanger.ac.uk>
	Tue, 12 Mar 2013 13:25:17 +0000 (13:25 +0000)
committer	Petr Danecek <pd3@sanger.ac.uk>
	Tue, 12 Mar 2013 13:25:17 +0000 (13:25 +0000)
		1	2
bam_plcmd.c	patch \|	diff1 \|	diff2 \|	blob \| history
bcftools/call1.c	patch \|	diff1 \|	diff2 \|	blob \| history
bcftools/prob1.c	patch \|	diff1 \|	diff2 \|	blob \| history
samtools.1	patch \|	diff1 \|	diff2 \|	blob \| history