From: Petr Danecek Date: Tue, 12 Mar 2013 13:25:17 +0000 (+0000) Subject: Merge remote branch 'remotes/master/master' into fb-annots X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=60e0a8467ddbd0b89f15d201dcfe10c8796552b2;hp=-c Merge remote branch 'remotes/master/master' into fb-annots --- 60e0a8467ddbd0b89f15d201dcfe10c8796552b2 diff --combined bam_plcmd.c index e45c8c8,8935c5a..9c4f273 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@@ -4,6 -4,8 +4,8 @@@ #include #include #include + #include + #include #include "sam.h" #include "faidx.h" #include "kstring.h" @@@ -80,6 -82,7 +82,7 @@@ int bed_overlap(const void *_h, const c typedef struct { int max_mq, min_mq, flag, min_baseQ, capQ_thres, max_depth, max_indel_depth, fmt_flag; + int rflag_require, rflag_filter; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels char *reg, *pl_list; @@@ -117,6 -120,8 +120,8 @@@ static int mplp_func(void *data, bam1_ skip = 1; continue; } + if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; } + if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; } if (ma->conf->bed) { // test overlap skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_calend(&b->core, bam1_cigar(b))); if (skip) continue; @@@ -215,10 -220,6 +220,10 @@@ static int mpileup(mplp_conf_t *conf, i } data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); + if ( !h_tmp ) { + fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); + exit(1); + } data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); @@@ -316,7 -317,7 +321,7 @@@ ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); - bcf_call_combine(gplp.n, bcr, ref16, &bc); + bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); @@@ -324,7 -325,7 +329,7 @@@ if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); - if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { + if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, bca, ref); bcf_write(bp, bh, b); @@@ -397,11 -398,15 +402,15 @@@ } #define MAX_PATH_LEN 1024 - static int read_file_list(const char *file_list,int *n,char **argv[]) + int read_file_list(const char *file_list,int *n,char **argv[]) { char buf[MAX_PATH_LEN]; - int len, nfiles; - char **files; + int len, nfiles = 0; + char **files = NULL; + struct stat sb; + + *n = 0; + *argv = NULL; FILE *fh = fopen(file_list,"r"); if ( !fh ) @@@ -410,28 -415,33 +419,33 @@@ return 1; } - // Speed is not an issue here, determine the number of files by reading the file twice - nfiles = 0; - while ( fgets(buf,MAX_PATH_LEN,fh) ) nfiles++; - - if ( fseek(fh, 0L, SEEK_SET) ) - { - fprintf(stderr,"%s: %s\n", file_list,strerror(errno)); - return 1; - } - files = calloc(nfiles,sizeof(char*)); nfiles = 0; while ( fgets(buf,MAX_PATH_LEN,fh) ) { + // allow empty lines and trailing spaces len = strlen(buf); while ( len>0 && isspace(buf[len-1]) ) len--; if ( !len ) continue; - files[nfiles] = malloc(sizeof(char)*(len+1)); - strncpy(files[nfiles],buf,len); - files[nfiles][len] = 0; + // check sanity of the file list + buf[len] = 0; + if (stat(buf, &sb) != 0) + { + // no such file, check if it is safe to print its name + int i, safe_to_print = 1; + for (i=0; i= 0) { + static struct option lopts[] = + { + {"rf",1,0,1}, // require flag + {"ff",1,0,2}, // filter flag + {0,0,0,0} + }; + while ((c = getopt_long(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV1:2:",lopts,NULL)) >= 0) { switch (c) { + case 1 : mplp.rflag_require = strtol(optarg,0,0); break; + case 2 : mplp.rflag_filter = strtol(optarg,0,0); break; case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == 0) return 1; @@@ -517,7 -535,7 +539,7 @@@ fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n"); fprintf(stderr, " -A count anomalous read pairs\n"); fprintf(stderr, " -B disable BAQ computation\n"); - fprintf(stderr, " -b FILE list of input BAM files [null]\n"); + fprintf(stderr, " -b FILE list of input BAM filenames, one per line [null]\n"); fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n"); fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); fprintf(stderr, " -E recalculate extended BAQ on the fly thus ignoring existing BQs\n"); @@@ -529,6 -547,8 +551,8 @@@ fprintf(stderr, " -R ignore RG tags\n"); fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq); fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ); + fprintf(stderr, " --rf INT required flags: skip reads with mask bits unset []\n"); + fprintf(stderr, " --ff INT filter flags: skip reads with mask bits set []\n"); fprintf(stderr, "\nOutput options:\n\n"); fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n"); fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n"); diff --combined bcftools/call1.c index eb58498,bc1cd40..18805a0 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@@ -190,27 -190,7 +190,27 @@@ static char **read_samples(const char * *_n = 0; s.l = s.m = 0; s.s = 0; fp = gzopen(fn, "r"); - if (fp == 0) return 0; // fail to open file + if (fp == 0) + { + // interpret as sample names, not as a file name + const char *t = fn, *p = t; + while (*t) + { + t++; + if ( *t==',' || !*t ) + { + sam = realloc(sam, sizeof(void*)*(n+1)); + sam[n] = (char*) malloc(sizeof(char)*(t-p+2)); + memcpy(sam[n], p, t-p); + sam[n][t-p] = 0; + sam[n][t-p+1] = 2; // assume diploid + p = t+1; + n++; + } + } + *_n = n; + return sam; // fail to open file + } ks = ks_init(fp); while (ks_getuntil(ks, 0, &s, &dret) >= 0) { int l; @@@ -286,18 -266,8 +286,18 @@@ static void write_header(bcf_hdr_t *h kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + kputs("##INFO=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT== 0) { + while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Ywm:K:")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; - case 'l': vc.bed = bed_read(optarg); break; + case 'l': vc.bed = bed_read(optarg); if (!vc.bed) fprintf(stderr,"Could not read \"%s\"\n", optarg); return 1; break; case 'D': vc.fn_dict = strdup(optarg); break; case 'F': vc.flag |= VC_FIX_PL; break; case 'N': vc.flag |= VC_ACGT_ONLY; break; @@@ -373,6 -346,7 +376,7 @@@ case 'C': vc.min_lrt = atof(optarg); break; case 'X': vc.min_perm_p = atof(optarg); break; case 'd': vc.min_smpl_frac = atof(optarg); break; + case 'K': bcf_p1_fp_lk = gzopen(optarg, "w"); break; case 's': vc.subsam = read_samples(optarg, &vc.n_sub); vc.ploidy = calloc(vc.n_sub + 1, 1); for (tid = 0; tid < vc.n_sub; ++tid) vc.ploidy[tid] = vc.subsam[tid][strlen(vc.subsam[tid]) + 1]; @@@ -462,7 -436,7 +466,7 @@@ vc.sublist = calloc(vc.n_sub, sizeof(int)); hout = bcf_hdr_subsam(hin, vc.n_sub, vc.subsam, vc.sublist); } - if (vc.flag & VC_CALL) write_header(hout); + write_header(hout); // always print the header vcf_hdr_write(bout, hout); } if (vc.flag & VC_CALL) { @@@ -496,6 -470,10 +500,10 @@@ } } } + if (bcf_p1_fp_lk && p1) { + int32_t M = bcf_p1_get_M(p1); + gzwrite(bcf_p1_fp_lk, &M, 4); + } while (vcf_read(bp, hin, b) > 0) { int is_indel, cons_llr = -1; int64_t cons_gt = -1; @@@ -544,15 -522,19 +552,19 @@@ int i; for (i = 0; i < 9; ++i) em[i] = -1.; } - if ( !(vc.flag&VC_KEEPALT) && vc.flag&VC_CALL && vc.min_ma_lrt>=0 ) + if ( !(vc.flag&VC_KEEPALT) && (vc.flag&VC_CALL) && vc.min_ma_lrt>=0 ) { bcf_p1_set_ploidy(b, p1); // could be improved: do this per site to allow pseudo-autosomal regions - int gts = call_multiallelic_gt(b,p1,vc.min_ma_lrt); + int gts = call_multiallelic_gt(b, p1, vc.min_ma_lrt, vc.flag&VC_VARONLY); if ( gts<=1 && vc.flag & VC_VARONLY ) continue; } else if (vc.flag & VC_CALL) { // call variants bcf_p1rst_t pr; - int calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr); + int calret; + gzwrite(bcf_p1_fp_lk, &b->tid, 4); + gzwrite(bcf_p1_fp_lk, &b->pos, 4); + gzwrite(bcf_p1_fp_lk, &em[0], sizeof(double)); + calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr); if (n_processed % 100000 == 0) { fprintf(stderr, "[%s] %ld sites processed.\n", __func__, (long)n_processed); bcf_p1_dump_afs(p1); @@@ -597,6 -579,8 +609,8 @@@ } else bcf_fix_gt(b); vcf_write(bout, hout, b); } + + if (bcf_p1_fp_lk) gzclose(bcf_p1_fp_lk); if (vc.prior_file) free(vc.prior_file); if (vc.flag & VC_CALL) bcf_p1_dump_afs(p1); if (hin != hout) bcf_hdr_destroy(hout); diff --combined bcftools/prob1.c index d655722,ffa608e..f04cf08 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@@ -5,6 -5,7 +5,7 @@@ #include #include #include + #include #include "prob1.h" #include "kstring.h" @@@ -15,6 -16,8 +16,8 @@@ KSTREAM_INIT(gzFile, gzread, 16384 #define MC_EM_EPS 1e-5 #define MC_DEF_INDEL 0.15 + gzFile bcf_p1_fp_lk; + unsigned char seq_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, @@@ -165,6 -168,8 +168,8 @@@ bcf_p1aux_t *bcf_p1_init(int n, uint8_ return ma; } + int bcf_p1_get_M(bcf_p1aux_t *b) { return b->M; } + int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) { if (n1 == 0 || n1 >= b->n) return -1; @@@ -203,124 -208,7 +208,124 @@@ void bcf_p1_destroy(bcf_p1aux_t *ma extern double kf_gammap(double s, double z); int test16(bcf1_t *b, anno16_t *a); -int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) +// Wigginton 2005, PMID: 15789306 +// written by Jan Wigginton +double calc_hwe(int obs_hom1, int obs_hom2, int obs_hets) +{ + if (obs_hom1 + obs_hom2 + obs_hets == 0 ) return 1; + + assert(obs_hom1 >= 0 && obs_hom2 >= 0 && obs_hets >= 0); + + int obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1; + int obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2; + + int rare_copies = 2 * obs_homr + obs_hets; + int genotypes = obs_hets + obs_homc + obs_homr; + + double *het_probs = (double*) calloc(rare_copies+1, sizeof(double)); + + /* start at midpoint */ + int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes); + + /* check to ensure that midpoint and rare alleles have same parity */ + if ((rare_copies & 1) ^ (mid & 1)) mid++; + + int curr_hets = mid; + int curr_homr = (rare_copies - mid) / 2; + int curr_homc = genotypes - curr_hets - curr_homr; + + het_probs[mid] = 1.0; + double sum = het_probs[mid]; + for (curr_hets = mid; curr_hets > 1; curr_hets -= 2) + { + het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)); + sum += het_probs[curr_hets - 2]; + + /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */ + curr_homr++; + curr_homc++; + } + + curr_hets = mid; + curr_homr = (rare_copies - mid) / 2; + curr_homc = genotypes - curr_hets - curr_homr; + for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2) + { + het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc /((curr_hets + 2.0) * (curr_hets + 1.0)); + sum += het_probs[curr_hets + 2]; + + /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */ + curr_homr--; + curr_homc--; + } + int i; + for (i = 0; i <= rare_copies; i++) het_probs[i] /= sum; + + /* p-value calculation for p_hwe */ + double p_hwe = 0.0; + for (i = 0; i <= rare_copies; i++) + { + if (het_probs[i] > het_probs[obs_hets]) + continue; + p_hwe += het_probs[i]; + } + + p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; + free(het_probs); + return p_hwe; + +} + + +static void _bcf1_set_ref(bcf1_t *b, int idp) +{ + kstring_t s; + int old_n_gi = b->n_gi; + s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; + kputs(":GT", &s); kputc('\0', &s); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + + // Call GTs + int isample, an = 0; + for (isample = 0; isample < b->n_smpl; isample++) + { + if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 ) + ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7; + else + { + ((uint8_t*)b->gi[old_n_gi].data)[isample] = 0; + an += b->ploidy ? b->ploidy[isample] : 2; + } + } + bcf_fit_alt(b,1); + b->qual = 999; + + // Prepare BCF for output: ref, alt, filter, info, format + memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); + kputs(b->ref, &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); + { + ksprintf(&s, "AN=%d;", an); + kputs(b->info, &s); + anno16_t a; + int has_I16 = test16(b, &a) >= 0? 1 : 0; + if (has_I16 ) + { + if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); + ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + } + kputc('\0', &s); + rm_info(&s, "I16="); + rm_info(&s, "QS="); + } + kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); +} + +int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only) { int nals = 1; char *p; @@@ -331,6 -219,8 +336,6 @@@ } if ( b->alt[0] && !*p ) nals++; - if ( nals==1 ) return 1; - if ( nals>4 ) { if ( *b->ref=='N' ) return 0; @@@ -338,9 -228,10 +343,9 @@@ exit(1); } - // find PL and DP FORMAT indexes + // find PL, DV and DP FORMAT indexes uint8_t *pl = NULL; - int npl = 0, idp=-1; - int i; + int i, npl = 0, idp = -1, idv = -1; for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("PL", 2)) @@@ -348,13 -239,7 +353,13 @@@ pl = (uint8_t*)b->gi[i].data; npl = b->gi[i].len; } - if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i; + else if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i; + else if (b->gi[i].fmt == bcf_str2int("DV", 2)) idv=i; + } + if ( nals==1 ) + { + if ( !var_only ) _bcf1_set_ref(b, idp); + return 1; } if ( !pl ) return -1; @@@ -385,9 -270,9 +390,9 @@@ if ( sscanf(p+3,"%lf,%lf,%lf,%lf",&qsum[0],&qsum[1],&qsum[2],&qsum[3])!=4 ) { fprintf(stderr,"Could not parse %s\n",p); exit(1); } - // Calculate the most likely combination of alleles + // Calculate the most likely combination of alleles, remembering the most and second most likely set int ia,ib,ic, max_als=0, max_als2=0; - double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN; + double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN, lk_sums[3]; for (ia=0; ialk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); } + lk_sums[0] = lk_sum; if ( nals>1 ) { for (ia=0; ialk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); } } + lk_sums[1] = lk_sum; } if ( nals>2 ) { @@@ -471,7 -354,6 +476,7 @@@ } } } + lk_sums[2] = lk_sum; } // Should we add another allele, does it increase the likelihood significantly? @@@ -480,12 -362,9 +485,12 @@@ for (i=0; in_smpl; isample++) { int ploidy = b->ploidy ? b->ploidy[isample] : 2; double *p = pdg + isample*npdg; int ia, als = 0; - double lk = 0, lk_sum=0; + double lk = 0, lk_s = 0; for (ia=0; ia lk ) { lk = _lk; als = ia<<3 | ia; } - lk_sum += _lk; + lk_s += _lk; } if ( ploidy==2 ) { @@@ -524,48 -402,26 +529,48 @@@ int iab = iaa - ia + ib; double _lk = 2*qsum[ia]*qsum[ib]*p[iab]; if ( _lk > lk ) { lk = _lk; als = ib<<3 | ia; } - lk_sum += _lk; + lk_s += _lk; } } } - lk = -log(1-lk/lk_sum)/0.2302585; - if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 ) + lk = -log(1-lk/lk_s)/0.2302585; + int dp = 0; + if ( idp>=0 && (dp=((uint16_t*)b->gi[idp].data)[isample])==0 ) { + // no coverage ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7; ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = 0; continue; } + if ( lk>99 ) lk = 99; ((uint8_t*)b->gi[old_n_gi].data)[isample] = als; - ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = lk<100 ? (int)lk : 99; + ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = (int)lk; + + // For MDV annotation + int dv; + if ( als && idv>=0 && (dv=((uint16_t*)b->gi[idv].data)[isample]) ) + { + if ( max_dv < dv ) max_dv = dv; + dp_nref += dp; + } + + // For HWE annotation; multiple ALT alleles treated as one + if ( !als ) nRR++; + else if ( !(als>>3&7) || !(als&7) ) nRA++; + else nAA++; gts |= 1<<(als>>3&7) | 1<<(als&7); ac[ als>>3&7 ]++; ac[ als&7 ]++; } + free(pdg); bcf_fit_alt(b,max_als); + // The VCF spec is ambiguous about QUAL: is it the probability of anything else + // (that is QUAL(non-ref) = P(ref)+P(any non-ref other than ALT)) or is it + // QUAL(non-ref)=P(ref) and QUAL(ref)=1-P(ref)? Assuming the latter. + b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*log(1-exp(ref_lk - lk_sum)); + if ( b->qual>999 ) b->qual = 999; // Prepare BCF for output: ref, alt, filter, info, format memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); @@@ -598,14 -454,6 +603,14 @@@ { if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + ksprintf(&s, ";QBD=%e", b->qual/(a.d[0] + a.d[1] + a.d[2] + a.d[3])); + if ( dp_nref ) ksprintf(&s, ";QBDNR=%e", b->qual/dp_nref); + if ( max_dv ) ksprintf(&s, ";MDV=%d", max_dv); + } + if ( nAA+nRA ) + { + double hwe = calc_hwe(nAA, nRR, nRA); + ksprintf(&s, ";HWE=%e", hwe); } kputc('\0', &s); rm_info(&s, "I16="); @@@ -614,8 -462,12 +619,8 @@@ kputs(b->fmt, &s); kputc('\0', &s); free(b->str); b->m_str = s.m; b->l_str = s.l; b->str = s.s; - b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*(max_lk - lk_sum); - if ( b->qual>999 ) b->qual = 999; bcf_sync(b); - - free(pdg); return gts; } @@@ -751,6 -603,8 +756,8 @@@ static void mc_cal_y_core(bcf_p1aux_t * } } if (z[0] != ma->z) memcpy(ma->z, z[0], sizeof(double) * (ma->M + 1)); + if (bcf_p1_fp_lk) + gzwrite(bcf_p1_fp_lk, ma->z, sizeof(double) * (ma->M + 1)); } static void mc_cal_y(bcf_p1aux_t *ma) diff --combined samtools.1 index 4b3f75a,118a6e8..869feaa --- a/samtools.1 +++ b/samtools.1 @@@ -30,7 -30,7 +30,7 @@@ bcftools index in.bc .PP bcftools view in.bcf chr2:100-200 > out.vcf .PP -bcftools view -vc in.bcf > out.vcf 2> out.afs +bcftools view -Nvm0.99 in.bcf > out.vcf 2> out.afs .SH DESCRIPTION .PP @@@ -140,38 -140,17 +140,38 @@@ to another samtools command .TP .B tview -samtools tview [ref.fasta] +samtools tview +.RB [ \-p +.IR chr:pos ] +.RB [ \-s +.IR STR ] +.RB [ \-d +.IR display ] +.RI +.RI [ref.fasta] Text alignment viewer (based on the ncurses library). In the viewer, press `?' for help and press `g' to check the alignment start from a region in the format like `chr10:10,000,000' or `=10,000,000' when viewing the same reference sequence. +.B Options: +.RS +.TP 14 +.BI -d \ display +Output as (H)tml or (C)urses or (T)ext +.TP +.BI -p \ chr:pos +Go directly to this position +.TP +.BI -s \ STR +Display only reads from this sample or read group +.RE + .TP .B mpileup -.B samtools mpileup -.RB [ \-EBug ] +samtools mpileup +.RB [ \-EBugp ] .RB [ \-C .IR capQcoef ] .RB [ \-r @@@ -318,10 -297,6 +318,10 @@@ Phred-scaled gap open sequencing error .I INT leads to more indel calls. [40] .TP +.BI -p +Apply -m and -F thresholds per sample to increase sensitivity of calling. +By default both options are applied to reads pooled from all samples. +.TP .BI -P \ STR Comma dilimited list of platforms (determined by .BR @RG-PL ) @@@ -353,7 -328,7 +353,7 @@@ which enables fast BAM concatenation .TP .B sort - samtools sort [-no] [-m maxMem] + samtools sort [-nof] [-m maxMem] Sort alignments by leftmost coordinates. File .I .bam @@@ -371,6 -346,13 +371,13 @@@ Output the final alignment to the stand .B -n Sort by read names rather than by chromosomal coordinates .TP + .B -f + Use + .I + as the full output path and do not append + .I .bam + suffix. + .TP .BI -m \ INT Approximately the maximum required memory. [500000000] .RE @@@ -595,8 -577,6 +602,8 @@@ Minimum base quality to be used in het .IR mutRate ] .RB [ \-p .IR varThres ] +.RB [ \-m +.IR varThres ] .RB [ \-P .IR prior ] .RB [ \-1 @@@ -679,12 -659,6 +686,12 @@@ Call per-sample genotypes at variant si .BI -i \ FLOAT Ratio of INDEL-to-SNP mutation rate [0.15] .TP +.BI -m \ FLOAT +New model for improved multiallelic and rare-variant calling. Another +ALT allele is accepted if P(chi^2) of LRT exceeds the FLOAT threshold. The +parameter seems robust and the actual value usually does not affect the results +much; a good value to use is 0.99. This is the recommended calling method. [0] +.TP .BI -p \ FLOAT A site is considered to be a variant if P(ref|D)