From: Petr Danecek Date: Fri, 15 Mar 2013 09:27:09 +0000 (+0000) Subject: Merge remote branch 'pd3master/master' X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=96b5f2294ac0054230e88913c4983d548069ea4e;hp=3f820f8c565a8162d835a321a5c0ecdd9567d820;p=samtools.git Merge remote branch 'pd3master/master' --- diff --git a/bam2bcf.c b/bam2bcf.c index a51a406..340b10b 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -1,5 +1,6 @@ #include #include +#include #include "bam.h" #include "kstring.h" #include "bam2bcf.h" @@ -27,20 +28,54 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) bca->min_frac = 0.002; bca->min_support = 1; bca->per_sample_flt = 0; - return bca; + bca->npos = 100; + bca->ref_pos = calloc(bca->npos, sizeof(int)); + bca->alt_pos = calloc(bca->npos, sizeof(int)); + return bca; +} + + +static int get_position(const bam_pileup1_t *p, int *len) +{ + int icig, n_tot_bases = 0, iread = 0, edist = p->qpos + 1; + for (icig=0; icigb->core.n_cigar; icig++) + { + // Conversion from uint32_t to MIDNSHP + // 0123456 + // MIDNSHP + int cig = bam1_cigar(p->b)[icig] & BAM_CIGAR_MASK; + int ncig = bam1_cigar(p->b)[icig] >> BAM_CIGAR_SHIFT; + if ( cig==0 ) + { + n_tot_bases += ncig; + iread += ncig; + } + else if ( cig==1 ) + { + n_tot_bases += ncig; + iread += ncig; + } + else if ( cig==4 ) + { + iread += ncig; + if ( iread<=p->qpos ) edist -= ncig; + } + } + *len = n_tot_bases; + return edist; } void bcf_call_destroy(bcf_callaux_t *bca) { if (bca == 0) return; errmod_destroy(bca->e); + if (bca->npos) { free(bca->ref_pos); free(bca->alt_pos); bca->npos = 0; } free(bca->bases); free(bca->inscns); free(bca); } /* ref_base is the 4-bit representation of the reference base. It is * negative if we are looking at an indel. */ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r) { - static int *var_pos = NULL, nvar_pos = 0; int i, n, ref4, is_indel, ori_depth = 0; memset(r, 0, sizeof(bcf_callret1_t)); if (ref_base >= 0) { @@ -87,102 +122,168 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t if (min_dist > p->qpos) min_dist = p->qpos; if (min_dist > CAP_DIST) min_dist = CAP_DIST; r->anno[1<<2|is_diff<<1|0] += baseQ; - r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; // FIXME: signed int is not enough for thousands of samples + r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; r->anno[2<<2|is_diff<<1|0] += mapQ; - r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; // FIXME: signed int is not enough for thousands of samples + r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; r->anno[3<<2|is_diff<<1|0] += min_dist; r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist; + + // collect read positions for ReadPosBias + int len, pos = get_position(p, &len); + int epos = (double)pos/(len+1) * bca->npos; + if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) + bca->ref_pos[epos]++; + else + bca->alt_pos[epos]++; } r->depth = n; r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); + return r->depth; +} - // Calculate the Variant Distance Bias (make it optional?) - if ( nvar_pos < _n ) { - nvar_pos = _n; - var_pos = realloc(var_pos,sizeof(int)*nvar_pos); - } - int alt_dp=0, read_len=0; - for (i=0; i<_n; i++) { - const bam_pileup1_t *p = pl + i; - if ( bam1_seqi(bam1_seq(p->b),p->qpos) == ref_base ) - continue; +double mann_whitney_1947(int n, int m, int U) +{ + if (U<0) return 0; + if (n==0||m==0) return U==0 ? 1 : 0; + return (double)n/(n+m)*mann_whitney_1947(n-1,m,U-m) + (double)m/(n+m)*mann_whitney_1947(n,m-1,U); +} - var_pos[alt_dp] = p->qpos; - if ( (bam1_cigar(p->b)[0]&BAM_CIGAR_MASK)==4 ) - var_pos[alt_dp] -= bam1_cigar(p->b)[0]>>BAM_CIGAR_SHIFT; +void calc_ReadPosBias(bcf_callaux_t *bca, bcf_call_t *call) +{ + int i, nref = 0, nalt = 0; + unsigned long int U = 0; + for (i=0; inpos; i++) + { + nref += bca->ref_pos[i]; + nalt += bca->alt_pos[i]; + U += nref*bca->alt_pos[i]; + bca->ref_pos[i] = 0; + bca->alt_pos[i] = 0; + } +#if 0 +//todo + double var = 0, avg = (double)(nref+nalt)/bca->npos; + for (i=0; inpos; i++) + { + double ediff = bca->ref_pos[i] + bca->alt_pos[i] - avg; + var += ediff*ediff; + bca->ref_pos[i] = 0; + bca->alt_pos[i] = 0; + } + call->read_pos.avg = avg; + call->read_pos.var = sqrt(var/bca->npos); + call->read_pos.dp = nref+nalt; +#endif + if ( !nref || !nalt ) + { + call->read_pos_bias = -1; + return; + } - alt_dp++; - read_len += p->b->core.l_qseq; + if ( nref>=8 || nalt>=8 ) + { + // normal approximation + double mean = ((double)nref*nalt+1.0)/2.0; + double var2 = (double)nref*nalt*(nref+nalt+1.0)/12.0; + double z = (U-mean)/sqrt(var2); + call->read_pos_bias = z; + //fprintf(stderr,"nref=%d nalt=%d U=%ld mean=%e var=%e zval=%e\n", nref,nalt,U,mean,sqrt(var2),call->read_pos_bias); } - float mvd=0; - int j; - n=0; - for (i=0; i= 1./sqrt(var2*2*M_PI) ) z = 0; // equal to mean + else + { + if ( U >= nref*nalt/2. ) z = sqrt(-2*log(sqrt(var2*2*M_PI)*p)); + else z = -sqrt(-2*log(sqrt(var2*2*M_PI)*p)); } + call->read_pos_bias = z; + //fprintf(stderr,"nref=%d nalt=%d U=%ld p=%e var2=%e zval=%e\n", nref,nalt,U, p,var2,call->read_pos_bias); } - r->mvd[0] = n ? mvd/n : 0; - r->mvd[1] = alt_dp; - r->mvd[2] = alt_dp ? read_len/alt_dp : 0; - - return r->depth; } - -void calc_vdb(int n, const bcf_callret1_t *calls, bcf_call_t *call) +float mean_diff_to_prob(float mdiff, int dp, int readlen) { - // Variant distance bias. Samples merged by means of DP-weighted average. - - float weight=0, tot_prob=0; - - int i; - for (i=0; i2*mu ? 0 : sin(mvd*3.14/2/mu) / (4*mu/3.14); - } - else - { - // Scaled gaussian curve, crude approximation, but behaves well. Using fixed depth for bigger depths. - if ( dp>5 ) - dp = 5; - float sigma2 = (read_len/1.9/(dp+1)) * (read_len/1.9/(dp+1)); - float norm = 1.125*sqrt(2*3.14*sigma2); - float mu = read_len/2.9; - if ( mvd < mu ) - prob = exp(-(mvd-mu)*(mvd-mu)/2/sigma2)/norm; - else - prob = exp(-(mvd-mu)*(mvd-mu)/3.125/sigma2)/norm; - } + float m, v; + if ( dp>=24 ) + { + m = readlen/8.; + if (dp>100) dp = 100; + v = 1.476/(0.182*pow(dp,0.514)); + v = v*(readlen/100.); + } + else + { + m = mv[dp][0]; + v = mv[dp][1]; + m = m*readlen/100.; + v = v*readlen/100.; + v *= 1.2; // allow more variability + } + return 1.0/(v*sqrt(2*M_PI)) * exp(-0.5*((mdiff-m)/v)*((mdiff-m)/v)); +} - //fprintf(stderr,"dp=%d mvd=%d read_len=%d -> prob=%f\n", dp,mvd,read_len,prob); - tot_prob += prob*dp; - weight += dp; +void calc_vdb(bcf_callaux_t *bca, bcf_call_t *call) +{ + int i, dp = 0; + float mean_pos = 0, mean_diff = 0; + for (i=0; inpos; i++) + { + if ( !bca->alt_pos[i] ) continue; + dp += bca->alt_pos[i]; + int j = inpos/2 ? i : bca->npos - i; + mean_pos += bca->alt_pos[i]*j; + } + if ( dp<2 ) + { + call->vdb = -1; + return; + } + mean_pos /= dp; + for (i=0; inpos; i++) + { + if ( !bca->alt_pos[i] ) continue; + int j = inpos/2 ? i : bca->npos - i; + mean_diff += bca->alt_pos[i] * fabs(j - mean_pos); } - tot_prob = weight ? tot_prob/weight : 1; - //fprintf(stderr,"prob=%f\n", tot_prob); - call->vdb = tot_prob; + mean_diff /= dp; + call->vdb = mean_diff_to_prob(mean_diff, dp, bca->npos); } -int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call) +/** + * bcf_call_combine() - sets the PL array and VDB, RPB annotations, finds the top two alleles + * @n: number of samples + * @calls: each sample's calls + * @bca: auxiliary data structure for holding temporary values + * @ref_base: the reference base + * @call: filled with the annotations + */ +int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call) { int ref4, i, j, qsum[4]; int64_t tmp; @@ -264,7 +365,8 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j]; } - calc_vdb(n, calls, call); + calc_vdb(bca, call); + calc_ReadPosBias(bca, call); return 0; } @@ -319,9 +421,12 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc if (i) kputc(',', &s); kputw(bc->anno[i], &s); } + //ksprintf(&s,";RPS=%d,%f,%f", bc->read_pos.dp,bc->read_pos.avg,bc->read_pos.var); ksprintf(&s,";QS=%f,%f,%f,%f", bc->qsum[0],bc->qsum[1],bc->qsum[2],bc->qsum[3]); - if (bc->vdb != 1) - ksprintf(&s, ";VDB=%.4f", bc->vdb); + if (bc->vdb != -1) + ksprintf(&s, ";VDB=%e", bc->vdb); + if (bc->read_pos_bias != -1 ) + ksprintf(&s, ";RPB=%e", bc->read_pos_bias); kputc('\0', &s); // FMT kputs("PL", &s); diff --git a/bam2bcf.h b/bam2bcf.h index 8ac6b79..b2b1825 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -17,10 +17,12 @@ typedef struct __bcf_callaux_t { int min_support, max_support; // for collecting indel candidates double min_frac, max_frac; // for collecting indel candidates int per_sample_flt; // indel filtering strategy + int *ref_pos, *alt_pos, npos; // for ReadPosBias // for internal uses int max_bases; int indel_types[4]; int maxins, indelreg; + int read_len; char *inscns; uint16_t *bases; errmod_t *e; @@ -29,9 +31,8 @@ typedef struct __bcf_callaux_t { typedef struct { int depth, n_supp, ori_depth, qsum[4]; - int anno[16]; + unsigned int anno[16]; float p[25]; - int mvd[3]; // mean variant distance, number of variant reads, average read length } bcf_callret1_t; typedef struct { @@ -39,9 +40,11 @@ typedef struct { float qsum[4]; int n, n_alleles, shift, ori_ref, unseen; int n_supp; // number of supporting non-reference reads - int anno[16], depth, ori_depth; + unsigned int anno[16], depth, ori_depth; uint8_t *PL; float vdb; // variant distance bias + float read_pos_bias; + struct { float avg, var; int dp; } read_pos; } bcf_call_t; #ifdef __cplusplus @@ -51,7 +54,7 @@ extern "C" { bcf_callaux_t *bcf_call_init(double theta, int min_baseQ); void bcf_call_destroy(bcf_callaux_t *bca); int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t *bca, bcf_callret1_t *r); - int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, bcf_call_t *call); + int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call); int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref); int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 9aea5f9..30b3f46 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -109,6 +109,9 @@ static inline int est_indelreg(int pos, const char *ref, int l, char *ins4) return max_i - pos; } +/* + * @n: number of samples + */ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, const void *rghash) { @@ -215,6 +218,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla * reduces the power because sometimes, substitutions caused by * indels are not distinguishable from true mutations. Multiple * sequence realignment helps to increase the power. + * + * Masks mismatches present in at least 70% of the reads with 'N'. */ { // construct per-sample consensus int L = right - left + 1, max_i, max2_i; @@ -258,7 +263,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) max2_i = -1; if (max_i >= 0) r[max_i] = 15; if (max2_i >= 0) r[max2_i] = 15; -// for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); + //for (i = 0; i < right - left; ++i) fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); fputc('\n', stderr); } free(ref0); free(cns); } @@ -275,9 +280,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla } } // construct the consensus sequence - max_ins = types[n_types - 1]; // max_ins is at least 0 + max_ins = types[n_types - 1]; // max_ins is at least 0 if (max_ins > 0) { - int *inscns_aux = calloc(4 * n_types * max_ins, sizeof(int)); + int *inscns_aux = calloc(5 * n_types * max_ins, sizeof(int)); // count the number of occurrences of each base at each position for each type of insertion for (t = 0; t < n_types; ++t) { if (types[t] > 0) { @@ -288,7 +293,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla uint8_t *seq = bam1_seq(p->b); for (k = 1; k <= p->indel; ++k) { int c = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos + k)]; - if (c < 4) ++inscns_aux[(t*max_ins+(k-1))*4 + c]; + assert(c<5); + ++inscns_aux[(t*max_ins+(k-1))*5 + c]; } } } @@ -299,11 +305,12 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla inscns = calloc(n_types * max_ins, 1); for (t = 0; t < n_types; ++t) { for (j = 0; j < types[t]; ++j) { - int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*4]; - for (k = 0; k < 4; ++k) + int max = 0, max_k = -1, *ia = &inscns_aux[(t*max_ins+j)*5]; + for (k = 0; k < 5; ++k) if (ia[k] > max) max = ia[k], max_k = k; inscns[t*max_ins + j] = max? max_k : 4; + if ( max_k==4 ) { types[t] = 0; break; } // discard insertions which contain N's } } free(inscns_aux); diff --git a/bam_plcmd.c b/bam_plcmd.c index 8935c5a..54a4597 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -9,6 +9,7 @@ #include "sam.h" #include "faidx.h" #include "kstring.h" +#include "sam_header.h" static inline int printw(int c, FILE *fp) { @@ -85,7 +86,7 @@ typedef struct { int rflag_require, rflag_filter; int openQ, extQ, tandemQ, min_support; // for indels double min_frac; // for indels - char *reg, *pl_list; + char *reg, *pl_list, *fai_fname; faidx_t *fai; void *bed, *rghash; } mplp_conf_t; @@ -220,6 +221,10 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) } data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); + if ( !h_tmp ) { + fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); + exit(1); + } data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); @@ -271,9 +276,24 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); - bh->txt = malloc(strlen(BAM_VERSION) + 64); - bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); - free(s.s); + s.l = 0; + ksprintf(&s, "##samtoolsVersion=%s\n", BAM_VERSION); + if (conf->fai_fname) ksprintf(&s, "##reference=file://%s\n", conf->fai_fname); + h->dict = sam_header_parse2(h->text); + int nseq; + const char *tags[] = {"SN","LN","UR","M5",NULL}; + char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &nseq); + for (i=0; i\n", &s); + } + if (tbl) free(tbl); + bh->txt = s.s; + bh->l_txt = 1 + s.l; bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); @@ -317,7 +337,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); - bcf_call_combine(gplp.n, bcr, ref16, &bc); + bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); @@ -325,7 +345,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); - if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { + if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, bcr, conf->fmt_flag, bca, ref); bcf_write(bp, bh, b); @@ -483,6 +503,7 @@ int bam_mpileup(int argc, char *argv[]) case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == 0) return 1; + mplp.fai_fname = optarg; break; case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg = strdup(optarg); break; diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c index 1321c13..7638085 100644 --- a/bcftools/bcfutils.c +++ b/bcftools/bcfutils.c @@ -74,7 +74,6 @@ void bcf_fit_alt(bcf1_t *b, int mask) int i,j,nals=0; for (i=0; in_alleles <= nals ) return; // update ALT, in principle any of the alleles can be removed @@ -241,7 +240,7 @@ int bcf_fix_gt(bcf1_t *b) bcf_ginfo_t gt; // check the presence of the GT FMT if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first - if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact + assert(s[3] == '\0' || s[3] == ':'); // :GTX in fact tmp = bcf_str2int("GT", 2); for (i = 0; i < b->n_gi; ++i) if (b->gi[i].fmt == tmp) break; @@ -250,7 +249,10 @@ int bcf_fix_gt(bcf1_t *b) // move GT to the first for (; i > 0; --i) b->gi[i] = b->gi[i-1]; b->gi[0] = gt; - memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt); + if ( s[3]==0 ) + memmove(b->fmt + 3, b->fmt, s - b->fmt); // :GT + else + memmove(b->fmt + 3, b->fmt, s - b->fmt + 1); // :GT: b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':'; return 0; } @@ -395,7 +397,11 @@ bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int kputs(samples[i], &s); kputc('\0', &s); } } - if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); + if (j < n) + { + fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); + exit(1); + } kh_destroy(str2id, hash); h = calloc(1, sizeof(bcf_hdr_t)); *h = *h0; diff --git a/bcftools/call1.c b/bcftools/call1.c index dda4164..e6373d3 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -190,7 +190,27 @@ static char **read_samples(const char *fn, int *_n) *_n = 0; s.l = s.m = 0; s.s = 0; fp = gzopen(fn, "r"); - if (fp == 0) return 0; // fail to open file + if (fp == 0) + { + // interpret as sample names, not as a file name + const char *t = fn, *p = t; + while (*t) + { + t++; + if ( *t==',' || !*t ) + { + sam = realloc(sam, sizeof(void*)*(n+1)); + sam[n] = (char*) malloc(sizeof(char)*(t-p+2)); + memcpy(sam[n], p, t-p); + sam[n][t-p] = 0; + sam[n][t-p+1] = 2; // assume diploid + p = t+1; + n++; + } + } + *_n = n; + return sam; // fail to open file + } ks = ks_init(fp); while (ks_getuntil(ks, 0, &s, &dret) >= 0) { int l; @@ -266,8 +286,16 @@ static void write_header(bcf_hdr_t *h) kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + //if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + kputs("##INFO=\n", &str); if (!strstr(str.s, "##FORMAT=\n", &str); if (!strstr(str.s, "##FORMAT==0 ) { bcf_p1_set_ploidy(b, p1); // could be improved: do this per site to allow pseudo-autosomal regions - int gts = call_multiallelic_gt(b,p1,vc.min_ma_lrt); + int gts = call_multiallelic_gt(b, p1, vc.min_ma_lrt, vc.flag&VC_VARONLY); if ( gts<=1 && vc.flag & VC_VARONLY ) continue; } else if (vc.flag & VC_CALL) { // call variants diff --git a/bcftools/prob1.c b/bcftools/prob1.c index ffa608e..3539ee3 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -208,7 +208,124 @@ void bcf_p1_destroy(bcf_p1aux_t *ma) extern double kf_gammap(double s, double z); int test16(bcf1_t *b, anno16_t *a); -int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) +// Wigginton 2005, PMID: 15789306 +// written by Jan Wigginton +double calc_hwe(int obs_hom1, int obs_hom2, int obs_hets) +{ + if (obs_hom1 + obs_hom2 + obs_hets == 0 ) return 1; + + assert(obs_hom1 >= 0 && obs_hom2 >= 0 && obs_hets >= 0); + + int obs_homc = obs_hom1 < obs_hom2 ? obs_hom2 : obs_hom1; + int obs_homr = obs_hom1 < obs_hom2 ? obs_hom1 : obs_hom2; + + int rare_copies = 2 * obs_homr + obs_hets; + int genotypes = obs_hets + obs_homc + obs_homr; + + double *het_probs = (double*) calloc(rare_copies+1, sizeof(double)); + + /* start at midpoint */ + int mid = rare_copies * (2 * genotypes - rare_copies) / (2 * genotypes); + + /* check to ensure that midpoint and rare alleles have same parity */ + if ((rare_copies & 1) ^ (mid & 1)) mid++; + + int curr_hets = mid; + int curr_homr = (rare_copies - mid) / 2; + int curr_homc = genotypes - curr_hets - curr_homr; + + het_probs[mid] = 1.0; + double sum = het_probs[mid]; + for (curr_hets = mid; curr_hets > 1; curr_hets -= 2) + { + het_probs[curr_hets - 2] = het_probs[curr_hets] * curr_hets * (curr_hets - 1.0) / (4.0 * (curr_homr + 1.0) * (curr_homc + 1.0)); + sum += het_probs[curr_hets - 2]; + + /* 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote */ + curr_homr++; + curr_homc++; + } + + curr_hets = mid; + curr_homr = (rare_copies - mid) / 2; + curr_homc = genotypes - curr_hets - curr_homr; + for (curr_hets = mid; curr_hets <= rare_copies - 2; curr_hets += 2) + { + het_probs[curr_hets + 2] = het_probs[curr_hets] * 4.0 * curr_homr * curr_homc /((curr_hets + 2.0) * (curr_hets + 1.0)); + sum += het_probs[curr_hets + 2]; + + /* add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote */ + curr_homr--; + curr_homc--; + } + int i; + for (i = 0; i <= rare_copies; i++) het_probs[i] /= sum; + + /* p-value calculation for p_hwe */ + double p_hwe = 0.0; + for (i = 0; i <= rare_copies; i++) + { + if (het_probs[i] > het_probs[obs_hets]) + continue; + p_hwe += het_probs[i]; + } + + p_hwe = p_hwe > 1.0 ? 1.0 : p_hwe; + free(het_probs); + return p_hwe; + +} + + +static void _bcf1_set_ref(bcf1_t *b, int idp) +{ + kstring_t s; + int old_n_gi = b->n_gi; + s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; + kputs(":GT", &s); kputc('\0', &s); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + + // Call GTs + int isample, an = 0; + for (isample = 0; isample < b->n_smpl; isample++) + { + if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 ) + ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7; + else + { + ((uint8_t*)b->gi[old_n_gi].data)[isample] = 0; + an += b->ploidy ? b->ploidy[isample] : 2; + } + } + bcf_fit_alt(b,1); + b->qual = 999; + + // Prepare BCF for output: ref, alt, filter, info, format + memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); + kputs(b->ref, &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); + { + ksprintf(&s, "AN=%d;", an); + kputs(b->info, &s); + anno16_t a; + int has_I16 = test16(b, &a) >= 0? 1 : 0; + if (has_I16 ) + { + if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); + ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + } + kputc('\0', &s); + rm_info(&s, "I16="); + rm_info(&s, "QS="); + } + kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); +} + +int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only) { int nals = 1; char *p; @@ -219,8 +336,6 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) } if ( b->alt[0] && !*p ) nals++; - if ( nals==1 ) return 1; - if ( nals>4 ) { if ( *b->ref=='N' ) return 0; @@ -228,10 +343,9 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) exit(1); } - // find PL and DP FORMAT indexes + // find PL, DV and DP FORMAT indexes uint8_t *pl = NULL; - int npl = 0, idp=-1; - int i; + int i, npl = 0, idp = -1, idv = -1; for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("PL", 2)) @@ -239,7 +353,13 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) pl = (uint8_t*)b->gi[i].data; npl = b->gi[i].len; } - if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i; + else if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i; + else if (b->gi[i].fmt == bcf_str2int("DV", 2)) idv=i; + } + if ( nals==1 ) + { + if ( !var_only ) _bcf1_set_ref(b, idp); + return 1; } if ( !pl ) return -1; @@ -270,9 +390,9 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) if ( sscanf(p+3,"%lf,%lf,%lf,%lf",&qsum[0],&qsum[1],&qsum[2],&qsum[3])!=4 ) { fprintf(stderr,"Could not parse %s\n",p); exit(1); } - // Calculate the most likely combination of alleles + // Calculate the most likely combination of alleles, remembering the most and second most likely set int ia,ib,ic, max_als=0, max_als2=0; - double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN; + double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN, lk_sums[3]; for (ia=0; ialk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); } + lk_sums[0] = lk_sum; if ( nals>1 ) { for (ia=0; ialk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); } } + lk_sums[1] = lk_sum; } if ( nals>2 ) { @@ -354,6 +476,7 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) } } } + lk_sums[2] = lk_sum; } // Should we add another allele, does it increase the likelihood significantly? @@ -362,9 +485,12 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) for (i=0; in_smpl; isample++) { int ploidy = b->ploidy ? b->ploidy[isample] : 2; double *p = pdg + isample*npdg; int ia, als = 0; - double lk = 0, lk_sum=0; + double lk = 0, lk_s = 0; for (ia=0; ia lk ) { lk = _lk; als = ia<<3 | ia; } - lk_sum += _lk; + lk_s += _lk; } if ( ploidy==2 ) { @@ -402,26 +529,47 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) int iab = iaa - ia + ib; double _lk = 2*qsum[ia]*qsum[ib]*p[iab]; if ( _lk > lk ) { lk = _lk; als = ib<<3 | ia; } - lk_sum += _lk; + lk_s += _lk; } } } - lk = -log(1-lk/lk_sum)/0.2302585; - if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 ) + lk = -log(1-lk/lk_s)/0.2302585; + int dp = 0; + if ( idp>=0 && (dp=((uint16_t*)b->gi[idp].data)[isample])==0 ) { + // no coverage ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7; ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = 0; continue; } + if ( lk>99 ) lk = 99; ((uint8_t*)b->gi[old_n_gi].data)[isample] = als; - ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = lk<100 ? (int)lk : 99; + ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = (int)lk; + + // For MDV annotation + int dv; + if ( als && idv>=0 && (dv=((uint16_t*)b->gi[idv].data)[isample]) ) + { + if ( max_dv < dv ) max_dv = dv; + } + + // For HWE annotation; multiple ALT alleles treated as one + if ( !als ) nRR++; + else if ( !(als>>3&7) || !(als&7) ) nRA++; + else nAA++; gts |= 1<<(als>>3&7) | 1<<(als&7); ac[ als>>3&7 ]++; ac[ als&7 ]++; } + free(pdg); bcf_fit_alt(b,max_als); + // The VCF spec is ambiguous about QUAL: is it the probability of anything else + // (that is QUAL(non-ref) = P(ref)+P(any non-ref other than ALT)) or is it + // QUAL(non-ref)=P(ref) and QUAL(ref)=1-P(ref)? Assuming the latter. + b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*log(1-exp(ref_lk - lk_sum)); + if ( b->qual>999 ) b->qual = 999; // Prepare BCF for output: ref, alt, filter, info, format memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); @@ -454,6 +602,13 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) { if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + ksprintf(&s, ";QBD=%e", b->qual/(a.d[0] + a.d[1] + a.d[2] + a.d[3])); + if ( max_dv ) ksprintf(&s, ";MDV=%d", max_dv); + } + if ( nAA+nRA ) + { + double hwe = calc_hwe(nAA, nRR, nRA); + ksprintf(&s, ";HWE=%e", hwe); } kputc('\0', &s); rm_info(&s, "I16="); @@ -462,12 +617,8 @@ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) kputs(b->fmt, &s); kputc('\0', &s); free(b->str); b->m_str = s.m; b->l_str = s.l; b->str = s.s; - b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*(max_lk - lk_sum); - if ( b->qual>999 ) b->qual = 999; bcf_sync(b); - - free(pdg); return gts; } diff --git a/bcftools/prob1.h b/bcftools/prob1.h index eb0b145..6f93155 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -33,7 +33,7 @@ extern "C" { void bcf_p1_destroy(bcf_p1aux_t *ma); void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma); int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst); - int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold); + int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold, int var_only); int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); void bcf_p1_dump_afs(bcf_p1aux_t *ma); int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); diff --git a/sam_header.c b/sam_header.c index ddc2c38..88b6a1c 100644 --- a/sam_header.c +++ b/sam_header.c @@ -770,4 +770,41 @@ void *sam_header_merge(int n, const void **_dicts) return out_dict; } +char **sam_header2tbl_n(const void *dict, const char type[2], const char *tags[], int *n) +{ + int nout = 0; + char **out = NULL; + + *n = 0; + list_t *l = (list_t *)dict; + if ( !l ) return NULL; + + int i, ntags = 0; + while ( tags[ntags] ) ntags++; + + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + out = (char**) realloc(out, sizeof(char*)*(nout+1)*ntags); + for (i=0; ivalue; + } + nout++; + l = l->next; + } + *n = nout; + return out; +} diff --git a/sam_header.h b/sam_header.h index ebea12f..4b0cb03 100644 --- a/sam_header.h +++ b/sam_header.h @@ -19,6 +19,23 @@ extern "C" { void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **key, const char **value); char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n); + /* + // Usage example + int i, j, n; + const char *tags[] = {"SN","LN","UR","M5",NULL}; + void *dict = sam_header_parse2(bam->header->text); + char **tbl = sam_header2tbl_n(h->dict, "SQ", tags, &n); + for (i=0; i #include #include +#include #include "sam_header.h" #include "sam.h" #include "faidx.h" @@ -273,9 +274,9 @@ int main_samview(int argc, char *argv[]) } view_end: - if (is_count && ret == 0) { - printf("%ld\n", count); // compilers on some platforms may complain about printing int64_t with %ld - } + if (is_count && ret == 0) + printf("%" PRId64 "\n", count); + // close files, free and return free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg); if (g_bed) bed_destroy(g_bed); diff --git a/samtools.1 b/samtools.1 index 118a6e8..5923abd 100644 --- a/samtools.1 +++ b/samtools.1 @@ -1,4 +1,4 @@ -.TH samtools 1 "2 September 2011" "samtools-0.1.18" "Bioinformatics tools" +.TH samtools 1 "15 March 2013" "samtools-0.1.19" "Bioinformatics tools" .SH NAME .PP samtools - Utilities for the Sequence Alignment/Map (SAM) format @@ -30,7 +30,7 @@ bcftools index in.bcf .PP bcftools view in.bcf chr2:100-200 > out.vcf .PP -bcftools view -vc in.bcf > out.vcf 2> out.afs +bcftools view -Nvm0.99 in.bcf > out.vcf 2> out.afs .SH DESCRIPTION .PP @@ -140,17 +140,38 @@ to another samtools command. .TP .B tview -samtools tview [ref.fasta] +samtools tview +.RB [ \-p +.IR chr:pos ] +.RB [ \-s +.IR STR ] +.RB [ \-d +.IR display ] +.RI +.RI [ref.fasta] Text alignment viewer (based on the ncurses library). In the viewer, press `?' for help and press `g' to check the alignment start from a region in the format like `chr10:10,000,000' or `=10,000,000' when viewing the same reference sequence. +.B Options: +.RS +.TP 14 +.BI -d \ display +Output as (H)tml or (C)urses or (T)ext +.TP +.BI -p \ chr:pos +Go directly to this position +.TP +.BI -s \ STR +Display only reads from this sample or read group +.RE + .TP .B mpileup -.B samtools mpileup -.RB [ \-EBug ] +samtools mpileup +.RB [ \-EBugp ] .RB [ \-C .IR capQcoef ] .RB [ \-r @@ -297,6 +318,10 @@ Phred-scaled gap open sequencing error probability. Reducing .I INT leads to more indel calls. [40] .TP +.BI -p +Apply -m and -F thresholds per sample to increase sensitivity of calling. +By default both options are applied to reads pooled from all samples. +.TP .BI -P \ STR Comma dilimited list of platforms (determined by .BR @RG-PL ) @@ -577,6 +602,8 @@ Minimum base quality to be used in het calling. [13] .IR mutRate ] .RB [ \-p .IR varThres ] +.RB [ \-m +.IR varThres ] .RB [ \-P .IR prior ] .RB [ \-1 @@ -659,6 +686,12 @@ Call per-sample genotypes at variant sites (force -c) .BI -i \ FLOAT Ratio of INDEL-to-SNP mutation rate [0.15] .TP +.BI -m \ FLOAT +New model for improved multiallelic and rare-variant calling. Another +ALT allele is accepted if P(chi^2) of LRT exceeds the FLOAT threshold. The +parameter seems robust and the actual value usually does not affect the results +much; a good value to use is 0.99. This is the recommended calling method. [0] +.TP .BI -p \ FLOAT A site is considered to be a variant if P(ref|D) +.br +Samtools latest source: +.br +VCFtools website with stable link to VCF specification: +.br +HTSlib website: