From: Petr Danecek Date: Wed, 12 Sep 2012 14:45:30 +0000 (+0100) Subject: Merge remote branch 'remotes/pd3/master' X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=a71383ee344af03b1d8084a7ed8752a12f68e8e7;hp=2e9880c7bfc5b88c6a0bebefd310157a33c84a36;p=samtools.git Merge remote branch 'remotes/pd3/master' Conflicts: misc/bamcheck.c --- diff --git a/bam2bcf.c b/bam2bcf.c index 6ac5dce..a51a406 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -26,6 +26,7 @@ bcf_callaux_t *bcf_call_init(double theta, int min_baseQ) bca->e = errmod_init(1. - theta); bca->min_frac = 0.002; bca->min_support = 1; + bca->per_sample_flt = 0; return bca; } @@ -54,7 +55,6 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases); } // fill the bases array - memset(r, 0, sizeof(bcf_callret1_t)); for (i = n = r->n_supp = 0; i < _n; ++i) { const bam_pileup1_t *p = pl + i; int q, b, mapQ, baseQ, is_diff, min_dist, seqQ; @@ -87,9 +87,9 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t if (min_dist > p->qpos) min_dist = p->qpos; if (min_dist > CAP_DIST) min_dist = CAP_DIST; r->anno[1<<2|is_diff<<1|0] += baseQ; - r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; + r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; // FIXME: signed int is not enough for thousands of samples r->anno[2<<2|is_diff<<1|0] += mapQ; - r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; + r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; // FIXME: signed int is not enough for thousands of samples r->anno[3<<2|is_diff<<1|0] += min_dist; r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist; } @@ -195,6 +195,8 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, for (i = 0; i < n; ++i) for (j = 0; j < 4; ++j) qsum[j] += calls[i].qsum[j]; + int qsum_tot=0; + for (j=0; j<4; j++) { qsum_tot += qsum[j]; call->qsum[j] = 0; } for (j = 0; j < 4; ++j) qsum[j] = qsum[j] << 2 | j; // find the top 2 alleles for (i = 1; i < 4; ++i) // insertion sort @@ -206,9 +208,15 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, int ref_base /*4-bit*/, call->a[0] = ref4; for (i = 3, j = 1; i >= 0; --i) { if ((qsum[i]&3) != ref4) { - if (qsum[i]>>2 != 0) call->a[j++] = qsum[i]&3; + if (qsum[i]>>2 != 0) + { + if ( j<4 ) call->qsum[j] = (float)(qsum[i]>>2)/qsum_tot; // ref N can make j>=4 + call->a[j++] = qsum[i]&3; + } else break; } + else + call->qsum[0] = (float)(qsum[i]>>2)/qsum_tot; } if (ref_base >= 0) { // for SNPs, find the "unseen" base if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0) @@ -305,12 +313,13 @@ int bcf_call2bcf(int tid, int pos, bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bc } kputc('\0', &s); // INFO - if (bc->ori_ref < 0) kputs("INDEL;", &s); + if (bc->ori_ref < 0) ksprintf(&s,"INDEL;IS=%d,%f;", bca->max_support, bca->max_frac); kputs("DP=", &s); kputw(bc->ori_depth, &s); kputs(";I16=", &s); for (i = 0; i < 16; ++i) { if (i) kputc(',', &s); kputw(bc->anno[i], &s); } + ksprintf(&s,";QS=%f,%f,%f,%f", bc->qsum[0],bc->qsum[1],bc->qsum[2],bc->qsum[3]); if (bc->vdb != 1) ksprintf(&s, ";VDB=%.4f", bc->vdb); kputc('\0', &s); diff --git a/bam2bcf.h b/bam2bcf.h index a4d8ca5..8ac6b79 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -14,8 +14,9 @@ typedef struct __bcf_callaux_t { int capQ, min_baseQ; int openQ, extQ, tandemQ; // for indels - int min_support; // for collecting indel candidates - double min_frac; // for collecting indel candidates + int min_support, max_support; // for collecting indel candidates + double min_frac, max_frac; // for collecting indel candidates + int per_sample_flt; // indel filtering strategy // for internal uses int max_bases; int indel_types[4]; @@ -35,6 +36,7 @@ typedef struct { typedef struct { int a[5]; // alleles: ref, alt, alt2, alt3 + float qsum[4]; int n, n_alleles, shift, ori_ref, unseen; int n_supp; // number of supporting non-reference reads int anno[16], depth, ori_depth; diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index ab9e83c..9aea5f9 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -142,37 +142,48 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_calla if (s == n) return -1; // there is no indel at this position. for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads { // find out how many types of indels are present - int m, n_alt = 0, n_tot = 0; + bca->max_support = bca->max_frac = 0; + int m, n_alt = 0, n_tot = 0, indel_support_ok = 0; uint32_t *aux; aux = calloc(N + 1, 4); m = max_rd_len = 0; aux[m++] = MINUS_CONST; // zero indel is always a type for (s = 0; s < n; ++s) { + int na = 0, nt = 0; for (i = 0; i < n_plp[s]; ++i) { const bam_pileup1_t *p = plp[s] + i; if (rghash == 0 || p->aux == 0) { - ++n_tot; + ++nt; if (p->indel != 0) { - ++n_alt; + ++na; aux[m++] = MINUS_CONST + p->indel; } } j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b)); if (j > max_rd_len) max_rd_len = j; } + float frac = (float)na/nt; + if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac ) + indel_support_ok = 1; + if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac; + n_alt += na; + n_tot += nt; } // To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases), // check the number of N's in the sequence and skip places where half or more reference bases are Ns. int nN=0; for (i=pos; i-posi ) return -1; + if ( nN*2>i ) { free(aux); return -1; } ks_introsort(uint32_t, m, aux); // squeeze out identical types for (i = 1, n_types = 1; i < m; ++i) if (aux[i] != aux[i-1]) ++n_types; - if (n_types == 1 || (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support) { // then skip - free(aux); return -1; - } + // Taking totals makes it hard to call rare indels + if ( !bca->per_sample_flt ) + indel_support_ok = ( (float)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1; + if ( n_types == 1 || !indel_support_ok ) { // then skip + free(aux); return -1; + } if (n_types >= 64) { free(aux); if (bam_verbose >= 2) diff --git a/bam_plcmd.c b/bam_plcmd.c index 07f0a4f..ed00d2e 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -72,6 +72,7 @@ static inline void pileup_seq(const bam_pileup1_t *p, int pos, int ref_len, cons #define MPLP_IGNORE_RG 0x2000 #define MPLP_PRINT_POS 0x4000 #define MPLP_PRINT_MAPQ 0x8000 +#define MPLP_PER_SAMPLE 0x10000 void *bed_read(const char *fn); void bed_destroy(void *_h); @@ -271,6 +272,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn) bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; + bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); @@ -449,7 +451,7 @@ int bam_mpileup(int argc, char *argv[]) mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN; - while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:6OsV")) >= 0) { + while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV")) >= 0) { switch (c) { case 'f': mplp.fai = fai_load(optarg); @@ -459,6 +461,7 @@ int bam_mpileup(int argc, char *argv[]) case 'r': mplp.reg = strdup(optarg); break; case 'l': mplp.bed = bed_read(optarg); break; case 'P': mplp.pl_list = strdup(optarg); break; + case 'p': mplp.flag |= MPLP_PER_SAMPLE; break; case 'g': mplp.flag |= MPLP_GLF; break; case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break; case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break; @@ -532,6 +535,7 @@ int bam_mpileup(int argc, char *argv[]) fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); + fprintf(stderr, " -p apply -m and -F per-sample to increase sensitivity\n"); fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Notes: Assuming diploid individuals.\n\n"); diff --git a/bam_tview.c b/bam_tview.c index 1967b7c..f8a1f2c 100644 --- a/bam_tview.c +++ b/bam_tview.c @@ -25,6 +25,9 @@ #include "faidx.h" #include "bam2bcf.h" #include "sam_header.h" +#include "khash.h" + +KHASH_MAP_INIT_STR(kh_rg, const char *) char bam_aux_getCEi(bam1_t *b, int i); char bam_aux_getCSi(bam1_t *b, int i); @@ -57,8 +60,7 @@ typedef struct { int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name; char *ref; - char *sample; //TODO: multiple samples and read groups - void *rg2sm; + khash_t(kh_rg) *rg_hash; } tview_t; int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data) @@ -216,9 +218,28 @@ tview_t *tv_init(const char *fn, const char *fn_fa, char *samples) if ( samples ) { - tv->sample = samples; - tv->header->dict = sam_header_parse2(tv->header->text); - tv->rg2sm = sam_header2tbl(tv->header->dict, "RG", "ID", "SM"); + if ( !tv->header->dict ) tv->header->dict = sam_header_parse2(tv->header->text); + void *iter = tv->header->dict; + const char *key, *val; + int n = 0; + tv->rg_hash = kh_init(kh_rg); + while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) ) + { + if ( !strcmp(samples,key) || (val && !strcmp(samples,val)) ) + { + khiter_t k = kh_get(kh_rg, tv->rg_hash, key); + if ( k != kh_end(tv->rg_hash) ) continue; + int ret; + k = kh_put(kh_rg, tv->rg_hash, key, &ret); + kh_value(tv->rg_hash, k) = val; + n++; + } + } + if ( !n ) + { + fprintf(stderr,"The sample or read group \"%s\" not present.\n", samples); + exit(-1); + } } initscr(); @@ -262,13 +283,12 @@ void tv_destroy(tview_t *tv) int tv_fetch_func(const bam1_t *b, void *data) { tview_t *tv = (tview_t*)data; - if ( tv->sample ) + if ( tv->rg_hash ) { const uint8_t *rg = bam_aux_get(b, "RG"); if ( !rg ) return 0; - const char *sm = sam_tbl_get(tv->rg2sm, (const char*)(rg + 1)); - if ( !sm ) return 0; - if ( strcmp(sm,tv->sample) ) return 0; + khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1)); + if ( k == kh_end(tv->rg_hash) ) return 0; } if (tv->no_skip) { uint32_t *cigar = bam1_cigar(b); // this is cheating... @@ -442,7 +462,7 @@ void error(const char *format, ...) fprintf(stderr, "Usage: bamtk tview [options] [ref.fasta]\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -p chr:pos go directly to this position\n"); - fprintf(stderr, " -s STR display only reads from this sample\n"); + fprintf(stderr, " -s STR display only reads from this sample or grou\n"); fprintf(stderr, "\n\n"); } else diff --git a/bcftools/bcf.c b/bcftools/bcf.c index 0524408..24728db 100644 --- a/bcftools/bcf.c +++ b/bcftools/bcf.c @@ -240,31 +240,24 @@ void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s) } } for (j = 0; j < h->n_smpl; ++j) { - - // Determine GT with maximum PL (multiple ALT sites only) - int imax=-1; - if ( iPL!=-1 ) { - uint8_t *d = (uint8_t*)b->gi[iPL].data + j * x; - int k,identical=1; - imax=0; - for (k=1; kploidy ? b->ploidy[j] : 2; kputc('\t', s); for (i = 0; i < b->n_gi; ++i) { if (i) kputc(':', s); if (b->gi[i].fmt == bcf_str2int("PL", 2)) { uint8_t *d = (uint8_t*)b->gi[i].data + j * x; int k; - for (k = 0; k < x; ++k) { - if (k > 0) kputc(',', s); - kputw(d[k], s); - } + if ( ploidy==1 ) + for (k=0; kn_alleles; k++) + { + if (k>0) kputc(',', s); + kputw(d[(k+1)*(k+2)/2-1], s); + } + else + for (k = 0; k < x; ++k) { + if (k > 0) kputc(',', s); + kputw(d[k], s); + } } else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) { kputw(((uint16_t*)b->gi[i].data)[j], s); } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { @@ -273,28 +266,22 @@ void bcf_fmt_core(const bcf_hdr_t *h, bcf1_t *b, kstring_t *s) kputw(((int32_t*)b->gi[i].data)[j], s); } else if (b->gi[i].fmt == bcf_str2int("GT", 2)) { int y = ((uint8_t*)b->gi[i].data)[j]; - if ( y>>7&1 ) - kputsn("./.", 3, s); - else if ( imax==-1 ) + if ( ploidy==1 ) { - kputc('0' + (y>>3&7), s); - kputc("/|"[y>>6&1], s); - kputc('0' + (y&7), s); + if ( y>>7&1 ) + kputc('.', s); + else + kputc('0' + (y>>3&7), s); } else { - // Arguably, the while loop will be faster than two sqrts - int n = 0; - int row = 1; - while ( n>7&1 ) + kputsn("./.", 3, s); + else { + kputc('0' + (y>>3&7), s); + kputc("/|"[y>>6&1], s); + kputc('0' + (y&7), s); } - row--; - kputw(imax-n+row, s); - kputc("/|"[y>>6&1], s); - kputw(row, s); } } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { float *d = (float*)b->gi[i].data + j * x; @@ -336,6 +323,50 @@ int bcf_append_info(bcf1_t *b, const char *info, int l) return 0; } +int remove_tag(char *str, const char *tag, char delim) +{ + char *tmp = str, *p; + int len_diff = 0, ori_len = strlen(str); + while ( *tmp && (p = strstr(tmp,tag)) ) + { + if ( p>str ) + { + if ( *(p-1)!=delim ) { tmp=p+1; continue; } // shared substring + p--; + } + char *q=p+1; + while ( *q && *q!=delim ) q++; + if ( p==str && *q ) q++; // the tag is first, don't move the delim char + len_diff += q-p; + if ( ! *q ) { *p = 0; break; } // the tag was last, no delim follows + else + memmove(p,q,ori_len-(int)(p-str)-(int)(q-p)); // *q==delim + } + if ( len_diff==ori_len ) + str[0]='.', str[1]=0, len_diff--; + + return len_diff; +} + + +void rm_info(kstring_t *s, const char *key) +{ + char *p = s->s; + int n = 0; + while ( n<4 ) + { + if ( !*p ) n++; + p++; + } + char *q = p+1; + while ( *q && q-s->sl ) q++; + + int nrm = remove_tag(p, key, ';'); + if ( nrm ) + memmove(q-nrm, q, s->s+s->l-q+1); + s->l -= nrm; +} + int bcf_cpy(bcf1_t *r, const bcf1_t *b) { char *t1 = r->str; diff --git a/bcftools/bcf.h b/bcftools/bcf.h index 822ae5c..8c52451 100644 --- a/bcftools/bcf.h +++ b/bcftools/bcf.h @@ -73,6 +73,7 @@ typedef struct { bcf_ginfo_t *gi; // array of geno fields int n_alleles, n_smpl; // number of alleles and samples // derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl) + uint8_t *ploidy; // ploidy of all samples; if NULL, ploidy of 2 is assumed. } bcf1_t; typedef struct { @@ -122,6 +123,10 @@ extern "C" { char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b); // append more info int bcf_append_info(bcf1_t *b, const char *info, int l); + // remove tag + int remove_tag(char *string, const char *tag, char delim); + // remove info tag, string is the kstring holder of bcf1_t.str + void rm_info(kstring_t *string, const char *key); // copy int bcf_cpy(bcf1_t *r, const bcf1_t *b); @@ -142,6 +147,8 @@ extern "C" { // keep the first n alleles and discard the rest int bcf_shrink_alt(bcf1_t *b, int n); + // keep the masked alleles and discard the rest + void bcf_fit_alt(bcf1_t *b, int mask); // convert GL to PL int bcf_gl2pl(bcf1_t *b); // if the site is an indel diff --git a/bcftools/bcfutils.c b/bcftools/bcfutils.c index 0eab4c1..1321c13 100644 --- a/bcftools/bcfutils.c +++ b/bcftools/bcfutils.c @@ -1,5 +1,6 @@ #include #include +#include #include "bcf.h" #include "kstring.h" #include "khash.h" @@ -66,6 +67,113 @@ int bcf_str2id_add(void *_hash, const char *str) return kh_val(hash, k); } +void bcf_fit_alt(bcf1_t *b, int mask) +{ + mask |= 1; // REF must be always present + + int i,j,nals=0; + for (i=0; in_alleles <= nals ) return; + + // update ALT, in principle any of the alleles can be removed + char *p; + if ( nals>1 ) + { + char *dst, *src; + int n=0, nalts=nals-1; + for (src=dst=p=b->alt, i=1; *p; p++) + { + if ( *p!=',' ) continue; + + if ( mask&1<=nalts ) { *dst=0; break; } + src = p+1; + } + if ( nalt, *p = '\0'; + p++; + memmove(p, b->flt, b->str + b->l_str - b->flt); + b->l_str -= b->flt - p; + + // update PL and GT + int ipl=-1, igt=-1; + for (i = 0; i < b->n_gi; ++i) + { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) ipl = i; + if (g->fmt == bcf_str2int("GT", 2)) igt = i; + } + + // .. create mapping between old and new indexes + int npl = nals * (nals+1) / 2; + int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles)); + int kori=0,knew=0; + for (i=0; in_alleles; i++) + { + for (j=0; j<=i; j++) + { + int skip=0; + if ( i && !(mask&1<n_smpl; + for (i = 0; i < b->n_gi; ++i) + { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) + { + g->len = npl; + uint8_t *d = (uint8_t*)g->data; + int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2; + for (knew=ismpl=0; ismpln_alleles; i++) + map[i] = mask&1<gi[igt].data)[i]; + int a1 = (gt>>3)&7; + int a2 = gt&7; + assert( map[a1]>=0 && map[a2]>=0 ); + ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)>) | map[a1]<<3 | map[a2]; + } + free(map); + b->n_alleles = nals; + bcf_sync(b); +} + int bcf_shrink_alt(bcf1_t *b, int n) { char *p; diff --git a/bcftools/call1.c b/bcftools/call1.c index 6c53008..22ff2ac 100644 --- a/bcftools/call1.c +++ b/bcftools/call1.c @@ -40,7 +40,7 @@ typedef struct { uint32_t *trio_aux; char *prior_file, **subsam, *fn_dict; uint8_t *ploidy; - double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt; + double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt, min_ma_lrt; void *bed; } viewconf_t; @@ -48,11 +48,6 @@ void *bed_read(const char *fn); void bed_destroy(void *_h); int bed_overlap(const void *_h, const char *chr, int beg, int end); -typedef struct { - double p[4]; - int mq, depth, is_tested, d[4]; -} anno16_t; - static double ttest(int n1, int n2, int a[4]) { extern double kf_betai(double a, double b, double x); @@ -83,7 +78,7 @@ static int test16_core(int anno[16], anno16_t *a) return 0; } -static int test16(bcf1_t *b, anno16_t *a) +int test16(bcf1_t *b, anno16_t *a) { char *p; int i, anno[16]; @@ -100,17 +95,6 @@ static int test16(bcf1_t *b, anno16_t *a) return test16_core(anno, a); } -static void rm_info(bcf1_t *b, const char *key) -{ - char *p, *q; - if ((p = strstr(b->info, key)) == 0) return; - for (q = p; *q && *q != ';'; ++q); - if (p > b->info && *(p-1) == ';') --p; - memmove(p, q, b->l_str - (q - b->str)); - b->l_str -= q - p; - bcf_sync(b); -} - static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[10], int cons_llr, int64_t cons_gt) { kstring_t s; @@ -119,7 +103,7 @@ static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, anno16_t a; has_I16 = test16(b, &a) >= 0? 1 : 0; - rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed! + //rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed! memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s); @@ -170,6 +154,8 @@ static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, } if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); kputc('\0', &s); + rm_info(&s, "QS="); + rm_info(&s, "I16="); kputs(b->fmt, &s); kputc('\0', &s); free(b->str); b->m_str = s.m; b->l_str = s.l; b->str = s.s; @@ -250,6 +236,12 @@ static void write_header(bcf_hdr_t *h) kputs("##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); + if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO=\n", &str); if (!strstr(str.s, "##INFO== 0) { + while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Ywm:")) >= 0) { switch (c) { case '1': vc.n1 = atoi(optarg); break; case 'l': vc.bed = bed_read(optarg); break; @@ -341,6 +333,7 @@ int bcfview(int argc, char *argv[]) case 'w': vc.flag |= VC_INDEL_ONLY; break; case 'M': vc.flag |= VC_ANNO_MAX; break; case 'Y': vc.flag |= VC_QCNT; break; + case 'm': vc.min_ma_lrt = atof(optarg); break; case 't': vc.theta = atof(optarg); break; case 'p': vc.pref = atof(optarg); break; case 'i': vc.indel_frac = atof(optarg); break; @@ -396,6 +389,7 @@ int bcfview(int argc, char *argv[]) fprintf(stderr, " -g call genotypes at variant sites (force -c)\n"); fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac); fprintf(stderr, " -I skip indels\n"); + fprintf(stderr, " -m FLOAT alternative model for multiallelic and rare-variant calling, include if P(chi^2)>=FLOAT\n"); fprintf(stderr, " -p FLOAT variant if P(ref|D)=0 ) + { + bcf_p1_set_ploidy(b, p1); // could be improved: do this per site to allow pseudo-autosomal regions + int gts = call_multiallelic_gt(b,p1,vc.min_ma_lrt); + if ( gts<=1 && vc.flag & VC_VARONLY ) continue; + } + else if (vc.flag & VC_CALL) { // call variants bcf_p1rst_t pr; int calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr); if (n_processed % 100000 == 0) { diff --git a/bcftools/prob1.c b/bcftools/prob1.c index 83bd8e2..e3d6b5e 100644 --- a/bcftools/prob1.c +++ b/bcftools/prob1.c @@ -4,7 +4,9 @@ #include #include #include +#include #include "prob1.h" +#include "kstring.h" #include "kseq.h" KSTREAM_INIT(gzFile, gzread, 16384) @@ -174,6 +176,13 @@ int bcf_p1_set_n1(bcf_p1aux_t *b, int n1) return 0; } +void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma) +{ + // bcf_p1aux_t fields are not visible outside of prob1.c, hence this wrapper. + // Ideally, this should set ploidy per site to allow pseudo-autosomal regions + b->ploidy = ma->ploidy; +} + void bcf_p1_destroy(bcf_p1aux_t *ma) { if (ma) { @@ -191,54 +200,292 @@ void bcf_p1_destroy(bcf_p1aux_t *ma) } } -static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) +extern double kf_gammap(double s, double z); +int test16(bcf1_t *b, anno16_t *a); + +int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold) { - int i, j; - int n = (b->n_alleles+1)*b->n_alleles/2; - double *lk = alloca(n * sizeof(long)); - memset(lk, 0, sizeof(double) * n); - for (j = 0; j < ma->n; ++j) { - const uint8_t *pi = ma->PL + j * ma->PL_len; - double *pdg = ma->pdg + j * 3; - pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; - for (i=0; ialt; *p; p++) + { + if ( *p=='X' || p[0]=='.' ) break; + if ( p[0]==',' ) nals++; + } + if ( b->alt[0] && !*p ) nals++; + + if ( nals==1 ) return 1; + + if ( nals>4 ) + { + if ( *b->ref=='N' ) return 0; + fprintf(stderr,"Not ready for this, more than 4 alleles at %d: %s, %s\n", b->pos+1, b->ref,b->alt); + exit(1); + } + + // find PL and DP FORMAT indexes + uint8_t *pl = NULL; + int npl = 0, idp=-1; + int i; + for (i = 0; i < b->n_gi; ++i) + { + if (b->gi[i].fmt == bcf_str2int("PL", 2)) + { + pl = (uint8_t*)b->gi[i].data; + npl = b->gi[i].len; + } + if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i; + } + if ( !pl ) return -1; + + assert(ma->q2p[0] == 1); + + // Init P(D|G) + int npdg = nals*(nals+1)/2; + double *pdg,*_pdg; + _pdg = pdg = malloc(sizeof(double)*ma->n*npdg); + for (i=0; in; i++) + { + int j; + double sum = 0; + for (j=0; jq2p[pl[j]]; + sum += _pdg[j]; + } + if ( sum ) + for (j=0; jinfo, "QS=")) == 0) { fprintf(stderr,"INFO/QS is required with -m, exiting\n"); exit(1); } + double qsum[4]; + if ( sscanf(p+3,"%lf,%lf,%lf,%lf",&qsum[0],&qsum[1],&qsum[2],&qsum[3])!=4 ) { fprintf(stderr,"Could not parse %s\n",p); exit(1); } + + + // Calculate the most likely combination of alleles + int ia,ib,ic, max_als=0, max_als2=0; + double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN; + for (ia=0; ian; isample++) + { + double *p = pdg + isample*npdg; + // assert( log(p[iaa]) <= 0 ); + lk_tot += log(p[iaa]); + } + if ( ia==0 ) ref_lk = lk_tot; + if ( max_lklk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); + } + if ( nals>1 ) + { + for (ia=0; ian; isample++) + { + if ( b->ploidy && b->ploidy[isample]==1 ) continue; + double *p = pdg + isample*npdg; + //assert( log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]) <= 0 ); + lk_tot += log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]); + } + if ( max_lklk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); + } + } } + if ( nals>2 ) + { + for (ia=0; ian; isample++) + { + if ( b->ploidy && b->ploidy[isample]==1 ) continue; + double *p = pdg + isample*npdg; + //assert( log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]) <= 0 ); + lk_tot += log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]); + } + if ( max_lklk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum)); + } + } + } + } + - double norm=lk[0]; - for (i=1; in_alleles; i++) + // Should we add another allele, does it increase the likelihood significantly? + int n1=0, n2=0; + for (i=0; iis_indel ? b->n_alleles : b->n_alleles-1; - for (i=0; in_gi; + s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str; + kputs(":GT:GQ", &s); kputc('\0', &s); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + bcf_sync(b); + + // Call GTs + int isample, gts=0, ac[4] = {0,0,0,0}; + for (isample = 0; isample < b->n_smpl; isample++) { - double pr=0; - int k=i*(i+1)/2; - for (j=0; j<=i; j++) { pr+=lk[k]; k++; } - for (j=i+1; jn_alleles; j++) { k=j*(j+1)/2+i; pr+=lk[k]; } - #if DBG - printf("%d\t%e\n", i,pr); - #endif - if (pmaxploidy ? b->ploidy[isample] : 2; + double *p = pdg + isample*npdg; + int ia, als = 0; + double lk = 0, lk_sum=0; + for (ia=0; ia lk ) { lk = _lk; als = ia<<3 | ia; } + lk_sum += _lk; + } + if ( ploidy==2 ) + { + for (ia=0; ia lk ) { lk = _lk; als = ib<<3 | ia; } + lk_sum += _lk; + } + } + } + lk = -log(1-lk/lk_sum)/0.2302585; + if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 ) + { + ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7; + ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = 0; + continue; + } + ((uint8_t*)b->gi[old_n_gi].data)[isample] = als; + ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = lk<100 ? (int)lk : 99; + + gts |= 1<<(als>>3&7) | 1<<(als&7); + ac[ als>>3&7 ]++; + ac[ als&7 ]++; } - return i-1; + bcf_fit_alt(b,max_als); + + + // Prepare BCF for output: ref, alt, filter, info, format + memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s); + kputs(b->ref, &s); kputc('\0', &s); + kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s); + { + int an=0, nalts=0; + for (i=0; i0 && ac[i] ) nalts++; + } + ksprintf(&s, "AN=%d;", an); + if ( nalts ) + { + kputs("AC=", &s); + for (i=1; i0 ) kputc(',', &s); + } + kputc(';', &s); + } + kputs(b->info, &s); + anno16_t a; + int has_I16 = test16(b, &a) >= 0? 1 : 0; + if (has_I16 ) + { + if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]); + ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq); + } + kputc('\0', &s); + rm_info(&s, "I16="); + rm_info(&s, "QS="); + } + kputs(b->fmt, &s); kputc('\0', &s); + free(b->str); + b->m_str = s.m; b->l_str = s.l; b->str = s.s; + b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*(max_lk - lk_sum); + if ( b->qual>999 ) b->qual = 999; + bcf_sync(b); + + + free(pdg); + return gts; +} + +static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma) +{ + int i, j; + long *p, tmp; + p = alloca(b->n_alleles * sizeof(long)); + memset(p, 0, sizeof(long) * b->n_alleles); + for (j = 0; j < ma->n; ++j) { + const uint8_t *pi = ma->PL + j * ma->PL_len; + double *pdg = ma->pdg + j * 3; + pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]]; + for (i = 0; i < b->n_alleles; ++i) + p[i] += (int)pi[(i+1)*(i+2)/2-1]; + } + for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i; + for (i = 1; i < b->n_alleles; ++i) // insertion sort + for (j = i; j > 0 && p[j] < p[j-1]; --j) + tmp = p[j], p[j] = p[j-1], p[j-1] = tmp; + for (i = b->n_alleles - 1; i >= 0; --i) + if ((p[i]&0xf) == 0) break; + return i; } + int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k) { double sum, g[3]; diff --git a/bcftools/prob1.h b/bcftools/prob1.h index 0a51a0a..eb0b145 100644 --- a/bcftools/prob1.h +++ b/bcftools/prob1.h @@ -14,6 +14,11 @@ typedef struct { double cmp[3], p_chi2, lrt; // used by contrast2() } bcf_p1rst_t; +typedef struct { + double p[4]; + int mq, depth, is_tested, d[4]; +} anno16_t; + #define MC_PTYPE_FULL 1 #define MC_PTYPE_COND2 2 #define MC_PTYPE_FLAT 3 @@ -26,7 +31,9 @@ extern "C" { void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta); void bcf_p1_destroy(bcf_p1aux_t *ma); + void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma); int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst); + int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold); int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k); void bcf_p1_dump_afs(bcf_p1aux_t *ma); int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn); diff --git a/misc/bamcheck.c b/misc/bamcheck.c index 66a6861..532d105 100644 --- a/misc/bamcheck.c +++ b/misc/bamcheck.c @@ -293,7 +293,7 @@ void count_indels(stats_t *stats,bam1_t *bam_line) if ( cig==1 ) { - int idx = is_fwd ? icycle : read_len-icycle; + int idx = is_fwd ? icycle : read_len-icycle-ncig; if ( idx<0 ) error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle); if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line)); @@ -1043,6 +1043,8 @@ void output_stats(stats_t *stats) printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n"); for (ilen=0; ilen<=stats->nbases; ilen++) { + // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions + // the index of the cycle of the first inserted base (also 1-based) if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 ) printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]); } diff --git a/sam_header.c b/sam_header.c index d348d10..a1b5181 100644 --- a/sam_header.c +++ b/sam_header.c @@ -669,6 +669,36 @@ char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n return ret; } +void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value) +{ + list_t *l = iter; + if ( !l ) return NULL; + + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + + HeaderTag *key, *value; + key = header_line_has_tag(hline,key_tag); + value = header_line_has_tag(hline,value_tag); + if ( !key && !value ) + { + l = l->next; + continue; + } + + *_key = key->value; + *_value = value->value; + return l->next; + } + return l; +} + const char *sam_tbl_get(void *h, const char *key) { khash_t(str) *tbl = (khash_t(str)*)h; diff --git a/sam_header.h b/sam_header.h index e5c754f..ebea12f 100644 --- a/sam_header.h +++ b/sam_header.h @@ -10,6 +10,13 @@ extern "C" { void sam_header_free(void *header); char *sam_header_write(const void *headerDict); // returns a newly allocated string + /* + // Usage example + const char *key, *val; + void *iter = sam_header_parse2(bam->header->text); + while ( iter = sam_header_key_val(iter, "RG","ID","SM" &key,&val) ) printf("%s\t%s\n", key,val); + */ + void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **key, const char **value); char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n); void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]); diff --git a/sam_view.c b/sam_view.c index c7a4774..5283098 100644 --- a/sam_view.c +++ b/sam_view.c @@ -14,7 +14,7 @@ KHASH_SET_INIT_STR(rg) // data passed to the bam_fetch callback is encapsulated in this struct. typedef struct { bam_header_t *header; - int *count; + int64_t *count; // int does overflow for very big BAMs } count_func_data_t; typedef khash_t(rg) *rghash_t; @@ -128,7 +128,7 @@ int main_samview(int argc, char *argv[]) { int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, is_count = 0; int of_type = BAM_OFDEC, is_long_help = 0, n_threads = 0; - int count = 0; + int64_t count = 0; samfile_t *in = 0, *out = 0; char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0, *q; @@ -274,7 +274,7 @@ int main_samview(int argc, char *argv[]) view_end: if (is_count && ret == 0) { - printf("%d\n", count); + printf("%ld\n", count); // compilers on some platforms may complain about printing int64_t with %ld } // close files, free and return free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg);