bca->e = errmod_init(1. - theta);
bca->min_frac = 0.002;
bca->min_support = 1;
+ bca->per_sample_flt = 0;
return bca;
}
bca->bases = (uint16_t*)realloc(bca->bases, 2 * bca->max_bases);
}
// fill the bases array
- memset(r, 0, sizeof(bcf_callret1_t));
for (i = n = r->n_supp = 0; i < _n; ++i) {
const bam_pileup1_t *p = pl + i;
int q, b, mapQ, baseQ, is_diff, min_dist, seqQ;
if (min_dist > p->qpos) min_dist = p->qpos;
if (min_dist > CAP_DIST) min_dist = CAP_DIST;
r->anno[1<<2|is_diff<<1|0] += baseQ;
- r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ;
+ r->anno[1<<2|is_diff<<1|1] += baseQ * baseQ; // FIXME: signed int is not enough for thousands of samples
r->anno[2<<2|is_diff<<1|0] += mapQ;
- r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ;
+ r->anno[2<<2|is_diff<<1|1] += mapQ * mapQ; // FIXME: signed int is not enough for thousands of samples
r->anno[3<<2|is_diff<<1|0] += min_dist;
r->anno[3<<2|is_diff<<1|1] += min_dist * min_dist;
}
for (i = 0; i < n; ++i)
for (j = 0; j < 4; ++j)
qsum[j] += calls[i].qsum[j];
+ int qsum_tot=0;
+ for (j=0; j<4; j++) { qsum_tot += qsum[j]; call->qsum[j] = 0; }
for (j = 0; j < 4; ++j) qsum[j] = qsum[j] << 2 | j;
// find the top 2 alleles
for (i = 1; i < 4; ++i) // insertion sort
call->a[0] = ref4;
for (i = 3, j = 1; i >= 0; --i) {
if ((qsum[i]&3) != ref4) {
- if (qsum[i]>>2 != 0) call->a[j++] = qsum[i]&3;
+ if (qsum[i]>>2 != 0)
+ {
+ if ( j<4 ) call->qsum[j] = (float)(qsum[i]>>2)/qsum_tot; // ref N can make j>=4
+ call->a[j++] = qsum[i]&3;
+ }
else break;
}
+ else
+ call->qsum[0] = (float)(qsum[i]>>2)/qsum_tot;
}
if (ref_base >= 0) { // for SNPs, find the "unseen" base
if (((ref4 < 4 && j < 4) || (ref4 == 4 && j < 5)) && i >= 0)
}
kputc('\0', &s);
// INFO
- if (bc->ori_ref < 0) kputs("INDEL;", &s);
+ if (bc->ori_ref < 0) ksprintf(&s,"INDEL;IS=%d,%f;", bca->max_support, bca->max_frac);
kputs("DP=", &s); kputw(bc->ori_depth, &s); kputs(";I16=", &s);
for (i = 0; i < 16; ++i) {
if (i) kputc(',', &s);
kputw(bc->anno[i], &s);
}
+ ksprintf(&s,";QS=%f,%f,%f,%f", bc->qsum[0],bc->qsum[1],bc->qsum[2],bc->qsum[3]);
if (bc->vdb != 1)
ksprintf(&s, ";VDB=%.4f", bc->vdb);
kputc('\0', &s);
typedef struct __bcf_callaux_t {
int capQ, min_baseQ;
int openQ, extQ, tandemQ; // for indels
- int min_support; // for collecting indel candidates
- double min_frac; // for collecting indel candidates
+ int min_support, max_support; // for collecting indel candidates
+ double min_frac, max_frac; // for collecting indel candidates
+ int per_sample_flt; // indel filtering strategy
// for internal uses
int max_bases;
int indel_types[4];
typedef struct {
int a[5]; // alleles: ref, alt, alt2, alt3
+ float qsum[4];
int n, n_alleles, shift, ori_ref, unseen;
int n_supp; // number of supporting non-reference reads
int anno[16], depth, ori_depth;
if (s == n) return -1; // there is no indel at this position.
for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
{ // find out how many types of indels are present
- int m, n_alt = 0, n_tot = 0;
+ bca->max_support = bca->max_frac = 0;
+ int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
uint32_t *aux;
aux = calloc(N + 1, 4);
m = max_rd_len = 0;
aux[m++] = MINUS_CONST; // zero indel is always a type
for (s = 0; s < n; ++s) {
+ int na = 0, nt = 0;
for (i = 0; i < n_plp[s]; ++i) {
const bam_pileup1_t *p = plp[s] + i;
if (rghash == 0 || p->aux == 0) {
- ++n_tot;
+ ++nt;
if (p->indel != 0) {
- ++n_alt;
+ ++na;
aux[m++] = MINUS_CONST + p->indel;
}
}
j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b));
if (j > max_rd_len) max_rd_len = j;
}
+ float frac = (float)na/nt;
+ if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
+ indel_support_ok = 1;
+ if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
+ n_alt += na;
+ n_tot += nt;
}
// To prevent long stretches of N's to be mistaken for indels (sometimes thousands of bases),
// check the number of N's in the sequence and skip places where half or more reference bases are Ns.
int nN=0; for (i=pos; i-pos<max_rd_len && ref[i]; i++) if ( ref[i]=='N' ) nN++;
- if ( nN*2>i ) return -1;
+ if ( nN*2>i ) { free(aux); return -1; }
ks_introsort(uint32_t, m, aux);
// squeeze out identical types
for (i = 1, n_types = 1; i < m; ++i)
if (aux[i] != aux[i-1]) ++n_types;
- if (n_types == 1 || (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support) { // then skip
- free(aux); return -1;
- }
+ // Taking totals makes it hard to call rare indels
+ if ( !bca->per_sample_flt )
+ indel_support_ok = ( (float)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
+ if ( n_types == 1 || !indel_support_ok ) { // then skip
+ free(aux); return -1;
+ }
if (n_types >= 64) {
free(aux);
if (bam_verbose >= 2)
#define MPLP_IGNORE_RG 0x2000
#define MPLP_PRINT_POS 0x4000
#define MPLP_PRINT_MAPQ 0x8000
+#define MPLP_PER_SAMPLE 0x10000
void *bed_read(const char *fn);
void bed_destroy(void *_h);
bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
bca->min_frac = conf->min_frac;
bca->min_support = conf->min_support;
+ bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
}
if (tid0 >= 0 && conf->fai) { // region is set
ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100;
mplp.min_frac = 0.002; mplp.min_support = 1;
mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN;
- while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:6OsV")) >= 0) {
+ while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsV")) >= 0) {
switch (c) {
case 'f':
mplp.fai = fai_load(optarg);
case 'r': mplp.reg = strdup(optarg); break;
case 'l': mplp.bed = bed_read(optarg); break;
case 'P': mplp.pl_list = strdup(optarg); break;
+ case 'p': mplp.flag |= MPLP_PER_SAMPLE; break;
case 'g': mplp.flag |= MPLP_GLF; break;
case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break;
case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break;
fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth);
fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support);
fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ);
+ fprintf(stderr, " -p apply -m and -F per-sample to increase sensitivity\n");
fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n");
fprintf(stderr, "\n");
fprintf(stderr, "Notes: Assuming diploid individuals.\n\n");
#include "faidx.h"
#include "bam2bcf.h"
#include "sam_header.h"
+#include "khash.h"
+
+KHASH_MAP_INIT_STR(kh_rg, const char *)
char bam_aux_getCEi(bam1_t *b, int i);
char bam_aux_getCSi(bam1_t *b, int i);
int ccol, last_pos, row_shift, base_for, color_for, is_dot, l_ref, ins, no_skip, show_name;
char *ref;
- char *sample; //TODO: multiple samples and read groups
- void *rg2sm;
+ khash_t(kh_rg) *rg_hash;
} tview_t;
int tv_pl_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pl, void *data)
if ( samples )
{
- tv->sample = samples;
- tv->header->dict = sam_header_parse2(tv->header->text);
- tv->rg2sm = sam_header2tbl(tv->header->dict, "RG", "ID", "SM");
+ if ( !tv->header->dict ) tv->header->dict = sam_header_parse2(tv->header->text);
+ void *iter = tv->header->dict;
+ const char *key, *val;
+ int n = 0;
+ tv->rg_hash = kh_init(kh_rg);
+ while ( (iter = sam_header2key_val(iter, "RG","ID","SM", &key, &val)) )
+ {
+ if ( !strcmp(samples,key) || (val && !strcmp(samples,val)) )
+ {
+ khiter_t k = kh_get(kh_rg, tv->rg_hash, key);
+ if ( k != kh_end(tv->rg_hash) ) continue;
+ int ret;
+ k = kh_put(kh_rg, tv->rg_hash, key, &ret);
+ kh_value(tv->rg_hash, k) = val;
+ n++;
+ }
+ }
+ if ( !n )
+ {
+ fprintf(stderr,"The sample or read group \"%s\" not present.\n", samples);
+ exit(-1);
+ }
}
initscr();
int tv_fetch_func(const bam1_t *b, void *data)
{
tview_t *tv = (tview_t*)data;
- if ( tv->sample )
+ if ( tv->rg_hash )
{
const uint8_t *rg = bam_aux_get(b, "RG");
if ( !rg ) return 0;
- const char *sm = sam_tbl_get(tv->rg2sm, (const char*)(rg + 1));
- if ( !sm ) return 0;
- if ( strcmp(sm,tv->sample) ) return 0;
+ khiter_t k = kh_get(kh_rg, tv->rg_hash, (const char*)(rg + 1));
+ if ( k == kh_end(tv->rg_hash) ) return 0;
}
if (tv->no_skip) {
uint32_t *cigar = bam1_cigar(b); // this is cheating...
fprintf(stderr, "Usage: bamtk tview [options] <aln.bam> [ref.fasta]\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " -p chr:pos go directly to this position\n");
- fprintf(stderr, " -s STR display only reads from this sample\n");
+ fprintf(stderr, " -s STR display only reads from this sample or grou\n");
fprintf(stderr, "\n\n");
}
else
}
}
for (j = 0; j < h->n_smpl; ++j) {
-
- // Determine GT with maximum PL (multiple ALT sites only)
- int imax=-1;
- if ( iPL!=-1 ) {
- uint8_t *d = (uint8_t*)b->gi[iPL].data + j * x;
- int k,identical=1;
- imax=0;
- for (k=1; k<x; k++)
- {
- if ( identical && d[k]!=d[k-1] ) identical = 0;
- if ( d[k]<d[imax] ) imax = k;
- }
- // If all lks are identical, leave GT untouched
- if ( identical ) imax = -1;
- }
+ int ploidy = b->ploidy ? b->ploidy[j] : 2;
kputc('\t', s);
for (i = 0; i < b->n_gi; ++i) {
if (i) kputc(':', s);
if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
uint8_t *d = (uint8_t*)b->gi[i].data + j * x;
int k;
- for (k = 0; k < x; ++k) {
- if (k > 0) kputc(',', s);
- kputw(d[k], s);
- }
+ if ( ploidy==1 )
+ for (k=0; k<b->n_alleles; k++)
+ {
+ if (k>0) kputc(',', s);
+ kputw(d[(k+1)*(k+2)/2-1], s);
+ }
+ else
+ for (k = 0; k < x; ++k) {
+ if (k > 0) kputc(',', s);
+ kputw(d[k], s);
+ }
} else if (b->gi[i].fmt == bcf_str2int("DP", 2) || b->gi[i].fmt == bcf_str2int("DV", 2)) {
kputw(((uint16_t*)b->gi[i].data)[j], s);
} else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
kputw(((int32_t*)b->gi[i].data)[j], s);
} else if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
int y = ((uint8_t*)b->gi[i].data)[j];
- if ( y>>7&1 )
- kputsn("./.", 3, s);
- else if ( imax==-1 )
+ if ( ploidy==1 )
{
- kputc('0' + (y>>3&7), s);
- kputc("/|"[y>>6&1], s);
- kputc('0' + (y&7), s);
+ if ( y>>7&1 )
+ kputc('.', s);
+ else
+ kputc('0' + (y>>3&7), s);
}
else
{
- // Arguably, the while loop will be faster than two sqrts
- int n = 0;
- int row = 1;
- while ( n<imax )
- {
- row++;
- n += row;
+ if ( y>>7&1 )
+ kputsn("./.", 3, s);
+ else {
+ kputc('0' + (y>>3&7), s);
+ kputc("/|"[y>>6&1], s);
+ kputc('0' + (y&7), s);
}
- row--;
- kputw(imax-n+row, s);
- kputc("/|"[y>>6&1], s);
- kputw(row, s);
}
} else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
float *d = (float*)b->gi[i].data + j * x;
return 0;
}
+int remove_tag(char *str, const char *tag, char delim)
+{
+ char *tmp = str, *p;
+ int len_diff = 0, ori_len = strlen(str);
+ while ( *tmp && (p = strstr(tmp,tag)) )
+ {
+ if ( p>str )
+ {
+ if ( *(p-1)!=delim ) { tmp=p+1; continue; } // shared substring
+ p--;
+ }
+ char *q=p+1;
+ while ( *q && *q!=delim ) q++;
+ if ( p==str && *q ) q++; // the tag is first, don't move the delim char
+ len_diff += q-p;
+ if ( ! *q ) { *p = 0; break; } // the tag was last, no delim follows
+ else
+ memmove(p,q,ori_len-(int)(p-str)-(int)(q-p)); // *q==delim
+ }
+ if ( len_diff==ori_len )
+ str[0]='.', str[1]=0, len_diff--;
+
+ return len_diff;
+}
+
+
+void rm_info(kstring_t *s, const char *key)
+{
+ char *p = s->s;
+ int n = 0;
+ while ( n<4 )
+ {
+ if ( !*p ) n++;
+ p++;
+ }
+ char *q = p+1;
+ while ( *q && q-s->s<s->l ) q++;
+
+ int nrm = remove_tag(p, key, ';');
+ if ( nrm )
+ memmove(q-nrm, q, s->s+s->l-q+1);
+ s->l -= nrm;
+}
+
int bcf_cpy(bcf1_t *r, const bcf1_t *b)
{
char *t1 = r->str;
bcf_ginfo_t *gi; // array of geno fields
int n_alleles, n_smpl; // number of alleles and samples
// derived info: ref, alt, flt, info, fmt (<-str), n_gi (<-fmt), n_alleles (<-alt), n_smpl (<-bcf_hdr_t::n_smpl)
+ uint8_t *ploidy; // ploidy of all samples; if NULL, ploidy of 2 is assumed.
} bcf1_t;
typedef struct {
char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b);
// append more info
int bcf_append_info(bcf1_t *b, const char *info, int l);
+ // remove tag
+ int remove_tag(char *string, const char *tag, char delim);
+ // remove info tag, string is the kstring holder of bcf1_t.str
+ void rm_info(kstring_t *string, const char *key);
// copy
int bcf_cpy(bcf1_t *r, const bcf1_t *b);
// keep the first n alleles and discard the rest
int bcf_shrink_alt(bcf1_t *b, int n);
+ // keep the masked alleles and discard the rest
+ void bcf_fit_alt(bcf1_t *b, int mask);
// convert GL to PL
int bcf_gl2pl(bcf1_t *b);
// if the site is an indel
#include <string.h>
#include <math.h>
+#include <assert.h>
#include "bcf.h"
#include "kstring.h"
#include "khash.h"
return kh_val(hash, k);
}
+void bcf_fit_alt(bcf1_t *b, int mask)
+{
+ mask |= 1; // REF must be always present
+
+ int i,j,nals=0;
+ for (i=0; i<sizeof(int); i++)
+ if ( mask&1<<i) nals++;
+
+ if ( b->n_alleles <= nals ) return;
+
+ // update ALT, in principle any of the alleles can be removed
+ char *p;
+ if ( nals>1 )
+ {
+ char *dst, *src;
+ int n=0, nalts=nals-1;
+ for (src=dst=p=b->alt, i=1; *p; p++)
+ {
+ if ( *p!=',' ) continue;
+
+ if ( mask&1<<i )
+ {
+ n++;
+ if ( src!=dst )
+ {
+ memmove(dst,src,p-src);
+ dst += p-src;
+ }
+ else dst = p;
+ if ( n<nalts ) { *dst=','; dst++; }
+ }
+ i++;
+
+ if ( n>=nalts ) { *dst=0; break; }
+ src = p+1;
+ }
+ if ( n<nalts )
+ {
+ memmove(dst,src,p-src);
+ dst += p-src;
+ *dst = 0;
+ }
+ p = dst;
+ }
+ else p = b->alt, *p = '\0';
+ p++;
+ memmove(p, b->flt, b->str + b->l_str - b->flt);
+ b->l_str -= b->flt - p;
+
+ // update PL and GT
+ int ipl=-1, igt=-1;
+ for (i = 0; i < b->n_gi; ++i)
+ {
+ bcf_ginfo_t *g = b->gi + i;
+ if (g->fmt == bcf_str2int("PL", 2)) ipl = i;
+ if (g->fmt == bcf_str2int("GT", 2)) igt = i;
+ }
+
+ // .. create mapping between old and new indexes
+ int npl = nals * (nals+1) / 2;
+ int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles));
+ int kori=0,knew=0;
+ for (i=0; i<b->n_alleles; i++)
+ {
+ for (j=0; j<=i; j++)
+ {
+ int skip=0;
+ if ( i && !(mask&1<<i) ) skip=1;
+ if ( j && !(mask&1<<j) ) skip=1;
+ if ( !skip ) { map[knew++] = kori; }
+ kori++;
+ }
+ }
+ // .. apply to all samples
+ int n_smpl = b->n_smpl;
+ for (i = 0; i < b->n_gi; ++i)
+ {
+ bcf_ginfo_t *g = b->gi + i;
+ if (g->fmt == bcf_str2int("PL", 2))
+ {
+ g->len = npl;
+ uint8_t *d = (uint8_t*)g->data;
+ int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2;
+ for (knew=ismpl=0; ismpl<n_smpl; ismpl++)
+ {
+ uint8_t *dl = d + ismpl * npl_ori;
+ for (j=0; j<npl; j++) d[knew++] = dl[map[j]];
+ }
+ } // FIXME: to add GL
+ }
+ // update GTs
+ map[0] = 0;
+ for (i=1, knew=0; i<b->n_alleles; i++)
+ map[i] = mask&1<<i ? ++knew : -1;
+ for (i=0; i<n_smpl; i++)
+ {
+ uint8_t gt = ((uint8_t*)b->gi[igt].data)[i];
+ int a1 = (gt>>3)&7;
+ int a2 = gt&7;
+ assert( map[a1]>=0 && map[a2]>=0 );
+ ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)>) | map[a1]<<3 | map[a2];
+ }
+ free(map);
+ b->n_alleles = nals;
+ bcf_sync(b);
+}
+
int bcf_shrink_alt(bcf1_t *b, int n)
{
char *p;
uint32_t *trio_aux;
char *prior_file, **subsam, *fn_dict;
uint8_t *ploidy;
- double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt;
+ double theta, pref, indel_frac, min_perm_p, min_smpl_frac, min_lrt, min_ma_lrt;
void *bed;
} viewconf_t;
void bed_destroy(void *_h);
int bed_overlap(const void *_h, const char *chr, int beg, int end);
-typedef struct {
- double p[4];
- int mq, depth, is_tested, d[4];
-} anno16_t;
-
static double ttest(int n1, int n2, int a[4])
{
extern double kf_betai(double a, double b, double x);
return 0;
}
-static int test16(bcf1_t *b, anno16_t *a)
+int test16(bcf1_t *b, anno16_t *a)
{
char *p;
int i, anno[16];
return test16_core(anno, a);
}
-static void rm_info(bcf1_t *b, const char *key)
-{
- char *p, *q;
- if ((p = strstr(b->info, key)) == 0) return;
- for (q = p; *q && *q != ';'; ++q);
- if (p > b->info && *(p-1) == ';') --p;
- memmove(p, q, b->l_str - (q - b->str));
- b->l_str -= q - p;
- bcf_sync(b);
-}
-
static int update_bcf1(bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p1rst_t *pr, double pref, int flag, double em[10], int cons_llr, int64_t cons_gt)
{
kstring_t s;
anno16_t a;
has_I16 = test16(b, &a) >= 0? 1 : 0;
- rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed!
+ //rm_info(b, "I16="); // FIXME: probably this function has a bug. If I move it below, I16 will not be removed!
memset(&s, 0, sizeof(kstring_t));
kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s);
}
if (has_I16 && a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
kputc('\0', &s);
+ rm_info(&s, "QS=");
+ rm_info(&s, "I16=");
kputs(b->fmt, &s); kputc('\0', &s);
free(b->str);
b->m_str = s.m; b->l_str = s.l; b->str = s.s;
kputs("##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele frequency (assuming HWE)\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=AC1,"))
kputs("##INFO=<ID=AC1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele count (no HWE assumption)\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=AN,"))
+ kputs("##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=IS,"))
+ kputs("##INFO=<ID=IS,Number=2,Type=Float,Description=\"Maximum number of reads supporting an indel and fraction of indel reads\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=AC,"))
+ kputs("##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes for each ALT allele, in the same order as listed\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=G3,"))
kputs("##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=HWE,"))
tid = begin = end = -1;
memset(&vc, 0, sizeof(viewconf_t));
- vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; vc.min_lrt = 1;
+ vc.prior_type = vc.n1 = -1; vc.theta = 1e-3; vc.pref = 0.5; vc.indel_frac = -1.; vc.n_perm = 0; vc.min_perm_p = 0.01; vc.min_smpl_frac = 0; vc.min_lrt = 1; vc.min_ma_lrt = -1;
memset(qcnt, 0, 8 * 256);
- while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Yw")) >= 0) {
+ while ((c = getopt(argc, argv, "FN1:l:cC:eHAGvbSuP:t:p:QgLi:IMs:D:U:X:d:T:Ywm:")) >= 0) {
switch (c) {
case '1': vc.n1 = atoi(optarg); break;
case 'l': vc.bed = bed_read(optarg); break;
case 'w': vc.flag |= VC_INDEL_ONLY; break;
case 'M': vc.flag |= VC_ANNO_MAX; break;
case 'Y': vc.flag |= VC_QCNT; break;
+ case 'm': vc.min_ma_lrt = atof(optarg); break;
case 't': vc.theta = atof(optarg); break;
case 'p': vc.pref = atof(optarg); break;
case 'i': vc.indel_frac = atof(optarg); break;
fprintf(stderr, " -g call genotypes at variant sites (force -c)\n");
fprintf(stderr, " -i FLOAT indel-to-substitution ratio [%.4g]\n", vc.indel_frac);
fprintf(stderr, " -I skip indels\n");
+ fprintf(stderr, " -m FLOAT alternative model for multiallelic and rare-variant calling, include if P(chi^2)>=FLOAT\n");
fprintf(stderr, " -p FLOAT variant if P(ref|D)<FLOAT [%.3g]\n", vc.pref);
fprintf(stderr, " -P STR type of prior: full, cond2, flat [full]\n");
fprintf(stderr, " -t FLOAT scaled substitution mutation rate [%.4g]\n", vc.theta);
int i;
for (i = 0; i < 9; ++i) em[i] = -1.;
}
- if (vc.flag & VC_CALL) { // call variants
+ if ( !(vc.flag&VC_KEEPALT) && vc.flag&VC_CALL && vc.min_ma_lrt>=0 )
+ {
+ bcf_p1_set_ploidy(b, p1); // could be improved: do this per site to allow pseudo-autosomal regions
+ int gts = call_multiallelic_gt(b,p1,vc.min_ma_lrt);
+ if ( gts<=1 && vc.flag & VC_VARONLY ) continue;
+ }
+ else if (vc.flag & VC_CALL) { // call variants
bcf_p1rst_t pr;
int calret = bcf_p1_cal(b, (em[7] >= 0 && em[7] < vc.min_lrt), p1, &pr);
if (n_processed % 100000 == 0) {
#include <stdio.h>
#include <errno.h>
#include <assert.h>
+#include <limits.h>
#include "prob1.h"
+#include "kstring.h"
#include "kseq.h"
KSTREAM_INIT(gzFile, gzread, 16384)
return 0;
}
+void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma)
+{
+ // bcf_p1aux_t fields are not visible outside of prob1.c, hence this wrapper.
+ // Ideally, this should set ploidy per site to allow pseudo-autosomal regions
+ b->ploidy = ma->ploidy;
+}
+
void bcf_p1_destroy(bcf_p1aux_t *ma)
{
if (ma) {
}
}
-static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
+extern double kf_gammap(double s, double z);
+int test16(bcf1_t *b, anno16_t *a);
+
+int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold)
{
- int i, j;
- int n = (b->n_alleles+1)*b->n_alleles/2;
- double *lk = alloca(n * sizeof(long));
- memset(lk, 0, sizeof(double) * n);
- for (j = 0; j < ma->n; ++j) {
- const uint8_t *pi = ma->PL + j * ma->PL_len;
- double *pdg = ma->pdg + j * 3;
- pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]];
- for (i=0; i<n; i++) lk[i] += pi[i];
+ int nals = 1;
+ char *p;
+ for (p=b->alt; *p; p++)
+ {
+ if ( *p=='X' || p[0]=='.' ) break;
+ if ( p[0]==',' ) nals++;
+ }
+ if ( b->alt[0] && !*p ) nals++;
+
+ if ( nals==1 ) return 1;
+
+ if ( nals>4 )
+ {
+ if ( *b->ref=='N' ) return 0;
+ fprintf(stderr,"Not ready for this, more than 4 alleles at %d: %s, %s\n", b->pos+1, b->ref,b->alt);
+ exit(1);
+ }
+
+ // find PL and DP FORMAT indexes
+ uint8_t *pl = NULL;
+ int npl = 0, idp=-1;
+ int i;
+ for (i = 0; i < b->n_gi; ++i)
+ {
+ if (b->gi[i].fmt == bcf_str2int("PL", 2))
+ {
+ pl = (uint8_t*)b->gi[i].data;
+ npl = b->gi[i].len;
+ }
+ if (b->gi[i].fmt == bcf_str2int("DP", 2)) idp=i;
+ }
+ if ( !pl ) return -1;
+
+ assert(ma->q2p[0] == 1);
+
+ // Init P(D|G)
+ int npdg = nals*(nals+1)/2;
+ double *pdg,*_pdg;
+ _pdg = pdg = malloc(sizeof(double)*ma->n*npdg);
+ for (i=0; i<ma->n; i++)
+ {
+ int j;
+ double sum = 0;
+ for (j=0; j<npdg; j++)
+ {
+ //_pdg[j] = pow(10,-0.1*pl[j]);
+ _pdg[j] = ma->q2p[pl[j]];
+ sum += _pdg[j];
+ }
+ if ( sum )
+ for (j=0; j<npdg; j++) _pdg[j] /= sum;
+ _pdg += npdg;
+ pl += npl;
+ }
+
+ if ((p = strstr(b->info, "QS=")) == 0) { fprintf(stderr,"INFO/QS is required with -m, exiting\n"); exit(1); }
+ double qsum[4];
+ if ( sscanf(p+3,"%lf,%lf,%lf,%lf",&qsum[0],&qsum[1],&qsum[2],&qsum[3])!=4 ) { fprintf(stderr,"Could not parse %s\n",p); exit(1); }
+
+
+ // Calculate the most likely combination of alleles
+ int ia,ib,ic, max_als=0, max_als2=0;
+ double ref_lk = 0, max_lk = INT_MIN, max_lk2 = INT_MIN, lk_sum = INT_MIN;
+ for (ia=0; ia<nals; ia++)
+ {
+ double lk_tot = 0;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ int isample;
+ for (isample=0; isample<ma->n; isample++)
+ {
+ double *p = pdg + isample*npdg;
+ // assert( log(p[iaa]) <= 0 );
+ lk_tot += log(p[iaa]);
+ }
+ if ( ia==0 ) ref_lk = lk_tot;
+ if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia; }
+ else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia; }
+ lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
+ }
+ if ( nals>1 )
+ {
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( qsum[ia]==0 ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( qsum[ib]==0 ) continue;
+ double lk_tot = 0;
+ double fa = qsum[ia]/(qsum[ia]+qsum[ib]);
+ double fb = qsum[ib]/(qsum[ia]+qsum[ib]);
+ double fab = 2*fa*fb; fa *= fa; fb *= fb;
+ int isample, ibb = (ib+1)*(ib+2)/2-1, iab = iaa - ia + ib;
+ for (isample=0; isample<ma->n; isample++)
+ {
+ if ( b->ploidy && b->ploidy[isample]==1 ) continue;
+ double *p = pdg + isample*npdg;
+ //assert( log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]) <= 0 );
+ lk_tot += log(fa*p[iaa] + fb*p[ibb] + fab*p[iab]);
+ }
+ if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia|1<<ib; }
+ else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia|1<<ib; }
+ lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
+ }
+ }
}
+ if ( nals>2 )
+ {
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( qsum[ia]==0 ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( qsum[ib]==0 ) continue;
+ int ibb = (ib+1)*(ib+2)/2-1;
+ int iab = iaa - ia + ib;
+ for (ic=0; ic<ib; ic++)
+ {
+ if ( qsum[ic]==0 ) continue;
+ double lk_tot = 0;
+ double fa = qsum[ia]/(qsum[ia]+qsum[ib]+qsum[ic]);
+ double fb = qsum[ib]/(qsum[ia]+qsum[ib]+qsum[ic]);
+ double fc = qsum[ic]/(qsum[ia]+qsum[ib]+qsum[ic]);
+ double fab = 2*fa*fb, fac = 2*fa*fc, fbc = 2*fb*fc; fa *= fa; fb *= fb; fc *= fc;
+ int isample, icc = (ic+1)*(ic+2)/2-1;
+ int iac = iaa - ia + ic, ibc = ibb - ib + ic;
+ for (isample=0; isample<ma->n; isample++)
+ {
+ if ( b->ploidy && b->ploidy[isample]==1 ) continue;
+ double *p = pdg + isample*npdg;
+ //assert( log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]) <= 0 );
+ lk_tot += log(fa*p[iaa] + fb*p[ibb] + fc*p[icc] + fab*p[iab] + fac*p[iac] + fbc*p[ibc]);
+ }
+ if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia|1<<ib|1<<ic; }
+ else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia|1<<ib|1<<ic; }
+ lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
+ }
+ }
+ }
+ }
+
- double norm=lk[0];
- for (i=1; i<n; i++) if (lk[i]<norm) norm=lk[i];
- #if DBG
- for (i=0,j=0; i<b->n_alleles; i++)
+ // Should we add another allele, does it increase the likelihood significantly?
+ int n1=0, n2=0;
+ for (i=0; i<nals; i++) if ( max_als&1<<i) n1++;
+ for (i=0; i<nals; i++) if ( max_als2&1<<i) n2++;
+ if ( n2<n1 && kf_gammap(1,2.0*(max_lk-max_lk2))<threshold )
{
- int k; for (k=0; k<=i; k++) printf("%.0f\t", lk[j++]);
- printf("\n");
+ max_lk = max_lk2;
+ max_als = max_als2;
}
- #endif
- for (i=0; i<n; i++) lk[i] = pow(10,-0.1*(lk[i]-norm));
-
- // Find out the most likely alleles. In contrast to the original version,
- // ALT alleles may not be printed when they are more likely than REF but
- // significantly less likely than the most likely ALT. The only criterion
- // is the LK ratio now. To obtain behaviour similar to the original one,
- // use the pref variant below.
- double pmax=0; //,pref=0;
- n = ma->is_indel ? b->n_alleles : b->n_alleles-1;
- for (i=0; i<n; i++)
+
+ // Get the BCF record ready for GT and GQ
+ kstring_t s;
+ int old_n_gi = b->n_gi;
+ s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str;
+ kputs(":GT:GQ", &s); kputc('\0', &s);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ bcf_sync(b);
+
+ // Call GTs
+ int isample, gts=0, ac[4] = {0,0,0,0};
+ for (isample = 0; isample < b->n_smpl; isample++)
{
- double pr=0;
- int k=i*(i+1)/2;
- for (j=0; j<=i; j++) { pr+=lk[k]; k++; }
- for (j=i+1; j<b->n_alleles; j++) { k=j*(j+1)/2+i; pr+=lk[k]; }
- #if DBG
- printf("%d\t%e\n", i,pr);
- #endif
- if (pmax<pr) pmax=pr;
- // if (i==0) pref=pr;
- // if (pr<pref && pr/pmax < 1e-4) break;
- if (pr/pmax < 1e-4) break; // Assuming the alleles are sorted by the lk
+ int ploidy = b->ploidy ? b->ploidy[isample] : 2;
+ double *p = pdg + isample*npdg;
+ int ia, als = 0;
+ double lk = 0, lk_sum=0;
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(max_als&1<<ia) ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ double _lk = p[iaa]*qsum[ia]*qsum[ia];
+ if ( _lk > lk ) { lk = _lk; als = ia<<3 | ia; }
+ lk_sum += _lk;
+ }
+ if ( ploidy==2 )
+ {
+ for (ia=0; ia<nals; ia++)
+ {
+ if ( !(max_als&1<<ia) ) continue;
+ int iaa = (ia+1)*(ia+2)/2-1;
+ for (ib=0; ib<ia; ib++)
+ {
+ if ( !(max_als&1<<ib) ) continue;
+ int iab = iaa - ia + ib;
+ double _lk = 2*qsum[ia]*qsum[ib]*p[iab];
+ if ( _lk > lk ) { lk = _lk; als = ib<<3 | ia; }
+ lk_sum += _lk;
+ }
+ }
+ }
+ lk = -log(1-lk/lk_sum)/0.2302585;
+ if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 )
+ {
+ ((uint8_t*)b->gi[old_n_gi].data)[isample] = 1<<7;
+ ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = 0;
+ continue;
+ }
+ ((uint8_t*)b->gi[old_n_gi].data)[isample] = als;
+ ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = lk<100 ? (int)lk : 99;
+
+ gts |= 1<<(als>>3&7) | 1<<(als&7);
+ ac[ als>>3&7 ]++;
+ ac[ als&7 ]++;
}
- return i-1;
+ bcf_fit_alt(b,max_als);
+
+
+ // Prepare BCF for output: ref, alt, filter, info, format
+ memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s);
+ kputs(b->ref, &s); kputc('\0', &s);
+ kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
+ {
+ int an=0, nalts=0;
+ for (i=0; i<nals; i++)
+ {
+ an += ac[i];
+ if ( i>0 && ac[i] ) nalts++;
+ }
+ ksprintf(&s, "AN=%d;", an);
+ if ( nalts )
+ {
+ kputs("AC=", &s);
+ for (i=1; i<nals; i++)
+ {
+ if ( !(gts&1<<i) ) continue;
+ nalts--;
+ ksprintf(&s,"%d", ac[i]);
+ if ( nalts>0 ) kputc(',', &s);
+ }
+ kputc(';', &s);
+ }
+ kputs(b->info, &s);
+ anno16_t a;
+ int has_I16 = test16(b, &a) >= 0? 1 : 0;
+ if (has_I16 )
+ {
+ if ( a.is_tested) ksprintf(&s, ";PV4=%.2g,%.2g,%.2g,%.2g", a.p[0], a.p[1], a.p[2], a.p[3]);
+ ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
+ }
+ kputc('\0', &s);
+ rm_info(&s, "I16=");
+ rm_info(&s, "QS=");
+ }
+ kputs(b->fmt, &s); kputc('\0', &s);
+ free(b->str);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ b->qual = gts>1 ? -4.343*(ref_lk - lk_sum) : -4.343*(max_lk - lk_sum);
+ if ( b->qual>999 ) b->qual = 999;
+ bcf_sync(b);
+
+
+ free(pdg);
+ return gts;
+}
+
+static int cal_pdg(const bcf1_t *b, bcf_p1aux_t *ma)
+{
+ int i, j;
+ long *p, tmp;
+ p = alloca(b->n_alleles * sizeof(long));
+ memset(p, 0, sizeof(long) * b->n_alleles);
+ for (j = 0; j < ma->n; ++j) {
+ const uint8_t *pi = ma->PL + j * ma->PL_len;
+ double *pdg = ma->pdg + j * 3;
+ pdg[0] = ma->q2p[pi[2]]; pdg[1] = ma->q2p[pi[1]]; pdg[2] = ma->q2p[pi[0]];
+ for (i = 0; i < b->n_alleles; ++i)
+ p[i] += (int)pi[(i+1)*(i+2)/2-1];
+ }
+ for (i = 0; i < b->n_alleles; ++i) p[i] = p[i]<<4 | i;
+ for (i = 1; i < b->n_alleles; ++i) // insertion sort
+ for (j = i; j > 0 && p[j] < p[j-1]; --j)
+ tmp = p[j], p[j] = p[j-1], p[j-1] = tmp;
+ for (i = b->n_alleles - 1; i >= 0; --i)
+ if ((p[i]&0xf) == 0) break;
+ return i;
}
+
int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
{
double sum, g[3];
double cmp[3], p_chi2, lrt; // used by contrast2()
} bcf_p1rst_t;
+typedef struct {
+ double p[4];
+ int mq, depth, is_tested, d[4];
+} anno16_t;
+
#define MC_PTYPE_FULL 1
#define MC_PTYPE_COND2 2
#define MC_PTYPE_FLAT 3
void bcf_p1_init_prior(bcf_p1aux_t *ma, int type, double theta);
void bcf_p1_init_subprior(bcf_p1aux_t *ma, int type, double theta);
void bcf_p1_destroy(bcf_p1aux_t *ma);
+ void bcf_p1_set_ploidy(bcf1_t *b, bcf_p1aux_t *ma);
int bcf_p1_cal(const bcf1_t *b, int do_contrast, bcf_p1aux_t *ma, bcf_p1rst_t *rst);
+ int call_multiallelic_gt(bcf1_t *b, bcf_p1aux_t *ma, double threshold);
int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k);
void bcf_p1_dump_afs(bcf_p1aux_t *ma);
int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn);
if ( cig==1 )
{
- int idx = is_fwd ? icycle : read_len-icycle;
+ int idx = is_fwd ? icycle : read_len-icycle-ncig;
if ( idx<0 )
error("FIXME: read_len=%d vs icycle=%d\n", read_len,icycle);
if ( idx >= stats->nbases || idx<0 ) error("FIXME: %d vs %d, %s:%d %s\n", idx,stats->nbases, stats->sam->header->target_name[bam_line->core.tid],bam_line->core.pos+1,bam1_qname(bam_line));
printf("# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n");
for (ilen=0; ilen<=stats->nbases; ilen++)
{
+ // For deletions we print the index of the cycle before the deleted base (1-based) and for insertions
+ // the index of the cycle of the first inserted base (also 1-based)
if ( stats->ins_cycles_1st[ilen]>0 || stats->ins_cycles_2nd[ilen]>0 || stats->del_cycles_1st[ilen]>0 || stats->del_cycles_2nd[ilen]>0 )
printf("IC\t%d\t%ld\t%ld\t%ld\t%ld\n", ilen+1, (long)stats->ins_cycles_1st[ilen], (long)stats->ins_cycles_2nd[ilen], (long)stats->del_cycles_1st[ilen], (long)stats->del_cycles_2nd[ilen]);
}
return ret;
}
+void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value)
+{
+ list_t *l = iter;
+ if ( !l ) return NULL;
+
+ while (l)
+ {
+ HeaderLine *hline = l->data;
+ if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
+ {
+ l = l->next;
+ continue;
+ }
+
+ HeaderTag *key, *value;
+ key = header_line_has_tag(hline,key_tag);
+ value = header_line_has_tag(hline,value_tag);
+ if ( !key && !value )
+ {
+ l = l->next;
+ continue;
+ }
+
+ *_key = key->value;
+ *_value = value->value;
+ return l->next;
+ }
+ return l;
+}
+
const char *sam_tbl_get(void *h, const char *key)
{
khash_t(str) *tbl = (khash_t(str)*)h;
void sam_header_free(void *header);
char *sam_header_write(const void *headerDict); // returns a newly allocated string
+ /*
+ // Usage example
+ const char *key, *val;
+ void *iter = sam_header_parse2(bam->header->text);
+ while ( iter = sam_header_key_val(iter, "RG","ID","SM" &key,&val) ) printf("%s\t%s\n", key,val);
+ */
+ void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **key, const char **value);
char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n);
void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]);
// data passed to the bam_fetch callback is encapsulated in this struct.
typedef struct {
bam_header_t *header;
- int *count;
+ int64_t *count; // int does overflow for very big BAMs
} count_func_data_t;
typedef khash_t(rg) *rghash_t;
{
int c, is_header = 0, is_header_only = 0, is_bamin = 1, ret = 0, compress_level = -1, is_bamout = 0, is_count = 0;
int of_type = BAM_OFDEC, is_long_help = 0, n_threads = 0;
- int count = 0;
+ int64_t count = 0;
samfile_t *in = 0, *out = 0;
char in_mode[5], out_mode[5], *fn_out = 0, *fn_list = 0, *fn_ref = 0, *fn_rg = 0, *q;
view_end:
if (is_count && ret == 0) {
- printf("%d\n", count);
+ printf("%ld\n", count); // compilers on some platforms may complain about printing int64_t with %ld
}
// close files, free and return
free(fn_list); free(fn_ref); free(fn_out); free(g_library); free(g_rg); free(fn_rg);