for (j = 0; j < 4; ++j)
qsum[j] += calls[i].qsum[j];
int qsum_tot=0;
- for (j=0; j<4; j++) qsum_tot += qsum[j];
+ for (j=0; j<4; j++) { qsum_tot += qsum[j]; call->qsum[j] = 0; }
for (j = 0; j < 4; ++j) qsum[j] = qsum[j] << 2 | j;
// find the top 2 alleles
for (i = 1; i < 4; ++i) // insertion sort
if ((qsum[i]&3) != ref4) {
if (qsum[i]>>2 != 0)
{
- call->qsum[j] = (float)(qsum[i]>>2)/qsum_tot;
+ if ( j<4 ) call->qsum[j] = (float)(qsum[i]>>2)/qsum_tot; // ref N can make j>=4
call->a[j++] = qsum[i]&3;
}
else break;
}
kputc('\0', &s);
// INFO
- if (bc->ori_ref < 0) kputs("INDEL;", &s);
+ if (bc->ori_ref < 0) ksprintf(&s,"INDEL;IS=%d,%f;", bca->max_support, bca->max_frac);
kputs("DP=", &s); kputw(bc->ori_depth, &s); kputs(";I16=", &s);
for (i = 0; i < 16; ++i) {
if (i) kputc(',', &s);
typedef struct __bcf_callaux_t {
int capQ, min_baseQ;
int openQ, extQ, tandemQ; // for indels
- int min_support; // for collecting indel candidates
- double min_frac; // for collecting indel candidates
+ int min_support, max_support; // for collecting indel candidates
+ double min_frac, max_frac; // for collecting indel candidates
int per_sample_flt; // indel filtering strategy
// for internal uses
int max_bases;
if (s == n) return -1; // there is no indel at this position.
for (s = N = 0; s < n; ++s) N += n_plp[s]; // N is the total number of reads
{ // find out how many types of indels are present
+ bca->max_support = bca->max_frac = 0;
int m, n_alt = 0, n_tot = 0, indel_support_ok = 0;
uint32_t *aux;
aux = calloc(N + 1, 4);
j = bam_cigar2qlen(&p->b->core, bam1_cigar(p->b));
if (j > max_rd_len) max_rd_len = j;
}
- if ( !indel_support_ok && na >= bca->min_support && (double)na/nt >= bca->min_frac )
+ float frac = (float)na/nt;
+ if ( !indel_support_ok && na >= bca->min_support && frac >= bca->min_frac )
indel_support_ok = 1;
+ if ( na > bca->max_support && frac > 0 ) bca->max_support = na, bca->max_frac = frac;
n_alt += na;
n_tot += nt;
}
// squeeze out identical types
for (i = 1, n_types = 1; i < m; ++i)
if (aux[i] != aux[i-1]) ++n_types;
- // Taking totals makes hard to call rare indels
- if ( !bca->per_sample_flt )
- indel_support_ok = ( (double)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
- if ( n_types == 1 || !indel_support_ok ) { // then skip
- free(aux); return -1;
- }
+ // Taking totals makes it hard to call rare indels
+ if ( !bca->per_sample_flt )
+ indel_support_ok = ( (float)n_alt / n_tot < bca->min_frac || n_alt < bca->min_support ) ? 0 : 1;
+ if ( n_types == 1 || !indel_support_ok ) { // then skip
+ free(aux); return -1;
+ }
if (n_types >= 64) {
free(aux);
if (bam_verbose >= 2)
return 0;
}
+int remove_tag(char *str, char *tag, char delim)
+{
+ char *tmp = str, *p;
+ int len_diff = 0, ori_len = strlen(str);
+ while ( *tmp && (p = strstr(tmp,tag)) )
+ {
+ if ( p>str )
+ {
+ if ( *(p-1)!=delim ) { tmp=p+1; continue; } // shared substring
+ p--;
+ }
+ char *q=p+1;
+ while ( *q && *q!=delim ) q++;
+ if ( p==str && *q ) q++;
+ len_diff += q-p;
+ if ( ! *q ) { *p = 0; break; }
+ else
+ memmove(p,q,ori_len-(int)(q-p)+1);
+ }
+ if ( len_diff==ori_len )
+ str[0]='.', str[1]=0, len_diff--;
+
+ return len_diff;
+}
+
int bcf_cpy(bcf1_t *r, const bcf1_t *b)
{
char *t1 = r->str;
char *bcf_fmt(const bcf_hdr_t *h, bcf1_t *b);
// append more info
int bcf_append_info(bcf1_t *b, const char *info, int l);
+ // remove tag
+ int remove_tag(char *string, char *tag, char delim);
// copy
int bcf_cpy(bcf1_t *r, const bcf1_t *b);
if ( b->n_alleles <= nals ) return;
-#if DBG
- printf("fit_alt: %s, %s, mask=%d, nals=%d: ", b->ref, b->alt, mask, nals);
- for (i=0; i<sizeof(int); i++)
- if ( mask&1<<i) printf(" %d", i);
- printf("\n");
-#endif
// update ALT, in principle any of the alleles can be removed
char *p;
if ( nals>1 )
p = dst;
}
else p = b->alt, *p = '\0';
-#if DBG
- printf("fit_alt: %s, mask=%d\n", b->alt, mask);
-#endif
p++;
memmove(p, b->flt, b->str + b->l_str - b->flt);
b->l_str -= b->flt - p;
}
free(map);
b->n_alleles = nals;
+ bcf_sync(b);
}
int bcf_shrink_alt(bcf1_t *b, int n)
kputs("##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele frequency (assuming HWE)\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=AC1,"))
kputs("##INFO=<ID=AC1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele count (no HWE assumption)\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=AN,"))
+ kputs("##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=IS,"))
+ kputs("##INFO=<ID=IS,Number=2,Type=Float,Description=\"Maximum number of reads supporting an indel and fraction of indel reads\">\n", &str);
+ if (!strstr(str.s, "##INFO=<ID=AC,"))
+ kputs("##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes for each ALT allele, in the same order as listed\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=G3,"))
kputs("##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=HWE,"))
if ( !(vc.flag&VC_KEEPALT) && vc.flag&VC_CALL && vc.min_ma_lrt>=0 )
{
int gts = call_multiallelic_gt(b,p1,vc.min_ma_lrt);
- if ( gts==0 && vc.flag & VC_VARONLY ) continue;
+ if ( gts<=1 && vc.flag & VC_VARONLY ) continue;
}
else if (vc.flag & VC_CALL) { // call variants
bcf_p1rst_t pr;
if ( nals==1 ) return 1;
- if ( nals>4 ) { fprintf(stderr,"too many alts: %d\n", nals); exit(1); }
+ if ( nals>4 )
+ {
+ if ( *b->ref=='N' ) return 0;
+ fprintf(stderr,"Not ready for this, more than 4 alleles at %d: %s, %s\n", b->pos+1, b->ref,b->alt);
+ exit(1);
+ }
// set PL and PL_len
uint8_t *pl = NULL;
if ( max_lk<lk_tot ) { max_lk2 = max_lk; max_als2 = max_als; max_lk = lk_tot; max_als = 1<<ia|1<<ib|1<<ic; }
else if ( max_lk2<lk_tot ) { max_lk2 = lk_tot; max_als2 = 1<<ia|1<<ib|1<<ic; }
lk_sum = lk_tot>lk_sum ? lk_tot + log(1+exp(lk_sum-lk_tot)) : lk_sum + log(1+exp(lk_tot-lk_sum));
+ //printf(" %f\n", lk_sum);
}
}
}
max_als = max_als2;
}
- // Get the BCF record ready for output
+ // Get the BCF record ready for GT and GQ
kstring_t s;
- memset(&s, 0, sizeof(kstring_t));
- kputc('\0', &s); kputs(b->ref, &s); kputc('\0', &s);
- kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
- kputs(b->info, &s); if (b->info[0]) kputc(';', &s); kputc('\0', &s);
- kputs(b->fmt, &s); kputc('\0', &s);
- free(b->str);
- b->m_str = s.m; b->l_str = s.l; b->str = s.s;
- b->qual = -4.343*(log(1-exp(max_lk-lk_sum)));
- if ( b->qual>999 ) b->qual = 999;
- //bcf_sync(b);
-
- int x, old_n_gi = b->n_gi;
+ int old_n_gi = b->n_gi;
s.m = b->m_str; s.l = b->l_str - 1; s.s = b->str;
kputs(":GT:GQ", &s); kputc('\0', &s);
b->m_str = s.m; b->l_str = s.l; b->str = s.s;
bcf_sync(b);
- // Call GT
- int isample, gts=0;
+ // Call GTs
+ int isample, gts=0, ac[4] = {0,0,0,0};
for (isample = 0; isample < b->n_smpl; isample++)
{
int ploidy = b->ploidy ? b->ploidy[isample] : 2;
lk = -log(1-lk/lk_sum)/0.2302585;
if ( idp>=0 && ((uint16_t*)b->gi[idp].data)[isample]==0 )
{
- als |= 1<<7;
- lk = 0;
+ ((uint8_t*)b->gi[old_n_gi].data)[isample] = als | 1<<7;
+ ((uint8_t*)b->gi[old_n_gi+1].data)[isample] = 0;
+ continue;
}
- ((uint8_t*)b->gi[old_n_gi].data)[isample] = als;
+ ((uint8_t*)b->gi[old_n_gi].data)[isample] = als;
((uint8_t*)b->gi[old_n_gi+1].data)[isample] = lk<100 ? (int)lk : 99;
- gts |= (als>>3&7) | (als&7);
+ gts |= 1<<(als>>3&7) | 1<<(als&7);
+ ac[ als>>3&7 ]++;
+ ac[ als&7 ]++;
}
bcf_fit_alt(b,max_als);
+
+ // prepare ref, alt, filter, info, format
+ memset(&s, 0, sizeof(kstring_t)); kputc('\0', &s);
+ kputs(b->ref, &s); kputc('\0', &s);
+ kputs(b->alt, &s); kputc('\0', &s); kputc('\0', &s);
+ {
+ kstring_t info; memset(&info, 0, sizeof(kstring_t));
+ int an=0, nalts=0;
+ for (i=0; i<nals; i++)
+ {
+ an += ac[i];
+ if ( i>0 && ac[i] ) nalts++;
+ }
+ ksprintf(&info, "AN=%d;", an);
+ if ( nalts )
+ {
+ kputs("AC=", &info);
+ for (i=1; i<nals; i++)
+ {
+ if ( !(gts&1<<i) ) continue;
+ nalts--;
+ ksprintf(&info,"%d", ac[i]);
+ if ( nalts>0 ) kputc(',', &info);
+ }
+ kputc(';', &info);
+ }
+ kputs(b->info, &info);
+ info.l -= remove_tag(info.s, "I16=", ';');
+ info.l -= remove_tag(info.s, "QS=", ';');
+ kputc('\0', &info);
+ kputs(info.s, &s); kputc('\0', &s);
+ free(info.s);
+ }
+ kputs(b->fmt, &s); kputc('\0', &s);
+ free(b->str);
+ b->m_str = s.m; b->l_str = s.l; b->str = s.s;
+ b->qual = -4.343*(log(1-exp(max_lk-lk_sum)));
+ if ( b->qual>999 ) b->qual = 999;
bcf_sync(b);
+
free(pdg);
return gts;
}