#include <string.h>
#include <math.h>
+#include <assert.h>
#include "bcf.h"
#include "kstring.h"
#include "khash.h"
KHASH_MAP_INIT_STR(str2id, int)
+#ifdef _WIN32
+#define srand48(x) srand(x)
+#define drand48() ((double)rand() / RAND_MAX)
+#endif
+
+// FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated...
void *bcf_build_refhash(bcf_hdr_t *h)
{
khash_t(str2id) *hash;
return kh_val(hash, k);
}
+void bcf_fit_alt(bcf1_t *b, int mask)
+{
+ mask |= 1; // REF must be always present
+
+ int i,j,nals=0;
+ for (i=0; i<sizeof(int); i++)
+ if ( mask&1<<i) nals++;
+
+ if ( b->n_alleles <= nals ) return;
+
+ // update ALT, in principle any of the alleles can be removed
+ char *p;
+ if ( nals>1 )
+ {
+ char *dst, *src;
+ int n=0, nalts=nals-1;
+ for (src=dst=p=b->alt, i=1; *p; p++)
+ {
+ if ( *p!=',' ) continue;
+
+ if ( mask&1<<i )
+ {
+ n++;
+ if ( src!=dst )
+ {
+ memmove(dst,src,p-src);
+ dst += p-src;
+ }
+ else dst = p;
+ if ( n<nalts ) { *dst=','; dst++; }
+ }
+ i++;
+
+ if ( n>=nalts ) { *dst=0; break; }
+ src = p+1;
+ }
+ if ( n<nalts )
+ {
+ memmove(dst,src,p-src);
+ dst += p-src;
+ *dst = 0;
+ }
+ p = dst;
+ }
+ else p = b->alt, *p = '\0';
+ p++;
+ memmove(p, b->flt, b->str + b->l_str - b->flt);
+ b->l_str -= b->flt - p;
+
+ // update PL and GT
+ int ipl=-1, igt=-1;
+ for (i = 0; i < b->n_gi; ++i)
+ {
+ bcf_ginfo_t *g = b->gi + i;
+ if (g->fmt == bcf_str2int("PL", 2)) ipl = i;
+ if (g->fmt == bcf_str2int("GT", 2)) igt = i;
+ }
+
+ // .. create mapping between old and new indexes
+ int npl = nals * (nals+1) / 2;
+ int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles));
+ int kori=0,knew=0;
+ for (i=0; i<b->n_alleles; i++)
+ {
+ for (j=0; j<=i; j++)
+ {
+ int skip=0;
+ if ( i && !(mask&1<<i) ) skip=1;
+ if ( j && !(mask&1<<j) ) skip=1;
+ if ( !skip ) { map[knew++] = kori; }
+ kori++;
+ }
+ }
+ // .. apply to all samples
+ int n_smpl = b->n_smpl;
+ for (i = 0; i < b->n_gi; ++i)
+ {
+ bcf_ginfo_t *g = b->gi + i;
+ if (g->fmt == bcf_str2int("PL", 2))
+ {
+ g->len = npl;
+ uint8_t *d = (uint8_t*)g->data;
+ int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2;
+ for (knew=ismpl=0; ismpl<n_smpl; ismpl++)
+ {
+ uint8_t *dl = d + ismpl * npl_ori;
+ for (j=0; j<npl; j++) d[knew++] = dl[map[j]];
+ }
+ } // FIXME: to add GL
+ }
+ // update GTs
+ map[0] = 0;
+ for (i=1, knew=0; i<b->n_alleles; i++)
+ map[i] = mask&1<<i ? ++knew : -1;
+ for (i=0; i<n_smpl; i++)
+ {
+ uint8_t gt = ((uint8_t*)b->gi[igt].data)[i];
+ int a1 = (gt>>3)&7;
+ int a2 = gt&7;
+ assert( map[a1]>=0 && map[a2]>=0 );
+ ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)>) | map[a1]<<3 | map[a2];
+ }
+ free(map);
+ b->n_alleles = nals;
+ bcf_sync(b);
+}
+
int bcf_shrink_alt(bcf1_t *b, int n)
{
char *p;
b->n_smpl = n_smpl;
return 0;
}
+
+static int8_t nt4_table[128] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4
+};
+
+int bcf_gl10(const bcf1_t *b, uint8_t *gl)
+{
+ int a[4], k, l, map[4], k1, j, i;
+ const bcf_ginfo_t *PL;
+ char *s;
+ if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+ if (i == b->n_gi) return -1; // no PL
+ PL = b->gi + i;
+ a[0] = nt4_table[(int)b->ref[0]];
+ if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T
+ a[1] = a[2] = a[3] = -2; // -1 has a special meaning
+ if (b->alt[0] == 0) return -1; // no alternate allele
+ map[0] = map[1] = map[2] = map[3] = -2;
+ map[a[0]] = 0;
+ for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
+ if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
+ a[k+1] = nt4_table[(int)*s];
+ if (a[k+1] >= 0) map[a[k+1]] = k+1;
+ else k1 = k + 1;
+ if (s[1] == 0) break; // the end of the ALT string
+ }
+ for (k = 0; k < 4; ++k)
+ if (map[k] < 0) map[k] = k1;
+ for (i = 0; i < b->n_smpl; ++i) {
+ const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
+ uint8_t *g = gl + 10 * i;
+ for (k = j = 0; k < 4; ++k) {
+ for (l = k; l < 4; ++l) {
+ int t, x = map[k], y = map[l];
+ if (x > y) t = x, x = y, y = t; // make sure x is the smaller
+ g[j++] = p[y * (y+1) / 2 + x];
+ }
+ }
+ }
+ return 0;
+}
+
+int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl)
+{
+ int k, l, j, i;
+ const bcf_ginfo_t *PL;
+ if (b->alt[0] == 0) return -1; // no alternate allele
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+ if (i == b->n_gi) return -1; // no PL
+ PL = b->gi + i;
+ for (i = 0; i < b->n_smpl; ++i) {
+ const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
+ uint8_t *g = gl + 10 * i;
+ for (k = j = 0; k < 4; ++k) {
+ for (l = k; l < 4; ++l) {
+ int t, x = k, y = l;
+ if (x > y) t = x, x = y, y = t; // make sure x is the smaller
+ x = y * (y+1) / 2 + x;
+ g[j++] = x < PL->len? p[x] : 255;
+ }
+ }
+ }
+ return 0;
+}