+
+// FIXME: only data are shuffled; the header is NOT
+int bcf_shuffle(bcf1_t *b, int seed)
+{
+ int i, j, *a;
+ if (seed > 0) srand48(seed);
+ a = malloc(b->n_smpl * sizeof(int));
+ for (i = 0; i < b->n_smpl; ++i) a[i] = i;
+ for (i = b->n_smpl; i > 1; --i) {
+ int tmp;
+ j = (int)(drand48() * i);
+ tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
+ }
+ for (j = 0; j < b->n_gi; ++j) {
+ bcf_ginfo_t *gi = b->gi + j;
+ uint8_t *swap, *data = (uint8_t*)gi->data;
+ swap = malloc(gi->len * b->n_smpl);
+ for (i = 0; i < b->n_smpl; ++i)
+ memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len);
+ free(gi->data);
+ gi->data = swap;
+ }
+ free(a);
+ return 0;
+}
+
+bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list)
+{
+ int i, ret, j;
+ khint_t k;
+ bcf_hdr_t *h;
+ khash_t(str2id) *hash;
+ kstring_t s;
+ s.l = s.m = 0; s.s = 0;
+ hash = kh_init(str2id);
+ for (i = 0; i < h0->n_smpl; ++i) {
+ k = kh_put(str2id, hash, h0->sns[i], &ret);
+ kh_val(hash, k) = i;
+ }
+ for (i = j = 0; i < n; ++i) {
+ k = kh_get(str2id, hash, samples[i]);
+ if (k != kh_end(hash)) {
+ list[j++] = kh_val(hash, k);
+ kputs(samples[i], &s); kputc('\0', &s);
+ }
+ }
+ if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j);
+ kh_destroy(str2id, hash);
+ h = calloc(1, sizeof(bcf_hdr_t));
+ *h = *h0;
+ h->ns = 0; h->sns = 0;
+ h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm);
+ h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt);
+ h->l_smpl = s.l; h->sname = s.s;
+ bcf_hdr_sync(h);
+ return h;
+}
+
+int bcf_subsam(int n_smpl, int *list, bcf1_t *b)
+{
+ int i, j;
+ for (j = 0; j < b->n_gi; ++j) {
+ bcf_ginfo_t *gi = b->gi + j;
+ uint8_t *swap;
+ swap = malloc(gi->len * b->n_smpl);
+ for (i = 0; i < n_smpl; ++i)
+ memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
+ free(gi->data);
+ gi->data = swap;
+ }
+ b->n_smpl = n_smpl;
+ return 0;
+}
+
+static int8_t nt4_table[128] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
+ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4
+};
+
+int bcf_gl10(const bcf1_t *b, uint8_t *gl)
+{
+ int a[4], k, l, map[4], k1, j, i;
+ const bcf_ginfo_t *PL;
+ char *s;
+ if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+ if (i == b->n_gi) return -1; // no PL
+ PL = b->gi + i;
+ a[0] = nt4_table[(int)b->ref[0]];
+ if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T
+ a[1] = a[2] = a[3] = -2; // -1 has a special meaning
+ if (b->alt[0] == 0) return -1; // no alternate allele
+ map[0] = map[1] = map[2] = map[3] = -2;
+ map[a[0]] = 0;
+ for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
+ if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
+ a[k+1] = nt4_table[(int)*s];
+ if (a[k+1] >= 0) map[a[k+1]] = k+1;
+ else k1 = k + 1;
+ if (s[1] == 0) break; // the end of the ALT string
+ }
+ for (k = 0; k < 4; ++k)
+ if (map[k] < 0) map[k] = k1;
+ for (i = 0; i < b->n_smpl; ++i) {
+ const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
+ uint8_t *g = gl + 10 * i;
+ for (k = j = 0; k < 4; ++k) {
+ for (l = k; l < 4; ++l) {
+ int t, x = map[k], y = map[l];
+ if (x > y) t = x, x = y, y = t; // make sure x is the smaller
+ g[j++] = p[y * (y+1) / 2 + x];
+ }
+ }
+ }
+ return 0;
+}
+
+int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl)
+{
+ int k, l, j, i;
+ const bcf_ginfo_t *PL;
+ if (b->alt[0] == 0) return -1; // no alternate allele
+ for (i = 0; i < b->n_gi; ++i)
+ if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+ if (i == b->n_gi) return -1; // no PL
+ PL = b->gi + i;
+ for (i = 0; i < b->n_smpl; ++i) {
+ const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
+ uint8_t *g = gl + 10 * i;
+ for (k = j = 0; k < 4; ++k) {
+ for (l = k; l < 4; ++l) {
+ int t, x = k, y = l;
+ if (x > y) t = x, x = y, y = t; // make sure x is the smaller
+ x = y * (y+1) / 2 + x;
+ g[j++] = x < PL->len? p[x] : 255;
+ }
+ }
+ }
+ return 0;
+}