6 KHASH_MAP_INIT_STR(str2id, int)
8 // FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated...
9 void *bcf_build_refhash(bcf_hdr_t *h)
11 khash_t(str2id) *hash;
13 hash = kh_init(str2id);
14 for (i = 0; i < h->n_ref; ++i) {
16 k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret
22 void *bcf_str2id_init()
24 return kh_init(str2id);
27 void bcf_str2id_destroy(void *_hash)
29 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
30 if (hash) kh_destroy(str2id, hash); // Note that strings are not freed.
33 void bcf_str2id_thorough_destroy(void *_hash)
35 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
37 if (hash == 0) return;
38 for (k = 0; k < kh_end(hash); ++k)
39 if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
40 kh_destroy(str2id, hash);
43 int bcf_str2id(void *_hash, const char *str)
45 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
48 k = kh_get(str2id, hash, str);
49 return k == kh_end(hash)? -1 : kh_val(hash, k);
52 int bcf_str2id_add(void *_hash, const char *str)
56 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
58 k = kh_put(str2id, hash, str, &ret);
59 if (ret == 0) return kh_val(hash, k);
60 kh_val(hash, k) = kh_size(hash) - 1;
61 return kh_val(hash, k);
64 int bcf_shrink_alt(bcf1_t *b, int n)
67 int i, j, k, n_smpl = b->n_smpl;
68 if (b->n_alleles <= n) return -1;
71 for (p = b->alt, k = 1; *p; ++p)
72 if (*p == ',' && ++k == n) break;
74 } else p = b->alt, *p = '\0';
76 memmove(p, b->flt, b->str + b->l_str - b->flt);
77 b->l_str -= b->flt - p;
79 for (i = 0; i < b->n_gi; ++i) {
80 bcf_ginfo_t *g = b->gi + i;
81 if (g->fmt == bcf_str2int("PL", 2)) {
82 int l, x = b->n_alleles * (b->n_alleles + 1) / 2;
83 uint8_t *d = (uint8_t*)g->data;
84 g->len = n * (n + 1) / 2;
85 for (l = k = 0; l < n_smpl; ++l) {
86 uint8_t *dl = d + l * x;
87 for (j = 0; j < g->len; ++j) d[k++] = dl[j];
96 int bcf_gl2pl(bcf1_t *b)
99 int i, n_smpl = b->n_smpl;
103 if (strstr(b->fmt, "PL")) return -1;
104 if ((p = strstr(b->fmt, "GL")) == 0) return -1;
106 for (i = 0; i < b->n_gi; ++i)
107 if (b->gi[i].fmt == bcf_str2int("GL", 2))
110 g->fmt = bcf_str2int("PL", 2);
111 g->len /= 4; // 4 == sizeof(float)
112 d0 = (float*)g->data; d1 = (uint8_t*)g->data;
113 for (i = 0; i < n_smpl * g->len; ++i) {
114 int x = (int)(-10. * d0[i] + .499);
115 if (x > 255) x = 255;
121 /* FIXME: this function will fail given AB:GTX:GT. BCFtools never
122 * produces such FMT, but others may do. */
123 int bcf_fix_gt(bcf1_t *b)
129 // check the presence of the GT FMT
130 if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first
131 if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact
132 tmp = bcf_str2int("GT", 2);
133 for (i = 0; i < b->n_gi; ++i)
134 if (b->gi[i].fmt == tmp) break;
135 if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug...
137 // move GT to the first
138 for (; i > 0; --i) b->gi[i] = b->gi[i-1];
140 memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt);
141 b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':';
145 int bcf_fix_pl(bcf1_t *b)
152 tmp = bcf_str2int("PL", 2);
153 for (i = 0; i < b->n_gi; ++i)
154 if (b->gi[i].fmt == tmp) break;
155 if (i == b->n_gi) return 0;
158 PL = (uint8_t*)gi->data;
159 swap = alloca(gi->len);
160 // loop through individuals
161 for (i = 0; i < b->n_smpl; ++i) {
163 uint8_t *PLi = PL + i * gi->len;
164 memcpy(swap, PLi, gi->len);
165 for (k = x = 0; k < b->n_alleles; ++k)
166 for (l = k; l < b->n_alleles; ++l)
167 PLi[l*(l+1)/2 + k] = swap[x++];
172 int bcf_smpl_covered(const bcf1_t *b)
178 tmp = bcf_str2int("PL", 2);
179 for (i = 0; i < b->n_gi; ++i)
180 if (b->gi[i].fmt == tmp) break;
181 if (i == b->n_gi) return 0;
182 // count how many samples having PL!=[0..0]
184 for (i = 0; i < b->n_smpl; ++i) {
185 uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len;
186 for (j = 0; j < gi->len; ++j)
188 if (j < gi->len) ++n;
193 static void *locate_field(const bcf1_t *b, const char *fmt, int l)
197 tmp = bcf_str2int(fmt, l);
198 for (i = 0; i < b->n_gi; ++i)
199 if (b->gi[i].fmt == tmp) break;
200 return i == b->n_gi? 0 : b->gi[i].data;
203 int bcf_anno_max(bcf1_t *b)
205 int k, max_gq, max_sp, n_het;
209 max_gq = max_sp = n_het = 0;
210 gt = locate_field(b, "GT", 2);
211 if (gt == 0) return -1;
212 gq = locate_field(b, "GQ", 2);
213 sp = locate_field(b, "SP", 2);
215 for (k = 0; k < b->n_smpl; ++k)
217 max_sp = max_sp > (int)sp[k]? max_sp : sp[k];
219 for (k = 0; k < b->n_smpl; ++k)
221 max_gq = max_gq > (int)gq[k]? max_gq : gq[k];
222 for (k = 0; k < b->n_smpl; ++k) {
224 a1 = gt[k]&7; a2 = gt[k]>>3&7;
225 if ((!a1 && a2) || (!a2 && a1)) { // a het
226 if (gq == 0) ++n_het;
227 else if (gq[k] >= 20) ++n_het;
230 if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499);
231 if (max_sp < 0) max_sp = 0;
232 memset(&str, 0, sizeof(kstring_t));
233 if (*b->info) kputc(';', &str);
234 ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq);
235 bcf_append_info(b, str.s, str.l);
240 // FIXME: only data are shuffled; the header is NOT
241 int bcf_shuffle(bcf1_t *b, int seed)
244 if (seed > 0) srand48(seed);
245 a = malloc(b->n_smpl * sizeof(int));
246 for (i = 0; i < b->n_smpl; ++i) a[i] = i;
247 for (i = b->n_smpl; i > 1; --i) {
249 j = (int)(drand48() * i);
250 tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
252 for (j = 0; j < b->n_gi; ++j) {
253 bcf_ginfo_t *gi = b->gi + j;
254 uint8_t *swap, *data = (uint8_t*)gi->data;
255 swap = malloc(gi->len * b->n_smpl);
256 for (i = 0; i < b->n_smpl; ++i)
257 memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len);
265 bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list)
270 khash_t(str2id) *hash;
272 s.l = s.m = 0; s.s = 0;
273 hash = kh_init(str2id);
274 for (i = 0; i < h0->n_smpl; ++i) {
275 k = kh_put(str2id, hash, h0->sns[i], &ret);
278 for (i = j = 0; i < n; ++i) {
279 k = kh_get(str2id, hash, samples[i]);
280 if (k != kh_end(hash)) {
281 list[j++] = kh_val(hash, k);
282 kputs(samples[i], &s); kputc('\0', &s);
285 if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j);
286 kh_destroy(str2id, hash);
287 h = calloc(1, sizeof(bcf_hdr_t));
289 h->ns = 0; h->sns = 0;
290 h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm);
291 h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt);
292 h->l_smpl = s.l; h->sname = s.s;
297 int bcf_subsam(int n_smpl, int *list, bcf1_t *b)
300 for (j = 0; j < b->n_gi; ++j) {
301 bcf_ginfo_t *gi = b->gi + j;
303 swap = malloc(gi->len * b->n_smpl);
304 for (i = 0; i < n_smpl; ++i)
305 memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
313 static int8_t nt4_table[128] = {
314 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
315 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
316 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
317 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
318 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
319 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
320 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
321 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4
324 int bcf_gl10(const bcf1_t *b, uint8_t *gl)
326 int a[4], k, l, map[4], k1, j, i;
327 const bcf_ginfo_t *PL;
329 if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles
330 for (i = 0; i < b->n_gi; ++i)
331 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
332 if (i == b->n_gi) return -1; // no PL
334 a[0] = nt4_table[(int)b->ref[0]];
335 if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T
336 a[1] = a[2] = a[3] = -2; // -1 has a special meaning
337 if (b->alt[0] == 0) return -1; // no alternate allele
338 map[0] = map[1] = map[2] = map[3] = -2;
340 for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
341 if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
342 a[k+1] = nt4_table[(int)*s];
343 if (a[k+1] >= 0) map[a[k+1]] = k+1;
345 if (s[1] == 0) break; // the end of the ALT string
347 for (k = 0; k < 4; ++k)
348 if (map[k] < 0) map[k] = k1;
349 for (i = 0; i < b->n_smpl; ++i) {
350 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
351 uint8_t *g = gl + 10 * i;
352 for (j = 0; j < PL->len; ++j)
354 for (k = j = 0; k < 4; ++k) {
355 for (l = k; l < 4; ++l) {
356 int t, x = map[k], y = map[l];
357 if (x > y) t = x, x = y, y = t; // make sure x is the smaller
358 g[j++] = p[y * (y+1) / 2 + x];