6 KHASH_MAP_INIT_STR(str2id, int)
9 #define srand48(x) srand(x)
10 #define drand48() ((double)rand() / RAND_MAX)
13 // FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated...
14 void *bcf_build_refhash(bcf_hdr_t *h)
16 khash_t(str2id) *hash;
18 hash = kh_init(str2id);
19 for (i = 0; i < h->n_ref; ++i) {
21 k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret
27 void *bcf_str2id_init()
29 return kh_init(str2id);
32 void bcf_str2id_destroy(void *_hash)
34 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
35 if (hash) kh_destroy(str2id, hash); // Note that strings are not freed.
38 void bcf_str2id_thorough_destroy(void *_hash)
40 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
42 if (hash == 0) return;
43 for (k = 0; k < kh_end(hash); ++k)
44 if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
45 kh_destroy(str2id, hash);
48 int bcf_str2id(void *_hash, const char *str)
50 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
53 k = kh_get(str2id, hash, str);
54 return k == kh_end(hash)? -1 : kh_val(hash, k);
57 int bcf_str2id_add(void *_hash, const char *str)
61 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
63 k = kh_put(str2id, hash, str, &ret);
64 if (ret == 0) return kh_val(hash, k);
65 kh_val(hash, k) = kh_size(hash) - 1;
66 return kh_val(hash, k);
69 int bcf_shrink_alt(bcf1_t *b, int n)
72 int i, j, k, n_smpl = b->n_smpl;
73 if (b->n_alleles <= n) return -1;
76 for (p = b->alt, k = 1; *p; ++p)
77 if (*p == ',' && ++k == n) break;
79 } else p = b->alt, *p = '\0';
81 memmove(p, b->flt, b->str + b->l_str - b->flt);
82 b->l_str -= b->flt - p;
84 for (i = 0; i < b->n_gi; ++i) {
85 bcf_ginfo_t *g = b->gi + i;
86 if (g->fmt == bcf_str2int("PL", 2)) {
87 int l, x = b->n_alleles * (b->n_alleles + 1) / 2;
88 uint8_t *d = (uint8_t*)g->data;
89 g->len = n * (n + 1) / 2;
90 for (l = k = 0; l < n_smpl; ++l) {
91 uint8_t *dl = d + l * x;
92 for (j = 0; j < g->len; ++j) d[k++] = dl[j];
101 int bcf_gl2pl(bcf1_t *b)
104 int i, n_smpl = b->n_smpl;
108 if (strstr(b->fmt, "PL")) return -1;
109 if ((p = strstr(b->fmt, "GL")) == 0) return -1;
111 for (i = 0; i < b->n_gi; ++i)
112 if (b->gi[i].fmt == bcf_str2int("GL", 2))
115 g->fmt = bcf_str2int("PL", 2);
116 g->len /= 4; // 4 == sizeof(float)
117 d0 = (float*)g->data; d1 = (uint8_t*)g->data;
118 for (i = 0; i < n_smpl * g->len; ++i) {
119 int x = (int)(-10. * d0[i] + .499);
120 if (x > 255) x = 255;
126 /* FIXME: this function will fail given AB:GTX:GT. BCFtools never
127 * produces such FMT, but others may do. */
128 int bcf_fix_gt(bcf1_t *b)
134 // check the presence of the GT FMT
135 if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first
136 if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact
137 tmp = bcf_str2int("GT", 2);
138 for (i = 0; i < b->n_gi; ++i)
139 if (b->gi[i].fmt == tmp) break;
140 if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug...
142 // move GT to the first
143 for (; i > 0; --i) b->gi[i] = b->gi[i-1];
145 memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt);
146 b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':';
150 int bcf_fix_pl(bcf1_t *b)
157 tmp = bcf_str2int("PL", 2);
158 for (i = 0; i < b->n_gi; ++i)
159 if (b->gi[i].fmt == tmp) break;
160 if (i == b->n_gi) return 0;
163 PL = (uint8_t*)gi->data;
164 swap = alloca(gi->len);
165 // loop through individuals
166 for (i = 0; i < b->n_smpl; ++i) {
168 uint8_t *PLi = PL + i * gi->len;
169 memcpy(swap, PLi, gi->len);
170 for (k = x = 0; k < b->n_alleles; ++k)
171 for (l = k; l < b->n_alleles; ++l)
172 PLi[l*(l+1)/2 + k] = swap[x++];
177 int bcf_smpl_covered(const bcf1_t *b)
183 tmp = bcf_str2int("PL", 2);
184 for (i = 0; i < b->n_gi; ++i)
185 if (b->gi[i].fmt == tmp) break;
186 if (i == b->n_gi) return 0;
187 // count how many samples having PL!=[0..0]
189 for (i = 0; i < b->n_smpl; ++i) {
190 uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len;
191 for (j = 0; j < gi->len; ++j)
193 if (j < gi->len) ++n;
198 static void *locate_field(const bcf1_t *b, const char *fmt, int l)
202 tmp = bcf_str2int(fmt, l);
203 for (i = 0; i < b->n_gi; ++i)
204 if (b->gi[i].fmt == tmp) break;
205 return i == b->n_gi? 0 : b->gi[i].data;
208 int bcf_anno_max(bcf1_t *b)
210 int k, max_gq, max_sp, n_het;
214 max_gq = max_sp = n_het = 0;
215 gt = locate_field(b, "GT", 2);
216 if (gt == 0) return -1;
217 gq = locate_field(b, "GQ", 2);
218 sp = locate_field(b, "SP", 2);
220 for (k = 0; k < b->n_smpl; ++k)
222 max_sp = max_sp > (int)sp[k]? max_sp : sp[k];
224 for (k = 0; k < b->n_smpl; ++k)
226 max_gq = max_gq > (int)gq[k]? max_gq : gq[k];
227 for (k = 0; k < b->n_smpl; ++k) {
229 a1 = gt[k]&7; a2 = gt[k]>>3&7;
230 if ((!a1 && a2) || (!a2 && a1)) { // a het
231 if (gq == 0) ++n_het;
232 else if (gq[k] >= 20) ++n_het;
235 if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499);
236 if (max_sp < 0) max_sp = 0;
237 memset(&str, 0, sizeof(kstring_t));
238 if (*b->info) kputc(';', &str);
239 ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq);
240 bcf_append_info(b, str.s, str.l);
245 // FIXME: only data are shuffled; the header is NOT
246 int bcf_shuffle(bcf1_t *b, int seed)
249 if (seed > 0) srand48(seed);
250 a = malloc(b->n_smpl * sizeof(int));
251 for (i = 0; i < b->n_smpl; ++i) a[i] = i;
252 for (i = b->n_smpl; i > 1; --i) {
254 j = (int)(drand48() * i);
255 tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
257 for (j = 0; j < b->n_gi; ++j) {
258 bcf_ginfo_t *gi = b->gi + j;
259 uint8_t *swap, *data = (uint8_t*)gi->data;
260 swap = malloc(gi->len * b->n_smpl);
261 for (i = 0; i < b->n_smpl; ++i)
262 memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len);
270 bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list)
275 khash_t(str2id) *hash;
277 s.l = s.m = 0; s.s = 0;
278 hash = kh_init(str2id);
279 for (i = 0; i < h0->n_smpl; ++i) {
280 k = kh_put(str2id, hash, h0->sns[i], &ret);
283 for (i = j = 0; i < n; ++i) {
284 k = kh_get(str2id, hash, samples[i]);
285 if (k != kh_end(hash)) {
286 list[j++] = kh_val(hash, k);
287 kputs(samples[i], &s); kputc('\0', &s);
290 if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j);
291 kh_destroy(str2id, hash);
292 h = calloc(1, sizeof(bcf_hdr_t));
294 h->ns = 0; h->sns = 0;
295 h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm);
296 h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt);
297 h->l_smpl = s.l; h->sname = s.s;
302 int bcf_subsam(int n_smpl, int *list, bcf1_t *b)
305 for (j = 0; j < b->n_gi; ++j) {
306 bcf_ginfo_t *gi = b->gi + j;
308 swap = malloc(gi->len * b->n_smpl);
309 for (i = 0; i < n_smpl; ++i)
310 memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
318 static int8_t nt4_table[128] = {
319 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
320 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
321 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
322 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
323 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
324 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
325 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
326 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4
329 int bcf_gl10(const bcf1_t *b, uint8_t *gl)
331 int a[4], k, l, map[4], k1, j, i;
332 const bcf_ginfo_t *PL;
334 if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles
335 for (i = 0; i < b->n_gi; ++i)
336 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
337 if (i == b->n_gi) return -1; // no PL
339 a[0] = nt4_table[(int)b->ref[0]];
340 if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T
341 a[1] = a[2] = a[3] = -2; // -1 has a special meaning
342 if (b->alt[0] == 0) return -1; // no alternate allele
343 map[0] = map[1] = map[2] = map[3] = -2;
345 for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
346 if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
347 a[k+1] = nt4_table[(int)*s];
348 if (a[k+1] >= 0) map[a[k+1]] = k+1;
350 if (s[1] == 0) break; // the end of the ALT string
352 for (k = 0; k < 4; ++k)
353 if (map[k] < 0) map[k] = k1;
354 for (i = 0; i < b->n_smpl; ++i) {
355 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
356 uint8_t *g = gl + 10 * i;
357 for (k = j = 0; k < 4; ++k) {
358 for (l = k; l < 4; ++l) {
359 int t, x = map[k], y = map[l];
360 if (x > y) t = x, x = y, y = t; // make sure x is the smaller
361 g[j++] = p[y * (y+1) / 2 + x];
368 int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl)
371 const bcf_ginfo_t *PL;
372 if (b->alt[0] == 0) return -1; // no alternate allele
373 for (i = 0; i < b->n_gi; ++i)
374 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
375 if (i == b->n_gi) return -1; // no PL
377 for (i = 0; i < b->n_smpl; ++i) {
378 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
379 uint8_t *g = gl + 10 * i;
380 for (k = j = 0; k < 4; ++k) {
381 for (l = k; l < 4; ++l) {
383 if (x > y) t = x, x = y, y = t; // make sure x is the smaller
384 x = y * (y+1) / 2 + x;
385 g[j++] = x < PL->len? p[x] : 255;