7 KHASH_MAP_INIT_STR(str2id, int)
10 #define srand48(x) srand(x)
11 #define drand48() ((double)rand() / RAND_MAX)
14 // FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated...
15 void *bcf_build_refhash(bcf_hdr_t *h)
17 khash_t(str2id) *hash;
19 hash = kh_init(str2id);
20 for (i = 0; i < h->n_ref; ++i) {
22 k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret
28 void *bcf_str2id_init()
30 return kh_init(str2id);
33 void bcf_str2id_destroy(void *_hash)
35 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
36 if (hash) kh_destroy(str2id, hash); // Note that strings are not freed.
39 void bcf_str2id_thorough_destroy(void *_hash)
41 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
43 if (hash == 0) return;
44 for (k = 0; k < kh_end(hash); ++k)
45 if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
46 kh_destroy(str2id, hash);
49 int bcf_str2id(void *_hash, const char *str)
51 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
54 k = kh_get(str2id, hash, str);
55 return k == kh_end(hash)? -1 : kh_val(hash, k);
58 int bcf_str2id_add(void *_hash, const char *str)
62 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
64 k = kh_put(str2id, hash, str, &ret);
65 if (ret == 0) return kh_val(hash, k);
66 kh_val(hash, k) = kh_size(hash) - 1;
67 return kh_val(hash, k);
70 void bcf_fit_alt(bcf1_t *b, int mask)
72 mask |= 1; // REF must be always present
75 for (i=0; i<sizeof(int); i++)
76 if ( mask&1<<i) nals++;
78 if ( b->n_alleles <= nals ) return;
81 printf("fit_alt: %s, %s, mask=%d, nals=%d: ", b->ref, b->alt, mask, nals);
82 for (i=0; i<sizeof(int); i++)
83 if ( mask&1<<i) printf(" %d", i);
86 // update ALT, in principle any of the alleles can be removed
91 int n=0, nalts=nals-1;
92 for (src=dst=p=b->alt, i=1; *p; p++)
94 if ( *p!=',' ) continue;
101 memmove(dst,src,p-src);
105 if ( n<nalts ) { *dst=','; dst++; }
109 if ( n>=nalts ) { *dst=0; break; }
114 memmove(dst,src,p-src);
120 else p = b->alt, *p = '\0';
122 printf("fit_alt: %s, mask=%d\n", b->alt, mask);
125 memmove(p, b->flt, b->str + b->l_str - b->flt);
126 b->l_str -= b->flt - p;
130 for (i = 0; i < b->n_gi; ++i)
132 bcf_ginfo_t *g = b->gi + i;
133 if (g->fmt == bcf_str2int("PL", 2)) ipl = i;
134 if (g->fmt == bcf_str2int("GT", 2)) igt = i;
137 // .. create mapping between old and new indexes
138 int npl = nals * (nals+1) / 2;
139 int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles));
141 for (i=0; i<b->n_alleles; i++)
146 if ( i && !(mask&1<<i) ) skip=1;
147 if ( j && !(mask&1<<j) ) skip=1;
148 if ( !skip ) { map[knew++] = kori; }
152 // .. apply to all samples
153 int n_smpl = b->n_smpl;
154 for (i = 0; i < b->n_gi; ++i)
156 bcf_ginfo_t *g = b->gi + i;
157 if (g->fmt == bcf_str2int("PL", 2))
160 uint8_t *d = (uint8_t*)g->data;
161 int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2;
162 for (knew=ismpl=0; ismpl<n_smpl; ismpl++)
164 uint8_t *dl = d + ismpl * npl_ori;
165 for (j=0; j<npl; j++) d[knew++] = dl[map[j]];
167 } // FIXME: to add GL
171 for (i=1, knew=0; i<b->n_alleles; i++)
172 map[i] = mask&1<<i ? ++knew : -1;
173 for (i=0; i<n_smpl; i++)
175 uint8_t gt = ((uint8_t*)b->gi[igt].data)[i];
178 assert( map[a1]>=0 && map[a2]>=0 );
179 ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)>) | map[a1]<<3 | map[a2];
185 int bcf_shrink_alt(bcf1_t *b, int n)
188 int i, j, k, n_smpl = b->n_smpl;
189 if (b->n_alleles <= n) return -1;
192 for (p = b->alt, k = 1; *p; ++p)
193 if (*p == ',' && ++k == n) break;
195 } else p = b->alt, *p = '\0';
197 memmove(p, b->flt, b->str + b->l_str - b->flt);
198 b->l_str -= b->flt - p;
200 for (i = 0; i < b->n_gi; ++i) {
201 bcf_ginfo_t *g = b->gi + i;
202 if (g->fmt == bcf_str2int("PL", 2)) {
203 int l, x = b->n_alleles * (b->n_alleles + 1) / 2;
204 uint8_t *d = (uint8_t*)g->data;
205 g->len = n * (n + 1) / 2;
206 for (l = k = 0; l < n_smpl; ++l) {
207 uint8_t *dl = d + l * x;
208 for (j = 0; j < g->len; ++j) d[k++] = dl[j];
210 } // FIXME: to add GL
217 int bcf_gl2pl(bcf1_t *b)
220 int i, n_smpl = b->n_smpl;
224 if (strstr(b->fmt, "PL")) return -1;
225 if ((p = strstr(b->fmt, "GL")) == 0) return -1;
227 for (i = 0; i < b->n_gi; ++i)
228 if (b->gi[i].fmt == bcf_str2int("GL", 2))
231 g->fmt = bcf_str2int("PL", 2);
232 g->len /= 4; // 4 == sizeof(float)
233 d0 = (float*)g->data; d1 = (uint8_t*)g->data;
234 for (i = 0; i < n_smpl * g->len; ++i) {
235 int x = (int)(-10. * d0[i] + .499);
236 if (x > 255) x = 255;
242 /* FIXME: this function will fail given AB:GTX:GT. BCFtools never
243 * produces such FMT, but others may do. */
244 int bcf_fix_gt(bcf1_t *b)
250 // check the presence of the GT FMT
251 if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first
252 if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact
253 tmp = bcf_str2int("GT", 2);
254 for (i = 0; i < b->n_gi; ++i)
255 if (b->gi[i].fmt == tmp) break;
256 if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug...
258 // move GT to the first
259 for (; i > 0; --i) b->gi[i] = b->gi[i-1];
261 memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt);
262 b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':';
266 int bcf_fix_pl(bcf1_t *b)
273 tmp = bcf_str2int("PL", 2);
274 for (i = 0; i < b->n_gi; ++i)
275 if (b->gi[i].fmt == tmp) break;
276 if (i == b->n_gi) return 0;
279 PL = (uint8_t*)gi->data;
280 swap = alloca(gi->len);
281 // loop through individuals
282 for (i = 0; i < b->n_smpl; ++i) {
284 uint8_t *PLi = PL + i * gi->len;
285 memcpy(swap, PLi, gi->len);
286 for (k = x = 0; k < b->n_alleles; ++k)
287 for (l = k; l < b->n_alleles; ++l)
288 PLi[l*(l+1)/2 + k] = swap[x++];
293 int bcf_smpl_covered(const bcf1_t *b)
299 tmp = bcf_str2int("PL", 2);
300 for (i = 0; i < b->n_gi; ++i)
301 if (b->gi[i].fmt == tmp) break;
302 if (i == b->n_gi) return 0;
303 // count how many samples having PL!=[0..0]
305 for (i = 0; i < b->n_smpl; ++i) {
306 uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len;
307 for (j = 0; j < gi->len; ++j)
309 if (j < gi->len) ++n;
314 static void *locate_field(const bcf1_t *b, const char *fmt, int l)
318 tmp = bcf_str2int(fmt, l);
319 for (i = 0; i < b->n_gi; ++i)
320 if (b->gi[i].fmt == tmp) break;
321 return i == b->n_gi? 0 : b->gi[i].data;
324 int bcf_anno_max(bcf1_t *b)
326 int k, max_gq, max_sp, n_het;
330 max_gq = max_sp = n_het = 0;
331 gt = locate_field(b, "GT", 2);
332 if (gt == 0) return -1;
333 gq = locate_field(b, "GQ", 2);
334 sp = locate_field(b, "SP", 2);
336 for (k = 0; k < b->n_smpl; ++k)
338 max_sp = max_sp > (int)sp[k]? max_sp : sp[k];
340 for (k = 0; k < b->n_smpl; ++k)
342 max_gq = max_gq > (int)gq[k]? max_gq : gq[k];
343 for (k = 0; k < b->n_smpl; ++k) {
345 a1 = gt[k]&7; a2 = gt[k]>>3&7;
346 if ((!a1 && a2) || (!a2 && a1)) { // a het
347 if (gq == 0) ++n_het;
348 else if (gq[k] >= 20) ++n_het;
351 if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499);
352 if (max_sp < 0) max_sp = 0;
353 memset(&str, 0, sizeof(kstring_t));
354 if (*b->info) kputc(';', &str);
355 ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq);
356 bcf_append_info(b, str.s, str.l);
361 // FIXME: only data are shuffled; the header is NOT
362 int bcf_shuffle(bcf1_t *b, int seed)
365 if (seed > 0) srand48(seed);
366 a = malloc(b->n_smpl * sizeof(int));
367 for (i = 0; i < b->n_smpl; ++i) a[i] = i;
368 for (i = b->n_smpl; i > 1; --i) {
370 j = (int)(drand48() * i);
371 tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
373 for (j = 0; j < b->n_gi; ++j) {
374 bcf_ginfo_t *gi = b->gi + j;
375 uint8_t *swap, *data = (uint8_t*)gi->data;
376 swap = malloc(gi->len * b->n_smpl);
377 for (i = 0; i < b->n_smpl; ++i)
378 memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len);
386 bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list)
391 khash_t(str2id) *hash;
393 s.l = s.m = 0; s.s = 0;
394 hash = kh_init(str2id);
395 for (i = 0; i < h0->n_smpl; ++i) {
396 k = kh_put(str2id, hash, h0->sns[i], &ret);
399 for (i = j = 0; i < n; ++i) {
400 k = kh_get(str2id, hash, samples[i]);
401 if (k != kh_end(hash)) {
402 list[j++] = kh_val(hash, k);
403 kputs(samples[i], &s); kputc('\0', &s);
406 if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j);
407 kh_destroy(str2id, hash);
408 h = calloc(1, sizeof(bcf_hdr_t));
410 h->ns = 0; h->sns = 0;
411 h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm);
412 h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt);
413 h->l_smpl = s.l; h->sname = s.s;
418 int bcf_subsam(int n_smpl, int *list, bcf1_t *b)
421 for (j = 0; j < b->n_gi; ++j) {
422 bcf_ginfo_t *gi = b->gi + j;
424 swap = malloc(gi->len * b->n_smpl);
425 for (i = 0; i < n_smpl; ++i)
426 memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
434 static int8_t nt4_table[128] = {
435 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
436 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
437 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
438 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
439 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
440 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
441 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
442 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4
445 int bcf_gl10(const bcf1_t *b, uint8_t *gl)
447 int a[4], k, l, map[4], k1, j, i;
448 const bcf_ginfo_t *PL;
450 if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles
451 for (i = 0; i < b->n_gi; ++i)
452 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
453 if (i == b->n_gi) return -1; // no PL
455 a[0] = nt4_table[(int)b->ref[0]];
456 if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T
457 a[1] = a[2] = a[3] = -2; // -1 has a special meaning
458 if (b->alt[0] == 0) return -1; // no alternate allele
459 map[0] = map[1] = map[2] = map[3] = -2;
461 for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
462 if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
463 a[k+1] = nt4_table[(int)*s];
464 if (a[k+1] >= 0) map[a[k+1]] = k+1;
466 if (s[1] == 0) break; // the end of the ALT string
468 for (k = 0; k < 4; ++k)
469 if (map[k] < 0) map[k] = k1;
470 for (i = 0; i < b->n_smpl; ++i) {
471 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
472 uint8_t *g = gl + 10 * i;
473 for (k = j = 0; k < 4; ++k) {
474 for (l = k; l < 4; ++l) {
475 int t, x = map[k], y = map[l];
476 if (x > y) t = x, x = y, y = t; // make sure x is the smaller
477 g[j++] = p[y * (y+1) / 2 + x];
484 int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl)
487 const bcf_ginfo_t *PL;
488 if (b->alt[0] == 0) return -1; // no alternate allele
489 for (i = 0; i < b->n_gi; ++i)
490 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
491 if (i == b->n_gi) return -1; // no PL
493 for (i = 0; i < b->n_smpl; ++i) {
494 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
495 uint8_t *g = gl + 10 * i;
496 for (k = j = 0; k < 4; ++k) {
497 for (l = k; l < 4; ++l) {
499 if (x > y) t = x, x = y, y = t; // make sure x is the smaller
500 x = y * (y+1) / 2 + x;
501 g[j++] = x < PL->len? p[x] : 255;