7 KHASH_MAP_INIT_STR(str2id, int)
10 #define srand48(x) srand(x)
11 #define drand48() ((double)rand() / RAND_MAX)
14 // FIXME: valgrind report a memory leak in this function. Probably it does not get deallocated...
15 void *bcf_build_refhash(bcf_hdr_t *h)
17 khash_t(str2id) *hash;
19 hash = kh_init(str2id);
20 for (i = 0; i < h->n_ref; ++i) {
22 k = kh_put(str2id, hash, h->ns[i], &ret); // FIXME: check ret
28 void *bcf_str2id_init()
30 return kh_init(str2id);
33 void bcf_str2id_destroy(void *_hash)
35 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
36 if (hash) kh_destroy(str2id, hash); // Note that strings are not freed.
39 void bcf_str2id_thorough_destroy(void *_hash)
41 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
43 if (hash == 0) return;
44 for (k = 0; k < kh_end(hash); ++k)
45 if (kh_exist(hash, k)) free((char*)kh_key(hash, k));
46 kh_destroy(str2id, hash);
49 int bcf_str2id(void *_hash, const char *str)
51 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
54 k = kh_get(str2id, hash, str);
55 return k == kh_end(hash)? -1 : kh_val(hash, k);
58 int bcf_str2id_add(void *_hash, const char *str)
62 khash_t(str2id) *hash = (khash_t(str2id)*)_hash;
64 k = kh_put(str2id, hash, str, &ret);
65 if (ret == 0) return kh_val(hash, k);
66 kh_val(hash, k) = kh_size(hash) - 1;
67 return kh_val(hash, k);
70 void bcf_fit_alt(bcf1_t *b, int mask)
72 mask |= 1; // REF must be always present
75 for (i=0; i<sizeof(int); i++)
76 if ( mask&1<<i) nals++;
77 if ( b->n_alleles <= nals ) return;
79 // update ALT, in principle any of the alleles can be removed
84 int n=0, nalts=nals-1;
85 for (src=dst=p=b->alt, i=1; *p; p++)
87 if ( *p!=',' ) continue;
94 memmove(dst,src,p-src);
98 if ( n<nalts ) { *dst=','; dst++; }
102 if ( n>=nalts ) { *dst=0; break; }
107 memmove(dst,src,p-src);
113 else p = b->alt, *p = '\0';
115 memmove(p, b->flt, b->str + b->l_str - b->flt);
116 b->l_str -= b->flt - p;
120 for (i = 0; i < b->n_gi; ++i)
122 bcf_ginfo_t *g = b->gi + i;
123 if (g->fmt == bcf_str2int("PL", 2)) ipl = i;
124 if (g->fmt == bcf_str2int("GT", 2)) igt = i;
127 // .. create mapping between old and new indexes
128 int npl = nals * (nals+1) / 2;
129 int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles));
131 for (i=0; i<b->n_alleles; i++)
136 if ( i && !(mask&1<<i) ) skip=1;
137 if ( j && !(mask&1<<j) ) skip=1;
138 if ( !skip ) { map[knew++] = kori; }
142 // .. apply to all samples
143 int n_smpl = b->n_smpl;
144 for (i = 0; i < b->n_gi; ++i)
146 bcf_ginfo_t *g = b->gi + i;
147 if (g->fmt == bcf_str2int("PL", 2))
150 uint8_t *d = (uint8_t*)g->data;
151 int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2;
152 for (knew=ismpl=0; ismpl<n_smpl; ismpl++)
154 uint8_t *dl = d + ismpl * npl_ori;
155 for (j=0; j<npl; j++) d[knew++] = dl[map[j]];
157 } // FIXME: to add GL
161 for (i=1, knew=0; i<b->n_alleles; i++)
162 map[i] = mask&1<<i ? ++knew : -1;
163 for (i=0; i<n_smpl; i++)
165 uint8_t gt = ((uint8_t*)b->gi[igt].data)[i];
168 assert( map[a1]>=0 && map[a2]>=0 );
169 ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)>) | map[a1]<<3 | map[a2];
176 int bcf_shrink_alt(bcf1_t *b, int n)
179 int i, j, k, n_smpl = b->n_smpl;
180 if (b->n_alleles <= n) return -1;
183 for (p = b->alt, k = 1; *p; ++p)
184 if (*p == ',' && ++k == n) break;
186 } else p = b->alt, *p = '\0';
188 memmove(p, b->flt, b->str + b->l_str - b->flt);
189 b->l_str -= b->flt - p;
191 for (i = 0; i < b->n_gi; ++i) {
192 bcf_ginfo_t *g = b->gi + i;
193 if (g->fmt == bcf_str2int("PL", 2)) {
194 int l, x = b->n_alleles * (b->n_alleles + 1) / 2;
195 uint8_t *d = (uint8_t*)g->data;
196 g->len = n * (n + 1) / 2;
197 for (l = k = 0; l < n_smpl; ++l) {
198 uint8_t *dl = d + l * x;
199 for (j = 0; j < g->len; ++j) d[k++] = dl[j];
201 } // FIXME: to add GL
208 int bcf_gl2pl(bcf1_t *b)
211 int i, n_smpl = b->n_smpl;
215 if (strstr(b->fmt, "PL")) return -1;
216 if ((p = strstr(b->fmt, "GL")) == 0) return -1;
218 for (i = 0; i < b->n_gi; ++i)
219 if (b->gi[i].fmt == bcf_str2int("GL", 2))
222 g->fmt = bcf_str2int("PL", 2);
223 g->len /= 4; // 4 == sizeof(float)
224 d0 = (float*)g->data; d1 = (uint8_t*)g->data;
225 for (i = 0; i < n_smpl * g->len; ++i) {
226 int x = (int)(-10. * d0[i] + .499);
227 if (x > 255) x = 255;
233 /* FIXME: this function will fail given AB:GTX:GT. BCFtools never
234 * produces such FMT, but others may do. */
235 int bcf_fix_gt(bcf1_t *b)
241 // check the presence of the GT FMT
242 if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first
243 assert(s[3] == '\0' || s[3] == ':'); // :GTX in fact
244 tmp = bcf_str2int("GT", 2);
245 for (i = 0; i < b->n_gi; ++i)
246 if (b->gi[i].fmt == tmp) break;
247 if (i == b->n_gi) return 0; // no GT in b->gi; probably a bug...
249 // move GT to the first
250 for (; i > 0; --i) b->gi[i] = b->gi[i-1];
253 memmove(b->fmt + 3, b->fmt, s - b->fmt); // :GT
255 memmove(b->fmt + 3, b->fmt, s - b->fmt + 1); // :GT:
256 b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':';
260 int bcf_fix_pl(bcf1_t *b)
267 tmp = bcf_str2int("PL", 2);
268 for (i = 0; i < b->n_gi; ++i)
269 if (b->gi[i].fmt == tmp) break;
270 if (i == b->n_gi) return 0;
273 PL = (uint8_t*)gi->data;
274 swap = alloca(gi->len);
275 // loop through individuals
276 for (i = 0; i < b->n_smpl; ++i) {
278 uint8_t *PLi = PL + i * gi->len;
279 memcpy(swap, PLi, gi->len);
280 for (k = x = 0; k < b->n_alleles; ++k)
281 for (l = k; l < b->n_alleles; ++l)
282 PLi[l*(l+1)/2 + k] = swap[x++];
287 int bcf_smpl_covered(const bcf1_t *b)
293 tmp = bcf_str2int("PL", 2);
294 for (i = 0; i < b->n_gi; ++i)
295 if (b->gi[i].fmt == tmp) break;
296 if (i == b->n_gi) return 0;
297 // count how many samples having PL!=[0..0]
299 for (i = 0; i < b->n_smpl; ++i) {
300 uint8_t *PLi = ((uint8_t*)gi->data) + i * gi->len;
301 for (j = 0; j < gi->len; ++j)
303 if (j < gi->len) ++n;
308 static void *locate_field(const bcf1_t *b, const char *fmt, int l)
312 tmp = bcf_str2int(fmt, l);
313 for (i = 0; i < b->n_gi; ++i)
314 if (b->gi[i].fmt == tmp) break;
315 return i == b->n_gi? 0 : b->gi[i].data;
318 int bcf_anno_max(bcf1_t *b)
320 int k, max_gq, max_sp, n_het;
324 max_gq = max_sp = n_het = 0;
325 gt = locate_field(b, "GT", 2);
326 if (gt == 0) return -1;
327 gq = locate_field(b, "GQ", 2);
328 sp = locate_field(b, "SP", 2);
330 for (k = 0; k < b->n_smpl; ++k)
332 max_sp = max_sp > (int)sp[k]? max_sp : sp[k];
334 for (k = 0; k < b->n_smpl; ++k)
336 max_gq = max_gq > (int)gq[k]? max_gq : gq[k];
337 for (k = 0; k < b->n_smpl; ++k) {
339 a1 = gt[k]&7; a2 = gt[k]>>3&7;
340 if ((!a1 && a2) || (!a2 && a1)) { // a het
341 if (gq == 0) ++n_het;
342 else if (gq[k] >= 20) ++n_het;
345 if (n_het) max_sp -= (int)(4.343 * log(n_het) + .499);
346 if (max_sp < 0) max_sp = 0;
347 memset(&str, 0, sizeof(kstring_t));
348 if (*b->info) kputc(';', &str);
349 ksprintf(&str, "MXSP=%d;MXGQ=%d", max_sp, max_gq);
350 bcf_append_info(b, str.s, str.l);
355 // FIXME: only data are shuffled; the header is NOT
356 int bcf_shuffle(bcf1_t *b, int seed)
359 if (seed > 0) srand48(seed);
360 a = malloc(b->n_smpl * sizeof(int));
361 for (i = 0; i < b->n_smpl; ++i) a[i] = i;
362 for (i = b->n_smpl; i > 1; --i) {
364 j = (int)(drand48() * i);
365 tmp = a[j]; a[j] = a[i-1]; a[i-1] = tmp;
367 for (j = 0; j < b->n_gi; ++j) {
368 bcf_ginfo_t *gi = b->gi + j;
369 uint8_t *swap, *data = (uint8_t*)gi->data;
370 swap = malloc(gi->len * b->n_smpl);
371 for (i = 0; i < b->n_smpl; ++i)
372 memcpy(swap + gi->len * a[i], data + gi->len * i, gi->len);
380 bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int *list)
385 khash_t(str2id) *hash;
387 s.l = s.m = 0; s.s = 0;
388 hash = kh_init(str2id);
389 for (i = 0; i < h0->n_smpl; ++i) {
390 k = kh_put(str2id, hash, h0->sns[i], &ret);
393 for (i = j = 0; i < n; ++i) {
394 k = kh_get(str2id, hash, samples[i]);
395 if (k != kh_end(hash)) {
396 list[j++] = kh_val(hash, k);
397 kputs(samples[i], &s); kputc('\0', &s);
402 fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j);
405 kh_destroy(str2id, hash);
406 h = calloc(1, sizeof(bcf_hdr_t));
408 h->ns = 0; h->sns = 0;
409 h->name = malloc(h->l_nm); memcpy(h->name, h0->name, h->l_nm);
410 h->txt = calloc(1, h->l_txt + 1); memcpy(h->txt, h0->txt, h->l_txt);
411 h->l_smpl = s.l; h->sname = s.s;
416 int bcf_subsam(int n_smpl, int *list, bcf1_t *b)
419 for (j = 0; j < b->n_gi; ++j) {
420 bcf_ginfo_t *gi = b->gi + j;
422 swap = malloc(gi->len * b->n_smpl);
423 for (i = 0; i < n_smpl; ++i)
424 memcpy(swap + i * gi->len, (uint8_t*)gi->data + list[i] * gi->len, gi->len);
432 static int8_t nt4_table[128] = {
433 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
434 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
435 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 /*'-'*/, 4, 4,
436 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
437 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
438 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4,
439 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
440 4, 4, 4, 4, 3, 4, 4, 4, -1, 4, 4, 4, 4, 4, 4, 4
443 int bcf_gl10(const bcf1_t *b, uint8_t *gl)
445 int a[4], k, l, map[4], k1, j, i;
446 const bcf_ginfo_t *PL;
448 if (b->ref[1] != 0 || b->n_alleles > 4) return -1; // ref is not a single base or >4 alleles
449 for (i = 0; i < b->n_gi; ++i)
450 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
451 if (i == b->n_gi) return -1; // no PL
453 a[0] = nt4_table[(int)b->ref[0]];
454 if (a[0] > 3 || a[0] < 0) return -1; // ref is not A/C/G/T
455 a[1] = a[2] = a[3] = -2; // -1 has a special meaning
456 if (b->alt[0] == 0) return -1; // no alternate allele
457 map[0] = map[1] = map[2] = map[3] = -2;
459 for (k = 0, s = b->alt, k1 = -1; k < 3 && *s; ++k, s += 2) {
460 if (s[1] != ',' && s[1] != 0) return -1; // ALT is not single base
461 a[k+1] = nt4_table[(int)*s];
462 if (a[k+1] >= 0) map[a[k+1]] = k+1;
464 if (s[1] == 0) break; // the end of the ALT string
466 for (k = 0; k < 4; ++k)
467 if (map[k] < 0) map[k] = k1;
468 for (i = 0; i < b->n_smpl; ++i) {
469 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
470 uint8_t *g = gl + 10 * i;
471 for (k = j = 0; k < 4; ++k) {
472 for (l = k; l < 4; ++l) {
473 int t, x = map[k], y = map[l];
474 if (x > y) t = x, x = y, y = t; // make sure x is the smaller
475 g[j++] = p[y * (y+1) / 2 + x];
482 int bcf_gl10_indel(const bcf1_t *b, uint8_t *gl)
485 const bcf_ginfo_t *PL;
486 if (b->alt[0] == 0) return -1; // no alternate allele
487 for (i = 0; i < b->n_gi; ++i)
488 if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
489 if (i == b->n_gi) return -1; // no PL
491 for (i = 0; i < b->n_smpl; ++i) {
492 const uint8_t *p = PL->data + i * PL->len; // the PL for the i-th individual
493 uint8_t *g = gl + 10 * i;
494 for (k = j = 0; k < 4; ++k) {
495 for (l = k; l < 4; ++l) {
497 if (x > y) t = x, x = y, y = t; // make sure x is the smaller
498 x = y * (y+1) / 2 + x;
499 g[j++] = x < PL->len? p[x] : 255;