X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=sam%2Fbcftools%2Fbcfutils.c;h=7638085d4ffacce35f26b401ff1a90f48a975454;hp=0eab4c1f322b398750fdda9da4caec6eea8e7579;hb=dbcf1cfb8ad1086c21d64e249f012809403e7ddc;hpb=946f9a6adb2a82048c8453d44693cd3838d32939 diff --git a/sam/bcftools/bcfutils.c b/sam/bcftools/bcfutils.c index 0eab4c1..7638085 100644 --- a/sam/bcftools/bcfutils.c +++ b/sam/bcftools/bcfutils.c @@ -1,5 +1,6 @@ #include #include +#include #include "bcf.h" #include "kstring.h" #include "khash.h" @@ -66,6 +67,112 @@ int bcf_str2id_add(void *_hash, const char *str) return kh_val(hash, k); } +void bcf_fit_alt(bcf1_t *b, int mask) +{ + mask |= 1; // REF must be always present + + int i,j,nals=0; + for (i=0; in_alleles <= nals ) return; + + // update ALT, in principle any of the alleles can be removed + char *p; + if ( nals>1 ) + { + char *dst, *src; + int n=0, nalts=nals-1; + for (src=dst=p=b->alt, i=1; *p; p++) + { + if ( *p!=',' ) continue; + + if ( mask&1<=nalts ) { *dst=0; break; } + src = p+1; + } + if ( nalt, *p = '\0'; + p++; + memmove(p, b->flt, b->str + b->l_str - b->flt); + b->l_str -= b->flt - p; + + // update PL and GT + int ipl=-1, igt=-1; + for (i = 0; i < b->n_gi; ++i) + { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) ipl = i; + if (g->fmt == bcf_str2int("GT", 2)) igt = i; + } + + // .. create mapping between old and new indexes + int npl = nals * (nals+1) / 2; + int *map = malloc(sizeof(int)*(npl>b->n_alleles ? npl : b->n_alleles)); + int kori=0,knew=0; + for (i=0; in_alleles; i++) + { + for (j=0; j<=i; j++) + { + int skip=0; + if ( i && !(mask&1<n_smpl; + for (i = 0; i < b->n_gi; ++i) + { + bcf_ginfo_t *g = b->gi + i; + if (g->fmt == bcf_str2int("PL", 2)) + { + g->len = npl; + uint8_t *d = (uint8_t*)g->data; + int ismpl, npl_ori = b->n_alleles * (b->n_alleles + 1) / 2; + for (knew=ismpl=0; ismpln_alleles; i++) + map[i] = mask&1<gi[igt].data)[i]; + int a1 = (gt>>3)&7; + int a2 = gt&7; + assert( map[a1]>=0 && map[a2]>=0 ); + ((uint8_t*)b->gi[igt].data)[i] = ((1<<7|1<<6)>) | map[a1]<<3 | map[a2]; + } + free(map); + b->n_alleles = nals; + bcf_sync(b); +} + int bcf_shrink_alt(bcf1_t *b, int n) { char *p; @@ -133,7 +240,7 @@ int bcf_fix_gt(bcf1_t *b) bcf_ginfo_t gt; // check the presence of the GT FMT if ((s = strstr(b->fmt, ":GT")) == 0) return 0; // no GT or GT is already the first - if (s[3] != '\0' && s[3] != ':') return 0; // :GTX in fact + assert(s[3] == '\0' || s[3] == ':'); // :GTX in fact tmp = bcf_str2int("GT", 2); for (i = 0; i < b->n_gi; ++i) if (b->gi[i].fmt == tmp) break; @@ -142,7 +249,10 @@ int bcf_fix_gt(bcf1_t *b) // move GT to the first for (; i > 0; --i) b->gi[i] = b->gi[i-1]; b->gi[0] = gt; - memmove(b->fmt + 3, b->fmt, s + 1 - b->fmt); + if ( s[3]==0 ) + memmove(b->fmt + 3, b->fmt, s - b->fmt); // :GT + else + memmove(b->fmt + 3, b->fmt, s - b->fmt + 1); // :GT: b->fmt[0] = 'G'; b->fmt[1] = 'T'; b->fmt[2] = ':'; return 0; } @@ -287,7 +397,11 @@ bcf_hdr_t *bcf_hdr_subsam(const bcf_hdr_t *h0, int n, char *const* samples, int kputs(samples[i], &s); kputc('\0', &s); } } - if (j < n) fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); + if (j < n) + { + fprintf(stderr, "<%s> %d samples in the list but not in BCF.", __func__, n - j); + exit(1); + } kh_destroy(str2id, hash); h = calloc(1, sizeof(bcf_hdr_t)); *h = *h0;