X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=blobdiff_plain;f=bam_aux.c;h=28b22e3059273da57dd8395fba5e29287dc3228b;hp=41be81910d89478dc160fada8fe2b4c38f8052c5;hb=307c147168f7154e3755712797078c513e0b242a;hpb=11b0656ab8ca22edb2fb7fb54fe761ea4c9e05e3 diff --git a/bam_aux.c b/bam_aux.c index 41be819..28b22e3 100644 --- a/bam_aux.c +++ b/bam_aux.c @@ -1,7 +1,9 @@ #include #include "bam.h" #include "khash.h" +typedef char *str_p; KHASH_MAP_INIT_STR(s, int) +KHASH_MAP_INIT_STR(r2l, str_p) void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) { @@ -17,27 +19,59 @@ void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *d b->data[ori_len + 2] = type; memcpy(b->data + ori_len + 3, data, len); } -/* + uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) { return bam_aux_get(b, tag); } -*/ -uint8_t *bam_aux_get(bam1_t *b, const char tag[2]) + +#define __skip_tag(s) do { \ + int type = toupper(*(s)); \ + ++(s); \ + if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ + else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \ + else (s) += bam_aux_type2size(type); \ + } while(0) + +uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) { uint8_t *s; int y = tag[0]<<8 | tag[1]; s = bam1_aux(b); while (s < b->data + b->data_len) { - int type, x = (int)s[0]<<8 | s[1]; + int x = (int)s[0]<<8 | s[1]; s += 2; if (x == y) return s; - type = toupper(*s); ++s; - if (type == 'C') ++s; - else if (type == 'S') s += 2; - else if (type == 'I' || type == 'F') s += 4; - else if (type == 'D') s += 8; - else if (type == 'Z' || type == 'H') { while (*s) putchar(*s++); ++s; } + __skip_tag(s); + } + return 0; +} +// s MUST BE returned by bam_aux_get() +int bam_aux_del(bam1_t *b, uint8_t *s) +{ + uint8_t *p, *aux; + aux = bam1_aux(b); + p = s - 2; + __skip_tag(s); + memmove(p, s, b->l_aux - (s - aux)); + b->data_len -= s - p; + b->l_aux -= s - p; + return 0; +} + +int bam_aux_drop_other(bam1_t *b, uint8_t *s) +{ + if (s) { + uint8_t *p, *aux; + aux = bam1_aux(b); + p = s - 2; + __skip_tag(s); + memmove(aux, p, s - p); + b->data_len -= b->l_aux - (s - p); + b->l_aux = s - p; + } else { + b->data_len -= b->l_aux; + b->l_aux = 0; } return 0; } @@ -70,43 +104,56 @@ int32_t bam_get_tid(const bam_header_t *header, const char *seq_name) return k == kh_end(h)? -1 : kh_value(h, k); } -void bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end) +int bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *beg, int *end) { - char *s, *p; - int i, l, k; + char *s; + int i, l, k, name_end; khiter_t iter; khash_t(s) *h; bam_init_header_hash(header); h = (khash_t(s)*)header->hash; - l = strlen(str); - p = s = (char*)malloc(l+1); - /* squeeze out "," */ - for (i = k = 0; i != l; ++i) - if (str[i] != ',' && !isspace(str[i])) s[k++] = str[i]; - s[k] = 0; - for (i = 0; i != k; ++i) if (s[i] == ':') break; - s[i] = 0; - iter = kh_get(s, h, s); /* get the ref_id */ - if (iter == kh_end(h)) { // name not found - *ref_id = -1; free(s); - return; - } - *ref_id = kh_value(h, iter); - if (i == k) { /* dump the whole sequence */ - *begin = 0; *end = 1<<29; free(s); - return; - } - for (p = s + i + 1; i != k; ++i) if (s[i] == '-') break; - *begin = atoi(p); - if (i < k) { - p = s + i + 1; - *end = atoi(p); - } else *end = 1<<29; - if (*begin > 0) --*begin; - assert(*begin <= *end); + *ref_id = *beg = *end = -1; + name_end = l = strlen(str); + s = (char*)malloc(l+1); + // remove space + for (i = k = 0; i < l; ++i) + if (!isspace(str[i])) s[k++] = str[i]; + s[k] = 0; l = k; + // determine the sequence name + for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end + if (i >= 0) name_end = i; + if (name_end < l) { // check if this is really the end + int n_hyphen = 0; + for (i = name_end + 1; i < l; ++i) { + if (s[i] == '-') ++n_hyphen; + else if (!isdigit(s[i]) && s[i] != ',') break; + } + if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name + s[name_end] = 0; + iter = kh_get(s, h, s); + if (iter == kh_end(h)) { // cannot find the sequence name + iter = kh_get(s, h, str); // try str as the name + if (iter == kh_end(h)) { + if (bam_verbose >= 2) fprintf(stderr, "[%s] fail to determine the sequence name.\n", __func__); + free(s); return -1; + } else s[name_end] = ':', name_end = l; + } + } else iter = kh_get(s, h, str); + *ref_id = kh_val(h, iter); + // parse the interval + if (name_end < l) { + for (i = k = name_end + 1; i < l; ++i) + if (s[i] != ',') s[k++] = s[i]; + s[k] = 0; + *beg = atoi(s + name_end + 1); + for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; + *end = i < k? atoi(s + i + 1) : 1<<29; + if (*beg > 0) --*beg; + } else *beg = 0, *end = 1<<29; free(s); + return *beg <= *end? 0 : -1; } int32_t bam_aux2i(const uint8_t *s) @@ -158,104 +205,9 @@ char *bam_aux2Z(const uint8_t *s) else return 0; } -char bam_aux_getCSi(bam1_t *b, int i) -{ - uint8_t *c = bam_aux_get(b, "CS"); - char *cs = NULL; - - // return the base if the tag was not found - if(0 == c) return 0; - - cs = bam_aux2Z(c); - // adjust for strandedness and leading adaptor - if(bam1_strand(b)) i = strlen(cs) - 1 - i; - else i++; - return cs[i]; -} - -char bam_aux_getCQi(bam1_t *b, int i) +#ifdef _WIN32 +double drand48() { - uint8_t *c = bam_aux_get(b, "CQ"); - char *cq = NULL; - - // return the base if the tag was not found - if(0 == c) return 0; - - cq = bam_aux2Z(c); - // adjust for strandedness - if(bam1_strand(b)) i = strlen(cq) - 1 - i; - return cq[i]; -} - -char bam_aux_nt2int(char a) -{ - switch(toupper(a)) { - case 'A': - return 0; - break; - case 'C': - return 1; - break; - case 'G': - return 2; - break; - case 'T': - return 3; - break; - default: - return 4; - break; - } -} - -char bam_aux_ntnt2cs(char a, char b) -{ - a = bam_aux_nt2int(a); - b = bam_aux_nt2int(b); - if(4 == a || 4 == b) return '4'; - return "0123"[(int)(a ^ b)]; -} - -char bam_aux_getCEi(bam1_t *b, int i) -{ - int cs_i; - uint8_t *c = bam_aux_get(b, "CS"); - char *cs = NULL; - char prev_b, cur_b; - char cur_color, cor_color; - - // return the base if the tag was not found - if(0 == c) return 0; - - cs = bam_aux2Z(c); - - // adjust for strandedness and leading adaptor - if(bam1_strand(b)) { //reverse strand - cs_i = strlen(cs) - 1 - i; - // get current color - cur_color = cs[cs_i]; - // get previous base - prev_b = (0 == cs_i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i+1)]; - // get current base - cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; - } - else { - cs_i=i+1; - // get current color - cur_color = cs[cs_i]; - // get previous base - prev_b = (0 == i) ? cs[0] : bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i-1)]; - // get current base - cur_b = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]; - } - - // corrected color - cor_color = bam_aux_ntnt2cs(prev_b, cur_b); - - if(cur_color == cor_color) { - return '-'; - } - else { - return cur_color; - } + return (double)rand() / RAND_MAX; } +#endif