From: Heng Li Date: Mon, 18 Apr 2011 01:55:37 +0000 (+0000) Subject: Added the support for the new SAM/BAM type "B" X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=commitdiff_plain;h=ea89c550a4ca078a462dc66dfcf77e65b09e1f2f Added the support for the new SAM/BAM type "B" --- diff --git a/bam.c b/bam.c index a65aed5..5fac17d 100644 --- a/bam.c +++ b/bam.c @@ -160,6 +160,19 @@ static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } + else if (type == 'B') { + int32_t n, Bsize = bam_aux_type2size(*s); + memcpy(&n, s + 1, 4); + if (1 == Bsize) { + } else if (2 == Bsize) { + for (i = 0; i < n; i += 2) + bam_swap_endian_2p(s + 5 + i); + } else if (4 == Bsize) { + for (i = 0; i < n; i += 4) + bam_swap_endian_4p(s + 5 + i); + } + bam_swap_endian_4p(s+1); + } } } @@ -289,6 +302,23 @@ char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; } + else if (type == 'B') { + uint8_t sub_type = *(s++); + int32_t n; + memcpy(&n, s, 4); + s += 4; // no point to the start of the array + kputc(type, &str); kputc(':', &str); kputc(sub_type, &str); // write the typing + for (i = 0; i < n; ++i) { + kputc(',', &str); + if ('c' == sub_type || 'c' == sub_type) { kputw(*(int8_t*)s, &str); ++s; } + else if ('C' == sub_type) { kputw(*(uint8_t*)s, &str); ++s; } + else if ('s' == sub_type) { kputw(*(int16_t*)s, &str); s += 2; } + else if ('S' == sub_type) { kputw(*(uint16_t*)s, &str); s += 2; } + else if ('i' == sub_type) { kputw(*(int32_t*)s, &str); s += 4; } + else if ('I' == sub_type) { kputuw(*(uint32_t*)s, &str); s += 4; } + else if ('f' == sub_type) { ksprintf(&str, "%g", *(float*)s); s += 4; } + } + } } return str.s; } diff --git a/bam.h b/bam.h index 87e3de3..2de5b09 100644 --- a/bam.h +++ b/bam.h @@ -746,4 +746,13 @@ static inline bam1_t *bam_dup1(const bam1_t *src) return b; } +static inline int bam_aux_type2size(int x) +{ + if (x == 'C' || x == 'c' || x == 'A') return 1; + else if (x == 'S' || x == 's') return 2; + else if (x == 'I' || x == 'i' || x == 'f') return 4; + else return 0; +} + + #endif diff --git a/bam_aux.c b/bam_aux.c index 4fd3190..b6a8d88 100644 --- a/bam_aux.c +++ b/bam_aux.c @@ -26,14 +26,12 @@ uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) } #define __skip_tag(s) do { \ - int type = toupper(*(s)); \ - ++(s); \ - if (type == 'C' || type == 'A') ++(s); \ - else if (type == 'S') (s) += 2; \ - else if (type == 'I' || type == 'F') (s) += 4; \ - else if (type == 'D') (s) += 8; \ - else if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ - } while (0) + int type = toupper(*(s)); \ + ++(s); \ + if (type == 'Z' || type == 'H') { while (*(s)) ++(s); ++(s); } \ + else if (type == 'B') (s) += 5 + bam_aux_type2size(*(s)) * (*(int32_t*)((s)+1)); \ + else (s) += bam_aux_type2size(type); \ + } while(0) uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) { diff --git a/bam_import.c b/bam_import.c index 9d84328..8c24692 100644 --- a/bam_import.c +++ b/bam_import.c @@ -427,6 +427,27 @@ int sam_read1(tamFile fp, bam_header_t *header, bam1_t *b) memcpy(s, str->s + 5, str->l - 5); s[str->l - 5] = 0; doff += size; + } else if (type == 'B') { + int32_t n = 0, Bsize, k = 0, size; + char *p; + if (str->l < 8) parse_error(fp->n_lines, "too few values in aux type B"); + Bsize = bam_aux_type2size(str->s[5]); // the size of each element + for (p = (char*)str->s + 6; *p; ++p) // count the number of elements in the array + if (*p == ',') ++n; + p = str->s + 7; // now p points to the first number in the array + size = 6 + Bsize * n; // total number of bytes allocated to this tag + s = alloc_data(b, doff + 6 * Bsize * n) + doff; // allocate memory + *s++ = 'B'; *s++ = str->s[5]; + memcpy(s, &n, 4); s += 4; // write the number of elements + if (str->s[5] == 'c') while (p < str->s + str->l) ((int8_t*)s)[k++] = (int8_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'C') while (p < str->s + str->l) ((uint8_t*)s)[k++] = (uint8_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 's') while (p < str->s + str->l) ((int16_t*)s)[k++] = (int16_t)strtol(p, &p, 0), ++p; // FIXME: avoid unaligned memory + else if (str->s[5] == 'S') while (p < str->s + str->l) ((uint16_t*)s)[k++] = (uint16_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'i') while (p < str->s + str->l) ((int32_t*)s)[k++] = (int32_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'I') while (p < str->s + str->l) ((uint32_t*)s)[k++] = (uint32_t)strtol(p, &p, 0), ++p; + else if (str->s[5] == 'f') while (p < str->s + str->l) ((float*)s)[k++] = (float)strtof(p, &p), ++p; + else parse_error(fp->n_lines, "unrecognized array type"); + s += Bsize * n; doff += size; } else parse_error(fp->n_lines, "unrecognized type"); if (dret == '\n' || dret == '\r') break; } diff --git a/bamtk.c b/bamtk.c index 5886570..0941f6f 100644 --- a/bamtk.c +++ b/bamtk.c @@ -31,46 +31,6 @@ int main_depth(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); int glf3_view_main(int argc, char *argv[]); -int bam_tagview(int argc, char *argv[]) -{ - bamFile fp; - bam_header_t *header; - bam1_t *b; - char tag[2]; - int ret; - if (argc < 3) { - fprintf(stderr, "Usage: samtools tagview \n"); - return 1; - } - fp = strcmp(argv[1], "-")? bam_open(argv[1], "r") : bam_dopen(fileno(stdin), "r"); - assert(fp); - header = bam_header_read(fp); - if (header == 0) { - fprintf(stderr, "[bam_view] fail to read the BAM header. Abort!\n"); - return 1; - } - tag[0] = argv[2][0]; tag[1] = argv[2][1]; - b = (bam1_t*)calloc(1, sizeof(bam1_t)); - while ((ret = bam_read1(fp, b)) >= 0) { - uint8_t *d = bam_aux_get(b, tag); - if (d) { - printf("%s\t%d\t", bam1_qname(b), b->core.flag); - if (d[0] == 'Z' || d[0] == 'H') printf("%s\n", bam_aux2Z(d)); - else if (d[0] == 'f') printf("%f\n", bam_aux2f(d)); - else if (d[0] == 'd') printf("%lf\n", bam_aux2d(d)); - else if (d[0] == 'A') printf("%c\n", bam_aux2A(d)); - else if (d[0] == 'c' || d[0] == 's' || d[0] == 'i') printf("%d\n", bam_aux2i(d)); - else if (d[0] == 'C' || d[0] == 'S' || d[0] == 'I') printf("%u\n", bam_aux2i(d)); - else printf("\n"); - } - } - if (ret < -1) fprintf(stderr, "[bam_view] truncated file? Continue anyway. (%d)\n", ret); - free(b->data); free(b); - bam_header_destroy(header); - bam_close(fp); - return 0; -} - static int usage() { fprintf(stderr, "\n"); @@ -131,7 +91,6 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1); else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); - else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1); else if (strcmp(argv[1], "calmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); else if (strcmp(argv[1], "reheader") == 0) return main_reheader(argc-1, argv+1); diff --git a/examples/toy.sam b/examples/toy.sam index 1aff220..33449b1 100644 --- a/examples/toy.sam +++ b/examples/toy.sam @@ -1,6 +1,6 @@ @SQ SN:ref LN:45 @SQ SN:ref2 LN:40 -r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * +r001 163 ref 7 30 8M4I4M1D3M = 37 39 TTAGATAAAGAGGATACTG * XX:B:S,12561,2,20,112 r002 0 ref 9 30 1S2I6M1P1I1P1I4M2I * 0 0 AAAAGATAAGGGATAAA * r003 0 ref 9 30 5H6M * 0 0 AGCTAA * r004 0 ref 16 30 6M14N1I5M * 0 0 ATAGCTCTCAGC * diff --git a/sam_header.c b/sam_header.c index f49b2d4..f4c8a3b 100644 --- a/sam_header.c +++ b/sam_header.c @@ -38,7 +38,7 @@ const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL}; const char *r_sq_tags[] = {"SN","LN",NULL}; const char *u_sq_tags[] = {"SN",NULL}; -const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL}; +const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL}; const char *r_rg_tags[] = {"ID",NULL}; const char *u_rg_tags[] = {"ID",NULL};