From 78de989ec56e7c17e1fa854cb8c1c4f9ae08856c Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 12 Jun 2009 23:15:24 +0000 Subject: [PATCH] * samtools-0.1.4-8 (r338) * parse the @RG header lines and allow to choose library at the "samtools view" command line --- bam.c | 1 + bam.h | 11 +++++++-- bam_aux.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++- bam_import.c | 51 +++++++++++++++++++++++++++++++++++++++ bamtk.c | 2 +- kstring.h | 11 ++++++--- sam.c | 2 ++ sam_view.c | 25 ++++++++++++++----- 8 files changed, 158 insertions(+), 13 deletions(-) diff --git a/bam.c b/bam.c index 22e09cf..e3cb6bd 100644 --- a/bam.c +++ b/bam.c @@ -79,6 +79,7 @@ void bam_header_destroy(bam_header_t *header) } free(header->text); #ifndef BAM_NO_HASH + if (header->rg2lib) bam_strmap_destroy(header->rg2lib); bam_destroy_header_hash(header); #endif free(header); diff --git a/bam.h b/bam.h index 9330075..c635e72 100644 --- a/bam.h +++ b/bam.h @@ -100,7 +100,7 @@ typedef struct { int32_t n_targets; char **target_name; uint32_t *target_len; - void *hash; + void *hash, *rg2lib; int l_text; char *text; } bam_header_t; @@ -317,9 +317,16 @@ extern "C" { bam_header_t *sam_header_read(tamFile fp); int sam_header_parse(bam_header_t *h); + int sam_header_parse_rg(bam_header_t *h); #define sam_write1(header, b) bam_view1(header, b) + int bam_strmap_put(void *strmap, const char *rg, const char *lib); + const char *bam_strmap_get(const void *strmap, const char *rg); + void *bam_strmap_dup(const void*); + void *bam_strmap_init(); + void bam_strmap_destroy(void *strmap); + /*! @abstract Initialize a header structure. @return the pointer to the header structure @@ -593,7 +600,7 @@ extern "C" { void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); // uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get() - uint8_t *bam_aux_get(bam1_t *b, const char tag[2]); + uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); int32_t bam_aux2i(const uint8_t *s); float bam_aux2f(const uint8_t *s); double bam_aux2d(const uint8_t *s); diff --git a/bam_aux.c b/bam_aux.c index 41be819..137f9af 100644 --- a/bam_aux.c +++ b/bam_aux.c @@ -1,7 +1,9 @@ #include #include "bam.h" #include "khash.h" +typedef char *str_p; KHASH_MAP_INIT_STR(s, int) +KHASH_MAP_INIT_STR(r2l, str_p) void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) { @@ -23,7 +25,7 @@ uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]) return bam_aux_get(b, tag); } */ -uint8_t *bam_aux_get(bam1_t *b, const char tag[2]) +uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) { uint8_t *s; int y = tag[0]<<8 | tag[1]; @@ -158,6 +160,70 @@ char *bam_aux2Z(const uint8_t *s) else return 0; } +/****************** + * rg2lib related * + ******************/ + +int bam_strmap_put(void *rg2lib, const char *rg, const char *lib) +{ + int ret; + khint_t k; + khash_t(r2l) *h = (khash_t(r2l)*)rg2lib; + char *key = strdup(rg); + k = kh_put(r2l, h, key, &ret); + if (ret) kh_val(h, k) = strdup(lib); + else { + fprintf(stderr, "[bam_rg2lib_put] duplicated @RG ID: %s\n", rg); + free(key); + } + return 0; +} + +const char *bam_strmap_get(const void *rg2lib, const char *rg) +{ + const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib; + khint_t k; + k = kh_get(r2l, h, rg); + if (k != kh_end(h)) return (const char*)kh_val(h, k); + else return 0; +} + +void *bam_strmap_dup(const void *rg2lib) +{ + const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib; + khash_t(r2l) *g = kh_init(r2l); + khint_t k, l; + int ret; + for (k = kh_begin(h); k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + char *key = strdup(kh_key(h, k)); + l = kh_put(r2l, g, key, &ret); + kh_val(g, l) = strdup(kh_val(h, k)); + } + } + return g; +} + +void *bam_strmap_init() +{ + return (void*)kh_init(r2l); +} + +void bam_strmap_destroy(void *rg2lib) +{ + khash_t(r2l) *h = (khash_t(r2l)*)rg2lib; + khint_t k; + if (h == 0) return; + for (k = kh_begin(h); k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + free((char*)kh_key(h, k)); free(kh_val(h, k)); + } + } + kh_destroy(r2l, h); +} + +/*** The following routines were implemented by Nils Homer for color-space support in tview ***/ + char bam_aux_getCSi(bam1_t *b, int i) { uint8_t *c = bam_aux_get(b, "CS"); diff --git a/bam_import.c b/bam_import.c index 77e3243..c085473 100644 --- a/bam_import.c +++ b/bam_import.c @@ -5,6 +5,7 @@ #include #include #include +#include "kstring.h" #include "bam.h" #include "kseq.h" #include "khash.h" @@ -146,6 +147,55 @@ static inline void append_text(bam_header_t *header, kstring_t *str) header->text[header->l_text] = 0; } +int sam_header_parse_rg(bam_header_t *h) +{ + kstring_t *rgid, *rglib; + char *p, *q, *s, *r; + int n = 0; + + bam_strmap_destroy(h->rg2lib); + // parse @RG lines + h->rg2lib = bam_strmap_init(); + rgid = calloc(1, sizeof(kstring_t)); + rglib = calloc(1, sizeof(kstring_t)); + s = h->text; + while ((s = strstr(s, "@RG")) != 0) { + if (rgid->l && rglib->l) { + bam_strmap_put(h->rg2lib, rgid->s, rglib->s); + ++n; + } + rgid->l = rglib->l = 0; + s += 3; + r = s; + if ((p = strstr(s, "ID:")) != 0) { + q = p + 3; + for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p); + kputsn(q, p - q, rgid); + } else { + fprintf(stderr, "[bam_header_parse] missing ID tag in @RG lines.\n"); + break; + } + if (r < p) r = p; + if ((p = strstr(s, "LB:")) != 0) { + q = p + 3; + for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p); + kputsn(q, p - q, rglib); + } else { + fprintf(stderr, "[bam_header_parse] missing LB tag in @RG lines.\n"); + break; + } + if (r < p) r = p; + s = r + 3; + } + if (rgid->l && rglib->l) { + bam_strmap_put(h->rg2lib, rgid->s, rglib->s); + ++n; + } + free(rgid->s); free(rgid); + free(rglib->s); free(rglib); + return n; +} + int sam_header_parse(bam_header_t *h) { int i; @@ -183,6 +233,7 @@ int sam_header_parse(bam_header_t *h) s = r + 3; ++i; } + sam_header_parse_rg(h); return h->n_targets; header_err_ret: diff --git a/bamtk.c b/bamtk.c index 878562c..852213e 100644 --- a/bamtk.c +++ b/bamtk.c @@ -3,7 +3,7 @@ #include "bam.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.4-7 (r337)" +#define PACKAGE_VERSION "0.1.4-8 (r338)" #endif int bam_taf2baf(int argc, char *argv[]); diff --git a/kstring.h b/kstring.h index 47d63f7..221ade2 100644 --- a/kstring.h +++ b/kstring.h @@ -19,19 +19,24 @@ typedef struct __kstring_t { int ksprintf(kstring_t *s, const char *fmt, ...); int ksplit_core(char *s, int delimiter, int *_max, int **_offsets); -static inline int kputs(const char *p, kstring_t *s) +static inline int kputsn(const char *p, int l, kstring_t *s) { - int l = strlen(p); if (s->l + l + 1 >= s->m) { s->m = s->l + l + 2; kroundup32(s->m); s->s = (char*)realloc(s->s, s->m); } - strcpy(s->s + s->l, p); + strncpy(s->s + s->l, p, l); s->l += l; + s->s[s->l] = 0; return l; } +static inline int kputs(const char *p, kstring_t *s) +{ + return kputsn(p, strlen(p), s); +} + static inline int kputc(int c, kstring_t *s) { if (s->l + 1 >= s->m) { diff --git a/sam.c b/sam.c index 04be5c5..4c16b02 100644 --- a/sam.c +++ b/sam.c @@ -19,6 +19,7 @@ bam_header_t *bam_header_dup(const bam_header_t *h0) h->target_len[i] = h0->target_len[i]; h->target_name[i] = strdup(h0->target_name[i]); } + if (h0->rg2lib) h->rg2lib = bam_strmap_dup(h0->rg2lib); return h; } @@ -46,6 +47,7 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux) fprintf(stderr, "[samopen] no @SQ lines in the header.\n"); } else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets); } + sam_header_parse_rg(fp->header); } else if (mode[0] == 'w') { // write fp->header = bam_header_dup((const bam_header_t*)aux); if (mode[1] == 'b') { // binary diff --git a/sam_view.c b/sam_view.c index 4858610..50192f1 100644 --- a/sam_view.c +++ b/sam_view.c @@ -5,14 +5,26 @@ #include "sam.h" static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0; +static char *g_library; -#define __g_skip_aln(b) (((b)->core.qual < g_min_mapQ) || ((b->core.flag & g_flag_on) != g_flag_on) \ - || (b->core.flag & g_flag_off)) +static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) +{ + if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off)) + return 1; + if (g_library) { + uint8_t *s = bam_aux_get(b, "RG"); + if (s) { + const char *p = bam_strmap_get(h->rg2lib, s + 1); + return (p && strcmp(p, g_library) == 0)? 0 : 1; + } else return 1; + } else return 0; +} // callback function for bam_fetch() static int view_func(const bam1_t *b, void *data) { - if (!__g_skip_aln(b)) samwrite((samfile_t*)data, b); + if (!__g_skip_aln(((samfile_t*)data)->header, b)) + samwrite((samfile_t*)data, b); return 0; } @@ -26,7 +38,7 @@ int main_samview(int argc, char *argv[]) /* parse command-line options */ strcpy(in_mode, "r"); strcpy(out_mode, "w"); - while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:u")) >= 0) { + while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:")) >= 0) { switch (c) { case 'S': is_bamin = 0; break; case 'b': is_bamout = 1; break; @@ -38,6 +50,7 @@ int main_samview(int argc, char *argv[]) case 'F': g_flag_off = strtol(optarg, 0, 0); break; case 'q': g_min_mapQ = atoi(optarg); break; case 'u': is_uncompressed = 1; break; + case 'l': g_library = strdup(optarg); break; default: return usage(); } } @@ -64,7 +77,7 @@ int main_samview(int argc, char *argv[]) bam1_t *b = bam_init1(); int r; while ((r = samread(in, b)) >= 0) // read one alignment from `in' - if (!__g_skip_aln(b)) + if (!__g_skip_aln(in->header, b)) samwrite(out, b); // write the alignment to `out' if (r < -1) fprintf(stderr, "[main_samview] truncated file.\n"); bam_destroy1(b); @@ -91,7 +104,7 @@ int main_samview(int argc, char *argv[]) view_end: // close files, free and return - free(fn_list); free(fn_out); + free(fn_list); free(fn_out); free(g_library); samclose(in); samclose(out); return ret; -- 2.39.2