From d0e30eec1158752010659982342a611fc91ae8e3 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sat, 24 Oct 2009 04:17:10 +0000 Subject: [PATCH] * This revision is SERIOUSLY BUGGY. Please NOT use it. * Start to incorporate header parsing from Petr Danecek --- Makefile | 7 ++-- bam.c | 13 +++++-- bam.h | 5 ++- bam_aux.c | 67 -------------------------------- bam_import.c | 104 +++++++++----------------------------------------- bam_rmdup.c | 4 +- bam_rmdupse.c | 4 +- kaln.c | 11 ------ sam.c | 1 - sam_header.c | 75 +++++++++++++++++++++++++++++++++--- sam_header.h | 53 +++++++------------------ sam_view.c | 3 +- 12 files changed, 123 insertions(+), 224 deletions(-) diff --git a/Makefile b/Makefile index 90f613e..ad2cfb1 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,9 @@ CC= gcc -CFLAGS= -g -Wall -O2 #-m64 #-arch ppc +CFLAGS= -g -Wall #-O2 #-m64 #-arch ppc DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -D_CURSES_LIB=1 LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \ bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o knetfile.o \ - bam_sort.o + bam_sort.o sam_header.o AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \ bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \ bamtk.o kaln.o @@ -45,7 +45,7 @@ bgzip:bgzip.o bgzf.o $(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz razip.o:razf.h -bam.o:bam.h razf.h bam_endian.h kstring.h +bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h sam.o:sam.h bam.h bam_import.o:bam.h kseq.h khash.h razf.h bam_pileup.o:bam.h razf.h ksort.h @@ -57,6 +57,7 @@ bam_maqcns.o:bam.h ksort.h bam_maqcns.h bam_sort.o:bam.h ksort.h razf.h bam_md.o:bam.h faidx.h glf.o:glf.h +sam_header.o:sam_header.h khash.h faidx.o:faidx.h razf.h khash.h faidx_main.o:faidx.h razf.h diff --git a/bam.c b/bam.c index 837b956..8b51756 100644 --- a/bam.c +++ b/bam.c @@ -5,6 +5,7 @@ #include "bam.h" #include "bam_endian.h" #include "kstring.h" +#include "sam_header.h" int bam_is_be = 0; char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0"; @@ -59,10 +60,9 @@ void bam_header_destroy(bam_header_t *header) free(header->target_len); } free(header->text); -#ifndef BAM_NO_HASH - if (header->rg2lib) bam_strmap_destroy(header->rg2lib); + if (header->dict) sam_header_free(header->dict); + if (header->rg2lib) sam_tbl_destroy(header->rg2lib); bam_destroy_header_hash(header); -#endif free(header); } @@ -291,3 +291,10 @@ void bam_view1(const bam_header_t *header, const bam1_t *b) printf("%s\n", s); free(s); } + +const char *bam_get_library(const bam_header_t *header, const bam1_t *b) +{ + const uint8_t *rg; + rg = bam_aux_get(b, "RG"); + return (rg == 0)? 0 : sam_tbl_get(header->rg2lib, (const char*)(rg + 1)); +} diff --git a/bam.h b/bam.h index a7e0f08..4d30dcc 100644 --- a/bam.h +++ b/bam.h @@ -73,6 +73,7 @@ typedef gzFile bamFile; @field n_targets number of reference sequences @field target_name names of the reference sequences @field target_len lengths of the referene sequences + @field dict header dictionary @field hash hash table for fast name lookup @field rg2lib hash table for @RG-ID -> LB lookup @field l_text length of the plain text in the header @@ -85,7 +86,7 @@ typedef struct { int32_t n_targets; char **target_name; uint32_t *target_len; - void *hash, *rg2lib; + void *dict, *hash, *rg2lib; int l_text; char *text; } bam_header_t; @@ -437,6 +438,8 @@ extern "C" { char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of); + const char *bam_get_library(const bam_header_t *header, const bam1_t *b); + /*! @typedef @abstract Structure for one alignment covering the pileup position. @field b pointer to the alignment diff --git a/bam_aux.c b/bam_aux.c index d0d733f..89e99f2 100644 --- a/bam_aux.c +++ b/bam_aux.c @@ -180,70 +180,3 @@ char *bam_aux2Z(const uint8_t *s) if (type == 'Z' || type == 'H') return (char*)s; else return 0; } - -/****************** - * rg2lib related * - ******************/ - -int bam_strmap_put(void *rg2lib, const char *rg, const char *lib) -{ - int ret; - khint_t k; - khash_t(r2l) *h = (khash_t(r2l)*)rg2lib; - char *key; - if (h == 0) return 1; - key = strdup(rg); - k = kh_put(r2l, h, key, &ret); - if (ret) kh_val(h, k) = strdup(lib); - else { - fprintf(stderr, "[bam_rg2lib_put] duplicated @RG ID: %s\n", rg); - free(key); - } - return 0; -} - -const char *bam_strmap_get(const void *rg2lib, const char *rg) -{ - const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib; - khint_t k; - if (h == 0) return 0; - k = kh_get(r2l, h, rg); - if (k != kh_end(h)) return (const char*)kh_val(h, k); - else return 0; -} - -void *bam_strmap_dup(const void *rg2lib) -{ - const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib; - khash_t(r2l) *g; - khint_t k, l; - int ret; - if (h == 0) return 0; - g = kh_init(r2l); - for (k = kh_begin(h); k < kh_end(h); ++k) { - if (kh_exist(h, k)) { - char *key = strdup(kh_key(h, k)); - l = kh_put(r2l, g, key, &ret); - kh_val(g, l) = strdup(kh_val(h, k)); - } - } - return g; -} - -void *bam_strmap_init() -{ - return (void*)kh_init(r2l); -} - -void bam_strmap_destroy(void *rg2lib) -{ - khash_t(r2l) *h = (khash_t(r2l)*)rg2lib; - khint_t k; - if (h == 0) return; - for (k = kh_begin(h); k < kh_end(h); ++k) { - if (kh_exist(h, k)) { - free((char*)kh_key(h, k)); free(kh_val(h, k)); - } - } - kh_destroy(r2l, h); -} diff --git a/bam_import.c b/bam_import.c index eaa9452..427140d 100644 --- a/bam_import.c +++ b/bam_import.c @@ -10,6 +10,7 @@ #endif #include "kstring.h" #include "bam.h" +#include "sam_header.h" #include "kseq.h" #include "khash.h" @@ -172,104 +173,35 @@ static inline void append_text(bam_header_t *header, kstring_t *str) int sam_header_parse_rg(bam_header_t *h) { - kstring_t *rgid, *rglib; - char *p, *q, *s, *r; - int n = 0; - - // free - if (h == 0) return 0; - bam_strmap_destroy(h->rg2lib); h->rg2lib = 0; - if (h->l_text < 3) return 0; - // parse @RG lines - h->rg2lib = bam_strmap_init(); - rgid = calloc(1, sizeof(kstring_t)); - rglib = calloc(1, sizeof(kstring_t)); - s = h->text; - while ((s = strstr(s, "@RG")) != 0) { - if (rgid->l && rglib->l) { - bam_strmap_put(h->rg2lib, rgid->s, rglib->s); - ++n; - } - rgid->l = rglib->l = 0; - s += 3; - r = s; - if ((p = strstr(s, "ID:")) != 0) { - q = p + 3; - for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p); - kputsn(q, p - q, rgid); - } else { - fprintf(stderr, "[bam_header_parse] missing ID tag in @RG lines.\n"); - break; - } - if (r < p) r = p; - if ((p = strstr(s, "LB:")) != 0) { - q = p + 3; - for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p); - kputsn(q, p - q, rglib); - } else { - fprintf(stderr, "[bam_header_parse] missing LB tag in @RG lines.\n"); - break; - } - if (r < p) r = p; - } - if (rgid->l && rglib->l) { - bam_strmap_put(h->rg2lib, rgid->s, rglib->s); - ++n; - } - free(rgid->s); free(rgid); - free(rglib->s); free(rglib); - if (n == 0) { - bam_strmap_destroy(h->rg2lib); - h->rg2lib = 0; - } - return n; + if (h->dict == 0) h->dict = sam_header_parse2(h->text); + if (h->rg2lib) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB"); + return sam_tbl_size(h->rg2lib); } int sam_header_parse(bam_header_t *h) { + void *tbl; + char **tmp; int i; - char *s, *p, *q, *r; - - // free free(h->target_len); free(h->target_name); h->n_targets = 0; h->target_len = 0; h->target_name = 0; if (h->l_text < 3) return 0; - // count number of @SQ - s = h->text; - while ((s = strstr(s, "@SQ")) != 0) { - ++h->n_targets; - s += 3; + if (h->dict == 0) h->dict = sam_header_parse2(h->text); + tbl = sam_header2tbl(h->dict, "SQ", "SN", "LN"); + h->n_targets = sam_tbl_size(tbl); + if (h->n_targets == 0) { + sam_tbl_destroy(tbl); + return 0; } - if (h->n_targets == 0) return 0; h->target_len = (uint32_t*)calloc(h->n_targets, 4); h->target_name = (char**)calloc(h->n_targets, sizeof(void*)); - // parse @SQ lines - i = 0; - s = h->text; - while ((s = strstr(s, "@SQ")) != 0) { - s += 3; - r = s; - if ((p = strstr(s, "SN:")) != 0) { - q = p + 3; - for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p); - h->target_name[i] = (char*)calloc(p - q + 1, 1); - strncpy(h->target_name[i], q, p - q); - } else goto header_err_ret; - if (r < p) r = p; - if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10); - else goto header_err_ret; - if (r < p) r = p; - s = r + 3; - ++i; - } - sam_header_parse_rg(h); + tmp = (char**)calloc(h->n_targets, sizeof(void*)); + sam_tbl_pair(tbl, h->target_name, tmp); + for (i = 0; i < h->n_targets; ++i) + h->target_len[i] = atoi(tmp[i]); + free(tmp); + sam_tbl_destroy(tbl); return h->n_targets; - -header_err_ret: - fprintf(stderr, "[bam_header_parse] missing SN or LN tag in @SQ lines.\n"); - free(h->target_len); free(h->target_name); - h->n_targets = 0; h->target_len = 0; h->target_name = 0; - return 0; } bam_header_t *sam_header_read(tamFile fp) diff --git a/bam_rmdup.c b/bam_rmdup.c index fe6fbef..f0d2b5d 100644 --- a/bam_rmdup.c +++ b/bam_rmdup.c @@ -127,11 +127,9 @@ void bam_rmdup_core(samfile_t *in, samfile_t *out) } else if (c->isize > 0) { // paired, head uint64_t key = (uint64_t)c->pos<<32 | c->isize; const char *lib; - const uint8_t *rg; lib_aux_t *q; int ret; - rg = bam_aux_get(b, "RG"); - lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1)); + lib = bam_get_library(in->header, b); q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); ++q->n_checked; k = kh_put(pos, q->best_hash, key, &ret); diff --git a/bam_rmdupse.c b/bam_rmdupse.c index edc73a4..e7dbdc7 100644 --- a/bam_rmdupse.c +++ b/bam_rmdupse.c @@ -117,13 +117,11 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) push_queue(queue, b, endpos, score); } else { const char *lib; - const uint8_t *rg; lib_aux_t *q; besthash_t *h; uint32_t key; int ret; - rg = bam_aux_get(b, "RG"); - lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1)); + lib = bam_get_library(in->header, b); q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); ++q->n_checked; h = (c->flag&BAM_FREVERSE)? q->rght : q->left; diff --git a/kaln.c b/kaln.c index 015030e..9fa40d0 100644 --- a/kaln.c +++ b/kaln.c @@ -182,17 +182,6 @@ typedef struct { int M, I, D; } dpscore_t; -/* build score profile for accelerating alignment, in theory */ -static void aln_init_score_array(uint8_t *seq, int len, int row, int *score_matrix, int **s_array) -{ - int *tmp, *tmp2, i, k; - for (i = 0; i != row; ++i) { - tmp = score_matrix + i * row; - tmp2 = s_array[i]; - for (k = 0; k != len; ++k) - tmp2[k] = tmp[seq[k]]; - } -} /*************************** * banded global alignment * ***************************/ diff --git a/sam.c b/sam.c index a74c557..07524c0 100644 --- a/sam.c +++ b/sam.c @@ -21,7 +21,6 @@ bam_header_t *bam_header_dup(const bam_header_t *h0) h->target_len[i] = h0->target_len[i]; h->target_name[i] = strdup(h0->target_name[i]); } - if (h0->rg2lib) h->rg2lib = bam_strmap_dup(h0->rg2lib); return h; } static void append_header_text(bam_header_t *header, char* text, int len) diff --git a/sam_header.c b/sam_header.c index b9ed0f9..fa80050 100644 --- a/sam_header.c +++ b/sam_header.c @@ -5,6 +5,31 @@ #include #include +#include "khash.h" +KHASH_MAP_INIT_STR(str, const char *) + +struct _HeaderList +{ + struct _HeaderList *next; + void *data; +}; +typedef struct _HeaderList list_t; +typedef list_t HeaderDict; + +typedef struct +{ + char key[2]; + char *value; +} +HeaderTag; + +typedef struct +{ + char type[2]; + list_t *tags; +} +HeaderLine; + const char *o_hd_tags[] = {"SO","GO",NULL}; const char *r_hd_tags[] = {"VN",NULL}; @@ -13,7 +38,7 @@ const char *r_sq_tags[] = {"SN","LN",NULL}; const char *u_sq_tags[] = {"SN",NULL}; const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL}; -const char *r_rg_tags[] = {"ID","SM",NULL}; +const char *r_rg_tags[] = {"ID",NULL}; const char *u_rg_tags[] = {"ID",NULL}; const char *o_pg_tags[] = {"VN","CL",NULL}; @@ -401,8 +426,9 @@ void print_header_line(FILE *fp, HeaderLine *hline) } -void sam_header_free(HeaderDict *header) +void sam_header_free(void *_header) { + HeaderDict *header = (HeaderDict*)_header; list_t *hlines = header; while (hlines) { @@ -435,8 +461,9 @@ HeaderDict *sam_header_clone(const HeaderDict *dict) } // Returns a newly allocated string -char *sam_header_write(const HeaderDict *header) +char *sam_header_write(const void *_header) { + const HeaderDict *header = (const HeaderDict*)_header; char *out = NULL; int len=0, nout=0; const list_t *hlines; @@ -486,7 +513,7 @@ char *sam_header_write(const HeaderDict *header) return out; } -HeaderDict *sam_header_parse(const char *headerText) +void *sam_header_parse2(const char *headerText) { list_t *hlines = NULL; HeaderLine *hline; @@ -514,8 +541,9 @@ HeaderDict *sam_header_parse(const char *headerText) return hlines; } -khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char key_tag[2], char value_tag[2]) +void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2]) { + const HeaderDict *dict = (const HeaderDict*)_dict; const list_t *l = dict; khash_t(str) *tbl = kh_init(str); khiter_t k; @@ -550,9 +578,44 @@ khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char return tbl; } +const char *sam_tbl_get(void *h, const char *key) +{ + khash_t(str) *tbl = (khash_t(str)*)h; + khint_t k; + k = kh_get(str, tbl, key); + return k == kh_end(tbl)? 0 : kh_val(tbl, k); +} + +int sam_tbl_size(void *h) +{ + khash_t(str) *tbl = (khash_t(str)*)h; + return h? kh_size(tbl) : 0; +} + +int sam_tbl_pair(void *h, char **keys, char **vals) +{ + khash_t(str) *tbl = (khash_t(str)*)h; + int i = 0; + khint_t k; + if (h == 0) return -1; + for (k = kh_begin(tbl); k != kh_end(tbl); ++k) { + if (kh_exist(tbl, k)) { + keys[i] = (char*)kh_key(tbl, k); + vals[i++] = (char*)kh_val(tbl, k); + } + } + return kh_size(tbl); +} + +void sam_tbl_destroy(void *h) +{ + khash_t(str) *tbl = (khash_t(str)*)h; + kh_destroy(str, tbl); +} -HeaderDict *sam_header_merge(int n, const HeaderDict **dicts) +void *sam_header_merge(int n, const void **_dicts) { + const HeaderDict **dicts = (const HeaderDict**)_dicts; HeaderDict *out_dict; int idict, status; diff --git a/sam_header.h b/sam_header.h index 442a989..5045616 100644 --- a/sam_header.h +++ b/sam_header.h @@ -1,48 +1,23 @@ #ifndef __SAM_HEADER_H__ #define __SAM_HEADER_H__ -#include "khash.h" -KHASH_MAP_INIT_STR(str,const char *) +#ifdef __cplusplus +extern "C" { +#endif -// HeaderDict is a list_t of header lines. Each HeaderLine holds a list of tags. -struct _list_t -{ - struct _list_t *next; - void *data; -}; -typedef struct _list_t list_t; -typedef list_t HeaderDict; + void *sam_header_parse2(const char *headerText); + void *sam_header_merge(int n, const void **dicts); + void sam_header_free(void *header); + char *sam_header_write(const void *headerDict); // returns a newly allocated string -typedef struct -{ - char key[2]; - char *value; -} -HeaderTag; + void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]); + const char *sam_tbl_get(void *h, const char *key); + int sam_tbl_size(void *h); + int sam_tbl_pair(void *h, char **keys, char **vals); + void sam_tbl_destroy(void *h); -typedef struct -{ - char type[2]; - list_t *tags; +#ifdef __cplusplus } -HeaderLine; - - -void debug(const char *format, ...); -void error(const char *format, ...); - -HeaderDict *sam_header_parse(const char *headerText); -HeaderDict *sam_header_merge(int n, const HeaderDict **dicts); -void sam_header_free(HeaderDict *header); -char *sam_header_write(const HeaderDict *headerDict); // returns a newly allocated string - -khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char key_tag[2], char value_tag[2]); - -list_t *list_append(list_t *root, void *data); -void list_free(list_t *root); - -//char *sam_header_get(const HeaderDict *d, char type[2], int i, char tag[2]); -//int sam_header_ins(HeaderDict *d, char tp[2], int i, char tg[2], const char *s); -//int sam_header_del(HeaderDict *d, char type[2], int i, char tag[2]); +#endif #endif diff --git a/sam_view.c b/sam_view.c index 113c6c4..105b532 100644 --- a/sam_view.c +++ b/sam_view.c @@ -2,6 +2,7 @@ #include #include #include +#include "sam_header.h" #include "sam.h" #include "faidx.h" @@ -17,7 +18,7 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b) if (s) { if (g_rg && strcmp(g_rg, (char*)(s + 1)) == 0) return 0; if (g_library) { - const char *p = bam_strmap_get(h->rg2lib, (char*)(s + 1)); + const char *p = sam_tbl_get(h->rg2lib, (const char*)(s + 1)); return (p && strcmp(p, g_library) == 0)? 0 : 1; } return 1; } else return 1; -- 2.39.2