X-Git-Url: https://git.donarmstrong.com/?p=samtools.git;a=blobdiff_plain;f=sam_header.c;h=a1b518101ad20aefb297c240a68756da8eebc371;hp=b9ed0f912fa5a1ead872eb95228989d73109d151;hb=0ccd9d36ebf35ce620a8248ecf4336c84065e6c0;hpb=bb9a8233e309f483c565dac4cb7193decdf324a5 diff --git a/sam_header.c b/sam_header.c index b9ed0f9..a1b5181 100644 --- a/sam_header.c +++ b/sam_header.c @@ -5,6 +5,32 @@ #include #include +#include "khash.h" +KHASH_MAP_INIT_STR(str, const char *) + +struct _HeaderList +{ + struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only. + struct _HeaderList *next; + void *data; +}; +typedef struct _HeaderList list_t; +typedef list_t HeaderDict; + +typedef struct +{ + char key[2]; + char *value; +} +HeaderTag; + +typedef struct +{ + char type[2]; + list_t *tags; +} +HeaderLine; + const char *o_hd_tags[] = {"SO","GO",NULL}; const char *r_hd_tags[] = {"VN",NULL}; @@ -12,8 +38,8 @@ const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL}; const char *r_sq_tags[] = {"SN","LN",NULL}; const char *u_sq_tags[] = {"SN",NULL}; -const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL}; -const char *r_rg_tags[] = {"ID","SM",NULL}; +const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL}; +const char *r_rg_tags[] = {"ID",NULL}; const char *u_rg_tags[] = {"ID",NULL}; const char *o_pg_tags[] = {"VN","CL",NULL}; @@ -25,7 +51,7 @@ const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NUL const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL}; -void debug(const char *format, ...) +static void debug(const char *format, ...) { va_list ap; va_start(ap, format); @@ -33,16 +59,35 @@ void debug(const char *format, ...) va_end(ap); } -void error(const char *format, ...) +#if 0 +// Replaced by list_append_to_end +static list_t *list_prepend(list_t *root, void *data) { - va_list ap; - va_start(ap, format); - vfprintf(stderr, format, ap); - va_end(ap); - exit(-1); + list_t *l = malloc(sizeof(list_t)); + l->next = root; + l->data = data; + return l; +} +#endif + +// Relies on the root->last being correct. Do not use with the other list_* +// routines unless they are fixed to modify root->last as well. +static list_t *list_append_to_end(list_t *root, void *data) +{ + list_t *l = malloc(sizeof(list_t)); + l->last = l; + l->next = NULL; + l->data = data; + + if ( !root ) + return l; + + root->last->next = l; + root->last = l; + return root; } -list_t *list_append(list_t *root, void *data) +static list_t *list_append(list_t *root, void *data) { list_t *l = root; while (l && l->next) @@ -62,7 +107,7 @@ list_t *list_append(list_t *root, void *data) return root; } -void list_free(list_t *root) +static void list_free(list_t *root) { list_t *l = root; while (root) @@ -76,7 +121,7 @@ void list_free(list_t *root) // Look for a tag "XY" in a predefined const char *[] array. -int tag_exists(const char *tag, const char **tags) +static int tag_exists(const char *tag, const char **tags) { int itag=0; if ( !tags ) return -1; @@ -93,7 +138,7 @@ int tag_exists(const char *tag, const char **tags) // Mimics the behaviour of getline, except it returns pointer to the next chunk of the text // or NULL if everything has been read. The lineptr should be freed by the caller. The // newline character is stripped. -const char *nextline(char **lineptr, size_t *n, const char *text) +static const char *nextline(char **lineptr, size_t *n, const char *text) { int len; const char *to = text; @@ -122,8 +167,10 @@ const char *nextline(char **lineptr, size_t *n, const char *text) *lineptr = realloc(*lineptr, len); *n = len; } - if ( !*lineptr ) - error("FIXME\n"); + if ( !*lineptr ) { + debug("[nextline] Insufficient memory!\n"); + return 0; + } memcpy(*lineptr,text,len); (*lineptr)[len-1] = 0; @@ -133,7 +180,7 @@ const char *nextline(char **lineptr, size_t *n, const char *text) // name points to "XY", value_from points to the first character of the value string and // value_to points to the last character of the value string. -HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to) +static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to) { HeaderTag *tag = malloc(sizeof(HeaderTag)); int len = value_to-value_from+1; @@ -146,7 +193,7 @@ HeaderTag *new_tag(const char *name, const char *value_from, const char *value_t return tag; } -HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key) +static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key) { list_t *tags = hline->tags; while (tags) @@ -164,7 +211,7 @@ HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key) // 1 .. all tags identical -> no need to merge, drop one // 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated // 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line -int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2) +static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2) { HeaderTag *t1, *t2; @@ -172,7 +219,10 @@ int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2) return 0; int itype = tag_exists(hline1->type,types); - if ( itype==-1 ) error("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]); + if ( itype==-1 ) { + debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]); + return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code + } if ( unique_tags[itype] ) { @@ -236,7 +286,7 @@ int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2) } -HeaderLine *sam_header_line_clone(const HeaderLine *hline) +static HeaderLine *sam_header_line_clone(const HeaderLine *hline) { list_t *tags; HeaderLine *out = malloc(sizeof(HeaderLine)); @@ -260,7 +310,7 @@ HeaderLine *sam_header_line_clone(const HeaderLine *hline) return out; } -int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline) +static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline) { list_t *tmpl_tags; @@ -286,18 +336,24 @@ int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hli } -HeaderLine *sam_header_line_parse(const char *headerLine) +static HeaderLine *sam_header_line_parse(const char *headerLine) { HeaderLine *hline; HeaderTag *tag; const char *from, *to; from = headerLine; - if ( *from != '@' ) error("[sam_header_line_parse] expected '@', got [%s]\n", headerLine); + if ( *from != '@' ) { + debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine); + return 0; + } to = ++from; while (*to && *to!='\t') to++; - if ( to-from != 2 ) error("[sam_header_line_parse] expected '@XY', got [%s]\n", headerLine); + if ( to-from != 2 ) { + debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine); + return 0; + } hline = malloc(sizeof(HeaderLine)); hline->type[0] = from[0]; @@ -308,15 +364,21 @@ HeaderLine *sam_header_line_parse(const char *headerLine) from = to; while (*to && *to=='\t') to++; - if ( to-from != 1 ) - error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); + if ( to-from != 1 ) { + debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); + return 0; + } from = to; while (*from) { while (*to && *to!='\t') to++; if ( !required_tags[itype] && !optional_tags[itype] ) + { + // CO is a special case, it can contain anything, including tabs + if ( *to ) { to++; continue; } tag = new_tag(" ",from,to-1); + } else tag = new_tag(from,from+3,to-1); @@ -326,8 +388,10 @@ HeaderLine *sam_header_line_parse(const char *headerLine) from = to; while (*to && *to=='\t') to++; - if ( *to && to-from != 1 ) - error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); + if ( *to && to-from != 1 ) { + debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); + return 0; + } from = to; } @@ -336,7 +400,7 @@ HeaderLine *sam_header_line_parse(const char *headerLine) // Must be of an existing type, all tags must be recognised and all required tags must be present -int sam_header_line_validate(HeaderLine *hline) +static int sam_header_line_validate(HeaderLine *hline) { list_t *tags; HeaderTag *tag; @@ -370,8 +434,14 @@ int sam_header_line_validate(HeaderLine *hline) tag = tags->data; if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) ) { - debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]); - return 0; + // Lower case tags are user-defined values. + if( !(islower(tag->key[0]) || islower(tag->key[1])) ) + { + // Neither is lower case, but tag was not recognized. + debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]); + // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes + } + // else - allow user defined tag } tags = tags->next; } @@ -380,7 +450,7 @@ int sam_header_line_validate(HeaderLine *hline) } -void print_header_line(FILE *fp, HeaderLine *hline) +static void print_header_line(FILE *fp, HeaderLine *hline) { list_t *tags = hline->tags; HeaderTag *tag; @@ -401,22 +471,27 @@ void print_header_line(FILE *fp, HeaderLine *hline) } -void sam_header_free(HeaderDict *header) +static void sam_header_line_free(HeaderLine *hline) +{ + list_t *tags = hline->tags; + while (tags) + { + HeaderTag *tag = tags->data; + free(tag->value); + free(tag); + tags = tags->next; + } + list_free(hline->tags); + free(hline); +} + +void sam_header_free(void *_header) { + HeaderDict *header = (HeaderDict*)_header; list_t *hlines = header; while (hlines) { - HeaderLine *hline = hlines->data; - list_t *tags = hline->tags; - while (tags) - { - HeaderTag *tag = tags->data; - free(tag->value); - free(tag); - tags = tags->next; - } - list_free(hline->tags); - free(hline); + sam_header_line_free(hlines->data); hlines = hlines->next; } list_free(header); @@ -435,8 +510,9 @@ HeaderDict *sam_header_clone(const HeaderDict *dict) } // Returns a newly allocated string -char *sam_header_write(const HeaderDict *header) +char *sam_header_write(const void *_header) { + const HeaderDict *header = (const HeaderDict*)_header; char *out = NULL; int len=0, nout=0; const list_t *hlines; @@ -486,26 +562,30 @@ char *sam_header_write(const HeaderDict *header) return out; } -HeaderDict *sam_header_parse(const char *headerText) +void *sam_header_parse2(const char *headerText) { list_t *hlines = NULL; HeaderLine *hline; const char *text; char *buf=NULL; size_t nbuf = 0; + int tovalidate = 0; if ( !headerText ) - error("FIXME"); + return 0; text = headerText; while ( (text=nextline(&buf, &nbuf, text)) ) { hline = sam_header_line_parse(buf); - if ( sam_header_line_validate(hline) ) - hlines = list_append(hlines, hline); + if ( hline && (!tovalidate || sam_header_line_validate(hline)) ) + // With too many (~250,000) reference sequences the header parsing was too slow with list_append. + hlines = list_append_to_end(hlines, hline); else { - sam_header_free(hlines); + if (hline) sam_header_line_free(hline); + sam_header_free(hlines); + if ( buf ) free(buf); return NULL; } } @@ -514,13 +594,15 @@ HeaderDict *sam_header_parse(const char *headerText) return hlines; } -khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char key_tag[2], char value_tag[2]) +void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2]) { + const HeaderDict *dict = (const HeaderDict*)_dict; const list_t *l = dict; khash_t(str) *tbl = kh_init(str); khiter_t k; int ret; + if (_dict == 0) return tbl; // return an empty (not null) hash table while (l) { HeaderLine *hline = l->data; @@ -550,9 +632,96 @@ khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char return tbl; } +char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n) +{ + const HeaderDict *dict = (const HeaderDict*)_dict; + const list_t *l = dict; + int max, n; + char **ret; + + ret = 0; *_n = max = n = 0; + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + + HeaderTag *key; + key = header_line_has_tag(hline,key_tag); + if ( !key ) + { + l = l->next; + continue; + } + + if (n == max) { + max = max? max<<1 : 4; + ret = realloc(ret, max * sizeof(void*)); + } + ret[n++] = key->value; + + l = l->next; + } + *_n = n; + return ret; +} + +void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value) +{ + list_t *l = iter; + if ( !l ) return NULL; + + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + + HeaderTag *key, *value; + key = header_line_has_tag(hline,key_tag); + value = header_line_has_tag(hline,value_tag); + if ( !key && !value ) + { + l = l->next; + continue; + } + + *_key = key->value; + *_value = value->value; + return l->next; + } + return l; +} + +const char *sam_tbl_get(void *h, const char *key) +{ + khash_t(str) *tbl = (khash_t(str)*)h; + khint_t k; + k = kh_get(str, tbl, key); + return k == kh_end(tbl)? 0 : kh_val(tbl, k); +} + +int sam_tbl_size(void *h) +{ + khash_t(str) *tbl = (khash_t(str)*)h; + return h? kh_size(tbl) : 0; +} + +void sam_tbl_destroy(void *h) +{ + khash_t(str) *tbl = (khash_t(str)*)h; + kh_destroy(str, tbl); +} -HeaderDict *sam_header_merge(int n, const HeaderDict **dicts) +void *sam_header_merge(int n, const void **_dicts) { + const HeaderDict **dicts = (const HeaderDict**)_dicts; HeaderDict *out_dict; int idict, status; @@ -581,7 +750,8 @@ HeaderDict *sam_header_merge(int n, const HeaderDict **dicts) { print_header_line(stderr,tmpl_hlines->data); print_header_line(stderr,out_hlines->data); - error("Conflicting lines, cannot merge the headers.\n"); + debug("Conflicting lines, cannot merge the headers.\n"); + return 0; } if ( status==3 ) sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);