From: Petr Danecek Date: Thu, 22 Oct 2009 15:41:56 +0000 (+0000) Subject: (no commit message) X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=d410ecbf60b5aed90ee20c6ba40637ca50241edd;p=samtools.git --- diff --git a/sam_header.c b/sam_header.c new file mode 100644 index 0000000..b74b52b --- /dev/null +++ b/sam_header.c @@ -0,0 +1,438 @@ +#include "sam_header.h" +#include +#include +#include +#include + +const char *o_hd_tags[] = {"SO","GO",NULL}; +const char *r_hd_tags[] = {"VN",NULL}; +const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL}; +const char *r_sq_tags[] = {"SN","LN",NULL}; +const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL}; +const char *r_rg_tags[] = {"ID","SM",NULL}; +const char *o_pg_tags[] = {"VN","CL",NULL}; +const char *r_pg_tags[] = {"ID",NULL}; +const char *types[] = {"HD","SQ","RG","PG","CO",NULL}; +const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL}; +const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL}; + + +list_t *list_append(list_t *root, void *data) +{ + list_t *l = root; + while (l && l->next) + l = l->next; + if ( l ) + { + l->next = malloc(sizeof(list_t)); + l = l->next; + } + else + { + l = malloc(sizeof(list_t)); + root = l; + } + l->data = data; + l->next = NULL; + return root; +} + +void list_free(list_t *root) +{ + list_t *l = root; + while (root) + { + l = root; + root = root->next; + free(l); + } +} + + + +// Look for a tag "XY" in a predefined const char *[] array. +int tag_exists(const char *tag, const char **tags) +{ + int itag=0; + if ( !tags ) return -1; + while ( tags[itag] ) + { + if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag; + itag++; + } + return -1; +} + + + +// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text +// or NULL if everything has been read. The lineptr should be freed by the caller. The +// newline character is stripped. +const char *nextline(char **lineptr, size_t *n, const char *text) +{ + int len; + const char *to = text; + + if ( !*to ) return NULL; + + while ( *to && *to!='\n' && *to!='\r' ) to++; + len = to - text + 1; + + if ( *to ) + { + // Advance the pointer for the next call + if ( *to=='\n' ) to++; + else if ( *to=='\r' && *(to+1)=='\n' ) to+=2; + } + if ( !len ) + return to; + + if ( !*lineptr ) + { + *lineptr = malloc(len); + *n = len; + } + else if ( *nkey[0] = name[0]; + tag->key[1] = name[1]; + tag->value = malloc(len+1); + memcpy(tag->value,value_from,len+1); + tag->value[len] = 0; + return tag; +} + +HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key) +{ + list_t *tags = hline->tags; + while (tags) + { + HeaderTag *tag = tags->data; + if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag; + tags = tags->next; + } + return NULL; +} + +#if 0 +// Is there a HeaderLine with all required fields identical to those given in the hline? +HeaderLine *sam_header_has_line(HeaderDict *dict, HeaderLine *hline) +{ + HeaderLine *found=NULL; + + while (dict) + { + HeaderLine *dline = dict->data; + + if ( hline->type[0]!=dline->type[0] || hline->type[1]!=dline->type[1] ) + { + dict = dict->next; + continue; + } + + int itype = tag_exists(hline->type,types); + if ( itype==-1 ) error("[sam_header_has_line] Unknown type [%c%c]\n", hline->type[0],hline->type[1]); + + int ireq=0, differ=0; + while ( required_tags[itype] && required_tags[itype][ireq] ) + { + HeaderTag *t1, *t2; + t1 = header_line_has_tag(hline,required_tags[itype][ireq]); + t2 = header_line_has_tag(dline,required_tags[itype][ireq]); + if ( !t1 || !t2 ) error("[sam_header_has_line] Missing a required tag [%c%c]\n", + required_tags[itype][ireq][0],required_tags[itype][ireq][1]); + if ( strcmp(t1->value,t2->value) ) + ireq++; + } + dict = dict->next; + } + return found; +} +#endif + +HeaderLine *sam_header_line_parse(const char *headerLine) +{ + HeaderLine *hline; + HeaderTag *tag; + const char *from, *to; + from = headerLine; + + if ( *from != '@' ) error("[sam_header_line_parse] expected '@', got [%s]\n", headerLine); + to = ++from; + + while (*to && *to!='\t') to++; + if ( to-from != 2 ) error("[sam_header_line_parse] expected '@XY', got [%s]\n", headerLine); + + hline = malloc(sizeof(HeaderLine)); + hline->type[0] = from[0]; + hline->type[1] = from[1]; + hline->tags = NULL; + + int itype = tag_exists(hline->type, types); + + from = to; + while (*to && *to=='\t') to++; + if ( to-from != 1 ) + error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); + from = to; + while (*from) + { + while (*to && *to!='\t') to++; + + if ( !required_tags[itype] && !optional_tags[itype] ) + tag = new_tag(" ",from,to-1); + else + tag = new_tag(from,from+3,to-1); + + if ( header_line_has_tag(hline,tag->key) ) + debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine); + hline->tags = list_append(hline->tags, tag); + + from = to; + while (*to && *to=='\t') to++; + if ( *to && to-from != 1 ) + error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from)); + + from = to; + } + return hline; +} + + +// Must be of an existing type, all tags must be recognised and all required tags must be present +int sam_header_line_validate(HeaderLine *hline) +{ + list_t *tags; + HeaderTag *tag; + int itype, itag; + + // Is the type correct? + itype = tag_exists(hline->type, types); + if ( itype==-1 ) + { + debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]); + return 0; + } + + // Has all required tags? + itag = 0; + while ( required_tags[itype] && required_tags[itype][itag] ) + { + if ( !header_line_has_tag(hline,required_tags[itype][itag]) ) + { + debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1], + hline->type[0],hline->type[1]); + return 0; + } + itag++; + } + + // Are all tags recognised? + tags = hline->tags; + while ( tags ) + { + tag = tags->data; + if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) ) + { + debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]); + return 0; + } + tags = tags->next; + } + + return 1; +} + +void print_header_line(HeaderLine *hline) +{ + list_t *tags = hline->tags; + HeaderTag *tag; + + printf("@%c%c", hline->type[0],hline->type[1]); + while (tags) + { + tag = tags->data; + printf("\t%c%c:%s", tag->key[0],tag->key[1],tag->value); + tags = tags->next; + } + printf("\n"); +} + + +void sam_header_free(HeaderDict *header) +{ + list_t *hlines = header; + while (hlines) + { + HeaderLine *hline = hlines->data; + list_t *tags = hline->tags; + while (tags) + { + HeaderTag *tag = tags->data; + free(tag->value); + free(tag); + tags = tags->next; + } + list_free(hline->tags); + free(hline); + hlines = hlines->next; + } + list_free(header); +} + +// Returns a newly allocated string +char *sam_header_write(const HeaderDict *header) +{ + char *out = NULL; + int len=0, nout=0; + const list_t *hlines; + + // Calculate the length of the string to allocate + hlines = header; + while (hlines) + { + len += 4; // @XY and \n + + HeaderLine *hline = hlines->data; + list_t *tags = hline->tags; + while (tags) + { + HeaderTag *tag = tags->data; + len += strlen(tag->value) + 1; // \t + if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) + len += strlen(tag->value) + 3; // XY: + tags = tags->next; + } + hlines = hlines->next; + } + + nout = 0; + out = malloc(len+1); + hlines = header; + while (hlines) + { + HeaderLine *hline = hlines->data; + + nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]); + + list_t *tags = hline->tags; + while (tags) + { + HeaderTag *tag = tags->data; + nout += sprintf(out+nout,"\t"); + if ( tag->key[0]!=' ' || tag->key[1]!=' ' ) + nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]); + nout += sprintf(out+nout,"%s", tag->value); + tags = tags->next; + } + hlines = hlines->next; + nout += sprintf(out+nout,"\n"); + } + out[len] = 0; + return out; +} + +HeaderDict *sam_header_parse(const char *headerText) +{ + list_t *hlines = NULL; + HeaderLine *hline; + const char *text; + char *buf=NULL; + size_t nbuf = 0; + + if ( !headerText ) + error("FIXME"); + + text = headerText; + while ( (text=nextline(&buf, &nbuf, text)) ) + { + hline = sam_header_line_parse(buf); + if ( sam_header_line_validate(hline) ) + hlines = list_append(hlines, hline); + else + { + sam_header_free(hlines); + return NULL; + } + } + if ( buf ) free(buf); + + return hlines; +} + +khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char key_tag[2], char value_tag[2]) +{ + const list_t *l = dict; + khash_t(str) *tbl = kh_init(str); + khiter_t k; + int ret; + + while (l) + { + HeaderLine *hline = l->data; + if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) + { + l = l->next; + continue; + } + + HeaderTag *key, *value; + key = header_line_has_tag(hline,key_tag); + value = header_line_has_tag(hline,value_tag); + if ( !key || !value ) + { + l = l->next; + continue; + } + + k = kh_get(str, tbl, key->value); + if ( k != kh_end(tbl) ) + debug("[sam_header_lookup_table] They key %s not unique.\n", key->value); + k = kh_put(str, tbl, key->value, &ret); + kh_value(tbl, k) = value->value; + + l = l->next; + } + return tbl; +} + + +#if 0 +TODO +HeaderDict *sam_header_merge(int n, const HeaderDict **dicts) +{ + HeaderDict *out=NULL; + int idict; + + for (idict=0; idictdata); + sam_header_line_merge(hline,hlines->data); + hlines = hlines->next; + } + } +} +#endif + diff --git a/sam_header.h b/sam_header.h new file mode 100644 index 0000000..4f633ed --- /dev/null +++ b/sam_header.h @@ -0,0 +1,48 @@ +#ifndef __SAM_HEADER_H__ +#define __SAM_HEADER_H__ + +#include "khash.h" +KHASH_MAP_INIT_STR(str,const char *) + +// HeaderDict is a list_t of header lines. Each HeaderLine holds a list of tags. +struct _list_t +{ + struct _list_t *next; + void *data; +}; +typedef struct _list_t list_t; +typedef list_t HeaderDict; + +typedef struct +{ + char key[2]; + char *value; +} +HeaderTag; + +typedef struct +{ + char type[2]; + list_t *tags; +} +HeaderLine; + + +void debug(const char *format, ...); +void error(const char *format, ...); + +HeaderDict *sam_header_parse(const char *headerText); +void sam_header_free(HeaderDict *header); +char *sam_header_write(const HeaderDict *headerDict); // returns a newly allocated string + +khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char key_tag[2], char value_tag[2]); + +list_t *list_append(list_t *root, void *data); +void list_free(list_t *root); + +//HeaderDict *sam_header_merge(int n, const HeaderDict **dicts); // if n=1, the func clones the dictionary +//char *sam_header_get(const HeaderDict *d, char type[2], int i, char tag[2]); +//int sam_header_ins(HeaderDict *d, char tp[2], int i, char tg[2], const char *s); +//int sam_header_del(HeaderDict *d, char type[2], int i, char tag[2]); + +#endif