#include <stdlib.h>
#include <stdarg.h>
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, const char *)
+
+struct _HeaderList
+{
+ struct _HeaderList *last; // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only.
+ struct _HeaderList *next;
+ void *data;
+};
+typedef struct _HeaderList list_t;
+typedef list_t HeaderDict;
+
+typedef struct
+{
+ char key[2];
+ char *value;
+}
+HeaderTag;
+
+typedef struct
+{
+ char type[2];
+ list_t *tags;
+}
+HeaderLine;
+
const char *o_hd_tags[] = {"SO","GO",NULL};
const char *r_hd_tags[] = {"VN",NULL};
const char *u_sq_tags[] = {"SN",NULL};
const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL};
-const char *r_rg_tags[] = {"ID","SM",NULL};
+const char *r_rg_tags[] = {"ID",NULL};
const char *u_rg_tags[] = {"ID",NULL};
const char *o_pg_tags[] = {"VN","CL",NULL};
const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL};
-void debug(const char *format, ...)
+static void debug(const char *format, ...)
{
va_list ap;
va_start(ap, format);
va_end(ap);
}
-void error(const char *format, ...)
+#if 0
+// Replaced by list_append_to_end
+static list_t *list_prepend(list_t *root, void *data)
{
- va_list ap;
- va_start(ap, format);
- vfprintf(stderr, format, ap);
- va_end(ap);
- exit(-1);
+ list_t *l = malloc(sizeof(list_t));
+ l->next = root;
+ l->data = data;
+ return l;
+}
+#endif
+
+// Relies on the root->last being correct. Do not use with the other list_*
+// routines unless they are fixed to modify root->last as well.
+static list_t *list_append_to_end(list_t *root, void *data)
+{
+ list_t *l = malloc(sizeof(list_t));
+ l->last = l;
+ l->next = NULL;
+ l->data = data;
+
+ if ( !root )
+ return l;
+
+ root->last->next = l;
+ root->last = l;
+ return root;
}
-list_t *list_append(list_t *root, void *data)
+static list_t *list_append(list_t *root, void *data)
{
list_t *l = root;
while (l && l->next)
return root;
}
-void list_free(list_t *root)
+static void list_free(list_t *root)
{
list_t *l = root;
while (root)
// Look for a tag "XY" in a predefined const char *[] array.
-int tag_exists(const char *tag, const char **tags)
+static int tag_exists(const char *tag, const char **tags)
{
int itag=0;
if ( !tags ) return -1;
// Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
// or NULL if everything has been read. The lineptr should be freed by the caller. The
// newline character is stripped.
-const char *nextline(char **lineptr, size_t *n, const char *text)
+static const char *nextline(char **lineptr, size_t *n, const char *text)
{
int len;
const char *to = text;
*lineptr = realloc(*lineptr, len);
*n = len;
}
- if ( !*lineptr )
- error("FIXME\n");
+ if ( !*lineptr ) {
+ debug("[nextline] Insufficient memory!\n");
+ return 0;
+ }
memcpy(*lineptr,text,len);
(*lineptr)[len-1] = 0;
// name points to "XY", value_from points to the first character of the value string and
// value_to points to the last character of the value string.
-HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
+static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
{
HeaderTag *tag = malloc(sizeof(HeaderTag));
int len = value_to-value_from+1;
return tag;
}
-HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
+static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
{
list_t *tags = hline->tags;
while (tags)
// 1 .. all tags identical -> no need to merge, drop one
// 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated
// 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line
-int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
+static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
{
HeaderTag *t1, *t2;
return 0;
int itype = tag_exists(hline1->type,types);
- if ( itype==-1 ) error("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
+ if ( itype==-1 ) {
+ debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
+ return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code
+ }
if ( unique_tags[itype] )
{
}
-HeaderLine *sam_header_line_clone(const HeaderLine *hline)
+static HeaderLine *sam_header_line_clone(const HeaderLine *hline)
{
list_t *tags;
HeaderLine *out = malloc(sizeof(HeaderLine));
return out;
}
-int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
+static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
{
list_t *tmpl_tags;
}
-HeaderLine *sam_header_line_parse(const char *headerLine)
+static HeaderLine *sam_header_line_parse(const char *headerLine)
{
HeaderLine *hline;
HeaderTag *tag;
const char *from, *to;
from = headerLine;
- if ( *from != '@' ) error("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
+ if ( *from != '@' ) {
+ debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
+ return 0;
+ }
to = ++from;
while (*to && *to!='\t') to++;
- if ( to-from != 2 ) error("[sam_header_line_parse] expected '@XY', got [%s]\n", headerLine);
+ if ( to-from != 2 ) {
+ debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine);
+ return 0;
+ }
hline = malloc(sizeof(HeaderLine));
hline->type[0] = from[0];
from = to;
while (*to && *to=='\t') to++;
- if ( to-from != 1 )
- error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+ if ( to-from != 1 ) {
+ debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+ return 0;
+ }
from = to;
while (*from)
{
while (*to && *to!='\t') to++;
if ( !required_tags[itype] && !optional_tags[itype] )
+ {
+ // CO is a special case, it can contain anything, including tabs
+ if ( *to ) { to++; continue; }
tag = new_tag(" ",from,to-1);
+ }
else
tag = new_tag(from,from+3,to-1);
from = to;
while (*to && *to=='\t') to++;
- if ( *to && to-from != 1 )
- error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+ if ( *to && to-from != 1 ) {
+ debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+ return 0;
+ }
from = to;
}
// Must be of an existing type, all tags must be recognised and all required tags must be present
-int sam_header_line_validate(HeaderLine *hline)
+static int sam_header_line_validate(HeaderLine *hline)
{
list_t *tags;
HeaderTag *tag;
}
-void print_header_line(FILE *fp, HeaderLine *hline)
+static void print_header_line(FILE *fp, HeaderLine *hline)
{
list_t *tags = hline->tags;
HeaderTag *tag;
}
-void sam_header_free(HeaderDict *header)
+static void sam_header_line_free(HeaderLine *hline)
+{
+ list_t *tags = hline->tags;
+ while (tags)
+ {
+ HeaderTag *tag = tags->data;
+ free(tag->value);
+ free(tag);
+ tags = tags->next;
+ }
+ list_free(hline->tags);
+ free(hline);
+}
+
+void sam_header_free(void *_header)
{
+ HeaderDict *header = (HeaderDict*)_header;
list_t *hlines = header;
while (hlines)
{
- HeaderLine *hline = hlines->data;
- list_t *tags = hline->tags;
- while (tags)
- {
- HeaderTag *tag = tags->data;
- free(tag->value);
- free(tag);
- tags = tags->next;
- }
- list_free(hline->tags);
- free(hline);
+ sam_header_line_free(hlines->data);
hlines = hlines->next;
}
list_free(header);
}
// Returns a newly allocated string
-char *sam_header_write(const HeaderDict *header)
+char *sam_header_write(const void *_header)
{
+ const HeaderDict *header = (const HeaderDict*)_header;
char *out = NULL;
int len=0, nout=0;
const list_t *hlines;
return out;
}
-HeaderDict *sam_header_parse(const char *headerText)
+void *sam_header_parse2(const char *headerText)
{
list_t *hlines = NULL;
HeaderLine *hline;
size_t nbuf = 0;
if ( !headerText )
- error("FIXME");
+ return 0;
text = headerText;
while ( (text=nextline(&buf, &nbuf, text)) )
{
hline = sam_header_line_parse(buf);
- if ( sam_header_line_validate(hline) )
- hlines = list_append(hlines, hline);
+ if ( hline && sam_header_line_validate(hline) )
+ // With too many (~250,000) reference sequences the header parsing was too slow with list_append.
+ hlines = list_append_to_end(hlines, hline);
else
{
- sam_header_free(hlines);
+ if (hline) sam_header_line_free(hline);
+ sam_header_free(hlines);
+ if ( buf ) free(buf);
return NULL;
}
}
return hlines;
}
-khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char key_tag[2], char value_tag[2])
+void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])
{
+ const HeaderDict *dict = (const HeaderDict*)_dict;
const list_t *l = dict;
khash_t(str) *tbl = kh_init(str);
khiter_t k;
int ret;
+ if (_dict == 0) return tbl; // return an empty (not null) hash table
while (l)
{
HeaderLine *hline = l->data;
return tbl;
}
+char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n)
+{
+ const HeaderDict *dict = (const HeaderDict*)_dict;
+ const list_t *l = dict;
+ int max, n;
+ char **ret;
+
+ ret = 0; *_n = max = n = 0;
+ while (l)
+ {
+ HeaderLine *hline = l->data;
+ if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
+ {
+ l = l->next;
+ continue;
+ }
+
+ HeaderTag *key;
+ key = header_line_has_tag(hline,key_tag);
+ if ( !key )
+ {
+ l = l->next;
+ continue;
+ }
+
+ if (n == max) {
+ max = max? max<<1 : 4;
+ ret = realloc(ret, max * sizeof(void*));
+ }
+ ret[n++] = key->value;
+
+ l = l->next;
+ }
+ *_n = n;
+ return ret;
+}
+
+const char *sam_tbl_get(void *h, const char *key)
+{
+ khash_t(str) *tbl = (khash_t(str)*)h;
+ khint_t k;
+ k = kh_get(str, tbl, key);
+ return k == kh_end(tbl)? 0 : kh_val(tbl, k);
+}
+
+int sam_tbl_size(void *h)
+{
+ khash_t(str) *tbl = (khash_t(str)*)h;
+ return h? kh_size(tbl) : 0;
+}
+
+void sam_tbl_destroy(void *h)
+{
+ khash_t(str) *tbl = (khash_t(str)*)h;
+ kh_destroy(str, tbl);
+}
-HeaderDict *sam_header_merge(int n, const HeaderDict **dicts)
+void *sam_header_merge(int n, const void **_dicts)
{
+ const HeaderDict **dicts = (const HeaderDict**)_dicts;
HeaderDict *out_dict;
int idict, status;
{
print_header_line(stderr,tmpl_hlines->data);
print_header_line(stderr,out_hlines->data);
- error("Conflicting lines, cannot merge the headers.\n");
+ debug("Conflicting lines, cannot merge the headers.\n");
+ return 0;
}
if ( status==3 )
sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);