Fix memory leaks:

[samtools.git] / sam_header.c
diff --git a/sam_header.c b/sam_header.c

index b74b52b5919a1ec69df95dfb66e73c1707c77220..ddc2c38569f4ee0430439bc10a6e34240939ed5d 100644 (file)
--- a/sam_header.c
+++ b/sam_header.c
@@ -3,21 +3,91 @@
  #include <string.h>
  #include <ctype.h>
  #include <stdlib.h>
+#include <stdarg.h>
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, const char *)
+
+struct _HeaderList
+{
+    struct _HeaderList *last;   // Hack: Used and maintained only by list_append_to_end. Maintained in the root node only.
+    struct _HeaderList *next;
+    void *data;
+};
+typedef struct _HeaderList list_t;
+typedef list_t HeaderDict;
+
+typedef struct
+{
+    char key[2];
+    char *value;
+}
+HeaderTag;
+
+typedef struct
+{
+    char type[2];
+    list_t *tags;
+}
+HeaderLine;
  
  const char *o_hd_tags[] = {"SO","GO",NULL};
  const char *r_hd_tags[] = {"VN",NULL};
+
  const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
  const char *r_sq_tags[] = {"SN","LN",NULL};
-const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL};
-const char *r_rg_tags[] = {"ID","SM",NULL};
+const char *u_sq_tags[] = {"SN",NULL};
+
+const char *o_rg_tags[] = {"CN","DS","DT","FO","KS","LB","PG","PI","PL","PU","SM",NULL};
+const char *r_rg_tags[] = {"ID",NULL};
+const char *u_rg_tags[] = {"ID",NULL};
+
  const char *o_pg_tags[] = {"VN","CL",NULL};
  const char *r_pg_tags[] = {"ID",NULL};
+
  const char *types[]          = {"HD","SQ","RG","PG","CO",NULL};
  const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
  const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
+const char **unique_tags[]   = {NULL,     u_sq_tags,u_rg_tags,NULL,NULL,NULL};
+
+
+static void debug(const char *format, ...)
+{
+    va_list ap;
+    va_start(ap, format);
+    vfprintf(stderr, format, ap);
+    va_end(ap);
+}
+
+#if 0
+// Replaced by list_append_to_end
+static list_t *list_prepend(list_t *root, void *data)
+{
+    list_t *l = malloc(sizeof(list_t));
+    l->next = root;
+    l->data = data;
+    return l;
+}
+#endif
+
+// Relies on the root->last being correct. Do not use with the other list_*
+//  routines unless they are fixed to modify root->last as well.
+static list_t *list_append_to_end(list_t *root, void *data)
+{
+    list_t *l = malloc(sizeof(list_t));
+    l->last = l;
+    l->next = NULL;
+    l->data = data;
  
+    if ( !root )
+        return l;
  
-list_t *list_append(list_t *root, void *data)
+    root->last->next = l;
+    root->last = l;
+    return root;
+}
+
+static list_t *list_append(list_t *root, void *data)
  {
      list_t *l = root;
      while (l && l->next)
@@ -37,7 +107,7 @@ list_t *list_append(list_t *root, void *data)
      return root;
  }
  
-void list_free(list_t *root)
+static void list_free(list_t *root)
  {
      list_t *l = root;
      while (root)
@@ -51,7 +121,7 @@ void list_free(list_t *root)
  
  
  // Look for a tag "XY" in a predefined const char *[] array.
-int tag_exists(const char *tag, const char **tags)
+static int tag_exists(const char *tag, const char **tags)
  {
      int itag=0;
      if ( !tags ) return -1;
@@ -68,7 +138,7 @@ int tag_exists(const char *tag, const char **tags)
  // Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
  //  or NULL if everything has been read. The lineptr should be freed by the caller. The
  //  newline character is stripped.
-const char *nextline(char **lineptr, size_t *n, const char *text)
+static const char *nextline(char **lineptr, size_t *n, const char *text)
  {
      int len;
      const char *to = text;
@@ -97,8 +167,10 @@ const char *nextline(char **lineptr, size_t *n, const char *text)
          *lineptr = realloc(*lineptr, len);
          *n = len;
      }
-    if ( !*lineptr )
-            error("FIXME\n");
+    if ( !*lineptr ) {
+               debug("[nextline] Insufficient memory!\n");
+               return 0;
+       }
  
      memcpy(*lineptr,text,len);
      (*lineptr)[len-1] = 0;
@@ -108,7 +180,7 @@ const char *nextline(char **lineptr, size_t *n, const char *text)
  
  // name points to "XY", value_from points to the first character of the value string and
  //  value_to points to the last character of the value string.
-HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
+static HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
  {
      HeaderTag *tag = malloc(sizeof(HeaderTag));
      int len = value_to-value_from+1;
@@ -121,7 +193,7 @@ HeaderTag *new_tag(const char *name, const char *value_from, const char *value_t
      return tag;
  }
  
-HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
+static HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
  {
      list_t *tags = hline->tags;
      while (tags)
@@ -133,54 +205,155 @@ HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
      return NULL;
  }
  
-#if 0
-// Is there a HeaderLine with all required fields identical to those given in the hline?
-HeaderLine *sam_header_has_line(HeaderDict *dict, HeaderLine *hline)
+
+// Return codes:
+//   0 .. different types or unique tags differ or conflicting tags, cannot be merged
+//   1 .. all tags identical -> no need to merge, drop one
+//   2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated
+//   3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line
+static int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
  {
-    HeaderLine *found=NULL;
+    HeaderTag *t1, *t2;
  
-    while (dict)
+    if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] )
+        return 0;
+
+    int itype = tag_exists(hline1->type,types);
+    if ( itype==-1 ) {
+               debug("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
+               return -1; // FIXME (lh3): error; I do not know how this will be handled in Petr's code
+       }
+
+    if ( unique_tags[itype] )
+    {
+        t1 = header_line_has_tag(hline1,unique_tags[itype][0]);
+        t2 = header_line_has_tag(hline2,unique_tags[itype][0]);
+        if ( !t1 || !t2 ) // this should never happen, the unique tags are required
+            return 2;
+
+        if ( strcmp(t1->value,t2->value) )
+            return 0;   // the unique tags differ, cannot be merged
+    }
+    if ( !required_tags[itype] && !optional_tags[itype] )
      {
-        HeaderLine *dline = dict->data;
+        t1 = hline1->tags->data;
+        t2 = hline2->tags->data;
+        if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments
+        return 0;
+    }
  
-        if ( hline->type[0]!=dline->type[0] || hline->type[1]!=dline->type[1] )
+    int missing=0, itag=0;
+    while ( required_tags[itype] && required_tags[itype][itag] )
+    {
+        t1 = header_line_has_tag(hline1,required_tags[itype][itag]);
+        t2 = header_line_has_tag(hline2,required_tags[itype][itag]);
+        if ( !t1 && !t2 )
+            return 2;       // this should never happen
+        else if ( !t1 || !t2 )
+            missing = 1;    // there is some tag missing in one of the hlines
+        else if ( strcmp(t1->value,t2->value) )
          {
-            dict = dict->next;
+            if ( unique_tags[itype] )
+                return 2;   // the lines have a matching unique tag but have a conflicting tag
+                    
+            return 0;    // the lines contain conflicting tags, cannot be merged
+        }
+        itag++;
+    }
+    itag = 0;
+    while ( optional_tags[itype] && optional_tags[itype][itag] )
+    {
+        t1 = header_line_has_tag(hline1,optional_tags[itype][itag]);
+        t2 = header_line_has_tag(hline2,optional_tags[itype][itag]);
+        if ( !t1 && !t2 )
+        {
+            itag++;
              continue;
          }
+        if ( !t1 || !t2 )
+            missing = 1;    // there is some tag missing in one of the hlines
+        else if ( strcmp(t1->value,t2->value) )
+        {
+            if ( unique_tags[itype] )
+                return 2;   // the lines have a matching unique tag but have a conflicting tag
  
-        int itype = tag_exists(hline->type,types);
-        if ( itype==-1 ) error("[sam_header_has_line] Unknown type [%c%c]\n", hline->type[0],hline->type[1]);
+            return 0;   // the lines contain conflicting tags, cannot be merged
+        }
+        itag++;
+    }
+    if ( missing ) return 3;    // there are some missing complementary tags with no conflicts, can be merged
+    return 1;
+}
+
+
+static HeaderLine *sam_header_line_clone(const HeaderLine *hline)
+{
+    list_t *tags;
+    HeaderLine *out = malloc(sizeof(HeaderLine));
+    out->type[0] = hline->type[0];
+    out->type[1] = hline->type[1];
+    out->tags = NULL;
+
+    tags = hline->tags;
+    while (tags)
+    {
+        HeaderTag *old = tags->data;
  
-        int ireq=0, differ=0;
-        while ( required_tags[itype] && required_tags[itype][ireq] )
+        HeaderTag *new = malloc(sizeof(HeaderTag));
+        new->key[0] = old->key[0];
+        new->key[1] = old->key[1];
+        new->value  = strdup(old->value);
+        out->tags = list_append(out->tags, new);
+
+        tags = tags->next;
+    }
+    return out;
+}
+
+static int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
+{
+    list_t *tmpl_tags;
+
+    if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] )
+        return 0;
+    
+    tmpl_tags = tmpl_hline->tags;
+    while (tmpl_tags)
+    {
+        HeaderTag *tmpl_tag = tmpl_tags->data;
+        HeaderTag *out_tag  = header_line_has_tag(out_hline, tmpl_tag->key);
+        if ( !out_tag )
          {
-            HeaderTag *t1, *t2;
-            t1 = header_line_has_tag(hline,required_tags[itype][ireq]);
-            t2 = header_line_has_tag(dline,required_tags[itype][ireq]);
-            if ( !t1 || !t2 ) error("[sam_header_has_line] Missing a required tag [%c%c]\n",
-                required_tags[itype][ireq][0],required_tags[itype][ireq][1]);
-            if ( strcmp(t1->value,t2->value) )
-            ireq++;
+            HeaderTag *tag = malloc(sizeof(HeaderTag));
+            tag->key[0] = tmpl_tag->key[0];
+            tag->key[1] = tmpl_tag->key[1];
+            tag->value  = strdup(tmpl_tag->value);
+            out_hline->tags = list_append(out_hline->tags,tag);
          }
-        dict = dict->next; 
+        tmpl_tags = tmpl_tags->next;
      }
-    return found;
+    return 1;
  }
-#endif
  
-HeaderLine *sam_header_line_parse(const char *headerLine)
+
+static HeaderLine *sam_header_line_parse(const char *headerLine)
  {
      HeaderLine *hline;
      HeaderTag *tag;
      const char *from, *to;
      from = headerLine;
  
-    if ( *from != '@' ) error("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
+    if ( *from != '@' ) {
+               debug("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
+               return 0;
+       }
      to = ++from;
  
      while (*to && *to!='\t') to++;
-    if ( to-from != 2 ) error("[sam_header_line_parse] expected '@XY', got [%s]\n", headerLine);
+    if ( to-from != 2 ) {
+               debug("[sam_header_line_parse] expected '@XY', got [%s]\nHint: The header tags must be tab-separated.\n", headerLine);
+               return 0;
+       }
      
      hline = malloc(sizeof(HeaderLine));
      hline->type[0] = from[0];
@@ -191,15 +364,22 @@ HeaderLine *sam_header_line_parse(const char *headerLine)
      
      from = to;
      while (*to && *to=='\t') to++;
-    if ( to-from != 1 ) 
-        error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+    if ( to-from != 1 ) {
+        debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+        free(hline);
+               return 0;
+       }
      from = to;
      while (*from)
      {
          while (*to && *to!='\t') to++;
  
          if ( !required_tags[itype] && !optional_tags[itype] )
+        {
+            // CO is a special case, it can contain anything, including tabs
+            if ( *to ) { to++; continue; }
              tag = new_tag("  ",from,to-1);
+        }
          else
              tag = new_tag(from,from+3,to-1);
  
@@ -209,8 +389,10 @@ HeaderLine *sam_header_line_parse(const char *headerLine)
  
          from = to;
          while (*to && *to=='\t') to++;
-        if ( *to && to-from != 1 ) 
-                error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+        if ( *to && to-from != 1 ) {
+                       debug("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
+                       return 0;
+               }
  
          from = to;
      }
@@ -219,7 +401,7 @@ HeaderLine *sam_header_line_parse(const char *headerLine)
  
  
  // Must be of an existing type, all tags must be recognised and all required tags must be present
-int sam_header_line_validate(HeaderLine *hline)
+static int sam_header_line_validate(HeaderLine *hline)
  {
      list_t *tags;
      HeaderTag *tag;
@@ -253,8 +435,14 @@ int sam_header_line_validate(HeaderLine *hline)
          tag = tags->data;
          if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
          {
-            debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
-            return 0;
+            // Lower case tags are user-defined values.
+            if( !(islower(tag->key[0]) || islower(tag->key[1])) )
+            {
+                // Neither is lower case, but tag was not recognized.
+                debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
+                // return 0; // Even unknown tags are allowed - for forward compatibility with new attributes
+            }
+            // else - allow user defined tag
          }
          tags = tags->next;
      }
@@ -262,46 +450,70 @@ int sam_header_line_validate(HeaderLine *hline)
      return 1;
  }
  
-void print_header_line(HeaderLine *hline)
+
+static void print_header_line(FILE *fp, HeaderLine *hline)
  {
      list_t *tags = hline->tags;
      HeaderTag *tag;
  
-    printf("@%c%c", hline->type[0],hline->type[1]);
+    fprintf(fp, "@%c%c", hline->type[0],hline->type[1]);
      while (tags)
      {
          tag = tags->data;
-        printf("\t%c%c:%s", tag->key[0],tag->key[1],tag->value);
+
+        fprintf(fp, "\t");
+        if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
+            fprintf(fp, "%c%c:", tag->key[0],tag->key[1]);
+        fprintf(fp, "%s", tag->value);
+
          tags = tags->next;
      }
-    printf("\n");
+    fprintf(fp,"\n");
  }
  
  
-void sam_header_free(HeaderDict *header)
+static void sam_header_line_free(HeaderLine *hline)
  {
+    list_t *tags = hline->tags;
+    while (tags)
+    {
+        HeaderTag *tag = tags->data;
+        free(tag->value);
+        free(tag);
+        tags = tags->next;
+    }
+    list_free(hline->tags);
+    free(hline);
+}
+
+void sam_header_free(void *_header)
+{
+       HeaderDict *header = (HeaderDict*)_header;
      list_t *hlines = header;
      while (hlines)
      {
-        HeaderLine *hline = hlines->data;
-        list_t *tags = hline->tags;
-        while (tags)
-        {
-            HeaderTag *tag = tags->data;
-            free(tag->value);
-            free(tag);
-            tags = tags->next;
-        }
-        list_free(hline->tags);
-        free(hline);
+        sam_header_line_free(hlines->data);
          hlines = hlines->next;
      }
      list_free(header);
  }
  
+HeaderDict *sam_header_clone(const HeaderDict *dict)
+{
+    HeaderDict *out = NULL;
+    while (dict)
+    {
+        HeaderLine *hline = dict->data;
+        out = list_append(out, sam_header_line_clone(hline));
+        dict = dict->next;
+    }
+    return out;
+}
+
  // Returns a newly allocated string
-char *sam_header_write(const HeaderDict *header)
+char *sam_header_write(const void *_header)
  {
+       const HeaderDict *header = (const HeaderDict*)_header;
      char *out = NULL;
      int len=0, nout=0;
      const list_t *hlines;
@@ -351,26 +563,30 @@ char *sam_header_write(const HeaderDict *header)
      return out;
  }
  
-HeaderDict *sam_header_parse(const char *headerText)
+void *sam_header_parse2(const char *headerText)
  {
      list_t *hlines = NULL;
      HeaderLine *hline;
      const char *text;
      char *buf=NULL;
      size_t nbuf = 0;
+       int tovalidate = 0;
  
      if ( !headerText )
-        error("FIXME");
+               return 0;
  
      text = headerText;
      while ( (text=nextline(&buf, &nbuf, text)) )
      {
          hline = sam_header_line_parse(buf);
-        if ( sam_header_line_validate(hline) )
-            hlines = list_append(hlines, hline);
+        if ( hline && (!tovalidate || sam_header_line_validate(hline)) )
+            // With too many (~250,000) reference sequences the header parsing was too slow with list_append.
+            hlines = list_append_to_end(hlines, hline);
          else
          {
-            sam_header_free(hlines);
+                       if (hline) sam_header_line_free(hline);
+                       sam_header_free(hlines);
+            if ( buf ) free(buf);
              return NULL;
          }
      }
@@ -379,13 +595,15 @@ HeaderDict *sam_header_parse(const char *headerText)
      return hlines;
  }
  
-khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char key_tag[2], char value_tag[2])
+void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])
  {
+       const HeaderDict *dict = (const HeaderDict*)_dict;
      const list_t *l   = dict;
      khash_t(str) *tbl = kh_init(str);
      khiter_t k;
      int ret;
  
+       if (_dict == 0) return tbl; // return an empty (not null) hash table
      while (l)
      {
          HeaderLine *hline = l->data;
@@ -415,24 +633,141 @@ khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char
      return tbl;
  }
  
+char **sam_header2list(const void *_dict, char type[2], char key_tag[2], int *_n)
+{
+       const HeaderDict *dict = (const HeaderDict*)_dict;
+    const list_t *l   = dict;
+    int max, n;
+       char **ret;
  
-#if 0
-TODO
-HeaderDict *sam_header_merge(int n, const HeaderDict **dicts)
+       ret = 0; *_n = max = n = 0;
+    while (l)
+    {
+        HeaderLine *hline = l->data;
+        if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] ) 
+        {
+            l = l->next;
+            continue;
+        }
+        
+        HeaderTag *key;
+        key   = header_line_has_tag(hline,key_tag);
+        if ( !key )
+        {
+            l = l->next;
+            continue;
+        }
+
+               if (n == max) {
+                       max = max? max<<1 : 4;
+                       ret = realloc(ret, max * sizeof(void*));
+               }
+               ret[n++] = key->value;
+
+        l = l->next;
+    }
+       *_n = n;
+    return ret;
+}
+
+void *sam_header2key_val(void *iter, const char type[2], const char key_tag[2], const char value_tag[2], const char **_key, const char **_value)
  {
-    HeaderDict *out=NULL;
-    int idict;
+    list_t *l = iter;
+    if ( !l ) return NULL;
  
-    for (idict=0; idict<n; idict++)
+    while (l)
      {
-        list_t *hlines = dicts[idict];
-        while (hlines)
+        HeaderLine *hline = l->data;
+        if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
          {
-            HeaderLine *hline = sam_header_has_line(out, hlines->data);
-            sam_header_line_merge(hline,hlines->data);
-            hlines = hlines->next;
+            l = l->next;
+            continue;
          }
+
+        HeaderTag *key, *value;
+        key   = header_line_has_tag(hline,key_tag);
+        value = header_line_has_tag(hline,value_tag);
+        if ( !key && !value ) 
+        {
+            l = l->next;
+            continue;
+        }
+
+        *_key = key->value;
+        *_value = value->value;
+        return l->next;
      }
+    return l;
  }
-#endif
+
+const char *sam_tbl_get(void *h, const char *key)
+{
+       khash_t(str) *tbl = (khash_t(str)*)h;
+       khint_t k;
+       k = kh_get(str, tbl, key);
+       return k == kh_end(tbl)? 0 : kh_val(tbl, k);
+}
+
+int sam_tbl_size(void *h)
+{
+       khash_t(str) *tbl = (khash_t(str)*)h;
+       return h? kh_size(tbl) : 0;
+}
+
+void sam_tbl_destroy(void *h)
+{
+       khash_t(str) *tbl = (khash_t(str)*)h;
+       kh_destroy(str, tbl);
+}
+
+void *sam_header_merge(int n, const void **_dicts)
+{
+       const HeaderDict **dicts = (const HeaderDict**)_dicts;
+    HeaderDict *out_dict;
+    int idict, status;
+
+    if ( n<2 ) return NULL;
+
+    out_dict = sam_header_clone(dicts[0]);
+
+    for (idict=1; idict<n; idict++)
+    {
+        const list_t *tmpl_hlines = dicts[idict];
+
+        while ( tmpl_hlines )
+        {
+            list_t *out_hlines = out_dict;
+            int inserted = 0;
+            while ( out_hlines )
+            {
+                status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data);
+                if ( status==0 )
+                {
+                    out_hlines = out_hlines->next;
+                    continue;
+                }
+                
+                if ( status==2 ) 
+                {
+                    print_header_line(stderr,tmpl_hlines->data);
+                    print_header_line(stderr,out_hlines->data);
+                    debug("Conflicting lines, cannot merge the headers.\n");
+                                       return 0;
+                }
+                if ( status==3 )
+                    sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);
+
+                inserted = 1;
+                break;
+            }
+            if ( !inserted )
+                out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data));
+
+            tmpl_hlines = tmpl_hlines->next;
+        }
+    }
+
+    return out_dict;
+}
+