1 #include "sam_header.h"
7 const char *o_hd_tags[] = {"SO","GO",NULL};
8 const char *r_hd_tags[] = {"VN",NULL};
9 const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
10 const char *r_sq_tags[] = {"SN","LN",NULL};
11 const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL};
12 const char *r_rg_tags[] = {"ID","SM",NULL};
13 const char *o_pg_tags[] = {"VN","CL",NULL};
14 const char *r_pg_tags[] = {"ID",NULL};
15 const char *types[] = {"HD","SQ","RG","PG","CO",NULL};
16 const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
17 const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
20 list_t *list_append(list_t *root, void *data)
27 l->next = malloc(sizeof(list_t));
32 l = malloc(sizeof(list_t));
40 void list_free(list_t *root)
53 // Look for a tag "XY" in a predefined const char *[] array.
54 int tag_exists(const char *tag, const char **tags)
57 if ( !tags ) return -1;
60 if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag;
68 // Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
69 // or NULL if everything has been read. The lineptr should be freed by the caller. The
70 // newline character is stripped.
71 const char *nextline(char **lineptr, size_t *n, const char *text)
74 const char *to = text;
76 if ( !*to ) return NULL;
78 while ( *to && *to!='\n' && *to!='\r' ) to++;
83 // Advance the pointer for the next call
84 if ( *to=='\n' ) to++;
85 else if ( *to=='\r' && *(to+1)=='\n' ) to+=2;
92 *lineptr = malloc(len);
97 *lineptr = realloc(*lineptr, len);
103 memcpy(*lineptr,text,len);
104 (*lineptr)[len-1] = 0;
109 // name points to "XY", value_from points to the first character of the value string and
110 // value_to points to the last character of the value string.
111 HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
113 HeaderTag *tag = malloc(sizeof(HeaderTag));
114 int len = value_to-value_from+1;
116 tag->key[0] = name[0];
117 tag->key[1] = name[1];
118 tag->value = malloc(len+1);
119 memcpy(tag->value,value_from,len+1);
124 HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
126 list_t *tags = hline->tags;
129 HeaderTag *tag = tags->data;
130 if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag;
137 // Is there a HeaderLine with all required fields identical to those given in the hline?
138 HeaderLine *sam_header_has_line(HeaderDict *dict, HeaderLine *hline)
140 HeaderLine *found=NULL;
144 HeaderLine *dline = dict->data;
146 if ( hline->type[0]!=dline->type[0] || hline->type[1]!=dline->type[1] )
152 int itype = tag_exists(hline->type,types);
153 if ( itype==-1 ) error("[sam_header_has_line] Unknown type [%c%c]\n", hline->type[0],hline->type[1]);
155 int ireq=0, differ=0;
156 while ( required_tags[itype] && required_tags[itype][ireq] )
159 t1 = header_line_has_tag(hline,required_tags[itype][ireq]);
160 t2 = header_line_has_tag(dline,required_tags[itype][ireq]);
161 if ( !t1 || !t2 ) error("[sam_header_has_line] Missing a required tag [%c%c]\n",
162 required_tags[itype][ireq][0],required_tags[itype][ireq][1]);
163 if ( strcmp(t1->value,t2->value) )
172 HeaderLine *sam_header_line_parse(const char *headerLine)
176 const char *from, *to;
179 if ( *from != '@' ) error("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
182 while (*to && *to!='\t') to++;
183 if ( to-from != 2 ) error("[sam_header_line_parse] expected '@XY', got [%s]\n", headerLine);
185 hline = malloc(sizeof(HeaderLine));
186 hline->type[0] = from[0];
187 hline->type[1] = from[1];
190 int itype = tag_exists(hline->type, types);
193 while (*to && *to=='\t') to++;
195 error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
199 while (*to && *to!='\t') to++;
201 if ( !required_tags[itype] && !optional_tags[itype] )
202 tag = new_tag(" ",from,to-1);
204 tag = new_tag(from,from+3,to-1);
206 if ( header_line_has_tag(hline,tag->key) )
207 debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine);
208 hline->tags = list_append(hline->tags, tag);
211 while (*to && *to=='\t') to++;
212 if ( *to && to-from != 1 )
213 error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
221 // Must be of an existing type, all tags must be recognised and all required tags must be present
222 int sam_header_line_validate(HeaderLine *hline)
228 // Is the type correct?
229 itype = tag_exists(hline->type, types);
232 debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]);
236 // Has all required tags?
238 while ( required_tags[itype] && required_tags[itype][itag] )
240 if ( !header_line_has_tag(hline,required_tags[itype][itag]) )
242 debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1],
243 hline->type[0],hline->type[1]);
249 // Are all tags recognised?
254 if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
256 debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
265 void print_header_line(HeaderLine *hline)
267 list_t *tags = hline->tags;
270 printf("@%c%c", hline->type[0],hline->type[1]);
274 printf("\t%c%c:%s", tag->key[0],tag->key[1],tag->value);
281 void sam_header_free(HeaderDict *header)
283 list_t *hlines = header;
286 HeaderLine *hline = hlines->data;
287 list_t *tags = hline->tags;
290 HeaderTag *tag = tags->data;
295 list_free(hline->tags);
297 hlines = hlines->next;
302 // Returns a newly allocated string
303 char *sam_header_write(const HeaderDict *header)
307 const list_t *hlines;
309 // Calculate the length of the string to allocate
313 len += 4; // @XY and \n
315 HeaderLine *hline = hlines->data;
316 list_t *tags = hline->tags;
319 HeaderTag *tag = tags->data;
320 len += strlen(tag->value) + 1; // \t
321 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
322 len += strlen(tag->value) + 3; // XY:
325 hlines = hlines->next;
333 HeaderLine *hline = hlines->data;
335 nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]);
337 list_t *tags = hline->tags;
340 HeaderTag *tag = tags->data;
341 nout += sprintf(out+nout,"\t");
342 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
343 nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]);
344 nout += sprintf(out+nout,"%s", tag->value);
347 hlines = hlines->next;
348 nout += sprintf(out+nout,"\n");
354 HeaderDict *sam_header_parse(const char *headerText)
356 list_t *hlines = NULL;
366 while ( (text=nextline(&buf, &nbuf, text)) )
368 hline = sam_header_line_parse(buf);
369 if ( sam_header_line_validate(hline) )
370 hlines = list_append(hlines, hline);
373 sam_header_free(hlines);
377 if ( buf ) free(buf);
382 khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char key_tag[2], char value_tag[2])
384 const list_t *l = dict;
385 khash_t(str) *tbl = kh_init(str);
391 HeaderLine *hline = l->data;
392 if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
398 HeaderTag *key, *value;
399 key = header_line_has_tag(hline,key_tag);
400 value = header_line_has_tag(hline,value_tag);
401 if ( !key || !value )
407 k = kh_get(str, tbl, key->value);
408 if ( k != kh_end(tbl) )
409 debug("[sam_header_lookup_table] They key %s not unique.\n", key->value);
410 k = kh_put(str, tbl, key->value, &ret);
411 kh_value(tbl, k) = value->value;
421 HeaderDict *sam_header_merge(int n, const HeaderDict **dicts)
423 HeaderDict *out=NULL;
426 for (idict=0; idict<n; idict++)
428 list_t *hlines = dicts[idict];
431 HeaderLine *hline = sam_header_has_line(out, hlines->data);
432 sam_header_line_merge(hline,hlines->data);
433 hlines = hlines->next;