1 #include "sam_header.h"
8 const char *o_hd_tags[] = {"SO","GO",NULL};
9 const char *r_hd_tags[] = {"VN",NULL};
10 const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
11 const char *r_sq_tags[] = {"SN","LN",NULL};
12 const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL};
13 const char *r_rg_tags[] = {"ID","SM",NULL};
14 const char *o_pg_tags[] = {"VN","CL",NULL};
15 const char *r_pg_tags[] = {"ID",NULL};
16 const char *types[] = {"HD","SQ","RG","PG","CO",NULL};
17 const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
18 const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
20 void debug(const char *format, ...)
24 vfprintf(stderr, format, ap);
28 void error(const char *format, ...)
32 vfprintf(stderr, format, ap);
37 list_t *list_append(list_t *root, void *data)
44 l->next = malloc(sizeof(list_t));
49 l = malloc(sizeof(list_t));
57 void list_free(list_t *root)
70 // Look for a tag "XY" in a predefined const char *[] array.
71 int tag_exists(const char *tag, const char **tags)
74 if ( !tags ) return -1;
77 if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag;
85 // Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
86 // or NULL if everything has been read. The lineptr should be freed by the caller. The
87 // newline character is stripped.
88 const char *nextline(char **lineptr, size_t *n, const char *text)
91 const char *to = text;
93 if ( !*to ) return NULL;
95 while ( *to && *to!='\n' && *to!='\r' ) to++;
100 // Advance the pointer for the next call
101 if ( *to=='\n' ) to++;
102 else if ( *to=='\r' && *(to+1)=='\n' ) to+=2;
109 *lineptr = malloc(len);
114 *lineptr = realloc(*lineptr, len);
120 memcpy(*lineptr,text,len);
121 (*lineptr)[len-1] = 0;
126 // name points to "XY", value_from points to the first character of the value string and
127 // value_to points to the last character of the value string.
128 HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
130 HeaderTag *tag = malloc(sizeof(HeaderTag));
131 int len = value_to-value_from+1;
133 tag->key[0] = name[0];
134 tag->key[1] = name[1];
135 tag->value = malloc(len+1);
136 memcpy(tag->value,value_from,len+1);
141 HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
143 list_t *tags = hline->tags;
146 HeaderTag *tag = tags->data;
147 if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag;
154 // Is there a HeaderLine with all required fields identical to those given in the hline?
155 HeaderLine *sam_header_has_line(HeaderDict *dict, HeaderLine *hline)
157 HeaderLine *found=NULL;
161 HeaderLine *dline = dict->data;
163 if ( hline->type[0]!=dline->type[0] || hline->type[1]!=dline->type[1] )
169 int itype = tag_exists(hline->type,types);
170 if ( itype==-1 ) error("[sam_header_has_line] Unknown type [%c%c]\n", hline->type[0],hline->type[1]);
172 int ireq=0, differ=0;
173 while ( required_tags[itype] && required_tags[itype][ireq] )
176 t1 = header_line_has_tag(hline,required_tags[itype][ireq]);
177 t2 = header_line_has_tag(dline,required_tags[itype][ireq]);
178 if ( !t1 || !t2 ) error("[sam_header_has_line] Missing a required tag [%c%c]\n",
179 required_tags[itype][ireq][0],required_tags[itype][ireq][1]);
180 if ( strcmp(t1->value,t2->value) )
189 HeaderLine *sam_header_line_parse(const char *headerLine)
193 const char *from, *to;
196 if ( *from != '@' ) error("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
199 while (*to && *to!='\t') to++;
200 if ( to-from != 2 ) error("[sam_header_line_parse] expected '@XY', got [%s]\n", headerLine);
202 hline = malloc(sizeof(HeaderLine));
203 hline->type[0] = from[0];
204 hline->type[1] = from[1];
207 int itype = tag_exists(hline->type, types);
210 while (*to && *to=='\t') to++;
212 error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
216 while (*to && *to!='\t') to++;
218 if ( !required_tags[itype] && !optional_tags[itype] )
219 tag = new_tag(" ",from,to-1);
221 tag = new_tag(from,from+3,to-1);
223 if ( header_line_has_tag(hline,tag->key) )
224 debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine);
225 hline->tags = list_append(hline->tags, tag);
228 while (*to && *to=='\t') to++;
229 if ( *to && to-from != 1 )
230 error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
238 // Must be of an existing type, all tags must be recognised and all required tags must be present
239 int sam_header_line_validate(HeaderLine *hline)
245 // Is the type correct?
246 itype = tag_exists(hline->type, types);
249 debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]);
253 // Has all required tags?
255 while ( required_tags[itype] && required_tags[itype][itag] )
257 if ( !header_line_has_tag(hline,required_tags[itype][itag]) )
259 debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1],
260 hline->type[0],hline->type[1]);
266 // Are all tags recognised?
271 if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
273 debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
282 void print_header_line(HeaderLine *hline)
284 list_t *tags = hline->tags;
287 printf("@%c%c", hline->type[0],hline->type[1]);
291 printf("\t%c%c:%s", tag->key[0],tag->key[1],tag->value);
298 void sam_header_free(HeaderDict *header)
300 list_t *hlines = header;
303 HeaderLine *hline = hlines->data;
304 list_t *tags = hline->tags;
307 HeaderTag *tag = tags->data;
312 list_free(hline->tags);
314 hlines = hlines->next;
319 // Returns a newly allocated string
320 char *sam_header_write(const HeaderDict *header)
324 const list_t *hlines;
326 // Calculate the length of the string to allocate
330 len += 4; // @XY and \n
332 HeaderLine *hline = hlines->data;
333 list_t *tags = hline->tags;
336 HeaderTag *tag = tags->data;
337 len += strlen(tag->value) + 1; // \t
338 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
339 len += strlen(tag->value) + 3; // XY:
342 hlines = hlines->next;
350 HeaderLine *hline = hlines->data;
352 nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]);
354 list_t *tags = hline->tags;
357 HeaderTag *tag = tags->data;
358 nout += sprintf(out+nout,"\t");
359 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
360 nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]);
361 nout += sprintf(out+nout,"%s", tag->value);
364 hlines = hlines->next;
365 nout += sprintf(out+nout,"\n");
371 HeaderDict *sam_header_parse(const char *headerText)
373 list_t *hlines = NULL;
383 while ( (text=nextline(&buf, &nbuf, text)) )
385 hline = sam_header_line_parse(buf);
386 if ( sam_header_line_validate(hline) )
387 hlines = list_append(hlines, hline);
390 sam_header_free(hlines);
394 if ( buf ) free(buf);
399 khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char key_tag[2], char value_tag[2])
401 const list_t *l = dict;
402 khash_t(str) *tbl = kh_init(str);
408 HeaderLine *hline = l->data;
409 if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
415 HeaderTag *key, *value;
416 key = header_line_has_tag(hline,key_tag);
417 value = header_line_has_tag(hline,value_tag);
418 if ( !key || !value )
424 k = kh_get(str, tbl, key->value);
425 if ( k != kh_end(tbl) )
426 debug("[sam_header_lookup_table] They key %s not unique.\n", key->value);
427 k = kh_put(str, tbl, key->value, &ret);
428 kh_value(tbl, k) = value->value;
438 HeaderDict *sam_header_merge(int n, const HeaderDict **dicts)
440 HeaderDict *out=NULL;
443 for (idict=0; idict<n; idict++)
445 list_t *hlines = dicts[idict];
448 HeaderLine *hline = sam_header_has_line(out, hlines->data);
449 sam_header_line_merge(hline,hlines->data);
450 hlines = hlines->next;