1 #include "sam_header.h"
8 const char *o_hd_tags[] = {"SO","GO",NULL};
9 const char *r_hd_tags[] = {"VN",NULL};
11 const char *o_sq_tags[] = {"AS","M5","UR","SP",NULL};
12 const char *r_sq_tags[] = {"SN","LN",NULL};
13 const char *u_sq_tags[] = {"SN",NULL};
15 const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL};
16 const char *r_rg_tags[] = {"ID","SM",NULL};
17 const char *u_rg_tags[] = {"ID",NULL};
19 const char *o_pg_tags[] = {"VN","CL",NULL};
20 const char *r_pg_tags[] = {"ID",NULL};
22 const char *types[] = {"HD","SQ","RG","PG","CO",NULL};
23 const char **optional_tags[] = {o_hd_tags,o_sq_tags,o_rg_tags,o_pg_tags,NULL,NULL};
24 const char **required_tags[] = {r_hd_tags,r_sq_tags,r_rg_tags,r_pg_tags,NULL,NULL};
25 const char **unique_tags[] = {NULL, u_sq_tags,u_rg_tags,NULL,NULL,NULL};
28 void debug(const char *format, ...)
32 vfprintf(stderr, format, ap);
36 void error(const char *format, ...)
40 vfprintf(stderr, format, ap);
45 list_t *list_append(list_t *root, void *data)
52 l->next = malloc(sizeof(list_t));
57 l = malloc(sizeof(list_t));
65 void list_free(list_t *root)
78 // Look for a tag "XY" in a predefined const char *[] array.
79 int tag_exists(const char *tag, const char **tags)
82 if ( !tags ) return -1;
85 if ( tags[itag][0]==tag[0] && tags[itag][1]==tag[1] ) return itag;
93 // Mimics the behaviour of getline, except it returns pointer to the next chunk of the text
94 // or NULL if everything has been read. The lineptr should be freed by the caller. The
95 // newline character is stripped.
96 const char *nextline(char **lineptr, size_t *n, const char *text)
99 const char *to = text;
101 if ( !*to ) return NULL;
103 while ( *to && *to!='\n' && *to!='\r' ) to++;
108 // Advance the pointer for the next call
109 if ( *to=='\n' ) to++;
110 else if ( *to=='\r' && *(to+1)=='\n' ) to+=2;
117 *lineptr = malloc(len);
122 *lineptr = realloc(*lineptr, len);
128 memcpy(*lineptr,text,len);
129 (*lineptr)[len-1] = 0;
134 // name points to "XY", value_from points to the first character of the value string and
135 // value_to points to the last character of the value string.
136 HeaderTag *new_tag(const char *name, const char *value_from, const char *value_to)
138 HeaderTag *tag = malloc(sizeof(HeaderTag));
139 int len = value_to-value_from+1;
141 tag->key[0] = name[0];
142 tag->key[1] = name[1];
143 tag->value = malloc(len+1);
144 memcpy(tag->value,value_from,len+1);
149 HeaderTag *header_line_has_tag(HeaderLine *hline, const char *key)
151 list_t *tags = hline->tags;
154 HeaderTag *tag = tags->data;
155 if ( tag->key[0]==key[0] && tag->key[1]==key[1] ) return tag;
163 // 0 .. different types or unique tags differ or conflicting tags, cannot be merged
164 // 1 .. all tags identical -> no need to merge, drop one
165 // 2 .. the unique tags match and there are some conflicting tags (same tag, different value) -> error, cannot be merged nor duplicated
166 // 3 .. there are some missing complementary tags and no unique conflict -> can be merged into a single line
167 int sam_header_compare_lines(HeaderLine *hline1, HeaderLine *hline2)
171 if ( hline1->type[0]!=hline2->type[0] || hline1->type[1]!=hline2->type[1] )
174 int itype = tag_exists(hline1->type,types);
175 if ( itype==-1 ) error("[sam_header_compare_lines] Unknown type [%c%c]\n", hline1->type[0],hline1->type[1]);
177 if ( unique_tags[itype] )
179 t1 = header_line_has_tag(hline1,unique_tags[itype][0]);
180 t2 = header_line_has_tag(hline2,unique_tags[itype][0]);
181 if ( !t1 || !t2 ) // this should never happen, the unique tags are required
184 if ( strcmp(t1->value,t2->value) )
185 return 0; // the unique tags differ, cannot be merged
187 if ( !required_tags[itype] && !optional_tags[itype] )
189 t1 = hline1->tags->data;
190 t2 = hline2->tags->data;
191 if ( !strcmp(t1->value,t2->value) ) return 1; // identical comments
195 int missing=0, itag=0;
196 while ( required_tags[itype] && required_tags[itype][itag] )
198 t1 = header_line_has_tag(hline1,required_tags[itype][itag]);
199 t2 = header_line_has_tag(hline2,required_tags[itype][itag]);
201 return 2; // this should never happen
202 else if ( !t1 || !t2 )
203 missing = 1; // there is some tag missing in one of the hlines
204 else if ( strcmp(t1->value,t2->value) )
206 if ( unique_tags[itype] )
207 return 2; // the lines have a matching unique tag but have a conflicting tag
209 return 0; // the lines contain conflicting tags, cannot be merged
214 while ( optional_tags[itype] && optional_tags[itype][itag] )
216 t1 = header_line_has_tag(hline1,optional_tags[itype][itag]);
217 t2 = header_line_has_tag(hline2,optional_tags[itype][itag]);
224 missing = 1; // there is some tag missing in one of the hlines
225 else if ( strcmp(t1->value,t2->value) )
227 if ( unique_tags[itype] )
228 return 2; // the lines have a matching unique tag but have a conflicting tag
230 return 0; // the lines contain conflicting tags, cannot be merged
234 if ( missing ) return 3; // there are some missing complementary tags with no conflicts, can be merged
239 HeaderLine *sam_header_line_clone(const HeaderLine *hline)
242 HeaderLine *out = malloc(sizeof(HeaderLine));
243 out->type[0] = hline->type[0];
244 out->type[1] = hline->type[1];
250 HeaderTag *old = tags->data;
252 HeaderTag *new = malloc(sizeof(HeaderTag));
253 new->key[0] = old->key[0];
254 new->key[1] = old->key[1];
255 new->value = strdup(old->value);
256 out->tags = list_append(out->tags, new);
263 int sam_header_line_merge_with(HeaderLine *out_hline, const HeaderLine *tmpl_hline)
267 if ( out_hline->type[0]!=tmpl_hline->type[0] || out_hline->type[1]!=tmpl_hline->type[1] )
270 tmpl_tags = tmpl_hline->tags;
273 HeaderTag *tmpl_tag = tmpl_tags->data;
274 HeaderTag *out_tag = header_line_has_tag(out_hline, tmpl_tag->key);
277 HeaderTag *tag = malloc(sizeof(HeaderTag));
278 tag->key[0] = tmpl_tag->key[0];
279 tag->key[1] = tmpl_tag->key[1];
280 tag->value = strdup(tmpl_tag->value);
281 out_hline->tags = list_append(out_hline->tags,tag);
283 tmpl_tags = tmpl_tags->next;
289 HeaderLine *sam_header_line_parse(const char *headerLine)
293 const char *from, *to;
296 if ( *from != '@' ) error("[sam_header_line_parse] expected '@', got [%s]\n", headerLine);
299 while (*to && *to!='\t') to++;
300 if ( to-from != 2 ) error("[sam_header_line_parse] expected '@XY', got [%s]\n", headerLine);
302 hline = malloc(sizeof(HeaderLine));
303 hline->type[0] = from[0];
304 hline->type[1] = from[1];
307 int itype = tag_exists(hline->type, types);
310 while (*to && *to=='\t') to++;
312 error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
316 while (*to && *to!='\t') to++;
318 if ( !required_tags[itype] && !optional_tags[itype] )
319 tag = new_tag(" ",from,to-1);
321 tag = new_tag(from,from+3,to-1);
323 if ( header_line_has_tag(hline,tag->key) )
324 debug("The tag '%c%c' present (at least) twice on line [%s]\n", tag->key[0],tag->key[1], headerLine);
325 hline->tags = list_append(hline->tags, tag);
328 while (*to && *to=='\t') to++;
329 if ( *to && to-from != 1 )
330 error("[sam_header_line_parse] multiple tabs on line [%s] (%d)\n", headerLine,(int)(to-from));
338 // Must be of an existing type, all tags must be recognised and all required tags must be present
339 int sam_header_line_validate(HeaderLine *hline)
345 // Is the type correct?
346 itype = tag_exists(hline->type, types);
349 debug("The type [%c%c] not recognised.\n", hline->type[0],hline->type[1]);
353 // Has all required tags?
355 while ( required_tags[itype] && required_tags[itype][itag] )
357 if ( !header_line_has_tag(hline,required_tags[itype][itag]) )
359 debug("The tag [%c%c] required for [%c%c] not present.\n", required_tags[itype][itag][0],required_tags[itype][itag][1],
360 hline->type[0],hline->type[1]);
366 // Are all tags recognised?
371 if ( !tag_exists(tag->key,required_tags[itype]) && !tag_exists(tag->key,optional_tags[itype]) )
373 debug("Unknown tag [%c%c] for [%c%c].\n", tag->key[0],tag->key[1], hline->type[0],hline->type[1]);
383 void print_header_line(FILE *fp, HeaderLine *hline)
385 list_t *tags = hline->tags;
388 fprintf(fp, "@%c%c", hline->type[0],hline->type[1]);
394 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
395 fprintf(fp, "%c%c:", tag->key[0],tag->key[1]);
396 fprintf(fp, "%s", tag->value);
404 void sam_header_free(HeaderDict *header)
406 list_t *hlines = header;
409 HeaderLine *hline = hlines->data;
410 list_t *tags = hline->tags;
413 HeaderTag *tag = tags->data;
418 list_free(hline->tags);
420 hlines = hlines->next;
425 HeaderDict *sam_header_clone(const HeaderDict *dict)
427 HeaderDict *out = NULL;
430 HeaderLine *hline = dict->data;
431 out = list_append(out, sam_header_line_clone(hline));
437 // Returns a newly allocated string
438 char *sam_header_write(const HeaderDict *header)
442 const list_t *hlines;
444 // Calculate the length of the string to allocate
448 len += 4; // @XY and \n
450 HeaderLine *hline = hlines->data;
451 list_t *tags = hline->tags;
454 HeaderTag *tag = tags->data;
455 len += strlen(tag->value) + 1; // \t
456 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
457 len += strlen(tag->value) + 3; // XY:
460 hlines = hlines->next;
468 HeaderLine *hline = hlines->data;
470 nout += sprintf(out+nout,"@%c%c",hline->type[0],hline->type[1]);
472 list_t *tags = hline->tags;
475 HeaderTag *tag = tags->data;
476 nout += sprintf(out+nout,"\t");
477 if ( tag->key[0]!=' ' || tag->key[1]!=' ' )
478 nout += sprintf(out+nout,"%c%c:", tag->key[0],tag->key[1]);
479 nout += sprintf(out+nout,"%s", tag->value);
482 hlines = hlines->next;
483 nout += sprintf(out+nout,"\n");
489 HeaderDict *sam_header_parse(const char *headerText)
491 list_t *hlines = NULL;
501 while ( (text=nextline(&buf, &nbuf, text)) )
503 hline = sam_header_line_parse(buf);
504 if ( sam_header_line_validate(hline) )
505 hlines = list_append(hlines, hline);
508 sam_header_free(hlines);
512 if ( buf ) free(buf);
517 khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char key_tag[2], char value_tag[2])
519 const list_t *l = dict;
520 khash_t(str) *tbl = kh_init(str);
526 HeaderLine *hline = l->data;
527 if ( hline->type[0]!=type[0] || hline->type[1]!=type[1] )
533 HeaderTag *key, *value;
534 key = header_line_has_tag(hline,key_tag);
535 value = header_line_has_tag(hline,value_tag);
536 if ( !key || !value )
542 k = kh_get(str, tbl, key->value);
543 if ( k != kh_end(tbl) )
544 debug("[sam_header_lookup_table] They key %s not unique.\n", key->value);
545 k = kh_put(str, tbl, key->value, &ret);
546 kh_value(tbl, k) = value->value;
554 HeaderDict *sam_header_merge(int n, const HeaderDict **dicts)
556 HeaderDict *out_dict;
559 if ( n<2 ) return NULL;
561 out_dict = sam_header_clone(dicts[0]);
563 for (idict=1; idict<n; idict++)
565 const list_t *tmpl_hlines = dicts[idict];
567 while ( tmpl_hlines )
569 list_t *out_hlines = out_dict;
573 status = sam_header_compare_lines(tmpl_hlines->data, out_hlines->data);
576 out_hlines = out_hlines->next;
582 print_header_line(stderr,tmpl_hlines->data);
583 print_header_line(stderr,out_hlines->data);
584 error("Conflicting lines, cannot merge the headers.\n");
587 sam_header_line_merge_with(out_hlines->data, tmpl_hlines->data);
593 out_dict = list_append(out_dict, sam_header_line_clone(tmpl_hlines->data));
595 tmpl_hlines = tmpl_hlines->next;