CC= gcc
-CFLAGS= -g -Wall -O2 #-m64 #-arch ppc
+CFLAGS= -g -Wall #-O2 #-m64 #-arch ppc
DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -D_CURSES_LIB=1
LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o knetfile.o \
- bam_sort.o
+ bam_sort.o sam_header.o
AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
bamtk.o kaln.o
$(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz
razip.o:razf.h
-bam.o:bam.h razf.h bam_endian.h kstring.h
+bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h
sam.o:sam.h bam.h
bam_import.o:bam.h kseq.h khash.h razf.h
bam_pileup.o:bam.h razf.h ksort.h
bam_sort.o:bam.h ksort.h razf.h
bam_md.o:bam.h faidx.h
glf.o:glf.h
+sam_header.o:sam_header.h khash.h
faidx.o:faidx.h razf.h khash.h
faidx_main.o:faidx.h razf.h
#include "bam.h"
#include "bam_endian.h"
#include "kstring.h"
+#include "sam_header.h"
int bam_is_be = 0;
char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
free(header->target_len);
}
free(header->text);
-#ifndef BAM_NO_HASH
- if (header->rg2lib) bam_strmap_destroy(header->rg2lib);
+ if (header->dict) sam_header_free(header->dict);
+ if (header->rg2lib) sam_tbl_destroy(header->rg2lib);
bam_destroy_header_hash(header);
-#endif
free(header);
}
printf("%s\n", s);
free(s);
}
+
+const char *bam_get_library(const bam_header_t *header, const bam1_t *b)
+{
+ const uint8_t *rg;
+ rg = bam_aux_get(b, "RG");
+ return (rg == 0)? 0 : sam_tbl_get(header->rg2lib, (const char*)(rg + 1));
+}
@field n_targets number of reference sequences
@field target_name names of the reference sequences
@field target_len lengths of the referene sequences
+ @field dict header dictionary
@field hash hash table for fast name lookup
@field rg2lib hash table for @RG-ID -> LB lookup
@field l_text length of the plain text in the header
int32_t n_targets;
char **target_name;
uint32_t *target_len;
- void *hash, *rg2lib;
+ void *dict, *hash, *rg2lib;
int l_text;
char *text;
} bam_header_t;
char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of);
+ const char *bam_get_library(const bam_header_t *header, const bam1_t *b);
+
/*! @typedef
@abstract Structure for one alignment covering the pileup position.
@field b pointer to the alignment
if (type == 'Z' || type == 'H') return (char*)s;
else return 0;
}
-
-/******************
- * rg2lib related *
- ******************/
-
-int bam_strmap_put(void *rg2lib, const char *rg, const char *lib)
-{
- int ret;
- khint_t k;
- khash_t(r2l) *h = (khash_t(r2l)*)rg2lib;
- char *key;
- if (h == 0) return 1;
- key = strdup(rg);
- k = kh_put(r2l, h, key, &ret);
- if (ret) kh_val(h, k) = strdup(lib);
- else {
- fprintf(stderr, "[bam_rg2lib_put] duplicated @RG ID: %s\n", rg);
- free(key);
- }
- return 0;
-}
-
-const char *bam_strmap_get(const void *rg2lib, const char *rg)
-{
- const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib;
- khint_t k;
- if (h == 0) return 0;
- k = kh_get(r2l, h, rg);
- if (k != kh_end(h)) return (const char*)kh_val(h, k);
- else return 0;
-}
-
-void *bam_strmap_dup(const void *rg2lib)
-{
- const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib;
- khash_t(r2l) *g;
- khint_t k, l;
- int ret;
- if (h == 0) return 0;
- g = kh_init(r2l);
- for (k = kh_begin(h); k < kh_end(h); ++k) {
- if (kh_exist(h, k)) {
- char *key = strdup(kh_key(h, k));
- l = kh_put(r2l, g, key, &ret);
- kh_val(g, l) = strdup(kh_val(h, k));
- }
- }
- return g;
-}
-
-void *bam_strmap_init()
-{
- return (void*)kh_init(r2l);
-}
-
-void bam_strmap_destroy(void *rg2lib)
-{
- khash_t(r2l) *h = (khash_t(r2l)*)rg2lib;
- khint_t k;
- if (h == 0) return;
- for (k = kh_begin(h); k < kh_end(h); ++k) {
- if (kh_exist(h, k)) {
- free((char*)kh_key(h, k)); free(kh_val(h, k));
- }
- }
- kh_destroy(r2l, h);
-}
#endif
#include "kstring.h"
#include "bam.h"
+#include "sam_header.h"
#include "kseq.h"
#include "khash.h"
int sam_header_parse_rg(bam_header_t *h)
{
- kstring_t *rgid, *rglib;
- char *p, *q, *s, *r;
- int n = 0;
-
- // free
- if (h == 0) return 0;
- bam_strmap_destroy(h->rg2lib); h->rg2lib = 0;
- if (h->l_text < 3) return 0;
- // parse @RG lines
- h->rg2lib = bam_strmap_init();
- rgid = calloc(1, sizeof(kstring_t));
- rglib = calloc(1, sizeof(kstring_t));
- s = h->text;
- while ((s = strstr(s, "@RG")) != 0) {
- if (rgid->l && rglib->l) {
- bam_strmap_put(h->rg2lib, rgid->s, rglib->s);
- ++n;
- }
- rgid->l = rglib->l = 0;
- s += 3;
- r = s;
- if ((p = strstr(s, "ID:")) != 0) {
- q = p + 3;
- for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
- kputsn(q, p - q, rgid);
- } else {
- fprintf(stderr, "[bam_header_parse] missing ID tag in @RG lines.\n");
- break;
- }
- if (r < p) r = p;
- if ((p = strstr(s, "LB:")) != 0) {
- q = p + 3;
- for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
- kputsn(q, p - q, rglib);
- } else {
- fprintf(stderr, "[bam_header_parse] missing LB tag in @RG lines.\n");
- break;
- }
- if (r < p) r = p;
- }
- if (rgid->l && rglib->l) {
- bam_strmap_put(h->rg2lib, rgid->s, rglib->s);
- ++n;
- }
- free(rgid->s); free(rgid);
- free(rglib->s); free(rglib);
- if (n == 0) {
- bam_strmap_destroy(h->rg2lib);
- h->rg2lib = 0;
- }
- return n;
+ if (h->dict == 0) h->dict = sam_header_parse2(h->text);
+ if (h->rg2lib) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB");
+ return sam_tbl_size(h->rg2lib);
}
int sam_header_parse(bam_header_t *h)
{
+ void *tbl;
+ char **tmp;
int i;
- char *s, *p, *q, *r;
-
- // free
free(h->target_len); free(h->target_name);
h->n_targets = 0; h->target_len = 0; h->target_name = 0;
if (h->l_text < 3) return 0;
- // count number of @SQ
- s = h->text;
- while ((s = strstr(s, "@SQ")) != 0) {
- ++h->n_targets;
- s += 3;
+ if (h->dict == 0) h->dict = sam_header_parse2(h->text);
+ tbl = sam_header2tbl(h->dict, "SQ", "SN", "LN");
+ h->n_targets = sam_tbl_size(tbl);
+ if (h->n_targets == 0) {
+ sam_tbl_destroy(tbl);
+ return 0;
}
- if (h->n_targets == 0) return 0;
h->target_len = (uint32_t*)calloc(h->n_targets, 4);
h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
- // parse @SQ lines
- i = 0;
- s = h->text;
- while ((s = strstr(s, "@SQ")) != 0) {
- s += 3;
- r = s;
- if ((p = strstr(s, "SN:")) != 0) {
- q = p + 3;
- for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
- h->target_name[i] = (char*)calloc(p - q + 1, 1);
- strncpy(h->target_name[i], q, p - q);
- } else goto header_err_ret;
- if (r < p) r = p;
- if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10);
- else goto header_err_ret;
- if (r < p) r = p;
- s = r + 3;
- ++i;
- }
- sam_header_parse_rg(h);
+ tmp = (char**)calloc(h->n_targets, sizeof(void*));
+ sam_tbl_pair(tbl, h->target_name, tmp);
+ for (i = 0; i < h->n_targets; ++i)
+ h->target_len[i] = atoi(tmp[i]);
+ free(tmp);
+ sam_tbl_destroy(tbl);
return h->n_targets;
-
-header_err_ret:
- fprintf(stderr, "[bam_header_parse] missing SN or LN tag in @SQ lines.\n");
- free(h->target_len); free(h->target_name);
- h->n_targets = 0; h->target_len = 0; h->target_name = 0;
- return 0;
}
bam_header_t *sam_header_read(tamFile fp)
} else if (c->isize > 0) { // paired, head
uint64_t key = (uint64_t)c->pos<<32 | c->isize;
const char *lib;
- const uint8_t *rg;
lib_aux_t *q;
int ret;
- rg = bam_aux_get(b, "RG");
- lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1));
+ lib = bam_get_library(in->header, b);
q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
++q->n_checked;
k = kh_put(pos, q->best_hash, key, &ret);
push_queue(queue, b, endpos, score);
} else {
const char *lib;
- const uint8_t *rg;
lib_aux_t *q;
besthash_t *h;
uint32_t key;
int ret;
- rg = bam_aux_get(b, "RG");
- lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1));
+ lib = bam_get_library(in->header, b);
q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
++q->n_checked;
h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
int M, I, D;
} dpscore_t;
-/* build score profile for accelerating alignment, in theory */
-static void aln_init_score_array(uint8_t *seq, int len, int row, int *score_matrix, int **s_array)
-{
- int *tmp, *tmp2, i, k;
- for (i = 0; i != row; ++i) {
- tmp = score_matrix + i * row;
- tmp2 = s_array[i];
- for (k = 0; k != len; ++k)
- tmp2[k] = tmp[seq[k]];
- }
-}
/***************************
* banded global alignment *
***************************/
h->target_len[i] = h0->target_len[i];
h->target_name[i] = strdup(h0->target_name[i]);
}
- if (h0->rg2lib) h->rg2lib = bam_strmap_dup(h0->rg2lib);
return h;
}
static void append_header_text(bam_header_t *header, char* text, int len)
#include <stdlib.h>
#include <stdarg.h>
+#include "khash.h"
+KHASH_MAP_INIT_STR(str, const char *)
+
+struct _HeaderList
+{
+ struct _HeaderList *next;
+ void *data;
+};
+typedef struct _HeaderList list_t;
+typedef list_t HeaderDict;
+
+typedef struct
+{
+ char key[2];
+ char *value;
+}
+HeaderTag;
+
+typedef struct
+{
+ char type[2];
+ list_t *tags;
+}
+HeaderLine;
+
const char *o_hd_tags[] = {"SO","GO",NULL};
const char *r_hd_tags[] = {"VN",NULL};
const char *u_sq_tags[] = {"SN",NULL};
const char *o_rg_tags[] = {"LB","DS","PU","PI","CN","DT","PL",NULL};
-const char *r_rg_tags[] = {"ID","SM",NULL};
+const char *r_rg_tags[] = {"ID",NULL};
const char *u_rg_tags[] = {"ID",NULL};
const char *o_pg_tags[] = {"VN","CL",NULL};
}
-void sam_header_free(HeaderDict *header)
+void sam_header_free(void *_header)
{
+ HeaderDict *header = (HeaderDict*)_header;
list_t *hlines = header;
while (hlines)
{
}
// Returns a newly allocated string
-char *sam_header_write(const HeaderDict *header)
+char *sam_header_write(const void *_header)
{
+ const HeaderDict *header = (const HeaderDict*)_header;
char *out = NULL;
int len=0, nout=0;
const list_t *hlines;
return out;
}
-HeaderDict *sam_header_parse(const char *headerText)
+void *sam_header_parse2(const char *headerText)
{
list_t *hlines = NULL;
HeaderLine *hline;
return hlines;
}
-khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char key_tag[2], char value_tag[2])
+void *sam_header2tbl(const void *_dict, char type[2], char key_tag[2], char value_tag[2])
{
+ const HeaderDict *dict = (const HeaderDict*)_dict;
const list_t *l = dict;
khash_t(str) *tbl = kh_init(str);
khiter_t k;
return tbl;
}
+const char *sam_tbl_get(void *h, const char *key)
+{
+ khash_t(str) *tbl = (khash_t(str)*)h;
+ khint_t k;
+ k = kh_get(str, tbl, key);
+ return k == kh_end(tbl)? 0 : kh_val(tbl, k);
+}
+
+int sam_tbl_size(void *h)
+{
+ khash_t(str) *tbl = (khash_t(str)*)h;
+ return h? kh_size(tbl) : 0;
+}
+
+int sam_tbl_pair(void *h, char **keys, char **vals)
+{
+ khash_t(str) *tbl = (khash_t(str)*)h;
+ int i = 0;
+ khint_t k;
+ if (h == 0) return -1;
+ for (k = kh_begin(tbl); k != kh_end(tbl); ++k) {
+ if (kh_exist(tbl, k)) {
+ keys[i] = (char*)kh_key(tbl, k);
+ vals[i++] = (char*)kh_val(tbl, k);
+ }
+ }
+ return kh_size(tbl);
+}
+
+void sam_tbl_destroy(void *h)
+{
+ khash_t(str) *tbl = (khash_t(str)*)h;
+ kh_destroy(str, tbl);
+}
-HeaderDict *sam_header_merge(int n, const HeaderDict **dicts)
+void *sam_header_merge(int n, const void **_dicts)
{
+ const HeaderDict **dicts = (const HeaderDict**)_dicts;
HeaderDict *out_dict;
int idict, status;
#ifndef __SAM_HEADER_H__
#define __SAM_HEADER_H__
-#include "khash.h"
-KHASH_MAP_INIT_STR(str,const char *)
+#ifdef __cplusplus
+extern "C" {
+#endif
-// HeaderDict is a list_t of header lines. Each HeaderLine holds a list of tags.
-struct _list_t
-{
- struct _list_t *next;
- void *data;
-};
-typedef struct _list_t list_t;
-typedef list_t HeaderDict;
+ void *sam_header_parse2(const char *headerText);
+ void *sam_header_merge(int n, const void **dicts);
+ void sam_header_free(void *header);
+ char *sam_header_write(const void *headerDict); // returns a newly allocated string
-typedef struct
-{
- char key[2];
- char *value;
-}
-HeaderTag;
+ void *sam_header2tbl(const void *dict, char type[2], char key_tag[2], char value_tag[2]);
+ const char *sam_tbl_get(void *h, const char *key);
+ int sam_tbl_size(void *h);
+ int sam_tbl_pair(void *h, char **keys, char **vals);
+ void sam_tbl_destroy(void *h);
-typedef struct
-{
- char type[2];
- list_t *tags;
+#ifdef __cplusplus
}
-HeaderLine;
-
-
-void debug(const char *format, ...);
-void error(const char *format, ...);
-
-HeaderDict *sam_header_parse(const char *headerText);
-HeaderDict *sam_header_merge(int n, const HeaderDict **dicts);
-void sam_header_free(HeaderDict *header);
-char *sam_header_write(const HeaderDict *headerDict); // returns a newly allocated string
-
-khash_t(str) *sam_header_lookup_table(const HeaderDict *dict, char type[2], char key_tag[2], char value_tag[2]);
-
-list_t *list_append(list_t *root, void *data);
-void list_free(list_t *root);
-
-//char *sam_header_get(const HeaderDict *d, char type[2], int i, char tag[2]);
-//int sam_header_ins(HeaderDict *d, char tp[2], int i, char tg[2], const char *s);
-//int sam_header_del(HeaderDict *d, char type[2], int i, char tag[2]);
+#endif
#endif
#include <string.h>
#include <stdio.h>
#include <unistd.h>
+#include "sam_header.h"
#include "sam.h"
#include "faidx.h"
if (s) {
if (g_rg && strcmp(g_rg, (char*)(s + 1)) == 0) return 0;
if (g_library) {
- const char *p = bam_strmap_get(h->rg2lib, (char*)(s + 1));
+ const char *p = sam_tbl_get(h->rg2lib, (const char*)(s + 1));
return (p && strcmp(p, g_library) == 0)? 0 : 1;
} return 1;
} else return 1;