From: Heng Li Date: Tue, 10 Mar 2009 22:06:45 +0000 (+0000) Subject: * samtools-0.1.2-14 X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=1fff3215f3a8beeb98a8025af7a86c7839f0b100;p=samtools.git * samtools-0.1.2-14 * implemented GLFv3 --- diff --git a/bam_plcmd.c b/bam_plcmd.c index 34c48de..98a1e49 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -52,57 +52,61 @@ static khash_t(64) *load_pos(const char *fn, bam_header_t *h) } // an analogy to pileup_func() below -static int glt_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) +static int glt3_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data) { pu_data_t *d = (pu_data_t*)data; bam_maqindel_ret_t *r = 0; int rb; glf1_t *g; - glf3_t g2; + glf3_t *g3; if (d->fai == 0) { - fprintf(stderr, "[glt_func] reference sequence is required for generating GLT. Abort!\n"); + fprintf(stderr, "[glt3_func] reference sequence is required for generating GLT. Abort!\n"); exit(1); } if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0; + g3 = glf3_init1(); if (d->fai && (int)tid != d->tid) { if (d->ref) { // then write the end mark - memset(&g2, 0, sizeof(glf3_t)); - g2.type = GLF_TYPE_END; - bgzf_write(d->fp, &g2, sizeof(glf3_t)); + g3->rtype = GLF3_RTYPE_END; + glf3_write1(d->fp, g3); } - glf_ref_write(d->fp, d->h->target_name[tid]); // write reference + glf3_ref_write(d->fp, d->h->target_name[tid], d->h->target_len[tid]); // write reference free(d->ref); d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); d->tid = tid; + d->last_pos = 0; } rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N'; g = bam_maqcns_glfgen(n, pu, bam_nt16_table[rb], d->c); - memcpy(&g2, g, sizeof(glf1_t)); - g2.type = GLF_TYPE_NORMAL; - g2.offset = pos - d->last_pos; + memcpy(g3, g, sizeof(glf1_t)); + g3->rtype = GLF3_RTYPE_SUB; + g3->offset = pos - d->last_pos; d->last_pos = pos; - bgzf_write(d->fp, &g2, sizeof(glf3_t)); + glf3_write1(d->fp, g3); r = bam_maqindel(n, pos, d->ido, pu, d->ref); if (r) { // then write indel line - int g3 = 3 * n, min; - min = g3; + int het = 3 * n, min; + min = het; if (min > r->gl[0]) min = r->gl[0]; if (min > r->gl[1]) min = r->gl[1]; - g2.ref_base = 0; - g2.type = GLF_TYPE_INDEL; - memset(g2.lk, 0, 10); - g2.lk[0] = r->gl[0] - min < 255? r->gl[0] - min : 255; - g2.lk[1] = r->gl[1] - min < 255? r->gl[1] - min : 255; - g2.lk[2] = g3 - min < 255? g3 - min : 255; - *(int16_t*)(g2.lk + 3) = (int16_t)r->indel1; - *(int16_t*)(g2.lk + 5) = (int16_t)r->indel2; - g2.min_lk = min < 255? min : 255; - bgzf_write(d->fp, &g2, sizeof(glf3_t)); - if (r->indel1) bgzf_write(d->fp, r->s[0]+1, r->indel1>0? r->indel1 : -r->indel1); - if (r->indel2) bgzf_write(d->fp, r->s[1]+1, r->indel2>0? r->indel2 : -r->indel2); + g3->ref_base = 0; + g3->rtype = GLF3_RTYPE_INDEL; + memset(g3->lk, 0, 10); + g3->lk[0] = r->gl[0] - min < 255? r->gl[0] - min : 255; + g3->lk[1] = r->gl[1] - min < 255? r->gl[1] - min : 255; + g3->lk[2] = het - min < 255? het - min : 255; + g3->offset = 0; + g3->indel_len[0] = r->indel1; + g3->indel_len[1] = r->indel2; + g3->min_lk = min < 255? min : 255; + g3->max_len = (abs(r->indel1) > abs(r->indel2)? abs(r->indel1) : abs(r->indel2)) + 1; + g3->indel_seq[0] = strdup(r->s[0]+1); + g3->indel_seq[1] = strdup(r->s[1]+1); + glf3_write1(d->fp, g3); bam_maqindel_ret_destroy(r); } free(g); + glf3_destroy1(g3); return 0; } @@ -113,7 +117,7 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p int i, j, rb, max_mapq = 0; uint32_t x; if (d->hash && kh_get(64, d->hash, (uint64_t)tid<<32|pos) == kh_end(d->hash)) return 0; - if (d->format & BAM_PLF_GLF) return glt_func(tid, pos, n, pu, data); + if (d->format & BAM_PLF_GLF) return glt3_func(tid, pos, n, pu, data); if (d->fai && (int)tid != d->tid) { free(d->ref); d->ref = fai_fetch(d->fai, d->h->target_name[tid], &d->len); @@ -247,11 +251,11 @@ int bam_pileup(int argc, char *argv[]) free(fn_fa); bam_maqcns_prepare(d->c); if (d->format & BAM_PLF_GLF) { - glf_header_t *h; - h = glf_header_init(); + glf3_header_t *h; + h = glf3_header_init(); d->fp = bgzf_fdopen(fileno(stdout), "w"); - glf_header_write(d->fp, h); - glf_header_destroy(h); + glf3_header_write(d->fp, h); + glf3_header_destroy(h); } if (fn_list) { tamFile fp; diff --git a/bamtk.c b/bamtk.c index 1025bef..2fe679b 100644 --- a/bamtk.c +++ b/bamtk.c @@ -3,7 +3,7 @@ #include "bam.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.2-13" +#define PACKAGE_VERSION "0.1.2-14" #endif int bam_taf2baf(int argc, char *argv[]); @@ -17,7 +17,7 @@ int bam_rmdup(int argc, char *argv[]); int bam_flagstat(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); -int glf_view_main(int argc, char *argv[]); +int glf3_view_main(int argc, char *argv[]); static int view_aux(const bam1_t *b, void *data) { @@ -125,7 +125,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); - else if (strcmp(argv[1], "glfview") == 0) return glf_view_main(argc-1, argv+1); + else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1); else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); #ifndef _NO_CURSES else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); diff --git a/glf.c b/glf.c index 731e872..4a6c667 100644 --- a/glf.c +++ b/glf.c @@ -6,32 +6,43 @@ // then alias bgzf_*() functions #endif +static int glf3_is_BE = 0; + +static inline uint32_t bam_swap_endian_4(uint32_t v) +{ + v = ((v & 0x0000FFFFU) << 16) | (v >> 16); + return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); +} + +static inline uint16_t bam_swap_endian_2(uint16_t v) +{ + return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); +} + static inline int bam_is_big_endian() { long one= 1; return !(*((char *)(&one))); } -glf_header_t *glf_header_init() +glf3_header_t *glf3_header_init() { - return (glf_header_t*)calloc(1, sizeof(glf_header_t)); + glf3_is_BE = bam_is_big_endian(); + return (glf3_header_t*)calloc(1, sizeof(glf3_header_t)); } -glf_header_t *glf_header_read(glfFile fp) +glf3_header_t *glf3_header_read(glfFile fp) { - glf_header_t *h; + glf3_header_t *h; char magic[4]; - if (bam_is_big_endian()) { - fprintf(stderr, "[glf_header_read] Big endian is detected. Abort.\n"); - exit(1); - } - h = (glf_header_t*)calloc(1, sizeof(glf_header_t)); + h = glf3_header_init(); bgzf_read(fp, magic, 4); if (strncmp(magic, "GLF\3", 4)) { - fprintf(stderr, "[glf_header_read] invalid magic. Abort.\n"); + fprintf(stderr, "[glf3_header_read] invalid magic. Abort.\n"); exit(1); } bgzf_read(fp, &h->l_text, 4); + if (glf3_is_BE) h->l_text = bam_swap_endian_4(h->l_text); if (h->l_text) { h->text = (uint8_t*)calloc(h->l_text + 1, 1); bgzf_read(fp, h->text, h->l_text); @@ -39,101 +50,167 @@ glf_header_t *glf_header_read(glfFile fp) return h; } -void glf_header_write(glfFile fp, const glf_header_t *h) +void glf3_header_write(glfFile fp, const glf3_header_t *h) { + int32_t x; bgzf_write(fp, "GLF\3", 4); - bgzf_write(fp, &h->l_text, 4); + x = glf3_is_BE? bam_swap_endian_4(h->l_text) : h->l_text; + bgzf_write(fp, &x, 4); if (h->l_text) bgzf_write(fp, h->text, h->l_text); } -void glf_header_destroy(glf_header_t *h) +void glf3_header_destroy(glf3_header_t *h) { free(h->text); free(h); } -char *glf_ref_read(glfFile fp) +char *glf3_ref_read(glfFile fp, int *len) { - int32_t n; + int32_t n, x; char *str; + *len = 0; if (bgzf_read(fp, &n, 4) != 4) return 0; + if (glf3_is_BE) n = bam_swap_endian_4(n); if (n < 0) { - fprintf(stderr, "[glf_ref_read] invalid reference name length: %d.\n", n); + fprintf(stderr, "[glf3_ref_read] invalid reference name length: %d.\n", n); return 0; } str = (char*)calloc(n + 1, 1); // not necesarily n+1 in fact - bgzf_read(fp, str, n); + x = bgzf_read(fp, str, n); + x += bgzf_read(fp, len, 4); + if (x != n + 4) { + free(str); *len = -1; return 0; // truncated + } + if (glf3_is_BE) *len = bam_swap_endian_4(*len); return str; } -void glf_ref_write(glfFile fp, const char *str) +void glf3_ref_write(glfFile fp, const char *str, int len) { - int32_t n = strlen(str); - ++n; - bgzf_write(fp, &n, 4); + int32_t m, n = strlen(str) + 1; + m = glf3_is_BE? bam_swap_endian_4(n) : n; + bgzf_write(fp, &m, 4); bgzf_write(fp, str, n); + if (glf3_is_BE) len = bam_swap_endian_4(len); + bgzf_write(fp, &len, 4); } -void glf_view_normal(const char *ref_name, glf3_t *g1, int pos) +void glf3_view1(const char *ref_name, const glf3_t *g3, int pos) { int j; - printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, pos + 1, "XACMGRSVTWYHKDBN"[g1->ref_base], - g1->depth, g1->max_mapQ, g1->min_lk); - for (j = 0; j != 10; ++j) printf("\t%d", g1->lk[j]); + if (g3->rtype == GLF3_RTYPE_END) return; + printf("%s\t%d\t%c\t%d\t%d\t%d", ref_name, pos + 1, + g3->rtype == GLF3_RTYPE_INDEL? '*' : "XACMGRSVTWYHKDBN"[g3->ref_base], + g3->depth, g3->rms_mapQ, g3->min_lk); + if (g3->rtype == GLF3_RTYPE_SUB) + for (j = 0; j != 10; ++j) printf("\t%d", g3->lk[j]); + else { + printf("\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t", g3->lk[0], g3->lk[1], g3->lk[2], g3->indel_len[0], g3->indel_len[1], + g3->indel_len[0]? g3->indel_seq[0] : "*", g3->indel_len[1]? g3->indel_seq[1] : "*"); + } printf("\n"); } -static char *glf_read_indel(glfFile fp, char *str, int *max, int16_t indel) +int glf3_write1(glfFile fp, const glf3_t *g3) { - int l = indel > 0? indel : -indel; - if (l + 1 > *max) { - *max = l + 1; - str = (char*)realloc(str, *max); + int r; + uint8_t c; + uint32_t y[2]; + c = g3->rtype<<4 | g3->ref_base; + r = bgzf_write(fp, &c, 1); + if (g3->rtype == GLF3_RTYPE_END) return r; + y[0] = g3->offset; + y[1] = g3->min_lk<<24 | g3->depth; + if (glf3_is_BE) { + y[0] = bam_swap_endian_4(y[0]); + y[1] = bam_swap_endian_4(y[1]); } - bgzf_read(fp, str, l); - str[l] = 0; - return str; + r += bgzf_write(fp, y, 8); + r += bgzf_write(fp, &g3->rms_mapQ, 1); + if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_write(fp, g3->lk, 10); + else { + int16_t x[2]; + r += bgzf_write(fp, g3->lk, 3); + x[0] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[0]) : g3->indel_len[0]; + x[1] = glf3_is_BE? bam_swap_endian_2(g3->indel_len[1]) : g3->indel_len[1]; + r += bgzf_write(fp, x, 4); + if (g3->indel_len[0]) r += bgzf_write(fp, g3->indel_seq[0], abs(g3->indel_len[0])); + if (g3->indel_len[1]) r += bgzf_write(fp, g3->indel_seq[1], abs(g3->indel_len[1])); + } + return r; +} + +#ifndef kv_roundup32 +#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +int glf3_read1(glfFile fp, glf3_t *g3) +{ + int r; + uint8_t c; + uint32_t y[2]; + r = bgzf_read(fp, &c, 1); + if (r == 0) return 0; + g3->ref_base = c & 0xf; + g3->rtype = c>>4; + if (g3->rtype == GLF3_RTYPE_END) return r; + r += bgzf_read(fp, y, 8); + if (glf3_is_BE) { + y[0] = bam_swap_endian_4(y[0]); + y[1] = bam_swap_endian_4(y[1]); + } + g3->offset = y[0]; + g3->min_lk = y[1]>>24; + g3->depth = y[1]<<8>>8; + r += bgzf_read(fp, &g3->rms_mapQ, 1); + if (g3->rtype == GLF3_RTYPE_SUB) r += bgzf_read(fp, g3->lk, 10); + else { + int16_t x[2], max; + r += bgzf_read(fp, g3->lk, 3); + r += bgzf_read(fp, x, 4); + if (glf3_is_BE) { + x[0] = bam_swap_endian_2(x[0]); + x[1] = bam_swap_endian_2(x[1]); + } + g3->indel_len[0] = x[0]; + g3->indel_len[1] = x[1]; + x[0] = abs(x[0]); x[1] = abs(x[1]); + max = (x[0] > x[1]? x[0] : x[1]) + 1; + if (g3->max_len < max) { + g3->max_len = max; + kv_roundup32(g3->max_len); + g3->indel_seq[0] = (char*)realloc(g3->indel_seq[0], g3->max_len); + g3->indel_seq[1] = (char*)realloc(g3->indel_seq[1], g3->max_len); + } + r += bgzf_read(fp, g3->indel_seq[0], x[0]); + r += bgzf_read(fp, g3->indel_seq[1], x[1]); + g3->indel_seq[0][x[0]] = g3->indel_seq[1][x[1]] = 0; + } + return r; } -void glf_view(glfFile fp) +void glf3_view(glfFile fp) { - glf_header_t *h; - char *name, *str; - glf3_t g2; - int max; - - h = glf_header_read(fp); - str = 0; max = 0; - while ((name = glf_ref_read(fp)) != 0) { + glf3_header_t *h; + char *name; + glf3_t *g3; + int len; + h = glf3_header_read(fp); + g3 = glf3_init1(); + while ((name = glf3_ref_read(fp, &len)) != 0) { int pos = 0; - while (bgzf_read(fp, &g2, sizeof(glf3_t))) { - if (g2.type == GLF_TYPE_END) break; - pos += g2.offset; - if (g2.type == GLF_TYPE_NORMAL) glf_view_normal(name, &g2, pos); - else if (g2.type == GLF_TYPE_INDEL) { - int16_t indel1, indel2; - printf("%s\t%d\t*\t%d\t%d\t%d\t", name, pos + 1, g2.depth, g2.max_mapQ, g2.min_lk); - printf("%d\t%d\t%d\t", g2.lk[0], g2.lk[1], g2.lk[2]); - indel1 = *(int16_t*)(g2.lk + 3); - indel2 = *(int16_t*)(g2.lk + 5); - printf("%d\t%d\t", indel1, indel2); - if (indel1) { - str = glf_read_indel(fp, str, &max, indel1); - printf("%c%d%s\t", indel1>0? '+':'-', indel1>0?indel1:-indel1, str); - } else printf("*\t"); - if (indel2) { - str = glf_read_indel(fp, str, &max, indel2); - printf("%c%d%s\n", indel2>0? '+':'-', indel2>0?indel2:-indel2, str); - } else printf("*\n"); - } + while (glf3_read1(fp, g3) && g3->rtype != GLF3_RTYPE_END) { + pos += g3->offset; + glf3_view1(name, g3, pos); } free(name); } - glf_header_destroy(h); - free(str); + glf3_header_destroy(h); + glf3_destroy1(g3); } -int glf_view_main(int argc, char *argv[]) +int glf3_view_main(int argc, char *argv[]) { glfFile fp; if (argc == 1) { @@ -145,7 +222,7 @@ int glf_view_main(int argc, char *argv[]) fprintf(stderr, "Fail to open file '%s'\n", argv[1]); return 1; } - glf_view(fp); + glf3_view(fp); bgzf_close(fp); return 0; } @@ -153,6 +230,6 @@ int glf_view_main(int argc, char *argv[]) #ifdef GLFVIEW_MAIN int main(int argc, char *argv[]) { - return glf_view_main(argc, argv); + return glf3_view_main(argc, argv); } #endif diff --git a/glf.h b/glf.h index 99acd3a..12e5400 100644 --- a/glf.h +++ b/glf.h @@ -12,33 +12,42 @@ typedef struct { #include "bgzf.h" typedef BGZF *glfFile; -#define GLF_TYPE_NORMAL 0 -#define GLF_TYPE_INDEL 1 -#define GLF_TYPE_END 15 +#define GLF3_RTYPE_END 0 +#define GLF3_RTYPE_SUB 1 +#define GLF3_RTYPE_INDEL 2 typedef struct { - unsigned char ref_base:4, type:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ - unsigned char max_mapQ; /** maximum mapping quality */ - unsigned char lk[10]; /** log likelihood ratio, capped at 255 */ - unsigned min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ - unsigned offset; /** the first base in a chromosome has offset zero. */ + uint8_t ref_base:4, rtype:4; /** "XACMGRSVTWYHKDBN"[ref_base] gives the reference base */ + uint8_t rms_mapQ; /** RMS mapping quality */ + uint8_t lk[10]; /** log likelihood ratio, capped at 255 */ + uint32_t min_lk:8, depth:24; /** minimum lk capped at 255, and the number of mapped reads */ + int32_t offset; /** the first base in a chromosome has offset zero. */ + // for indel (lkHom1, lkHom2 and lkHet are the first three elements in lk[10]) + int16_t indel_len[2]; + int32_t max_len; // maximum indel len; will be modified by glf3_read1() + char *indel_seq[2]; } glf3_t; typedef struct { int32_t l_text; uint8_t *text; -} glf_header_t; +} glf3_header_t; #ifdef __cplusplus extern "C" { #endif - glf_header_t *glf_header_init(); - glf_header_t *glf_header_read(glfFile fp); - void glf_header_write(glfFile fp, const glf_header_t *h); - void glf_header_destroy(glf_header_t *h); - char *glf_ref_read(glfFile fp); - void glf_ref_write(glfFile fp, const char *str); +#define glf3_init1() ((glf3_t*)calloc(1, sizeof(glf3_t))) +#define glf3_destroy1(g3) do { free((g3)->indel_seq[0]); free((g3)->indel_seq[1]); free(g3); } while (0) + + glf3_header_t *glf3_header_init(); + glf3_header_t *glf3_header_read(glfFile fp); + void glf3_header_write(glfFile fp, const glf3_header_t *h); + void glf3_header_destroy(glf3_header_t *h); + char *glf3_ref_read(glfFile fp, int *len); + void glf3_ref_write(glfFile fp, const char *name, int len); + int glf3_write1(glfFile fp, const glf3_t *g3); + int glf3_read1(glfFile fp, glf3_t *g3); #ifdef __cplusplus } diff --git a/misc/Makefile b/misc/Makefile index 792cbbb..0f65881 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -4,7 +4,7 @@ CFLAGS= -g -Wall -O2 -m64 #-arch ppc CXXFLAGS= $(CFLAGS) DFLAGS= #-D_FILE_OFFSET_BITS=64 OBJS= -PROG= faidx md5sum-lite md5fa maq2sam-short maq2sam-long wgsim fa2csfa +PROG= faidx md5sum-lite md5fa maq2sam-short maq2sam-long wgsim INCLUDES= -I.. LIBS= -lm -lz SUBDIRS= . @@ -34,9 +34,6 @@ wgsim:wgsim.o faidx:../faidx.c ../faidx.h $(CC) $(CFLAGS) -D_NO_RAZF -DFAIDX_MAIN -o $@ ../faidx.c -fa2csfa:fa2csfa.o - $(CC) $(CFLAGS) -o $@ fa2csfa.o -lz - md5fa:md5.o md5fa.o md5.h ../kseq.h $(CC) $(CFLAGS) -o $@ md5.o md5fa.o -lz @@ -46,8 +43,6 @@ md5sum-lite:md5sum-lite.o md5sum-lite.o:md5.c md5.h $(CC) -c $(CFLAGS) -DMD5SUM_MAIN -o $@ md5.c -fa2csfa:fa2csfa.c ../kseq.h - maq2sam-short:maq2sam.c $(CC) $(CFLAGS) -o $@ maq2sam.c -lz