From 63e4e54189bc9e7f43852edf865f95e713b93dc4 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Fri, 24 Apr 2009 15:40:18 +0000 Subject: [PATCH] * samtools-0.1.3-3 (r240) * generate MD tag * generate "=" bases * the plain pileup now support "=" bases, but consensus calling and glfgen may fail --- Makefile | 3 +- Makefile.lite | 3 +- bam.h | 1 + bam_aux.c | 15 ++++++++ bam_md.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++ bam_plcmd.c | 2 +- bamtk.c | 5 ++- 7 files changed, 120 insertions(+), 4 deletions(-) create mode 100644 bam_md.c diff --git a/Makefile b/Makefile index d4a4101..25897db 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ CXXFLAGS= $(CFLAGS) DFLAGS= -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 #-D_NO_RAZF #-D_NO_CURSES OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \ razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \ - bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o + bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o PROG= razip bgzip samtools INCLUDES= LIBS= -lm -lz @@ -52,6 +52,7 @@ bam_lpileup.o:bam.h ksort.h bam_tview.o:bam.h faidx.h bam_maqcns.h bam_maqcns.o:bam.h ksort.h bam_maqcns.h bam_sort.o:bam.h ksort.h razf.h +bam_md.o:bam.h faidx.h razf.o:razf.h glf.o:glf.h diff --git a/Makefile.lite b/Makefile.lite index ac2e8de..5ddb284 100644 --- a/Makefile.lite +++ b/Makefile.lite @@ -5,7 +5,7 @@ CXXFLAGS= $(CFLAGS) DFLAGS= -D_IOLIB=2 -D_FILE_OFFSET_BITS=64 -D_NO_CURSES -D_NO_RAZF OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \ bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \ - bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o + bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o PROG= samtools INCLUDES= LIBS= -lm -lz @@ -52,6 +52,7 @@ bam_lpileup.o:bam.h ksort.h bam_tview.o:bam.h faidx.h bam_maqcns.h bam_maqcns.o:bam.h ksort.h bam_maqcns.h bam_sort.o:bam.h ksort.h razf.h +bam_md.o:bam.h faidx.h razf.o:razf.h faidx.o:faidx.h razf.h khash.h diff --git a/bam.h b/bam.h index c0bde4b..92e7a36 100644 --- a/bam.h +++ b/bam.h @@ -600,6 +600,7 @@ extern "C" { */ void bam_parse_region(bam_header_t *header, const char *str, int *ref_id, int *begin, int *end); + void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data); uint8_t *bam_aux_get(bam1_t *b, const char tag[2]); int32_t bam_aux2i(const uint8_t *s); float bam_aux2f(const uint8_t *s); diff --git a/bam_aux.c b/bam_aux.c index f9be398..d6f18ea 100644 --- a/bam_aux.c +++ b/bam_aux.c @@ -3,6 +3,21 @@ #include "khash.h" KHASH_MAP_INIT_STR(s, int) +void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data) +{ + int ori_len = b->data_len; + b->data_len += 3 + len; + b->l_aux += 3 + len; + if (b->m_data < b->data_len) { + b->m_data = b->data_len; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + b->data[ori_len] = tag[0]; b->data[ori_len + 1] = tag[1]; + b->data[ori_len + 2] = type; + memcpy(b->data + ori_len + 3, data, len); +} + uint8_t *bam_aux_get(bam1_t *b, const char tag[2]) { uint8_t *s; diff --git a/bam_md.c b/bam_md.c new file mode 100644 index 0000000..a998f1b --- /dev/null +++ b/bam_md.c @@ -0,0 +1,95 @@ +#include +#include "faidx.h" +#include "bam.h" +#include "kstring.h" + +void bam_fillmd1(bam1_t *b, char *ref, int is_equal) +{ + uint8_t *seq = bam1_seq(b); + uint32_t *cigar = bam1_cigar(b); + bam1_core_t *c = &b->core; + int i, x, y, u = 0, has_md = 0; + kstring_t *str; + + if (bam_aux_get(b, "MD")) has_md = 1; + if (has_md == 1 && !is_equal) return; // no need to add MD + str = (kstring_t*)calloc(1, sizeof(kstring_t)); + if (c->flag & BAM_FUNMAP) return; + for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { + int j, l = cigar[i]>>4, op = cigar[i]&0xf; + if (op == BAM_CMATCH) { + for (j = 0; j < l; ++j) { + int z = y + j; + int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; + if (ref[x+j] == 0) break; // out of boundary + if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { + if (is_equal) seq[z/2] &= (z&1)? 0xf0 : 0x0f; + ++u; + } else { + ksprintf(str, "%d", u); + kputc(ref[x+j], str); + u = 0; + } + } + if (j < l) break; + x += l; y += l; + } else if (op == BAM_CDEL) { + ksprintf(str, "%d", u); + kputc('^', str); + for (j = 0; j < l; ++j) { + if (ref[x+j] == 0) break; + kputc(ref[x+j], str); + } + if (j < l) break; + x += l; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { + y += l; + } else if (op == BAM_CREF_SKIP) { + x += l; + } + } + ksprintf(str, "%d", u); + if (!has_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); + free(str->s); free(str); +} + +int bam_fillmd(int argc, char *argv[]) +{ + int c, is_equal = 0, tid = -1, ret, len; + bamFile fp, fpout = 0; + bam_header_t *header; + faidx_t *fai; + char *ref = 0; + bam1_t *b; + + while ((c = getopt(argc, argv, "e")) >= 0) { + switch (c) { + case 'e': is_equal = 1; break; + default: fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n", c); return 1; + } + } + if (optind + 1 >= argc) { + fprintf(stderr, "Usage: bam fillmd [-e] \n"); + return 1; + } + fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r"); + assert(fp); + header = bam_header_read(fp); + fpout = bam_dopen(fileno(stdout), "w"); + bam_header_write(fpout, header); + fai = fai_load(argv[optind+1]); + + b = bam_init1(); + while ((ret = bam_read1(fp, b)) >= 0) { + if (tid != b->core.tid) + ref = fai_fetch(fai, header->target_name[b->core.tid], &len); + bam_fillmd1(b, ref, is_equal); + bam_write1(fpout, b); + } + bam_destroy1(b); + + fai_destroy(fai); + bam_header_destroy(header); + bam_close(fp); bam_close(fpout); + return 0; +} diff --git a/bam_plcmd.c b/bam_plcmd.c index 38680df..b1ffdf9 100644 --- a/bam_plcmd.c +++ b/bam_plcmd.c @@ -195,7 +195,7 @@ static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *p if (p->is_head) printf("^%c", p->b->core.qual > 93? 126 : p->b->core.qual + 33); if (!p->is_del) { int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]; - if (toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; + if (c == '=' || toupper(c) == toupper(rb)) c = bam1_strand(p->b)? ',' : '.'; else c = bam1_strand(p->b)? tolower(c) : toupper(c); putchar(c); if (p->indel > 0) { diff --git a/bamtk.c b/bamtk.c index f12f900..a9daa52 100644 --- a/bamtk.c +++ b/bamtk.c @@ -3,7 +3,7 @@ #include "bam.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.3-2" +#define PACKAGE_VERSION "0.1.3-3" #endif int bam_taf2baf(int argc, char *argv[]); @@ -15,6 +15,7 @@ int bam_tview_main(int argc, char *argv[]); int bam_mating(int argc, char *argv[]); int bam_rmdup(int argc, char *argv[]); int bam_flagstat(int argc, char *argv[]); +int bam_fillmd(int argc, char *argv[]); int faidx_main(int argc, char *argv[]); int glf3_view_main(int argc, char *argv[]); @@ -161,6 +162,7 @@ static int usage() fprintf(stderr, " rmdup remove PCR duplicates\n"); fprintf(stderr, " glfview print GLFv2 file\n"); fprintf(stderr, " flagstat simple stats\n"); + fprintf(stderr, " fillmd fill the MD tag and change identical base to =\n"); fprintf(stderr, "\n"); return 1; } @@ -180,6 +182,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1); else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1); + else if (strcmp(argv[1], "fillmd") == 0) return bam_fillmd(argc-1, argv+1); #ifndef _NO_CURSES else if (strcmp(argv[1], "tview") == 0) return bam_tview_main(argc-1, argv+1); #endif -- 2.39.2