From 2f8de7def998e2c804213aaf16cc42b3f4095167 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Tue, 12 May 2009 13:17:58 +0000 Subject: [PATCH] * samtools-0.1.3-17 (r271) * added 'rmdupse' command --- Makefile.am | 2 +- Makefile.generic | 5 +- Makefile.lite | 3 +- bam_rmdupse.c | 177 +++++++++++++++++++++++++++++++++++++++++++++++ bamtk.c | 4 +- configure.ac | 2 +- 6 files changed, 187 insertions(+), 6 deletions(-) create mode 100644 bam_rmdupse.c diff --git a/Makefile.am b/Makefile.am index 91c6a56..b55403d 100644 --- a/Makefile.am +++ b/Makefile.am @@ -7,7 +7,7 @@ bgzip_LDADD = -lm -lz samtools_SOURCES = bamtk.c bam.c bam_aux.c bam_import.c bam_index.c \ bam_lpileup.c bam_maqcns.c bam_mate.c bam_md.c bam_pileup.c \ - bam_plcmd.c bam_rmdup.c bam_sort.c bam_stat.c \ + bam_plcmd.c bam_rmdup.c bam_sort.c bam_stat.c bam_rmdupse.c \ bam_tview.c bgzf.c faidx.c glf.c kstring.c razf.c sam.c sam_view.c \ bam.h bam_endian.h bam_maqcns.h bgzf.h faidx.h glf.h \ khash.h kseq.h kstring.h ksort.h razf.h zutil.h sam.h diff --git a/Makefile.generic b/Makefile.generic index 1d8e7a5..6bf4227 100644 --- a/Makefile.generic +++ b/Makefile.generic @@ -1,11 +1,12 @@ CC= gcc CXX= g++ -CFLAGS= -g -Wall -O2 -m64 #-arch ppc +CFLAGS= -g -Wall #-O2 -m64 #-arch ppc CXXFLAGS= $(CFLAGS) DFLAGS= -D_IOLIB=2 -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 #-D_NO_RAZF #-D_NO_CURSES OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \ razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \ - bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o sam.o sam_view.o + bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o sam.o sam_view.o \ + bam_rmdupse.o PROG= razip bgzip samtools INCLUDES= LIBS= -lm -lz diff --git a/Makefile.lite b/Makefile.lite index 229565f..749b0a8 100644 --- a/Makefile.lite +++ b/Makefile.lite @@ -5,7 +5,8 @@ CXXFLAGS= $(CFLAGS) DFLAGS= -D_IOLIB=2 -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_NO_CURSES -D_NO_RAZF OBJS= bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \ bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \ - bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o sam.o sam_view.o + bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o sam.o sam_view.o \ + bam_rmdupse.o PROG= samtools INCLUDES= LIBS= -lm -lz diff --git a/bam_rmdupse.c b/bam_rmdupse.c new file mode 100644 index 0000000..ed0ef47 --- /dev/null +++ b/bam_rmdupse.c @@ -0,0 +1,177 @@ +#include +#include "sam.h" +#include "khash.h" + +typedef struct { + int n, m; + int *a; +} listelem_t; + +KHASH_MAP_INIT_INT(32, listelem_t) + +//#define BLOCK_SIZE 65536 +#define BLOCK_SIZE 100 + +typedef struct { + bam1_t *b; + int rpos, score; +} elem_t; + +typedef struct { + int n, max, x; + elem_t *buf; +} buffer_t; + +static int fill_buf(samfile_t *in, buffer_t *buf) +{ + int i, ret, last_tid, min_rpos = 0x7fffffff, capacity; + bam1_t *b = bam_init1(); + bam1_core_t *c = &b->core; + // squeeze out the empty cells at the beginning + for (i = 0; i < buf->n; ++i) + if (buf->buf[i].b) break; + if (i < buf->n) { // squeeze + if (i > 0) { + memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i)); + buf->n = buf->n - i; + } + } else buf->n = 0; + // calculate min_rpos + for (i = 0; i < buf->n; ++i) { + elem_t *e = buf->buf + i; + if (e->b && e->rpos >= 0 && e->rpos < min_rpos) + min_rpos = buf->buf[i].rpos; + } + // fill the buffer + buf->x = -1; + last_tid = buf->n? buf->buf[0].b->core.tid : -1; + capacity = buf->n + BLOCK_SIZE; + while ((ret = samread(in, b)) >= 0) { + elem_t *e; + uint8_t *qual = bam1_qual(b); + int is_mapped; + if (last_tid < 0) last_tid = c->tid; + if (c->tid != last_tid) { + if (buf->x < 0) buf->x = buf->n; + } + if (buf->n >= buf->max) { // enlarge + buf->max = buf->max? buf->max<<1 : 8; + buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max); + } + e = &buf->buf[buf->n++]; + e->b = bam_dup1(b); + e->rpos = -1; e->score = 0; + for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1; + e->score = (double)e->score / sqrt(c->l_qseq + 1); + is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1; + if (is_mapped && (c->flag & BAM_FREVERSE)) { + e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b)); + if (min_rpos > e->rpos) min_rpos = e->rpos; + } + if (buf->n >= capacity) { + if (c->pos <= min_rpos) capacity += BLOCK_SIZE; + else break; + } + } + if (ret >= 0 && buf->x < 0) buf->x = buf->n; + bam_destroy1(b); + return buf->n; +} + +static void rmdupse_buf(buffer_t *buf) +{ + khash_t(32) *h; + uint32_t key; + khint_t k; + int mpos, i, upper; + listelem_t *p; + mpos = 0x7fffffff; + mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff; + upper = (buf->x < 0)? buf->n : buf->x; + // fill the hash table + h = kh_init(32); + for (i = 0; i < upper; ++i) { + elem_t *e = buf->buf + i; + int ret; + if (e->score < 0) continue; + if (e->rpos >= 0) { + if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1; + else continue; + } else { + if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1; + else continue; + } + k = kh_put(32, h, key, &ret); + p = &kh_val(h, k); + if (ret == 0) { // present in the hash table + if (p->n == p->m) { + p->m <<= 1; + p->a = (int*)realloc(p->a, p->m * sizeof(int)); + } + p->a[p->n++] = i; + } else { + p->m = p->n = 1; + p->a = (int*)calloc(p->m, sizeof(int)); + p->a[0] = i; + } + } + // rmdup + for (k = kh_begin(h); k < kh_end(h); ++k) { + if (kh_exist(h, k)) { + int max, maxi; + p = &kh_val(h, k); + // get the max + for (i = max = 0, maxi = -1; i < p->n; ++i) { + if (buf->buf[p->a[i]].score > max) { + max = buf->buf[p->a[i]].score; + maxi = i; + } + } + // mark the elements + for (i = 0; i < p->n; ++i) { + buf->buf[p->a[i]].score = -1; + if (i != maxi) { + bam_destroy1(buf->buf[p->a[i]].b); + buf->buf[p->a[i]].b = 0; + } + } + // free + free(p->a); + } + } + kh_destroy(32, h); +} + +static void dump_buf(buffer_t *buf, samfile_t *out) +{ + int i; + for (i = 0; i < buf->n; ++i) { + elem_t *e = buf->buf + i; + if (e->score != -1) break; + if (e->b) { + samwrite(out, e->b); + bam_destroy1(e->b); + e->b = 0; + } + } +} + +int bam_rmdupse(int argc, char *argv[]) +{ + samfile_t *in, *out; + buffer_t *buf; + if (argc < 3) { + fprintf(stderr, "Usage: samtools rmdupse \n"); + return 1; + } + buf = calloc(1, sizeof(buffer_t)); + in = samopen(argv[1], "rb", 0); + out = samopen(argv[2], "wb", in->header); + while (fill_buf(in, buf)) { + rmdupse_buf(buf); + dump_buf(buf, out); + } + samclose(in); samclose(out); + free(buf->buf); free(buf); + return 0; +} diff --git a/bamtk.c b/bamtk.c index 3a89dfb..6f030a6 100644 --- a/bamtk.c +++ b/bamtk.c @@ -3,7 +3,7 @@ #include "bam.h" #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.3-16 (r267)" +#define PACKAGE_VERSION "0.1.3-17 (r271)" #endif int bam_taf2baf(int argc, char *argv[]); @@ -14,6 +14,7 @@ int bam_sort(int argc, char *argv[]); int bam_tview_main(int argc, char *argv[]); int bam_mating(int argc, char *argv[]); int bam_rmdup(int argc, char *argv[]); +int bam_rmdupse(int argc, char *argv[]); int bam_flagstat(int argc, char *argv[]); int bam_fillmd(int argc, char *argv[]); @@ -100,6 +101,7 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); + else if (strcmp(argv[1], "rmdupse") == 0) return bam_rmdupse(argc-1, argv+1); else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1); else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1); diff --git a/configure.ac b/configure.ac index eace603..bde5ed2 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT(SAMTOOLS, 0.1.3) +AC_INIT(SAMTOOLS, 0.1.3-17) AM_CONFIG_HEADER(config.h) AM_INIT_AUTOMAKE([no-dependencies]) AC_CANONICAL_HOST -- 2.39.5