]> git.donarmstrong.com Git - samtools.git/commitdiff
* samtools-0.1.3-17 (r271)
authorHeng Li <lh3@live.co.uk>
Tue, 12 May 2009 13:17:58 +0000 (13:17 +0000)
committerHeng Li <lh3@live.co.uk>
Tue, 12 May 2009 13:17:58 +0000 (13:17 +0000)
 * added 'rmdupse' command

Makefile.am
Makefile.generic
Makefile.lite
bam_rmdupse.c [new file with mode: 0644]
bamtk.c
configure.ac

index 91c6a564ed13d342263c48e5a21193ba93502334..b55403debc288e65d49f507720d96d2df39321c8 100644 (file)
@@ -7,7 +7,7 @@ bgzip_LDADD = -lm -lz
 
 samtools_SOURCES = bamtk.c bam.c bam_aux.c bam_import.c bam_index.c \
                                   bam_lpileup.c bam_maqcns.c bam_mate.c bam_md.c bam_pileup.c \
-                                  bam_plcmd.c bam_rmdup.c bam_sort.c bam_stat.c \
+                                  bam_plcmd.c bam_rmdup.c bam_sort.c bam_stat.c bam_rmdupse.c \
                                   bam_tview.c bgzf.c faidx.c glf.c kstring.c razf.c sam.c sam_view.c \
                                   bam.h bam_endian.h bam_maqcns.h bgzf.h faidx.h glf.h \
                                   khash.h kseq.h kstring.h ksort.h razf.h zutil.h sam.h
index 1d8e7a57e51893a0335b6b6fe42e835b31d9d79d..6bf4227753c1e29844701fa897d631fa6f3b39d8 100644 (file)
@@ -1,11 +1,12 @@
 CC=                    gcc
 CXX=           g++
-CFLAGS=                -g -Wall -O2 -m64 #-arch ppc
+CFLAGS=                -g -Wall #-O2 -m64 #-arch ppc
 CXXFLAGS=      $(CFLAGS)
 DFLAGS=                -D_IOLIB=2 -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 #-D_NO_RAZF #-D_NO_CURSES
 OBJS=          bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \
                        razf.o bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \
-                       bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o sam.o sam_view.o
+                       bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o sam.o sam_view.o \
+                       bam_rmdupse.o
 PROG=          razip bgzip samtools
 INCLUDES=      
 LIBS=          -lm -lz
index 229565f8e9d5c843d3f6aa69dc264faf36eb39b8..749b0a846b14a0463801241f99c82c7982630137 100644 (file)
@@ -5,7 +5,8 @@ CXXFLAGS=       $(CFLAGS)
 DFLAGS=                -D_IOLIB=2 -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_NO_CURSES -D_NO_RAZF
 OBJS=          bam.o bam_import.o bam_pileup.o bam_lpileup.o bam_sort.o bam_index.o \
                        bgzf.o faidx.o bam_tview.o bam_maqcns.o bam_aux.o bam_plcmd.o \
-                       bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o sam.o sam_view.o
+                       bam_mate.o bam_rmdup.o glf.o bam_stat.o kstring.o bam_md.o sam.o sam_view.o \
+                       bam_rmdupse.o
 PROG=          samtools
 INCLUDES=      
 LIBS=          -lm -lz
diff --git a/bam_rmdupse.c b/bam_rmdupse.c
new file mode 100644 (file)
index 0000000..ed0ef47
--- /dev/null
@@ -0,0 +1,177 @@
+#include <math.h>
+#include "sam.h"
+#include "khash.h"
+
+typedef struct {
+       int n, m;
+       int *a;
+} listelem_t;
+
+KHASH_MAP_INIT_INT(32, listelem_t)
+
+//#define BLOCK_SIZE 65536
+#define BLOCK_SIZE 100
+
+typedef struct {
+       bam1_t *b;
+       int rpos, score;
+} elem_t;
+
+typedef struct {
+       int n, max, x;
+       elem_t *buf;
+} buffer_t;
+
+static int fill_buf(samfile_t *in, buffer_t *buf)
+{
+       int i, ret, last_tid, min_rpos = 0x7fffffff, capacity;
+       bam1_t *b = bam_init1();
+       bam1_core_t *c = &b->core;
+       // squeeze out the empty cells at the beginning
+       for (i = 0; i < buf->n; ++i)
+               if (buf->buf[i].b) break;
+       if (i < buf->n) { // squeeze
+               if (i > 0) {
+                       memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i));
+                       buf->n = buf->n - i;
+               }
+       } else buf->n = 0;
+       // calculate min_rpos
+       for (i = 0; i < buf->n; ++i) {
+               elem_t *e = buf->buf + i;
+               if (e->b && e->rpos >= 0 && e->rpos < min_rpos)
+                       min_rpos = buf->buf[i].rpos;
+       }
+       // fill the buffer
+       buf->x = -1;
+       last_tid = buf->n? buf->buf[0].b->core.tid : -1;
+       capacity = buf->n + BLOCK_SIZE;
+       while ((ret = samread(in, b)) >= 0) {
+               elem_t *e;
+               uint8_t *qual = bam1_qual(b);
+               int is_mapped;
+               if (last_tid < 0) last_tid = c->tid;
+               if (c->tid != last_tid) {
+                       if (buf->x < 0) buf->x = buf->n;
+               }
+               if (buf->n >= buf->max) { // enlarge
+                       buf->max = buf->max? buf->max<<1 : 8;
+                       buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max);
+               }
+               e = &buf->buf[buf->n++];
+               e->b = bam_dup1(b);
+               e->rpos = -1; e->score = 0;
+               for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1;
+               e->score = (double)e->score / sqrt(c->l_qseq + 1);
+               is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1;
+               if (is_mapped && (c->flag & BAM_FREVERSE)) {
+                       e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b));
+                       if (min_rpos > e->rpos) min_rpos = e->rpos;
+               }
+               if (buf->n >= capacity) {
+                       if (c->pos <= min_rpos) capacity += BLOCK_SIZE;
+                       else break;
+               }
+       }
+       if (ret >= 0 && buf->x < 0) buf->x = buf->n;
+       bam_destroy1(b);
+       return buf->n;
+}
+
+static void rmdupse_buf(buffer_t *buf)
+{
+       khash_t(32) *h;
+       uint32_t key;
+       khint_t k;
+       int mpos, i, upper;
+       listelem_t *p;
+       mpos = 0x7fffffff;
+       mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff;
+       upper = (buf->x < 0)? buf->n : buf->x;
+       // fill the hash table
+       h = kh_init(32);
+       for (i = 0; i < upper; ++i) {
+               elem_t *e = buf->buf + i;
+               int ret;
+               if (e->score < 0) continue;
+               if (e->rpos >= 0) {
+                       if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1;
+                       else continue;
+               } else {
+                       if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1;
+                       else continue;
+               }
+               k = kh_put(32, h, key, &ret);
+               p = &kh_val(h, k);
+               if (ret == 0) { // present in the hash table
+                       if (p->n == p->m) {
+                               p->m <<= 1;
+                               p->a = (int*)realloc(p->a, p->m * sizeof(int));
+                       }
+                       p->a[p->n++] = i;
+               } else {
+                       p->m = p->n = 1;
+                       p->a = (int*)calloc(p->m, sizeof(int));
+                       p->a[0] = i;
+               }
+       }
+       // rmdup
+       for (k = kh_begin(h); k < kh_end(h); ++k) {
+               if (kh_exist(h, k)) {
+                       int max, maxi;
+                       p = &kh_val(h, k);
+                       // get the max
+                       for (i = max = 0, maxi = -1; i < p->n; ++i) {
+                               if (buf->buf[p->a[i]].score > max) {
+                                       max = buf->buf[p->a[i]].score;
+                                       maxi = i;
+                               }
+                       }
+                       // mark the elements
+                       for (i = 0; i < p->n; ++i) {
+                               buf->buf[p->a[i]].score = -1;
+                               if (i != maxi) {
+                                       bam_destroy1(buf->buf[p->a[i]].b);
+                                       buf->buf[p->a[i]].b = 0;
+                               }
+                       }
+                       // free
+                       free(p->a);
+               }
+       }
+       kh_destroy(32, h);
+}
+
+static void dump_buf(buffer_t *buf, samfile_t *out)
+{
+       int i;
+       for (i = 0; i < buf->n; ++i) {
+               elem_t *e = buf->buf + i;
+               if (e->score != -1) break;
+               if (e->b) {
+                       samwrite(out, e->b);
+                       bam_destroy1(e->b);
+                       e->b = 0;
+               }
+       }
+}
+
+int bam_rmdupse(int argc, char *argv[])
+{
+       samfile_t *in, *out;
+       buffer_t *buf;
+       if (argc < 3) {
+               fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n");
+               return 1;
+       }
+       buf = calloc(1, sizeof(buffer_t));
+       in = samopen(argv[1], "rb", 0);
+       out = samopen(argv[2], "wb", in->header);
+       while (fill_buf(in, buf)) {
+               rmdupse_buf(buf);
+               dump_buf(buf, out);
+       }
+       samclose(in); samclose(out);
+       free(buf->buf); free(buf);
+       return 0;
+}
diff --git a/bamtk.c b/bamtk.c
index 3a89dfbbe714ac3b986f5ffc5a39791febbc5265..6f030a654ecbc62fe5eb5ce81c1f333b8c2682a0 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -3,7 +3,7 @@
 #include "bam.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.3-16 (r267)"
+#define PACKAGE_VERSION "0.1.3-17 (r271)"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);
@@ -14,6 +14,7 @@ int bam_sort(int argc, char *argv[]);
 int bam_tview_main(int argc, char *argv[]);
 int bam_mating(int argc, char *argv[]);
 int bam_rmdup(int argc, char *argv[]);
+int bam_rmdupse(int argc, char *argv[]);
 int bam_flagstat(int argc, char *argv[]);
 int bam_fillmd(int argc, char *argv[]);
 
@@ -100,6 +101,7 @@ int main(int argc, char *argv[])
        else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
        else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);
        else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);
+       else if (strcmp(argv[1], "rmdupse") == 0) return bam_rmdupse(argc-1, argv+1);
        else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1);
        else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1);
        else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1);
index eace603adfaecfebf994f6f1de6243e873a4cd7d..bde5ed253bbec9e125046ce71621baa0b744d9ab 100644 (file)
@@ -1,4 +1,4 @@
-AC_INIT(SAMTOOLS, 0.1.3)
+AC_INIT(SAMTOOLS, 0.1.3-17)
 AM_CONFIG_HEADER(config.h)
 AM_INIT_AUTOMAKE([no-dependencies])
 AC_CANONICAL_HOST