]> git.donarmstrong.com Git - samtools.git/commitdiff
* samtools-0.1.6-15 (r482)
authorHeng Li <lh3@live.co.uk>
Mon, 19 Oct 2009 17:03:34 +0000 (17:03 +0000)
committerHeng Li <lh3@live.co.uk>
Mon, 19 Oct 2009 17:03:34 +0000 (17:03 +0000)
 * rewrite rmdupse
 * rmdupse is now library aware

bam.h
bam_rmdup.c
bam_rmdupse.c
bamtk.c
klist.h [new file with mode: 0644]

diff --git a/bam.h b/bam.h
index 48156888c94760cdab3910a12eb38a4cb2802908..a7e0f086b546b9e35d4bd14c1f0e693e21cd6f97 100644 (file)
--- a/bam.h
+++ b/bam.h
@@ -423,8 +423,8 @@ extern "C" {
          @abstract  Free the memory allocated for an alignment.
          @param  b  pointer to an alignment
         */
-#define bam_destroy1(b) do {           \
-               free((b)->data); free(b);       \
+#define bam_destroy1(b) do {                                   \
+               if (b) { free((b)->data); free(b); }    \
        } while (0)
 
        /*!
index f0dc1c4a9afb71451166d9e4b90b7d17a8b5c534..52ccd4cd1a70e982aec4de88da8941822e05354a 100644 (file)
@@ -179,12 +179,12 @@ void bam_rmdup_core(bamFile in, bamFile out)
        free(stack.a);
        bam_destroy1(b);
 }
+
 int bam_rmdup(int argc, char *argv[])
 {
        bamFile in, out;
        if (argc < 3) {
-               fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n\n");
-               fprintf(stderr, "Note: Picard is recommended for this task.\n");
+               fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n");
                return 1;
        }
        in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
index cf1b7bd7b71461938bf43033b8c452eb4a914eaf..e23de1c4e17928f85c520549e8db56de9f40bb9d 100644 (file)
 #include <math.h>
 #include "sam.h"
 #include "khash.h"
+#include "klist.h"
 
-typedef struct {
-       int n, m;
-       int *a;
-} listelem_t;
-
-KHASH_MAP_INIT_INT(32, listelem_t)
-
-#define BLOCK_SIZE 65536
+#define QUEUE_CLEAR_SIZE 0x100000
+#define MAX_POS 0x7fffffff
 
 typedef struct {
+       int endpos;
+       uint32_t score:31, discarded:1;
        bam1_t *b;
-       int rpos, score;
-} elem_t;
+} elem_t, *elem_p;
+#define __free_elem(p) bam_destroy1((p)->data.b)
+KLIST_INIT(q, elem_t, __free_elem)
+typedef klist_t(q) queue_t;
+
+KHASH_MAP_INIT_INT(best, elem_p)
+typedef khash_t(best) besthash_t;
 
 typedef struct {
-       int n, max, x;
-       elem_t *buf;
-} buffer_t;
+       uint64_t n_checked, n_removed;
+       besthash_t *left, *rght;
+} lib_aux_t;
+KHASH_MAP_INIT_STR(lib, lib_aux_t)
 
-static int fill_buf(samfile_t *in, buffer_t *buf)
+static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
 {
-       int i, ret, last_tid, min_rpos = 0x7fffffff, capacity;
-       bam1_t *b = bam_init1();
-       bam1_core_t *c = &b->core;
-       // squeeze out the empty cells at the beginning
-       for (i = 0; i < buf->n; ++i)
-               if (buf->buf[i].b) break;
-       if (i < buf->n) { // squeeze
-               if (i > 0) {
-                       memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i));
-                       buf->n = buf->n - i;
-               }
-       } else buf->n = 0;
-       // calculate min_rpos
-       for (i = 0; i < buf->n; ++i) {
-               elem_t *e = buf->buf + i;
-               if (e->b && e->rpos >= 0 && e->rpos < min_rpos)
-                       min_rpos = buf->buf[i].rpos;
-       }
-       // fill the buffer
-       buf->x = -1;
-       last_tid = buf->n? buf->buf[0].b->core.tid : -1;
-       capacity = buf->n + BLOCK_SIZE;
-       while ((ret = samread(in, b)) >= 0) {
-               elem_t *e;
-               uint8_t *qual = bam1_qual(b);
-               int is_mapped;
-               if (last_tid < 0) last_tid = c->tid;
-               if (c->tid != last_tid) {
-                       if (buf->x < 0) buf->x = buf->n;
-               }
-               if (buf->n >= buf->max) { // enlarge
-                       buf->max = buf->max? buf->max<<1 : 8;
-                       buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max);
-               }
-               e = &buf->buf[buf->n++];
-               e->b = bam_dup1(b);
-               e->rpos = -1; e->score = 0;
-               for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1;
-               e->score = (double)e->score / sqrt(c->l_qseq + 1);
-               is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1;
-               if (!is_mapped) e->score = -1;
-               if (is_mapped && (c->flag & BAM_FREVERSE)) {
-                       e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b));
-                       if (min_rpos > e->rpos) min_rpos = e->rpos;
-               }
-               if (buf->n >= capacity) {
-                       if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE;
-                       else break;
-               }
-       }
-       if (ret >= 0 && buf->x < 0) buf->x = buf->n;
-       bam_destroy1(b);
-       return buf->n;
+       khint_t k = kh_get(lib, aux, lib);
+       if (k == kh_end(aux)) {
+               int ret;
+               char *p = strdup(lib);
+               lib_aux_t *q;
+               k = kh_put(lib, aux, p, &ret);
+               q = &kh_val(aux, k);
+               q->left = kh_init(best);
+               q->rght = kh_init(best);
+               q->n_checked = q->n_removed = 0;
+               return q;
+       } else return &kh_val(aux, k);
+}
+
+static inline int sum_qual(const bam1_t *b)
+{
+       int i, q;
+       uint8_t *qual = bam1_qual(b);
+       for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
+       return q;
+}
+
+static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score)
+{
+       elem_t *p = kl_pushp(q, queue);
+       p->discarded = 0;
+       p->endpos = endpos; p->score = score;
+       p->b = bam_dup1(b);
+       return p;
 }
 
-static void rmdupse_buf(buffer_t *buf)
+static void clear_besthash(besthash_t *h, int32_t pos)
 {
-       khash_t(32) *h;
-       uint32_t key;
        khint_t k;
-       int mpos, i, upper;
-       listelem_t *p;
-       mpos = 0x7fffffff;
-       mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff;
-       upper = (buf->x < 0)? buf->n : buf->x;
-       // fill the hash table
-       h = kh_init(32);
-       for (i = 0; i < upper; ++i) {
-               elem_t *e = buf->buf + i;
-               int ret;
-               if (e->score < 0) continue;
-               if (e->rpos >= 0) {
-                       if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1;
-                       else continue;
-               } else {
-                       if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1;
-                       else continue;
-               }
-               k = kh_put(32, h, key, &ret);
-               p = &kh_val(h, k);
-               if (ret == 0) { // present in the hash table
-                       if (p->n == p->m) {
-                               p->m <<= 1;
-                               p->a = (int*)realloc(p->a, p->m * sizeof(int));
+       for (k = kh_begin(h); k != kh_end(h); ++k)
+               if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos)
+                       kh_del(best, h, k);
+}
+
+static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
+{
+       if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
+               khint_t k;
+               while (1) {
+                       elem_t *q;
+                       if (queue->head == queue->tail) break;
+                       q = &kl_val(queue->head);
+                       if (q->discarded) {
+                               q->b->data_len = 0;
+                               kl_shift(q, queue, 0);
+                               continue;
                        }
-                       p->a[p->n++] = i;
-               } else {
-                       p->m = p->n = 1;
-                       p->a = (int*)calloc(p->m, sizeof(int));
-                       p->a[0] = i;
+                       if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
+                       samwrite(out, q->b);
+                       q->b->data_len = 0;
+                       kl_shift(q, queue, 0);
                }
-       }
-       // rmdup
-       for (k = kh_begin(h); k < kh_end(h); ++k) {
-               if (kh_exist(h, k)) {
-                       int max, maxi;
-                       p = &kh_val(h, k);
-                       // get the max
-                       for (i = max = 0, maxi = -1; i < p->n; ++i) {
-                               if (buf->buf[p->a[i]].score > max) {
-                                       max = buf->buf[p->a[i]].score;
-                                       maxi = i;
-                               }
-                       }
-                       // mark the elements
-                       for (i = 0; i < p->n; ++i) {
-                               buf->buf[p->a[i]].score = -1;
-                               if (i != maxi) {
-                                       bam_destroy1(buf->buf[p->a[i]].b);
-                                       buf->buf[p->a[i]].b = 0;
-                               }
+               for (k = kh_begin(h); k != kh_end(h); ++k) {
+                       if (kh_exist(h, k)) {
+                               clear_besthash(kh_val(h, k).left, pos);
+                               clear_besthash(kh_val(h, k).rght, pos);
                        }
-                       // free
-                       free(p->a);
                }
        }
-       kh_destroy(32, h);
 }
 
-static void dump_buf(buffer_t *buf, samfile_t *out)
+void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
 {
-       int i;
-       for (i = 0; i < buf->n; ++i) {
-               elem_t *e = buf->buf + i;
-               if (e->score != -1) break;
-               if (e->b) {
-                       samwrite(out, e->b);
-                       bam_destroy1(e->b);
-                       e->b = 0;
+       bam1_t *b;
+       queue_t *queue;
+       khint_t k;
+       int last_tid = -2;
+       khash_t(lib) *aux;
+
+       aux = kh_init(lib);
+       b = bam_init1();
+       queue = kl_init(q);
+       while (samread(in, b) >= 0) {
+               bam1_core_t *c = &b->core;
+               int endpos = bam_calend(c, bam1_cigar(b));
+               int score = sum_qual(b);
+               
+               if (last_tid != c->tid) {
+                       if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
+                       last_tid = c->tid;
+               } else dump_alignment(out, queue, c->pos, aux);
+               if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
+                       push_queue(queue, b, endpos, score);
+               } else {
+                       const char *lib;
+                       const uint8_t *rg;
+                       lib_aux_t *q;
+                       besthash_t *h;
+                       uint32_t key;
+                       int ret;
+                       rg = bam_aux_get(b, "RG");
+                       lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1));
+                       q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
+                       h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
+                       key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
+                       k = kh_put(best, h, key, &ret);
+                       if (ret == 0) { // in the hash table
+                               elem_t *p = kh_val(h, k);
+                               if (p->score < score) {
+                                       if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
+                                               p->discarded = 1;
+                                               kh_val(h, k) = push_queue(queue, b, endpos, score);
+                                       } else { // replace
+                                               p->score = score; p->endpos = endpos;
+                                               bam_copy1(p->b, b);
+                                       }
+                               } // otherwise, discard the alignment
+                       } else kh_val(h, k) = push_queue(queue, b, endpos, score);
                }
        }
+       dump_alignment(out, queue, MAX_POS, aux);
+
+       for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+               if (kh_exist(aux, k)) {
+                       kh_destroy(best, kh_val(aux, k).rght);
+                       kh_destroy(best, kh_val(aux, k).left);
+                       free((char*)kh_key(aux, k));
+               }
+       }
+       kh_destroy(lib, aux);
+       bam_destroy1(b);
+       kl_destroy(q, queue);
 }
 
 int bam_rmdupse(int argc, char *argv[])
 {
        samfile_t *in, *out;
-       buffer_t *buf;
        if (argc < 3) {
-               fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n\n");
-               fprintf(stderr, "Note: Picard is recommended for this task.\n");
+               fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n");
                return 1;
        }
-       buf = calloc(1, sizeof(buffer_t));
        in = samopen(argv[1], "rb", 0);
        out = samopen(argv[2], "wb", in->header);
-       while (fill_buf(in, buf)) {
-               rmdupse_buf(buf);
-               dump_buf(buf, out);
-       }
+       bam_rmdupse_core(in, out, 1);
        samclose(in); samclose(out);
-       free(buf->buf); free(buf);
        return 0;
 }
diff --git a/bamtk.c b/bamtk.c
index 86bfd873b089a7d943003962fd85e8b8b0a577b6..8fba4d0fd2877a881ac42987909d5871aa8ae5d0 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -9,7 +9,7 @@
 #endif
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.6-14 (r480)"
+#define PACKAGE_VERSION "0.1.6-15 (r482)"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);
diff --git a/klist.h b/klist.h
new file mode 100644 (file)
index 0000000..2f17016
--- /dev/null
+++ b/klist.h
@@ -0,0 +1,96 @@
+#ifndef _LH3_KLIST_H
+#define _LH3_KLIST_H
+
+#include <stdlib.h>
+
+#define KMEMPOOL_INIT(name, kmptype_t, kmpfree_f)                                              \
+       typedef struct {                                                                                                        \
+               size_t cnt, n, max;                                                                                             \
+               kmptype_t **buf;                                                                                                \
+       } kmp_##name##_t;                                                                                                       \
+       static inline kmp_##name##_t *kmp_init_##name() {                                       \
+               return calloc(1, sizeof(kmp_##name##_t));                                               \
+       }                                                                                                                                       \
+       static inline void kmp_destroy_##name(kmp_##name##_t *mp) {                     \
+               size_t k;                                                                                                               \
+               for (k = 0; k < mp->n; ++k) {                                                                   \
+                       kmpfree_f(mp->buf[k]); free(mp->buf[k]);                                        \
+               }                                                                                                                               \
+               free(mp->buf); free(mp);                                                                                \
+       }                                                                                                                                       \
+       static inline kmptype_t *kmp_alloc_##name(kmp_##name##_t *mp) {         \
+               ++mp->cnt;                                                                                                              \
+               if (mp->n == 0) return calloc(1, sizeof(kmptype_t));                    \
+               return mp->buf[--mp->n];                                                                                \
+       }                                                                                                                                       \
+       static inline void kmp_free_##name(kmp_##name##_t *mp, kmptype_t *p) { \
+               --mp->cnt;                                                                                                              \
+               if (mp->n == mp->max) {                                                                                 \
+                       mp->max = mp->max? mp->max<<1 : 16;                                                     \
+                       mp->buf = realloc(mp->buf, sizeof(void*) * mp->max);            \
+               }                                                                                                                               \
+               mp->buf[mp->n++] = p;                                                                                   \
+       }
+
+#define kmempool_t(name) kmp_##name##_t
+#define kmp_init(name) kmp_init_##name()
+#define kmp_destroy(name, mp) kmp_destroy_##name(mp)
+#define kmp_alloc(name, mp) kmp_alloc_##name(mp)
+#define kmp_free(name, mp, p) kmp_free_##name(mp, p)
+
+#define KLIST_INIT(name, kltype_t, kmpfree_t)                                                  \
+       struct __kl1_##name {                                                                                           \
+               kltype_t data;                                                                                                  \
+               struct __kl1_##name *next;                                                                              \
+       };                                                                                                                                      \
+       typedef struct __kl1_##name kl1_##name;                                                         \
+       KMEMPOOL_INIT(name, kl1_##name, kmpfree_t)                                                      \
+       typedef struct {                                                                                                        \
+               kl1_##name *head, *tail;                                                                                \
+               kmp_##name##_t *mp;                                                                                             \
+               size_t size;                                                                                                    \
+       } kl_##name##_t;                                                                                                        \
+       static inline kl_##name##_t *kl_init_##name() {                                         \
+               kl_##name##_t *kl = calloc(1, sizeof(kl_##name##_t));                   \
+               kl->mp = kmp_init(name);                                                                                \
+               kl->head = kl->tail = kmp_alloc(name, kl->mp);                                  \
+               kl->head->next = 0;                                                                                             \
+               return kl;                                                                                                              \
+       }                                                                                                                                       \
+       static inline void kl_destroy_##name(kl_##name##_t *kl) {                       \
+               kl1_##name *p;                                                                                                  \
+               for (p = kl->head; p != kl->tail; p = p->next)                                  \
+                       kmp_free(name, kl->mp, p);                                                                      \
+               kmp_free(name, kl->mp, p);                                                                              \
+               kmp_destroy(name, kl->mp);                                                                              \
+               free(kl);                                                                                                               \
+       }                                                                                                                                       \
+       static inline kltype_t *kl_pushp_##name(kl_##name##_t *kl) {            \
+               kl1_##name *q, *p = kmp_alloc(name, kl->mp);                                    \
+               q = kl->tail; p->next = 0; kl->tail->next = p; kl->tail = p;    \
+               ++kl->size;                                                                                                             \
+               return &q->data;                                                                                                \
+       }                                                                                                                                       \
+       static inline int kl_shift_##name(kl_##name##_t *kl, kltype_t *d) { \
+               kl1_##name *p;                                                                                                  \
+               if (kl->head->next == 0) return -1;                                                             \
+               --kl->size;                                                                                                             \
+               p = kl->head; kl->head = kl->head->next;                                                \
+               if (d) *d = p->data;                                                                                    \
+               kmp_free(name, kl->mp, p);                                                                              \
+               return 0;                                                                                                               \
+       }
+
+#define kliter_t(name) kl1_##name
+#define klist_t(name) kl_##name##_t
+#define kl_val(iter) ((iter)->data)
+#define kl_next(iter) ((iter)->next)
+#define kl_begin(kl) ((kl)->head)
+#define kl_end(kl) ((kl)->tail)
+
+#define kl_init(name) kl_init_##name()
+#define kl_destroy(name, kl) kl_destroy_##name(kl)
+#define kl_pushp(name, kl) kl_pushp_##name(kl)
+#define kl_shift(name, kl, d) kl_shift_##name(kl, d)
+
+#endif