]> git.donarmstrong.com Git - samtools.git/commitdiff
* samtools-0.1.6-14 (r480)
authorHeng Li <lh3@live.co.uk>
Sun, 18 Oct 2009 04:07:21 +0000 (04:07 +0000)
committerHeng Li <lh3@live.co.uk>
Sun, 18 Oct 2009 04:07:21 +0000 (04:07 +0000)
 * rmdup is now RG aware

bam_rmdup.c
bamtk.c

index 5da946011f10ec72758c18cc10af75180a7d8f4e..f0dc1c4a9afb71451166d9e4b90b7d17a8b5c534 100644 (file)
@@ -2,15 +2,22 @@
 #include <string.h>
 #include <stdio.h>
 #include <zlib.h>
-#include "bam.h"
+#include "sam.h"
 
 typedef bam1_t *bam1_p;
+
 #include "khash.h"
 KHASH_SET_INIT_STR(name)
 KHASH_MAP_INIT_INT64(pos, bam1_p)
 
 #define BUFFER_SIZE 0x40000
 
+typedef struct {
+       uint64_t n_checked, n_removed;
+       khash_t(pos) *best_hash;
+} lib_aux_t;
+KHASH_MAP_INIT_STR(lib, lib_aux_t)
+
 typedef struct {
        int n, max;
        bam1_t **a;
@@ -25,7 +32,7 @@ static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
        stack->a[stack->n++] = b;
 }
 
-static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFile out)
+static inline void dump_best(tmp_stack_t *stack, bamFile out)
 {
        int i;
        for (i = 0; i != stack->n; ++i) {
@@ -33,7 +40,6 @@ static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFil
                bam_destroy1(stack->a[i]);
        }
        stack->n = 0;
-       if (kh_size(best_hash) > BUFFER_SIZE) kh_clear(pos, best_hash);
 }
 
 static void clear_del_set(khash_t(name) *del_set)
@@ -45,32 +51,67 @@ static void clear_del_set(khash_t(name) *del_set)
        kh_clear(name, del_set);
 }
 
+static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
+{
+       khint_t k = kh_get(lib, aux, lib);
+       if (k == kh_end(aux)) {
+               int ret;
+               char *p = strdup(lib);
+               lib_aux_t *q;
+               k = kh_put(lib, aux, p, &ret);
+               q = &kh_val(aux, k);
+               q->n_checked = q->n_removed = 0;
+               q->best_hash = kh_init(pos);
+               return q;
+       } else return &kh_val(aux, k);
+}
+
+static void clear_best(khash_t(lib) *aux, int max)
+{
+       khint_t k;
+       for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+               if (kh_exist(aux, k)) {
+                       lib_aux_t *q = &kh_val(aux, k);
+                       if (kh_size(q->best_hash) >= max)
+                               kh_clear(pos, q->best_hash);
+               }
+       }
+}
+
+static inline int sum_qual(const bam1_t *b)
+{
+       int i, q;
+       uint8_t *qual = bam1_qual(b);
+       for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
+       return q;
+}
+
 void bam_rmdup_core(bamFile in, bamFile out)
 {
        bam_header_t *header;
        bam1_t *b;
        int last_tid = -1, last_pos = -1;
-       uint64_t n_checked = 0, n_removed = 0;
        tmp_stack_t stack;
        khint_t k;
-       khash_t(pos) *best_hash;
+       khash_t(lib) *aux;
        khash_t(name) *del_set;
-
-       best_hash = kh_init(pos);
+       
+       aux = kh_init(lib);
        del_set = kh_init(name);
        b = bam_init1();
        memset(&stack, 0, sizeof(tmp_stack_t));
        header = bam_header_read(in);
+       sam_header_parse_rg(header);
        bam_header_write(out, header);
 
        kh_resize(name, del_set, 4 * BUFFER_SIZE);
-       kh_resize(pos, best_hash, 3 * BUFFER_SIZE);
        while (bam_read1(in, b) >= 0) {
                bam1_core_t *c = &b->core;
                if (c->tid != last_tid || last_pos != c->pos) {
-                       dump_best(&stack, best_hash, out); // write the result
+                       dump_best(&stack, out); // write the result
+                       clear_best(aux, BUFFER_SIZE);
                        if (c->tid != last_tid) {
-                               kh_clear(pos, best_hash);
+                               clear_best(aux, 0);
                                if (kh_size(del_set)) { // check
                                        fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set));
                                        clear_del_set(del_set);
@@ -88,21 +129,27 @@ void bam_rmdup_core(bamFile in, bamFile out)
                        bam_write1(out, b);
                } else if (c->isize > 0) { // paired, head
                        uint64_t key = (uint64_t)c->pos<<32 | c->isize;
+                       const char *lib;
+                       const uint8_t *rg;
+                       lib_aux_t *q;
                        int ret;
-                       ++n_checked;
-                       k = kh_put(pos, best_hash, key, &ret);
+                       rg = bam_aux_get(b, "RG");
+                       lib = (rg == 0)? 0 : bam_strmap_get(header->rg2lib, (char*)(rg + 1));
+                       q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
+                       ++q->n_checked;
+                       k = kh_put(pos, q->best_hash, key, &ret);
                        if (ret == 0) { // found in best_hash
-                               bam1_t *p = kh_val(best_hash, k);
-                               ++n_removed;
-                               if (p->core.qual < c->qual) { // the current alignment is better
+                               bam1_t *p = kh_val(q->best_hash, k);
+                               ++q->n_removed;
+                               if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle
                                        kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed
                                        bam_copy1(p, b); // replaced as b
                                } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed
                                if (ret == 0)
                                        fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b));
                        } else { // not found in best_hash
-                               kh_val(best_hash, k) = bam_dup1(b);
-                               stack_insert(&stack, kh_val(best_hash, k));
+                               kh_val(q->best_hash, k) = bam_dup1(b);
+                               stack_insert(&stack, kh_val(q->best_hash, k));
                        }
                } else { // paired, tail
                        k = kh_get(name, del_set, bam1_qname(b));
@@ -113,16 +160,24 @@ void bam_rmdup_core(bamFile in, bamFile out)
                }
                last_pos = c->pos;
        }
-       dump_best(&stack, best_hash, out);
+
+       for (k = kh_begin(aux); k != kh_end(aux); ++k) {
+               if (kh_exist(aux, k)) {
+                       lib_aux_t *q = &kh_val(aux, k);                 
+                       dump_best(&stack, out);
+                       fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
+                                       (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
+                       kh_destroy(pos, q->best_hash);
+                       free((char*)kh_key(aux, k));
+               }
+       }
+       kh_destroy(lib, aux);
 
        bam_header_destroy(header);
        clear_del_set(del_set);
        kh_destroy(name, del_set);
-       kh_destroy(pos, best_hash);
        free(stack.a);
        bam_destroy1(b);
-       fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf\n", (long long)n_removed, (long long)n_checked,
-                       (double)n_removed/n_checked);
 }
 int bam_rmdup(int argc, char *argv[])
 {
diff --git a/bamtk.c b/bamtk.c
index b67971fd3ffec0b31b3ec8fc099de82ce9d82b62..86bfd873b089a7d943003962fd85e8b8b0a577b6 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -9,7 +9,7 @@
 #endif
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.6-13 (r479)"
+#define PACKAGE_VERSION "0.1.6-14 (r480)"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);