From 228debef77a6dff27e0b34f6d14232eb74331402 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Sun, 18 Oct 2009 04:07:21 +0000 Subject: [PATCH] * samtools-0.1.6-14 (r480) * rmdup is now RG aware --- bam_rmdup.c | 97 +++++++++++++++++++++++++++++++++++++++++------------ bamtk.c | 2 +- 2 files changed, 77 insertions(+), 22 deletions(-) diff --git a/bam_rmdup.c b/bam_rmdup.c index 5da9460..f0dc1c4 100644 --- a/bam_rmdup.c +++ b/bam_rmdup.c @@ -2,15 +2,22 @@ #include #include #include -#include "bam.h" +#include "sam.h" typedef bam1_t *bam1_p; + #include "khash.h" KHASH_SET_INIT_STR(name) KHASH_MAP_INIT_INT64(pos, bam1_p) #define BUFFER_SIZE 0x40000 +typedef struct { + uint64_t n_checked, n_removed; + khash_t(pos) *best_hash; +} lib_aux_t; +KHASH_MAP_INIT_STR(lib, lib_aux_t) + typedef struct { int n, max; bam1_t **a; @@ -25,7 +32,7 @@ static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) stack->a[stack->n++] = b; } -static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFile out) +static inline void dump_best(tmp_stack_t *stack, bamFile out) { int i; for (i = 0; i != stack->n; ++i) { @@ -33,7 +40,6 @@ static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFil bam_destroy1(stack->a[i]); } stack->n = 0; - if (kh_size(best_hash) > BUFFER_SIZE) kh_clear(pos, best_hash); } static void clear_del_set(khash_t(name) *del_set) @@ -45,32 +51,67 @@ static void clear_del_set(khash_t(name) *del_set) kh_clear(name, del_set); } +static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) +{ + khint_t k = kh_get(lib, aux, lib); + if (k == kh_end(aux)) { + int ret; + char *p = strdup(lib); + lib_aux_t *q; + k = kh_put(lib, aux, p, &ret); + q = &kh_val(aux, k); + q->n_checked = q->n_removed = 0; + q->best_hash = kh_init(pos); + return q; + } else return &kh_val(aux, k); +} + +static void clear_best(khash_t(lib) *aux, int max) +{ + khint_t k; + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + if (kh_size(q->best_hash) >= max) + kh_clear(pos, q->best_hash); + } + } +} + +static inline int sum_qual(const bam1_t *b) +{ + int i, q; + uint8_t *qual = bam1_qual(b); + for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i]; + return q; +} + void bam_rmdup_core(bamFile in, bamFile out) { bam_header_t *header; bam1_t *b; int last_tid = -1, last_pos = -1; - uint64_t n_checked = 0, n_removed = 0; tmp_stack_t stack; khint_t k; - khash_t(pos) *best_hash; + khash_t(lib) *aux; khash_t(name) *del_set; - - best_hash = kh_init(pos); + + aux = kh_init(lib); del_set = kh_init(name); b = bam_init1(); memset(&stack, 0, sizeof(tmp_stack_t)); header = bam_header_read(in); + sam_header_parse_rg(header); bam_header_write(out, header); kh_resize(name, del_set, 4 * BUFFER_SIZE); - kh_resize(pos, best_hash, 3 * BUFFER_SIZE); while (bam_read1(in, b) >= 0) { bam1_core_t *c = &b->core; if (c->tid != last_tid || last_pos != c->pos) { - dump_best(&stack, best_hash, out); // write the result + dump_best(&stack, out); // write the result + clear_best(aux, BUFFER_SIZE); if (c->tid != last_tid) { - kh_clear(pos, best_hash); + clear_best(aux, 0); if (kh_size(del_set)) { // check fprintf(stderr, "[bam_rmdup_core] %llu unmatched pairs\n", (long long)kh_size(del_set)); clear_del_set(del_set); @@ -88,21 +129,27 @@ void bam_rmdup_core(bamFile in, bamFile out) bam_write1(out, b); } else if (c->isize > 0) { // paired, head uint64_t key = (uint64_t)c->pos<<32 | c->isize; + const char *lib; + const uint8_t *rg; + lib_aux_t *q; int ret; - ++n_checked; - k = kh_put(pos, best_hash, key, &ret); + rg = bam_aux_get(b, "RG"); + lib = (rg == 0)? 0 : bam_strmap_get(header->rg2lib, (char*)(rg + 1)); + q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); + ++q->n_checked; + k = kh_put(pos, q->best_hash, key, &ret); if (ret == 0) { // found in best_hash - bam1_t *p = kh_val(best_hash, k); - ++n_removed; - if (p->core.qual < c->qual) { // the current alignment is better + bam1_t *p = kh_val(q->best_hash, k); + ++q->n_removed; + if (sum_qual(p) < sum_qual(b)) { // the current alignment is better; this can be accelerated in principle kh_put(name, del_set, strdup(bam1_qname(p)), &ret); // p will be removed bam_copy1(p, b); // replaced as b } else kh_put(name, del_set, strdup(bam1_qname(b)), &ret); // b will be removed if (ret == 0) fprintf(stderr, "[bam_rmdup_core] inconsistent BAM file for pair '%s'. Continue anyway.\n", bam1_qname(b)); } else { // not found in best_hash - kh_val(best_hash, k) = bam_dup1(b); - stack_insert(&stack, kh_val(best_hash, k)); + kh_val(q->best_hash, k) = bam_dup1(b); + stack_insert(&stack, kh_val(q->best_hash, k)); } } else { // paired, tail k = kh_get(name, del_set, bam1_qname(b)); @@ -113,16 +160,24 @@ void bam_rmdup_core(bamFile in, bamFile out) } last_pos = c->pos; } - dump_best(&stack, best_hash, out); + + for (k = kh_begin(aux); k != kh_end(aux); ++k) { + if (kh_exist(aux, k)) { + lib_aux_t *q = &kh_val(aux, k); + dump_best(&stack, out); + fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, + (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); + kh_destroy(pos, q->best_hash); + free((char*)kh_key(aux, k)); + } + } + kh_destroy(lib, aux); bam_header_destroy(header); clear_del_set(del_set); kh_destroy(name, del_set); - kh_destroy(pos, best_hash); free(stack.a); bam_destroy1(b); - fprintf(stderr, "[bam_rmdup_core] %lld / %lld = %.4lf\n", (long long)n_removed, (long long)n_checked, - (double)n_removed/n_checked); } int bam_rmdup(int argc, char *argv[]) { diff --git a/bamtk.c b/bamtk.c index b67971f..86bfd87 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.6-13 (r479)" +#define PACKAGE_VERSION "0.1.6-14 (r480)" #endif int bam_taf2baf(int argc, char *argv[]); -- 2.39.5