]> git.donarmstrong.com Git - samtools.git/commitdiff
* samtools-0.1.6-16 (r483)
authorHeng Li <lh3@live.co.uk>
Mon, 19 Oct 2009 17:22:48 +0000 (17:22 +0000)
committerHeng Li <lh3@live.co.uk>
Mon, 19 Oct 2009 17:22:48 +0000 (17:22 +0000)
 * unify the interface of rmdup and rmdupse
 * a new bug found in rg2lib(). Have not been fixed yet.

bam_rmdup.c
bam_rmdupse.c
bamtk.c

index 52ccd4cd1a70e982aec4de88da8941822e05354a..fe6fbeff97f13f427e84d6de72ab2d7e1f5e7deb 100644 (file)
@@ -2,6 +2,7 @@
 #include <string.h>
 #include <stdio.h>
 #include <zlib.h>
+#include <unistd.h>
 #include "sam.h"
 
 typedef bam1_t *bam1_p;
@@ -32,11 +33,11 @@ static inline void stack_insert(tmp_stack_t *stack, bam1_t *b)
        stack->a[stack->n++] = b;
 }
 
-static inline void dump_best(tmp_stack_t *stack, bamFile out)
+static inline void dump_best(tmp_stack_t *stack, samfile_t *out)
 {
        int i;
        for (i = 0; i != stack->n; ++i) {
-               bam_write1(out, stack->a[i]);
+               samwrite(out, stack->a[i]);
                bam_destroy1(stack->a[i]);
        }
        stack->n = 0;
@@ -86,9 +87,8 @@ static inline int sum_qual(const bam1_t *b)
        return q;
 }
 
-void bam_rmdup_core(bamFile in, bamFile out)
+void bam_rmdup_core(samfile_t *in, samfile_t *out)
 {
-       bam_header_t *header;
        bam1_t *b;
        int last_tid = -1, last_pos = -1;
        tmp_stack_t stack;
@@ -100,12 +100,9 @@ void bam_rmdup_core(bamFile in, bamFile out)
        del_set = kh_init(name);
        b = bam_init1();
        memset(&stack, 0, sizeof(tmp_stack_t));
-       header = bam_header_read(in);
-       sam_header_parse_rg(header);
-       bam_header_write(out, header);
 
        kh_resize(name, del_set, 4 * BUFFER_SIZE);
-       while (bam_read1(in, b) >= 0) {
+       while (samread(in, b) >= 0) {
                bam1_core_t *c = &b->core;
                if (c->tid != last_tid || last_pos != c->pos) {
                        dump_best(&stack, out); // write the result
@@ -117,16 +114,16 @@ void bam_rmdup_core(bamFile in, bamFile out)
                                        clear_del_set(del_set);
                                }
                                if ((int)c->tid == -1) { // append unmapped reads
-                                       bam_write1(out, b);
-                                       while (bam_read1(in, b) >= 0) bam_write1(out, b);
+                                       samwrite(out, b);
+                                       while (samread(in, b) >= 0) samwrite(out, b);
                                        break;
                                }
                                last_tid = c->tid;
-                               fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]);
+                               fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]);
                        }
                }
                if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
-                       bam_write1(out, b);
+                       samwrite(out, b);
                } else if (c->isize > 0) { // paired, head
                        uint64_t key = (uint64_t)c->pos<<32 | c->isize;
                        const char *lib;
@@ -134,7 +131,7 @@ void bam_rmdup_core(bamFile in, bamFile out)
                        lib_aux_t *q;
                        int ret;
                        rg = bam_aux_get(b, "RG");
-                       lib = (rg == 0)? 0 : bam_strmap_get(header->rg2lib, (char*)(rg + 1));
+                       lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1));
                        q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
                        ++q->n_checked;
                        k = kh_put(pos, q->best_hash, key, &ret);
@@ -156,7 +153,7 @@ void bam_rmdup_core(bamFile in, bamFile out)
                        if (k != kh_end(del_set)) {
                                free((char*)kh_key(del_set, k));
                                kh_del(name, del_set, k);
-                       } else bam_write1(out, b);
+                       } else samwrite(out, b);
                }
                last_pos = c->pos;
        }
@@ -173,28 +170,39 @@ void bam_rmdup_core(bamFile in, bamFile out)
        }
        kh_destroy(lib, aux);
 
-       bam_header_destroy(header);
        clear_del_set(del_set);
        kh_destroy(name, del_set);
        free(stack.a);
        bam_destroy1(b);
 }
 
+void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se);
+
 int bam_rmdup(int argc, char *argv[])
 {
-       bamFile in, out;
-       if (argc < 3) {
-               fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n");
+       int c, is_se = 0, force_se = 0;
+       samfile_t *in, *out;
+       while ((c = getopt(argc, argv, "sS")) >= 0) {
+               switch (c) {
+               case 's': is_se = 1; break;
+               case 'S': force_se = is_se = 1; break;
+               }
+       }
+       if (optind + 2 > argc) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:  samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n");
+               fprintf(stderr, "Option: -s    rmdup for SE reads\n");
+               fprintf(stderr, "        -S    treat PE reads as SE in rmdup (force -s)\n\n");
                return 1;
        }
-       in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
-       out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
+       in = samopen(argv[optind], "rb", 0);
+       out = samopen(argv[optind+1], "wb", in->header);
        if (in == 0 || out == 0) {
                fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
                return 1;
        }
-       bam_rmdup_core(in, out);
-       bam_close(in);
-       bam_close(out);
+       if (is_se) bam_rmdupse_core(in, out, force_se);
+       else bam_rmdup_core(in, out);
+       samclose(in); samclose(out);
        return 0;
 }
index e23de1c4e17928f85c520549e8db56de9f40bb9d..149d844c976dbe3b0561b61656e29a0fa7dd5255 100644 (file)
@@ -124,11 +124,13 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
                        rg = bam_aux_get(b, "RG");
                        lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1));
                        q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
+                       ++q->n_checked;
                        h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
                        key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
                        k = kh_put(best, h, key, &ret);
                        if (ret == 0) { // in the hash table
                                elem_t *p = kh_val(h, k);
+                               ++q->n_removed;
                                if (p->score < score) {
                                        if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
                                                p->discarded = 1;
@@ -145,8 +147,10 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
 
        for (k = kh_begin(aux); k != kh_end(aux); ++k) {
                if (kh_exist(aux, k)) {
-                       kh_destroy(best, kh_val(aux, k).rght);
-                       kh_destroy(best, kh_val(aux, k).left);
+                       lib_aux_t *q = &kh_val(aux, k);
+                       fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
+                                       (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
+                       kh_destroy(best, q->left); kh_destroy(best, q->rght);
                        free((char*)kh_key(aux, k));
                }
        }
@@ -154,17 +158,3 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
        bam_destroy1(b);
        kl_destroy(q, queue);
 }
-
-int bam_rmdupse(int argc, char *argv[])
-{
-       samfile_t *in, *out;
-       if (argc < 3) {
-               fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n");
-               return 1;
-       }
-       in = samopen(argv[1], "rb", 0);
-       out = samopen(argv[2], "wb", in->header);
-       bam_rmdupse_core(in, out, 1);
-       samclose(in); samclose(out);
-       return 0;
-}
diff --git a/bamtk.c b/bamtk.c
index 8fba4d0fd2877a881ac42987909d5871aa8ae5d0..abb9015ab366a6d8dd6d7e256715f25bebd15869 100644 (file)
--- a/bamtk.c
+++ b/bamtk.c
@@ -9,7 +9,7 @@
 #endif
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.6-15 (r482)"
+#define PACKAGE_VERSION "0.1.6-16 (r483)"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);
@@ -20,7 +20,6 @@ int bam_sort(int argc, char *argv[]);
 int bam_tview_main(int argc, char *argv[]);
 int bam_mating(int argc, char *argv[]);
 int bam_rmdup(int argc, char *argv[]);
-int bam_rmdupse(int argc, char *argv[]);
 int bam_flagstat(int argc, char *argv[]);
 int bam_fillmd(int argc, char *argv[]);
 
@@ -88,8 +87,8 @@ static int usage()
        fprintf(stderr, "         glfview     print GLFv3 file\n");
        fprintf(stderr, "         flagstat    simple stats\n");
        fprintf(stderr, "         calmd       recalculate MD/NM tags and '=' bases\n");
-       fprintf(stderr, "         merge       merge sorted alignments (Picard recommended)\n");
-       fprintf(stderr, "         rmdup       remove PCR duplicates (Picard recommended)\n");
+       fprintf(stderr, "         merge       merge sorted alignments\n");
+       fprintf(stderr, "         rmdup       remove PCR duplicates\n");
        fprintf(stderr, "\n");
        return 1;
 }
@@ -113,7 +112,6 @@ int main(int argc, char *argv[])
        else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
        else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);
        else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);
-       else if (strcmp(argv[1], "rmdupse") == 0) return bam_rmdupse(argc-1, argv+1);
        else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1);
        else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1);
        else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1);