From: Heng Li Date: Mon, 19 Oct 2009 17:22:48 +0000 (+0000) Subject: * samtools-0.1.6-16 (r483) X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=8bee1f18418f068dc0d73ad6ec47084c2f830b32;p=samtools.git * samtools-0.1.6-16 (r483) * unify the interface of rmdup and rmdupse * a new bug found in rg2lib(). Have not been fixed yet. --- diff --git a/bam_rmdup.c b/bam_rmdup.c index 52ccd4c..fe6fbef 100644 --- a/bam_rmdup.c +++ b/bam_rmdup.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "sam.h" typedef bam1_t *bam1_p; @@ -32,11 +33,11 @@ static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) stack->a[stack->n++] = b; } -static inline void dump_best(tmp_stack_t *stack, bamFile out) +static inline void dump_best(tmp_stack_t *stack, samfile_t *out) { int i; for (i = 0; i != stack->n; ++i) { - bam_write1(out, stack->a[i]); + samwrite(out, stack->a[i]); bam_destroy1(stack->a[i]); } stack->n = 0; @@ -86,9 +87,8 @@ static inline int sum_qual(const bam1_t *b) return q; } -void bam_rmdup_core(bamFile in, bamFile out) +void bam_rmdup_core(samfile_t *in, samfile_t *out) { - bam_header_t *header; bam1_t *b; int last_tid = -1, last_pos = -1; tmp_stack_t stack; @@ -100,12 +100,9 @@ void bam_rmdup_core(bamFile in, bamFile out) del_set = kh_init(name); b = bam_init1(); memset(&stack, 0, sizeof(tmp_stack_t)); - header = bam_header_read(in); - sam_header_parse_rg(header); - bam_header_write(out, header); kh_resize(name, del_set, 4 * BUFFER_SIZE); - while (bam_read1(in, b) >= 0) { + while (samread(in, b) >= 0) { bam1_core_t *c = &b->core; if (c->tid != last_tid || last_pos != c->pos) { dump_best(&stack, out); // write the result @@ -117,16 +114,16 @@ void bam_rmdup_core(bamFile in, bamFile out) clear_del_set(del_set); } if ((int)c->tid == -1) { // append unmapped reads - bam_write1(out, b); - while (bam_read1(in, b) >= 0) bam_write1(out, b); + samwrite(out, b); + while (samread(in, b) >= 0) samwrite(out, b); break; } last_tid = c->tid; - fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]); + fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]); } } if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) { - bam_write1(out, b); + samwrite(out, b); } else if (c->isize > 0) { // paired, head uint64_t key = (uint64_t)c->pos<<32 | c->isize; const char *lib; @@ -134,7 +131,7 @@ void bam_rmdup_core(bamFile in, bamFile out) lib_aux_t *q; int ret; rg = bam_aux_get(b, "RG"); - lib = (rg == 0)? 0 : bam_strmap_get(header->rg2lib, (char*)(rg + 1)); + lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1)); q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); ++q->n_checked; k = kh_put(pos, q->best_hash, key, &ret); @@ -156,7 +153,7 @@ void bam_rmdup_core(bamFile in, bamFile out) if (k != kh_end(del_set)) { free((char*)kh_key(del_set, k)); kh_del(name, del_set, k); - } else bam_write1(out, b); + } else samwrite(out, b); } last_pos = c->pos; } @@ -173,28 +170,39 @@ void bam_rmdup_core(bamFile in, bamFile out) } kh_destroy(lib, aux); - bam_header_destroy(header); clear_del_set(del_set); kh_destroy(name, del_set); free(stack.a); bam_destroy1(b); } +void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se); + int bam_rmdup(int argc, char *argv[]) { - bamFile in, out; - if (argc < 3) { - fprintf(stderr, "Usage: samtools rmdup \n"); + int c, is_se = 0, force_se = 0; + samfile_t *in, *out; + while ((c = getopt(argc, argv, "sS")) >= 0) { + switch (c) { + case 's': is_se = 1; break; + case 'S': force_se = is_se = 1; break; + } + } + if (optind + 2 > argc) { + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: samtools rmdup [-sS] \n\n"); + fprintf(stderr, "Option: -s rmdup for SE reads\n"); + fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n\n"); return 1; } - in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r"); - out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w"); + in = samopen(argv[optind], "rb", 0); + out = samopen(argv[optind+1], "wb", in->header); if (in == 0 || out == 0) { fprintf(stderr, "[bam_rmdup] fail to read/write input files\n"); return 1; } - bam_rmdup_core(in, out); - bam_close(in); - bam_close(out); + if (is_se) bam_rmdupse_core(in, out, force_se); + else bam_rmdup_core(in, out); + samclose(in); samclose(out); return 0; } diff --git a/bam_rmdupse.c b/bam_rmdupse.c index e23de1c..149d844 100644 --- a/bam_rmdupse.c +++ b/bam_rmdupse.c @@ -124,11 +124,13 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) rg = bam_aux_get(b, "RG"); lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1)); q = lib? get_aux(aux, lib) : get_aux(aux, "\t"); + ++q->n_checked; h = (c->flag&BAM_FREVERSE)? q->rght : q->left; key = (c->flag&BAM_FREVERSE)? endpos : c->pos; k = kh_put(best, h, key, &ret); if (ret == 0) { // in the hash table elem_t *p = kh_val(h, k); + ++q->n_removed; if (p->score < score) { if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue p->discarded = 1; @@ -145,8 +147,10 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) for (k = kh_begin(aux); k != kh_end(aux); ++k) { if (kh_exist(aux, k)) { - kh_destroy(best, kh_val(aux, k).rght); - kh_destroy(best, kh_val(aux, k).left); + lib_aux_t *q = &kh_val(aux, k); + fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed, + (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k)); + kh_destroy(best, q->left); kh_destroy(best, q->rght); free((char*)kh_key(aux, k)); } } @@ -154,17 +158,3 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se) bam_destroy1(b); kl_destroy(q, queue); } - -int bam_rmdupse(int argc, char *argv[]) -{ - samfile_t *in, *out; - if (argc < 3) { - fprintf(stderr, "Usage: samtools rmdupse \n"); - return 1; - } - in = samopen(argv[1], "rb", 0); - out = samopen(argv[2], "wb", in->header); - bam_rmdupse_core(in, out, 1); - samclose(in); samclose(out); - return 0; -} diff --git a/bamtk.c b/bamtk.c index 8fba4d0..abb9015 100644 --- a/bamtk.c +++ b/bamtk.c @@ -9,7 +9,7 @@ #endif #ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.1.6-15 (r482)" +#define PACKAGE_VERSION "0.1.6-16 (r483)" #endif int bam_taf2baf(int argc, char *argv[]); @@ -20,7 +20,6 @@ int bam_sort(int argc, char *argv[]); int bam_tview_main(int argc, char *argv[]); int bam_mating(int argc, char *argv[]); int bam_rmdup(int argc, char *argv[]); -int bam_rmdupse(int argc, char *argv[]); int bam_flagstat(int argc, char *argv[]); int bam_fillmd(int argc, char *argv[]); @@ -88,8 +87,8 @@ static int usage() fprintf(stderr, " glfview print GLFv3 file\n"); fprintf(stderr, " flagstat simple stats\n"); fprintf(stderr, " calmd recalculate MD/NM tags and '=' bases\n"); - fprintf(stderr, " merge merge sorted alignments (Picard recommended)\n"); - fprintf(stderr, " rmdup remove PCR duplicates (Picard recommended)\n"); + fprintf(stderr, " merge merge sorted alignments\n"); + fprintf(stderr, " rmdup remove PCR duplicates\n"); fprintf(stderr, "\n"); return 1; } @@ -113,7 +112,6 @@ int main(int argc, char *argv[]) else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1); else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1); else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1); - else if (strcmp(argv[1], "rmdupse") == 0) return bam_rmdupse(argc-1, argv+1); else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1); else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1); else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1);