#include <string.h>
#include <stdio.h>
#include <zlib.h>
+#include <unistd.h>
#include "sam.h"
typedef bam1_t *bam1_p;
stack->a[stack->n++] = b;
}
-static inline void dump_best(tmp_stack_t *stack, bamFile out)
+static inline void dump_best(tmp_stack_t *stack, samfile_t *out)
{
int i;
for (i = 0; i != stack->n; ++i) {
- bam_write1(out, stack->a[i]);
+ samwrite(out, stack->a[i]);
bam_destroy1(stack->a[i]);
}
stack->n = 0;
return q;
}
-void bam_rmdup_core(bamFile in, bamFile out)
+void bam_rmdup_core(samfile_t *in, samfile_t *out)
{
- bam_header_t *header;
bam1_t *b;
int last_tid = -1, last_pos = -1;
tmp_stack_t stack;
del_set = kh_init(name);
b = bam_init1();
memset(&stack, 0, sizeof(tmp_stack_t));
- header = bam_header_read(in);
- sam_header_parse_rg(header);
- bam_header_write(out, header);
kh_resize(name, del_set, 4 * BUFFER_SIZE);
- while (bam_read1(in, b) >= 0) {
+ while (samread(in, b) >= 0) {
bam1_core_t *c = &b->core;
if (c->tid != last_tid || last_pos != c->pos) {
dump_best(&stack, out); // write the result
clear_del_set(del_set);
}
if ((int)c->tid == -1) { // append unmapped reads
- bam_write1(out, b);
- while (bam_read1(in, b) >= 0) bam_write1(out, b);
+ samwrite(out, b);
+ while (samread(in, b) >= 0) samwrite(out, b);
break;
}
last_tid = c->tid;
- fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", header->target_name[c->tid]);
+ fprintf(stderr, "[bam_rmdup_core] processing reference %s...\n", in->header->target_name[c->tid]);
}
}
if (!(c->flag&BAM_FPAIRED) || (c->flag&(BAM_FUNMAP|BAM_FMUNMAP)) || (c->mtid >= 0 && c->tid != c->mtid)) {
- bam_write1(out, b);
+ samwrite(out, b);
} else if (c->isize > 0) { // paired, head
uint64_t key = (uint64_t)c->pos<<32 | c->isize;
const char *lib;
lib_aux_t *q;
int ret;
rg = bam_aux_get(b, "RG");
- lib = (rg == 0)? 0 : bam_strmap_get(header->rg2lib, (char*)(rg + 1));
+ lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1));
q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
++q->n_checked;
k = kh_put(pos, q->best_hash, key, &ret);
if (k != kh_end(del_set)) {
free((char*)kh_key(del_set, k));
kh_del(name, del_set, k);
- } else bam_write1(out, b);
+ } else samwrite(out, b);
}
last_pos = c->pos;
}
}
kh_destroy(lib, aux);
- bam_header_destroy(header);
clear_del_set(del_set);
kh_destroy(name, del_set);
free(stack.a);
bam_destroy1(b);
}
+void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se);
+
int bam_rmdup(int argc, char *argv[])
{
- bamFile in, out;
- if (argc < 3) {
- fprintf(stderr, "Usage: samtools rmdup <input.srt.bam> <output.bam>\n");
+ int c, is_se = 0, force_se = 0;
+ samfile_t *in, *out;
+ while ((c = getopt(argc, argv, "sS")) >= 0) {
+ switch (c) {
+ case 's': is_se = 1; break;
+ case 'S': force_se = is_se = 1; break;
+ }
+ }
+ if (optind + 2 > argc) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Usage: samtools rmdup [-sS] <input.srt.bam> <output.bam>\n\n");
+ fprintf(stderr, "Option: -s rmdup for SE reads\n");
+ fprintf(stderr, " -S treat PE reads as SE in rmdup (force -s)\n\n");
return 1;
}
- in = (strcmp(argv[1], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[1], "r");
- out = (strcmp(argv[2], "-") == 0)? bam_dopen(fileno(stdout), "w") : bam_open(argv[2], "w");
+ in = samopen(argv[optind], "rb", 0);
+ out = samopen(argv[optind+1], "wb", in->header);
if (in == 0 || out == 0) {
fprintf(stderr, "[bam_rmdup] fail to read/write input files\n");
return 1;
}
- bam_rmdup_core(in, out);
- bam_close(in);
- bam_close(out);
+ if (is_se) bam_rmdupse_core(in, out, force_se);
+ else bam_rmdup_core(in, out);
+ samclose(in); samclose(out);
return 0;
}
rg = bam_aux_get(b, "RG");
lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1));
q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
+ ++q->n_checked;
h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
k = kh_put(best, h, key, &ret);
if (ret == 0) { // in the hash table
elem_t *p = kh_val(h, k);
+ ++q->n_removed;
if (p->score < score) {
if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
p->discarded = 1;
for (k = kh_begin(aux); k != kh_end(aux); ++k) {
if (kh_exist(aux, k)) {
- kh_destroy(best, kh_val(aux, k).rght);
- kh_destroy(best, kh_val(aux, k).left);
+ lib_aux_t *q = &kh_val(aux, k);
+ fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
+ (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
+ kh_destroy(best, q->left); kh_destroy(best, q->rght);
free((char*)kh_key(aux, k));
}
}
bam_destroy1(b);
kl_destroy(q, queue);
}
-
-int bam_rmdupse(int argc, char *argv[])
-{
- samfile_t *in, *out;
- if (argc < 3) {
- fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n");
- return 1;
- }
- in = samopen(argv[1], "rb", 0);
- out = samopen(argv[2], "wb", in->header);
- bam_rmdupse_core(in, out, 1);
- samclose(in); samclose(out);
- return 0;
-}
#endif
#ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.6-15 (r482)"
+#define PACKAGE_VERSION "0.1.6-16 (r483)"
#endif
int bam_taf2baf(int argc, char *argv[]);
int bam_tview_main(int argc, char *argv[]);
int bam_mating(int argc, char *argv[]);
int bam_rmdup(int argc, char *argv[]);
-int bam_rmdupse(int argc, char *argv[]);
int bam_flagstat(int argc, char *argv[]);
int bam_fillmd(int argc, char *argv[]);
fprintf(stderr, " glfview print GLFv3 file\n");
fprintf(stderr, " flagstat simple stats\n");
fprintf(stderr, " calmd recalculate MD/NM tags and '=' bases\n");
- fprintf(stderr, " merge merge sorted alignments (Picard recommended)\n");
- fprintf(stderr, " rmdup remove PCR duplicates (Picard recommended)\n");
+ fprintf(stderr, " merge merge sorted alignments\n");
+ fprintf(stderr, " rmdup remove PCR duplicates\n");
fprintf(stderr, "\n");
return 1;
}
else if (strcmp(argv[1], "faidx") == 0) return faidx_main(argc-1, argv+1);
else if (strcmp(argv[1], "fixmate") == 0) return bam_mating(argc-1, argv+1);
else if (strcmp(argv[1], "rmdup") == 0) return bam_rmdup(argc-1, argv+1);
- else if (strcmp(argv[1], "rmdupse") == 0) return bam_rmdupse(argc-1, argv+1);
else if (strcmp(argv[1], "glfview") == 0) return glf3_view_main(argc-1, argv+1);
else if (strcmp(argv[1], "flagstat") == 0) return bam_flagstat(argc-1, argv+1);
else if (strcmp(argv[1], "tagview") == 0) return bam_tagview(argc-1, argv+1);