10 KHASH_MAP_INIT_INT(32, listelem_t)
12 #define BLOCK_SIZE 65536
24 static int fill_buf(samfile_t *in, buffer_t *buf)
26 int i, ret, last_tid, min_rpos = 0x7fffffff, capacity;
27 bam1_t *b = bam_init1();
28 bam1_core_t *c = &b->core;
29 // squeeze out the empty cells at the beginning
30 for (i = 0; i < buf->n; ++i)
31 if (buf->buf[i].b) break;
32 if (i < buf->n) { // squeeze
34 memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i));
39 for (i = 0; i < buf->n; ++i) {
40 elem_t *e = buf->buf + i;
41 if (e->b && e->rpos >= 0 && e->rpos < min_rpos)
42 min_rpos = buf->buf[i].rpos;
46 last_tid = buf->n? buf->buf[0].b->core.tid : -1;
47 capacity = buf->n + BLOCK_SIZE;
48 while ((ret = samread(in, b)) >= 0) {
50 uint8_t *qual = bam1_qual(b);
52 if (last_tid < 0) last_tid = c->tid;
53 if (c->tid != last_tid) {
54 if (buf->x < 0) buf->x = buf->n;
56 if (buf->n >= buf->max) { // enlarge
57 buf->max = buf->max? buf->max<<1 : 8;
58 buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max);
60 e = &buf->buf[buf->n++];
62 e->rpos = -1; e->score = 0;
63 for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1;
64 e->score = (double)e->score / sqrt(c->l_qseq + 1);
65 is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1;
66 if (!is_mapped) e->score = -1;
67 if (is_mapped && (c->flag & BAM_FREVERSE)) {
68 e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b));
69 if (min_rpos > e->rpos) min_rpos = e->rpos;
71 if (buf->n >= capacity) {
72 if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE;
76 if (ret >= 0 && buf->x < 0) buf->x = buf->n;
81 static void rmdupse_buf(buffer_t *buf)
89 mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff;
90 upper = (buf->x < 0)? buf->n : buf->x;
91 // fill the hash table
93 for (i = 0; i < upper; ++i) {
94 elem_t *e = buf->buf + i;
96 if (e->score < 0) continue;
98 if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1;
101 if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1;
104 k = kh_put(32, h, key, &ret);
106 if (ret == 0) { // present in the hash table
109 p->a = (int*)realloc(p->a, p->m * sizeof(int));
114 p->a = (int*)calloc(p->m, sizeof(int));
119 for (k = kh_begin(h); k < kh_end(h); ++k) {
120 if (kh_exist(h, k)) {
124 for (i = max = 0, maxi = -1; i < p->n; ++i) {
125 if (buf->buf[p->a[i]].score > max) {
126 max = buf->buf[p->a[i]].score;
131 for (i = 0; i < p->n; ++i) {
132 buf->buf[p->a[i]].score = -1;
134 bam_destroy1(buf->buf[p->a[i]].b);
135 buf->buf[p->a[i]].b = 0;
145 static void dump_buf(buffer_t *buf, samfile_t *out)
148 for (i = 0; i < buf->n; ++i) {
149 elem_t *e = buf->buf + i;
150 if (e->score != -1) break;
159 int bam_rmdupse(int argc, char *argv[])
164 fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n");
167 buf = calloc(1, sizeof(buffer_t));
168 in = samopen(argv[1], "rb", 0);
169 out = samopen(argv[2], "wb", in->header);
170 while (fill_buf(in, buf)) {
174 samclose(in); samclose(out);
175 free(buf->buf); free(buf);